14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
49#include "ruby_assert.h"
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
67#undef rb_usascii_str_new
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
149#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
150#define TERM_FILL(ptr, termlen) do {\
151 char *const term_fill_ptr = (ptr);\
152 const int term_fill_len = (termlen);\
153 *term_fill_ptr = '\0';\
154 if (UNLIKELY(term_fill_len > 1))\
155 memset(term_fill_ptr, 0, term_fill_len);\
158#define RESIZE_CAPA(str,capacity) do {\
159 const int termlen = TERM_LEN(str);\
160 RESIZE_CAPA_TERM(str,capacity,termlen);\
162#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
163 if (STR_EMBED_P(str)) {\
164 if (str_embed_capa(str) < capacity + termlen) {\
165 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
166 const long tlen = RSTRING_LEN(str);\
167 memcpy(tmp, RSTRING_PTR(str), str_embed_capa(str));\
168 RSTRING(str)->as.heap.ptr = tmp;\
169 RSTRING(str)->len = tlen;\
170 STR_SET_NOEMBED(str);\
171 RSTRING(str)->as.heap.aux.capa = (capacity);\
175 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
176 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
177 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
178 RSTRING(str)->as.heap.aux.capa = (capacity);\
182#define STR_SET_SHARED(str, shared_str) do { \
183 if (!FL_TEST(str, STR_FAKESTR)) { \
184 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
185 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
186 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
187 FL_SET((str), STR_SHARED); \
188 rb_gc_register_pinning_obj(str); \
189 FL_SET((shared_str), STR_SHARED_ROOT); \
190 if (RBASIC_CLASS((shared_str)) == 0) \
191 FL_SET_RAW((shared_str), STR_BORROWED); \
195#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
196#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
199#define STR_ENC_GET(str) get_encoding(str)
202zero_filled(
const char *s,
int n)
205 if (*s++)
return false;
210#if !defined SHARABLE_MIDDLE_SUBSTRING
211# define SHARABLE_MIDDLE_SUBSTRING 0
215SHARABLE_SUBSTRING_P(
VALUE str,
long beg,
long len)
217#if SHARABLE_MIDDLE_SUBSTRING
220 long end = beg +
len;
221 long source_len = RSTRING_LEN(str);
222 return end == source_len || zero_filled(RSTRING_PTR(str) + end, TERM_LEN(str));
227str_embed_capa(
VALUE str)
229 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
233rb_str_reembeddable_p(
VALUE str)
235 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
239rb_str_embed_size(
long capa,
long termlen)
247rb_str_size_as_embedded(
VALUE str)
250 if (STR_EMBED_P(str)) {
252 if (
FL_TEST_RAW(str, STR_PRECOMPUTED_HASH))
capa +=
sizeof(st_index_t);
254 real_size = rb_str_embed_size(
capa, TERM_LEN(str));
258 else if (rb_str_reembeddable_p(str)) {
260 if (
FL_TEST_RAW(str, STR_PRECOMPUTED_HASH))
capa +=
sizeof(st_index_t);
262 real_size = rb_str_embed_size(
capa, TERM_LEN(str));
265 real_size =
sizeof(
struct RString);
272STR_EMBEDDABLE_P(
long len,
long termlen)
274 return rb_gc_size_allocatable_p(rb_str_embed_size(
len, termlen));
279static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
280static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
282static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
283static inline void str_modifiable(
VALUE str);
288str_make_independent(
VALUE str)
290 long len = RSTRING_LEN(str);
291 int termlen = TERM_LEN(str);
292 str_make_independent_expand((str),
len, 0L, termlen);
295static inline int str_dependent_p(
VALUE str);
298rb_str_make_independent(
VALUE str)
300 if (str_dependent_p(str)) {
301 str_make_independent(str);
306rb_str_make_embedded(
VALUE str)
311 int termlen = TERM_LEN(str);
312 char *buf =
RSTRING(str)->as.heap.ptr;
313 long old_capa =
RSTRING(str)->as.heap.aux.capa + termlen;
317 STR_SET_LEN(str,
len);
320 memcpy(RSTRING_PTR(str), buf,
len);
321 SIZED_FREE_N(buf, old_capa);
328rb_debug_rstring_null_ptr(
const char *func)
330 fprintf(stderr,
"%s is returning NULL!! "
331 "SIGSEGV is highly expected to follow immediately.\n"
332 "If you could reproduce, attach your debugger here, "
333 "and look at the passed string.\n",
338static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
341get_encoding(
VALUE str)
347mustnot_broken(
VALUE str)
349 if (is_broken_string(str)) {
350 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
355mustnot_wchar(
VALUE str)
358 if (rb_enc_mbminlen(enc) > 1) {
359 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
363static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
365#if SIZEOF_LONG == SIZEOF_VOIDP
366#define PRECOMPUTED_FAKESTR_HASH 1
371BARE_STRING_P(
VALUE str)
376static inline st_index_t
377str_do_hash(
VALUE str)
379 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
381 if (e && !is_ascii_string(str)) {
388str_store_precomputed_hash(
VALUE str, st_index_t hash)
394 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
395 size_t free_bytes = str_embed_capa(str) - used_bytes;
399 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
401 FL_SET(str, STR_PRECOMPUTED_HASH);
414 if (
FL_TEST(str, RSTRING_FSTR))
417 bare = BARE_STRING_P(str);
419 if (STR_EMBED_P(str)) {
424 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
431 rb_str_resize(str, RSTRING_LEN(str));
433 fstr = register_fstring(str,
false,
false);
436 str_replace_shared_without_enc(str, fstr);
443static VALUE fstring_table_obj;
446fstring_concurrent_set_hash(
VALUE str)
448#ifdef PRECOMPUTED_FAKESTR_HASH
452 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
469 const char *aptr, *bptr;
476 return (alen == blen &&
478 memcmp(aptr, bptr, alen) == 0);
483 bool force_precompute_hash;
487fstring_concurrent_set_create(
VALUE str,
void *data)
497 long len = RSTRING_LEN(str);
498 long capa =
len +
sizeof(st_index_t);
499 int term_len = TERM_LEN(str);
501 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
503 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
504 STR_SET_LEN(new_str, RSTRING_LEN(str));
506 rb_enc_copy(new_str, str);
507 str_store_precomputed_hash(new_str, str_do_hash(str));
511 rb_enc_copy(new_str, str);
512#ifdef PRECOMPUTED_FAKESTR_HASH
513 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
514 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
528 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
531 if (STR_SHARED_P(str)) {
533 str_make_independent(str);
536 if (!BARE_STRING_P(str)) {
542 RBASIC(str)->flags |= RSTRING_FSTR;
544 RB_OBJ_SET_SHAREABLE(str);
558 .hash = fstring_concurrent_set_hash,
559 .cmp = fstring_concurrent_set_cmp,
560 .create = fstring_concurrent_set_create,
565Init_fstring_table(
void)
567 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
568 rb_gc_register_address(&fstring_table_obj);
572register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
576 .force_precompute_hash = force_precompute_hash
579#if SIZEOF_VOIDP == SIZEOF_LONG
583 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
587 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
589 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
601rb_obj_is_fstring_table(
VALUE obj)
605 return obj == fstring_table_obj;
609rb_gc_free_fstring(
VALUE obj)
611 ASSERT_vm_locking_with_barrier();
617 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
619 RB_DEBUG_COUNTER_INC(obj_str_fstr);
625rb_fstring_foreach_with_replace(
int (*callback)(
VALUE *str,
void *data),
void *data)
627 if (fstring_table_obj) {
628 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
633setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
636 RBASIC_SET_SHAPE_ID((
VALUE)fake_str, ROOT_SHAPE_ID);
649 return (
VALUE)fake_str;
658 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
667rb_fstring_new(
const char *ptr,
long len)
669 struct RString fake_str = {RBASIC_INIT};
670 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
676 struct RString fake_str = {RBASIC_INIT};
677 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
681rb_fstring_cstr(
const char *
ptr)
683 return rb_fstring_new(
ptr, strlen(
ptr));
687single_byte_optimizable(
VALUE str)
691 case ENCINDEX_ASCII_8BIT:
692 case ENCINDEX_US_ASCII:
714static inline const char *
715search_nonascii(
const char *p,
const char *e)
719#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
720# if SIZEOF_UINTPTR_T == 8
721# define NONASCII_MASK UINT64_C(0x8080808080808080)
722# elif SIZEOF_UINTPTR_T == 4
723# define NONASCII_MASK UINT32_C(0x80808080)
725# error "don't know what to do."
728# if SIZEOF_UINTPTR_T == 8
729# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
730# elif SIZEOF_UINTPTR_T == 4
731# define NONASCII_MASK 0x80808080UL
733# error "don't know what to do."
737 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
738#if !UNALIGNED_WORD_ACCESS
739 if ((uintptr_t)p % SIZEOF_VOIDP) {
740 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
745 case 7:
if (p[-7]&0x80)
return p-7;
746 case 6:
if (p[-6]&0x80)
return p-6;
747 case 5:
if (p[-5]&0x80)
return p-5;
748 case 4:
if (p[-4]&0x80)
return p-4;
750 case 3:
if (p[-3]&0x80)
return p-3;
751 case 2:
if (p[-2]&0x80)
return p-2;
752 case 1:
if (p[-1]&0x80)
return p-1;
757#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
758#define aligned_ptr(value) \
759 __builtin_assume_aligned((value), sizeof(uintptr_t))
761#define aligned_ptr(value) (value)
764 t = (e - (SIZEOF_VOIDP-1));
766 for (;s < t; s +=
sizeof(uintptr_t)) {
768 memcpy(&word, s,
sizeof(word));
769 if (word & NONASCII_MASK) {
770#ifdef WORDS_BIGENDIAN
771 return (
const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
773 return (
const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
783 case 7:
if (e[-7]&0x80)
return e-7;
784 case 6:
if (e[-6]&0x80)
return e-6;
785 case 5:
if (e[-5]&0x80)
return e-5;
786 case 4:
if (e[-4]&0x80)
return e-4;
788 case 3:
if (e[-3]&0x80)
return e-3;
789 case 2:
if (e[-2]&0x80)
return e-2;
790 case 1:
if (e[-1]&0x80)
return e-1;
798 const char *e = p +
len;
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
802 p = search_nonascii(p, e);
806 if (rb_enc_asciicompat(enc)) {
807 p = search_nonascii(p, e);
810 int ret = rb_enc_precise_mbclen(p, e, enc);
814 p = search_nonascii(p, e);
820 int ret = rb_enc_precise_mbclen(p, e, enc);
836 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
839 p = search_nonascii(p, e);
843 else if (rb_enc_asciicompat(enc)) {
844 p = search_nonascii(p, e);
850 int ret = rb_enc_precise_mbclen(p, e, enc);
857 p = search_nonascii(p, e);
863 int ret = rb_enc_precise_mbclen(p, e, enc);
888 rb_enc_set_index(str1, rb_enc_get_index(str2));
896rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
901 str_enc_copy(dest, src);
902 if (RSTRING_LEN(dest) == 0) {
903 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
914 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
915 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
926rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
928 str_enc_copy(dest, src);
935 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
941 return enc_coderange_scan(str, enc);
950 cr = enc_coderange_scan(str, get_encoding(str));
957rb_enc_str_asciicompat(
VALUE str)
960 return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
968 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
977str_mod_check(
VALUE s,
const char *p,
long len)
979 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
985str_capacity(
VALUE str,
const int termlen)
987 if (STR_EMBED_P(str)) {
988 return str_embed_capa(str) - termlen;
990 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
994 return RSTRING(str)->as.heap.aux.capa;
1001 return str_capacity(str, TERM_LEN(str));
1005must_not_null(
const char *
ptr)
1008 rb_raise(rb_eArgError,
"NULL pointer given");
1013str_alloc_embed(
VALUE klass,
size_t capa)
1015 size_t size = rb_str_embed_size(
capa, 0);
1019 NEWOBJ_OF(str,
struct RString, klass,
1023 str->as.embed.ary[0] = 0;
1029str_alloc_heap(
VALUE klass)
1031 NEWOBJ_OF(str,
struct RString, klass,
1035 str->as.heap.aux.capa = 0;
1036 str->as.heap.ptr = NULL;
1042empty_str_alloc(
VALUE klass)
1044 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1045 VALUE str = str_alloc_embed(klass, 0);
1046 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1057 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1061 enc = rb_ascii8bit_encoding();
1064 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1066 int termlen = rb_enc_mbminlen(enc);
1068 if (STR_EMBEDDABLE_P(
len, termlen)) {
1069 str = str_alloc_embed(klass,
len + termlen);
1075 str = str_alloc_heap(klass);
1081 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1084 rb_enc_raw_set(str, enc);
1087 memcpy(RSTRING_PTR(str),
ptr,
len);
1090 memset(RSTRING_PTR(str), 0,
len);
1093 STR_SET_LEN(str,
len);
1094 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1101 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1136 __msan_unpoison_string(
ptr);
1156 if (rb_enc_mbminlen(enc) != 1) {
1157 rb_raise(rb_eArgError,
"wchar encoding given");
1159 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1163str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1168 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1172 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1175 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1176 str = str_alloc_heap(klass);
1180 RBASIC(str)->flags |= STR_NOFREE;
1181 rb_enc_associate_index(str, encindex);
1210static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1212 int ecflags,
VALUE ecopts);
1217 int encidx = rb_enc_to_index(enc);
1218 if (rb_enc_get_index(str) == encidx)
1219 return is_ascii_string(str);
1230 if (!to)
return str;
1231 if (!from) from = rb_enc_get(str);
1232 if (from == to)
return str;
1233 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1234 rb_is_ascii8bit_enc(to)) {
1235 if (STR_ENC_GET(str) != to) {
1237 rb_enc_associate(str, to);
1244 from, to, ecflags, ecopts);
1245 if (
NIL_P(newstr)) {
1253rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1258 olen = RSTRING_LEN(newstr);
1259 if (ofs < -olen || olen < ofs)
1261 if (ofs < 0) ofs += olen;
1263 STR_SET_LEN(newstr, ofs);
1267 rb_str_modify(newstr);
1268 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1276 STR_SET_LEN(str, 0);
1277 rb_enc_associate(str, enc);
1283str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1285 int ecflags,
VALUE ecopts)
1290 VALUE econv_wrapper;
1291 const unsigned char *start, *sp;
1292 unsigned char *dest, *dp;
1293 size_t converted_output = (size_t)ofs;
1298 RBASIC_CLEAR_CLASS(econv_wrapper);
1300 if (!ec)
return Qnil;
1303 sp = (
unsigned char*)
ptr;
1305 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1306 (dp = dest + converted_output),
1310 size_t converted_input = sp - start;
1311 size_t rest =
len - converted_input;
1312 converted_output = dp - dest;
1314 if (converted_input && converted_output &&
1315 rest < (LONG_MAX / converted_output)) {
1316 rest = (rest * converted_output) / converted_input;
1321 olen += rest < 2 ? 2 : rest;
1322 rb_str_resize(newstr, olen);
1329 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1331 rb_enc_associate(newstr, to);
1350 const int eidx = rb_enc_to_index(eenc);
1353 return rb_enc_str_new(
ptr,
len, eenc);
1357 if ((eidx == rb_ascii8bit_encindex()) ||
1358 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1362 ienc = rb_default_internal_encoding();
1363 if (!ienc || eenc == ienc) {
1364 return rb_enc_str_new(
ptr,
len, eenc);
1368 if ((eidx == rb_ascii8bit_encindex()) ||
1369 (eidx == rb_usascii_encindex()) ||
1370 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1371 return rb_enc_str_new(
ptr,
len, ienc);
1374 str = rb_enc_str_new(NULL, 0, ienc);
1377 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1378 rb_str_initialize(str,
ptr,
len, eenc);
1386 int eidx = rb_enc_to_index(eenc);
1387 if (eidx == rb_usascii_encindex() &&
1388 !is_ascii_string(str)) {
1389 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1392 rb_enc_associate_index(str, eidx);
1451str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1453 const int termlen = TERM_LEN(str);
1458 if (str_embed_capa(str2) >=
len + termlen) {
1459 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1460 STR_SET_EMBED(str2);
1461 memcpy(ptr2, RSTRING_PTR(str),
len);
1462 TERM_FILL(ptr2+
len, termlen);
1466 if (STR_SHARED_P(str)) {
1467 root =
RSTRING(str)->as.heap.aux.shared;
1476 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1478 rb_fatal(
"about to free a possible shared root");
1480 char *ptr2 = STR_HEAP_PTR(str2);
1482 SIZED_FREE_N(ptr2, STR_HEAP_SIZE(str2));
1485 FL_SET(str2, STR_NOEMBED);
1487 STR_SET_SHARED(str2, root);
1490 STR_SET_LEN(str2,
len);
1498 str_replace_shared_without_enc(str2, str);
1499 rb_enc_cr_str_exact_copy(str2, str);
1506 return str_replace_shared(str_alloc_heap(klass), str);
1523rb_str_new_frozen_String(
VALUE orig)
1531rb_str_frozen_bare_string(
VALUE orig)
1533 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1538rb_str_tmp_frozen_acquire(
VALUE orig)
1541 return str_new_frozen_buffer(0, orig, FALSE);
1545rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1547 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1548 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1550 VALUE str = str_alloc_heap(0);
1553 FL_SET(str, STR_SHARED_ROOT);
1555 size_t capa = str_capacity(orig, TERM_LEN(orig));
1561 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1562 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1569 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1570 RBASIC(orig)->flags &= ~STR_NOFREE;
1571 STR_SET_SHARED(orig, str);
1573 RB_OBJ_SET_SHAREABLE(str);
1579 RSTRING(str)->as.heap.aux.capa =
capa + (TERM_LEN(orig) - TERM_LEN(str));
1585rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1590 if (STR_EMBED_P(tmp)) {
1593 else if (
FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1599 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1603 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1604 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1609 STR_SET_LEN(tmp, 0);
1617 return str_new_frozen_buffer(klass, orig, TRUE);
1627 VALUE str = str_alloc_heap(klass);
1628 STR_SET_LEN(str, RSTRING_LEN(orig));
1629 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1630 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1631 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1632 RBASIC(orig)->flags &= ~STR_NOFREE;
1633 STR_SET_SHARED(orig, str);
1640str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1644 long len = RSTRING_LEN(orig);
1645 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1646 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1648 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1649 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1655 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1656 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1662 if ((ofs > 0) || (rest > 0) ||
1665 str = str_new_shared(klass,
shared);
1667 RSTRING(str)->as.heap.ptr += ofs;
1668 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1676 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1677 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1679 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1680 STR_SET_LEN(str, RSTRING_LEN(orig));
1686 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1689 str = heap_str_make_shared(klass, orig);
1694 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1706str_new_empty_String(
VALUE str)
1709 rb_enc_copy(v, str);
1713#define STR_BUF_MIN_SIZE 63
1718 if (STR_EMBEDDABLE_P(
capa, 1)) {
1726 RSTRING(str)->as.heap.ptr[0] =
'\0';
1746 return str_new(0, 0,
len);
1752 if (STR_EMBED_P(str)) {
1753 RB_DEBUG_COUNTER_INC(obj_str_embed);
1755 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1756 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1757 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1760 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1761 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1766rb_str_memsize(
VALUE str)
1768 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1769 return STR_HEAP_SIZE(str);
1779 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1782static inline void str_discard(
VALUE str);
1783static void str_shared_replace(
VALUE str,
VALUE str2);
1788 if (str != str2) str_shared_replace(str, str2);
1799 enc = STR_ENC_GET(str2);
1802 termlen = rb_enc_mbminlen(enc);
1804 STR_SET_LEN(str, RSTRING_LEN(str2));
1806 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1808 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1809 rb_enc_associate(str, enc);
1813 if (STR_EMBED_P(str2)) {
1815 long len = RSTRING_LEN(str2);
1818 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1819 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1820 RSTRING(str2)->as.heap.ptr = new_ptr;
1821 STR_SET_LEN(str2,
len);
1823 STR_SET_NOEMBED(str2);
1826 STR_SET_NOEMBED(str);
1828 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1830 if (
FL_TEST(str2, STR_SHARED)) {
1832 STR_SET_SHARED(str,
shared);
1835 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1839 STR_SET_EMBED(str2);
1840 RSTRING_PTR(str2)[0] = 0;
1841 STR_SET_LEN(str2, 0);
1842 rb_enc_associate(str, enc);
1856 return rb_obj_as_string_result(str, obj);
1872 len = RSTRING_LEN(str2);
1873 if (STR_SHARED_P(str2)) {
1876 STR_SET_NOEMBED(str);
1877 STR_SET_LEN(str,
len);
1878 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1879 STR_SET_SHARED(str,
shared);
1880 rb_enc_cr_str_exact_copy(str, str2);
1883 str_replace_shared(str, str2);
1892 size_t size = rb_str_embed_size(
capa, 0);
1896 NEWOBJ_OF(str,
struct RString, klass,
1907 NEWOBJ_OF(str,
struct RString, klass,
1910 str->as.heap.aux.capa = 0;
1911 str->as.heap.ptr = NULL;
1921 encidx = rb_enc_get_index(str);
1922 flags &= ~ENCODING_MASK;
1925 if (encidx) rb_enc_associate_index(dup, encidx);
1935 long len = RSTRING_LEN(str);
1940 STR_SET_LEN(dup, RSTRING_LEN(str));
1941 return str_duplicate_setup_encoding(str, dup, flags);
1950 root =
RSTRING(str)->as.heap.aux.shared;
1953 root = str = str_new_frozen(klass, str);
1959 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1961 STR_SET_SHARED(dup, root);
1962 flags |= RSTRING_NOEMBED | STR_SHARED;
1964 STR_SET_LEN(dup, RSTRING_LEN(str));
1965 return str_duplicate_setup_encoding(str, dup, flags);
1971 if (STR_EMBED_P(str)) {
1972 return str_duplicate_setup_embed(klass, str, dup);
1975 return str_duplicate_setup_heap(klass, str, dup);
1983 if (STR_EMBED_P(str)) {
1984 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 dup = str_alloc_heap(klass);
1990 return str_duplicate_setup(klass, str, dup);
2001rb_str_dup_m(
VALUE str)
2003 if (LIKELY(BARE_STRING_P(str))) {
2014 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2021 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2025 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2026 str_duplicate_setup_embed(klass, str, new_str);
2029 new_str = ec_str_alloc_heap(ec, klass);
2030 str_duplicate_setup_heap(klass, str, new_str);
2039rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
2041 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
2065 static ID keyword_ids[2];
2066 VALUE orig, opt, venc, vcapa;
2071 if (!keyword_ids[0]) {
2072 keyword_ids[0] = rb_id_encoding();
2073 CONST_ID(keyword_ids[1],
"capacity");
2081 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2082 enc = rb_to_encoding(venc);
2084 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2087 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2089 if (
capa < STR_BUF_MIN_SIZE) {
2090 capa = STR_BUF_MIN_SIZE;
2094 len = RSTRING_LEN(orig);
2098 if (orig == str) n = 0;
2100 str_modifiable(str);
2101 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2103 const size_t size = (size_t)
capa + termlen;
2104 const char *
const old_ptr = RSTRING_PTR(str);
2105 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2106 char *new_ptr =
ALLOC_N(
char, size);
2107 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2108 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2110 RSTRING(str)->as.heap.ptr = new_ptr;
2112 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2113 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2114 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2116 STR_SET_LEN(str,
len);
2119 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2120 rb_enc_cr_str_exact_copy(str, orig);
2122 FL_SET(str, STR_NOEMBED);
2129 rb_enc_associate(str, enc);
2141rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2147 static ID keyword_ids[2];
2157 keyword_ids[0] = rb_id_encoding();
2158 CONST_ID(keyword_ids[1],
"capacity");
2160 encoding = kwargs[0];
2161 capacity = kwargs[1];
2170 if (UNDEF_P(encoding)) {
2172 encoding = rb_obj_encoding(orig);
2176 if (!UNDEF_P(encoding)) {
2177 enc = rb_to_encoding(encoding);
2181 if (UNDEF_P(capacity)) {
2183 VALUE empty_str = str_new(klass,
"", 0);
2185 rb_enc_associate(empty_str, enc);
2189 VALUE copy = str_duplicate(klass, orig);
2190 rb_enc_associate(copy, enc);
2203 if (orig_capa >
capa) {
2208 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2209 STR_SET_LEN(str, 0);
2220#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2235static inline uintptr_t
2236count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2241 d = (d>>6) | (~d>>7);
2242 d &= NONASCII_MASK >> 7;
2245#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2247 return rb_popcount_intptr(d);
2251# if SIZEOF_VOIDP == 8
2260enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2266 long diff = (long)(e - p);
2267 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2272 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2273 const uintptr_t *s, *t;
2274 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2275 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2276 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2277 while (p < (
const char *)s) {
2278 if (is_utf8_lead_byte(*p))
len++;
2282 len += count_utf8_lead_bytes_with_word(s);
2285 p = (
const char *)s;
2288 if (is_utf8_lead_byte(*p))
len++;
2294 else if (rb_enc_asciicompat(enc)) {
2299 q = search_nonascii(p, e);
2305 p += rb_enc_fast_mbclen(p, e, enc);
2312 q = search_nonascii(p, e);
2318 p += rb_enc_mbclen(p, e, enc);
2325 for (c=0; p<e; c++) {
2326 p += rb_enc_mbclen(p, e, enc);
2341rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2349 long diff = (long)(e - p);
2350 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2352 else if (rb_enc_asciicompat(enc)) {
2356 q = search_nonascii(p, e);
2364 ret = rb_enc_precise_mbclen(p, e, enc);
2379 for (c=0; p<e; c++) {
2380 ret = rb_enc_precise_mbclen(p, e, enc);
2387 if (p + rb_enc_mbminlen(enc) <= e)
2388 p += rb_enc_mbminlen(enc);
2404 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2405 if (!enc) enc = STR_ENC_GET(str);
2406 p = RSTRING_PTR(str);
2411 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2416 return enc_strlen(p, e, enc, cr);
2423 return str_strlen(str, NULL);
2437 return LONG2NUM(str_strlen(str, NULL));
2449rb_str_bytesize(
VALUE str)
2468rb_str_empty(
VALUE str)
2470 return RBOOL(RSTRING_LEN(str) == 0);
2489 char *ptr1, *ptr2, *ptr3;
2494 enc = rb_enc_check_str(str1, str2);
2497 termlen = rb_enc_mbminlen(enc);
2498 if (len1 > LONG_MAX - len2) {
2499 rb_raise(rb_eArgError,
"string size too big");
2501 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2502 ptr3 = RSTRING_PTR(str3);
2503 memcpy(ptr3, ptr1, len1);
2504 memcpy(ptr3+len1, ptr2, len2);
2505 TERM_FILL(&ptr3[len1+len2], termlen);
2521 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2524 int enc1 = rb_enc_get_index(str1);
2525 int enc2 = rb_enc_get_index(str2);
2530 else if (enc2 < 0) {
2533 else if (enc1 != enc2) {
2536 else if (len1 > LONG_MAX - len2) {
2570 rb_enc_copy(str2, str);
2575 rb_raise(rb_eArgError,
"negative argument");
2577 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2578 if (STR_EMBEDDABLE_P(
len, 1)) {
2580 memset(RSTRING_PTR(str2), 0,
len + 1);
2587 STR_SET_LEN(str2,
len);
2588 rb_enc_copy(str2, str);
2591 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2592 rb_raise(rb_eArgError,
"argument too big");
2595 len *= RSTRING_LEN(str);
2596 termlen = TERM_LEN(str);
2598 ptr2 = RSTRING_PTR(str2);
2600 n = RSTRING_LEN(str);
2601 memcpy(ptr2, RSTRING_PTR(str), n);
2602 while (n <=
len/2) {
2603 memcpy(ptr2 + n, ptr2, n);
2606 memcpy(ptr2 + n, ptr2,
len-n);
2608 STR_SET_LEN(str2,
len);
2609 TERM_FILL(&ptr2[
len], termlen);
2610 rb_enc_cr_str_copy_for_substr(str2, str);
2649rb_check_lockedtmp(
VALUE str)
2651 if (
FL_TEST(str, STR_TMPLOCK)) {
2658#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2660str_modifiable(
VALUE str)
2664 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2665 if (CHILLED_STRING_P(str)) {
2666 CHILLED_STRING_MUTATED(str);
2668 rb_check_lockedtmp(str);
2669 rb_check_frozen(str);
2674str_dependent_p(
VALUE str)
2676 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2686#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2688str_independent(
VALUE str)
2692 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2693 str_modifiable(str);
2694 return !str_dependent_p(str);
2700str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2710 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2715 STR_SET_LEN(str,
len);
2720 oldptr = RSTRING_PTR(str);
2722 memcpy(
ptr, oldptr,
len);
2724 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2725 SIZED_FREE_N(oldptr, STR_HEAP_SIZE(str));
2727 STR_SET_NOEMBED(str);
2728 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2729 TERM_FILL(
ptr +
len, termlen);
2731 STR_SET_LEN(str,
len);
2738 if (!str_independent(str))
2739 str_make_independent(str);
2748 int termlen = TERM_LEN(str);
2749 long len = RSTRING_LEN(str);
2752 rb_raise(rb_eArgError,
"negative expanding string size");
2754 if (expand >= LONG_MAX -
len) {
2755 rb_raise(rb_eArgError,
"string size too big");
2758 if (!str_independent(str)) {
2759 str_make_independent_expand(str,
len, expand, termlen);
2761 else if (expand > 0) {
2762 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2769str_modify_keep_cr(
VALUE str)
2771 if (!str_independent(str))
2772 str_make_independent(str);
2779str_discard(
VALUE str)
2781 str_modifiable(str);
2782 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2783 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2784 RSTRING(str)->as.heap.ptr = 0;
2785 STR_SET_LEN(str, 0);
2792 int encindex = rb_enc_get_index(str);
2794 if (RB_UNLIKELY(encindex == -1)) {
2798 if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) {
2803 if (!rb_enc_asciicompat(enc)) {
2825 return RSTRING_PTR(str);
2829str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2831 const char *e = s +
len;
2833 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2834 if (zero_filled(s, minlen))
return s;
2840str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2845 if (str_dependent_p(str)) {
2846 if (!zero_filled(s +
len, termlen))
2847 str_make_independent_expand(str,
len, 0L, termlen);
2850 TERM_FILL(s +
len, termlen);
2853 return RSTRING_PTR(str);
2857rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2859 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2860 long len = RSTRING_LEN(str);
2864 rb_check_lockedtmp(str);
2865 str_make_independent_expand(str,
len, 0L, termlen);
2867 else if (str_dependent_p(str)) {
2868 if (termlen > oldtermlen)
2869 str_make_independent_expand(str,
len, 0L, termlen);
2872 if (!STR_EMBED_P(str)) {
2877 if (termlen > oldtermlen) {
2878 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2886str_null_check(
VALUE str,
int *w)
2888 char *s = RSTRING_PTR(str);
2889 long len = RSTRING_LEN(str);
2892 if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
2894 minlen = rb_enc_mbminlen(enc);
2898 if (str_null_char(s,
len, minlen, enc)) {
2901 return str_fill_term(str, s,
len, minlen);
2906 if (!s || memchr(s, 0,
len)) {
2910 s = str_fill_term(str, s,
len, minlen);
2916rb_str_null_check(
VALUE str)
2924 if (RB_LIKELY(rb_str_enc_fastpath(str))) {
2925 if (!s || memchr(s, 0,
len)) {
2926 rb_raise(rb_eArgError,
"string contains null byte");
2931 const char *s = str_null_check(str, &w);
2934 rb_raise(rb_eArgError,
"string contains null char");
2936 rb_raise(rb_eArgError,
"string contains null byte");
2944rb_str_to_cstr(
VALUE str)
2947 return str_null_check(str, &w);
2955 char *s = str_null_check(str, &w);
2958 rb_raise(rb_eArgError,
"string contains null char");
2960 rb_raise(rb_eArgError,
"string contains null byte");
2966rb_str_fill_terminator(
VALUE str,
const int newminlen)
2968 char *s = RSTRING_PTR(str);
2969 long len = RSTRING_LEN(str);
2970 return str_fill_term(str, s,
len, newminlen);
2976 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
3002str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
3011 else if (rb_enc_asciicompat(enc)) {
3012 const char *p2, *e2;
3015 while (p < e && 0 < nth) {
3022 p2 = search_nonascii(p, e2);
3031 n = rb_enc_mbclen(p, e, enc);
3042 while (p < e && nth--) {
3043 p += rb_enc_mbclen(p, e, enc);
3054 return str_nth_len(p, e, &nth, enc);
3058str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3063 p = str_nth_len(p, e, &nth, enc);
3072str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3074 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3075 if (!pp)
return e - p;
3082 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
3083 STR_ENC_GET(str), single_byte_optimizable(str));
3088str_utf8_nth(
const char *p,
const char *e,
long *nthp)
3091 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
3092 const uintptr_t *s, *t;
3093 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3094 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3095 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
3096 while (p < (
const char *)s) {
3097 if (is_utf8_lead_byte(*p)) nth--;
3101 nth -= count_utf8_lead_bytes_with_word(s);
3103 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3107 if (is_utf8_lead_byte(*p)) {
3108 if (nth == 0)
break;
3118str_utf8_offset(
const char *p,
const char *e,
long nth)
3120 const char *pp = str_utf8_nth(p, e, &nth);
3129 if (single_byte_optimizable(str) || pos < 0)
3132 char *p = RSTRING_PTR(str);
3133 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3138str_subseq(
VALUE str,
long beg,
long len)
3146 const int termlen = TERM_LEN(str);
3147 if (!SHARABLE_SUBSTRING_P(str, beg,
len)) {
3148 str2 = rb_enc_str_new(RSTRING_PTR(str) + beg,
len, rb_str_enc_get(str));
3157 if (str_embed_capa(str2) >=
len + termlen) {
3158 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3159 STR_SET_EMBED(str2);
3160 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3161 TERM_FILL(ptr2+
len, termlen);
3163 STR_SET_LEN(str2,
len);
3171 str_replace_shared(str2, str);
3177 RSTRING(str2)->as.heap.ptr += beg;
3178 if (RSTRING_LEN(str2) >
len) {
3179 STR_SET_LEN(str2,
len);
3189 VALUE str2 = str_subseq(str, beg,
len);
3190 rb_enc_cr_str_copy_for_substr(str2, str);
3199 const long blen = RSTRING_LEN(str);
3201 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3203 if (
len < 0)
return 0;
3204 if (beg < 0 && -beg < 0)
return 0;
3208 if (single_byte_optimizable(str)) {
3209 if (beg > blen)
return 0;
3212 if (beg < 0)
return 0;
3214 if (
len > blen - beg)
3216 if (
len < 0)
return 0;
3221 if (
len > -beg)
len = -beg;
3225 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3228 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3234 slen = str_strlen(str, enc);
3236 if (beg < 0)
return 0;
3238 if (
len == 0)
goto end;
3241 else if (beg > 0 && beg > blen) {
3245 if (beg > str_strlen(str, enc))
return 0;
3250 enc == rb_utf8_encoding()) {
3251 p = str_utf8_nth(s, e, &beg);
3252 if (beg > 0)
return 0;
3253 len = str_utf8_offset(p, e,
len);
3259 p = s + beg * char_sz;
3263 else if (
len * char_sz > e - p)
3268 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3269 if (beg > 0)
return 0;
3273 len = str_offset(p, e,
len, enc, 0);
3281static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3286 return str_substr(str, beg,
len, TRUE);
3296str_substr(
VALUE str,
long beg,
long len,
int empty)
3300 if (!p)
return Qnil;
3301 if (!
len && !empty)
return Qnil;
3303 beg = p - RSTRING_PTR(str);
3305 VALUE str2 = str_subseq(str, beg,
len);
3306 rb_enc_cr_str_copy_for_substr(str2, str);
3314 if (CHILLED_STRING_P(str)) {
3319 rb_str_resize(str, RSTRING_LEN(str));
3337 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3380str_uminus(
VALUE str)
3385 return rb_fstring(str);
3389#define rb_str_dup_frozen rb_str_new_frozen
3394 rb_check_frozen(str);
3395 if (
FL_TEST(str, STR_TMPLOCK)) {
3398 FL_SET(str, STR_TMPLOCK);
3405 rb_check_frozen(str);
3406 if (!
FL_TEST(str, STR_TMPLOCK)) {
3426 const int termlen = TERM_LEN(str);
3428 str_modifiable(str);
3429 if (STR_SHARED_P(str)) {
3432 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3433 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3444 else if (
len > RSTRING_LEN(str)) {
3448 const char *
const new_end = RSTRING_PTR(str) +
len;
3458 else if (
len < RSTRING_LEN(str)) {
3466 STR_SET_LEN(str,
len);
3467 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3474 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3477 int independent = str_independent(str);
3478 long slen = RSTRING_LEN(str);
3479 const int termlen = TERM_LEN(str);
3481 if (slen >
len || (termlen != 1 && slen <
len)) {
3487 if (STR_EMBED_P(str)) {
3488 if (
len == slen)
return str;
3489 if (str_embed_capa(str) >=
len + termlen) {
3490 STR_SET_LEN(str,
len);
3494 str_make_independent_expand(str, slen,
len - slen, termlen);
3496 else if (str_embed_capa(str) >=
len + termlen) {
3498 char *
ptr = STR_HEAP_PTR(str);
3500 if (slen >
len) slen =
len;
3503 STR_SET_LEN(str,
len);
3505 SIZED_FREE_N(
ptr,
capa + termlen);
3509 else if (!independent) {
3510 if (
len == slen)
return str;
3511 str_make_independent_expand(str, slen,
len - slen, termlen);
3515 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3516 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3519 else if (
len == slen)
return str;
3520 STR_SET_LEN(str,
len);
3527str_ensure_available_capa(
VALUE str,
long len)
3529 str_modify_keep_cr(str);
3531 const int termlen = TERM_LEN(str);
3532 long olen = RSTRING_LEN(str);
3534 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3535 rb_raise(rb_eArgError,
"string sizes too big");
3538 long total = olen +
len;
3539 long capa = str_capacity(str, termlen);
3542 if (total >= LONG_MAX / 2) {
3545 while (total >
capa) {
3548 RESIZE_CAPA_TERM(str,
capa, termlen);
3553str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3556 str_modify_keep_cr(str);
3561 if (
len == 0)
return 0;
3563 long total, olen,
off = -1;
3565 const int termlen = TERM_LEN(str);
3568 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3572 long capa = str_capacity(str, termlen);
3574 if (olen > LONG_MAX -
len) {
3575 rb_raise(rb_eArgError,
"string sizes too big");
3579 if (total >= LONG_MAX / 2) {
3582 while (total >
capa) {
3585 RESIZE_CAPA_TERM(str,
capa, termlen);
3586 sptr = RSTRING_PTR(str);
3591 memcpy(sptr + olen,
ptr,
len);
3592 STR_SET_LEN(str, total);
3593 TERM_FILL(sptr + total, termlen);
3598#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3599#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3604 if (
len == 0)
return str;
3606 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3608 return str_buf_cat(str,
ptr,
len);
3619rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3624 if (UNLIKELY(!str_independent(str))) {
3625 str_make_independent(str);
3628 long string_length = -1;
3629 const int null_terminator_length = 1;
3634 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3635 rb_raise(rb_eArgError,
"string sizes too big");
3638 long string_capacity = str_capacity(str, null_terminator_length);
3644 if (LIKELY(string_capacity >= string_length + 1)) {
3646 sptr[string_length] = byte;
3647 STR_SET_LEN(str, string_length + 1);
3648 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3652 str_buf_cat(str, (
char *)&
byte, 1);
3668 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3679rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3680 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3689 if (str_encindex == ptr_encindex) {
3691 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3695 str_enc = rb_enc_from_index(str_encindex);
3696 ptr_enc = rb_enc_from_index(ptr_encindex);
3697 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3700 if (RSTRING_LEN(str) == 0) {
3703 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3709 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3718 *ptr_cr_ret = ptr_cr;
3720 if (str_encindex != ptr_encindex &&
3723 str_enc = rb_enc_from_index(str_encindex);
3724 ptr_enc = rb_enc_from_index(ptr_encindex);
3729 res_encindex = str_encindex;
3734 res_encindex = str_encindex;
3738 res_encindex = ptr_encindex;
3743 res_encindex = str_encindex;
3750 res_encindex = str_encindex;
3756 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3758 str_buf_cat(str,
ptr,
len);
3764 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3771 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3781 if (rb_enc_asciicompat(enc)) {
3782 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3788 unsigned int c = (
unsigned char)*
ptr;
3789 int len = rb_enc_codelen(c, enc);
3790 rb_enc_mbcput(c, buf, enc);
3791 rb_enc_cr_str_buf_cat(str, buf,
len,
3804 if (rb_str_enc_fastpath(str)) {
3808 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3814 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3825 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3841rb_str_concat_literals(
size_t num,
const VALUE *strary)
3845 unsigned long len = 1;
3850 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3852 str_enc_copy_direct(str, strary[0]);
3854 for (i = s; i < num; ++i) {
3855 const VALUE v = strary[i];
3859 if (encidx != ENCINDEX_US_ASCII) {
3861 rb_enc_set_index(str, encidx);
3874rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3876 str_modifiable(str);
3881 else if (argc > 1) {
3884 rb_enc_copy(arg_str, str);
3885 for (i = 0; i < argc; i++) {
3920rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3922 long needed_capacity = 0;
3926 for (
int index = 0; index < argc; index++) {
3927 VALUE obj = argv[index];
3935 needed_capacity += RSTRING_LEN(obj);
3940 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3947 str_ensure_available_capa(str, needed_capacity);
3950 for (
int index = 0; index < argc; index++) {
3951 VALUE obj = argv[index];
3956 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3957 char byte = (char)(
NUM2INT(obj) & 0xFF);
3971 rb_bug(
"append_as_bytes arguments should have been validated");
3975 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3976 TERM_FILL(sptr, TERM_LEN(str));
3981 for (
int index = 0; index < argc; index++) {
3982 VALUE obj = argv[index];
3999 rb_bug(
"append_as_bytes arguments should have been validated");
4078 if (rb_num_to_uint(str2, &code) == 0) {
4091 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4094 rb_str_buf_cat_byte(str1, (
unsigned char)code);
4097 long pos = RSTRING_LEN(str1);
4102 switch (
len = rb_enc_codelen(code, enc)) {
4103 case ONIGERR_INVALID_CODE_POINT_VALUE:
4104 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4106 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4112 rb_enc_mbcput(code, buf, enc);
4113 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
4114 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4116 rb_str_resize(str1, pos+
len);
4117 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
4130rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
4132 int encidx = rb_enc_to_index(enc);
4134 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4139 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4140 return ENCINDEX_ASCII_8BIT;
4162rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4164 str_modifiable(str);
4169 else if (argc > 1) {
4172 rb_enc_copy(arg_str, str);
4173 for (i = 0; i < argc; i++) {
4186 st_index_t precomputed_hash;
4187 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4189 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4190 return precomputed_hash;
4193 return str_do_hash(str);
4200 const char *ptr1, *ptr2;
4203 return (len1 != len2 ||
4205 memcmp(ptr1, ptr2, len1) != 0);
4217rb_str_hash_m(
VALUE str)
4223#define lesser(a,b) (((a)>(b))?(b):(a))
4231 if (RSTRING_LEN(str1) == 0)
return TRUE;
4232 if (RSTRING_LEN(str2) == 0)
return TRUE;
4235 if (idx1 == idx2)
return TRUE;
4240 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4244 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4254 const char *ptr1, *ptr2;
4257 if (str1 == str2)
return 0;
4260 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4269 if (len1 > len2)
return 1;
4272 if (retval > 0)
return 1;
4306 if (str1 == str2)
return Qtrue;
4313 return rb_str_eql_internal(str1, str2);
4327 if (str1 == str2)
return Qtrue;
4329 return rb_str_eql_internal(str1, str2);
4367 return rb_invcmp(str1, str2);
4409 return str_casecmp(str1, s);
4417 const char *p1, *p1end, *p2, *p2end;
4419 enc = rb_enc_compatible(str1, str2);
4424 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4425 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4426 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4427 while (p1 < p1end && p2 < p2end) {
4429 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4430 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4432 return INT2FIX(c1 < c2 ? -1 : 1);
4439 while (p1 < p1end && p2 < p2end) {
4440 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4441 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4443 if (0 <= c1 && 0 <= c2) {
4447 return INT2FIX(c1 < c2 ? -1 : 1);
4451 l1 = rb_enc_mbclen(p1, p1end, enc);
4452 l2 = rb_enc_mbclen(p2, p2end, enc);
4453 len = l1 < l2 ? l1 : l2;
4454 r = memcmp(p1, p2,
len);
4456 return INT2FIX(r < 0 ? -1 : 1);
4458 return INT2FIX(l1 < l2 ? -1 : 1);
4464 if (p1 == p1end && p2 == p2end)
return INT2FIX(0);
4465 if (p1 == p1end)
return INT2FIX(-1);
4498 return str_casecmp_p(str1, s);
4505 VALUE folded_str1, folded_str2;
4506 VALUE fold_opt = sym_fold;
4508 enc = rb_enc_compatible(str1, str2);
4513 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4514 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4516 return rb_str_eql(folded_str1, folded_str2);
4520strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4521 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4523 const char *search_start = str_ptr;
4524 long pos, search_len = str_len - offset;
4528 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4529 if (pos < 0)
return pos;
4531 if (t == search_start + pos)
break;
4532 search_len -= t - search_start;
4533 if (search_len <= 0)
return -1;
4534 offset += t - search_start;
4537 return pos + offset;
4541#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4542#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4545rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4547 const char *str_ptr, *str_ptr_end, *sub_ptr;
4548 long str_len, sub_len;
4551 enc = rb_enc_check(str, sub);
4552 if (is_broken_string(sub))
return -1;
4554 str_ptr = RSTRING_PTR(str);
4556 str_len = RSTRING_LEN(str);
4557 sub_ptr = RSTRING_PTR(sub);
4558 sub_len = RSTRING_LEN(sub);
4560 if (str_len < sub_len)
return -1;
4563 long str_len_char, sub_len_char;
4564 int single_byte = single_byte_optimizable(str);
4565 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4566 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4568 offset += str_len_char;
4569 if (offset < 0)
return -1;
4571 if (str_len_char - offset < sub_len_char)
return -1;
4572 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4575 if (sub_len == 0)
return offset;
4578 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4591rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4598 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4599 long slen = str_strlen(str, enc);
4601 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4613 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4614 enc, single_byte_optimizable(str));
4625 pos = rb_str_index(str, sub, pos);
4639str_ensure_byte_pos(
VALUE str,
long pos)
4641 if (!single_byte_optimizable(str)) {
4642 const char *s = RSTRING_PTR(str);
4644 const char *p = s + pos;
4645 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4647 "offset %ld does not land on character boundary", pos);
4720rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4726 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4727 long slen = RSTRING_LEN(str);
4729 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4740 str_ensure_byte_pos(str, pos);
4752 pos = rb_str_byteindex(str, sub, pos);
4753 if (pos >= 0)
return LONG2NUM(pos);
4760memrchr(
const char *search_str,
int chr,
long search_len)
4762 const char *ptr = search_str + search_len;
4763 while (ptr > search_str) {
4764 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4774 char *hit, *adjusted;
4776 long slen, searchlen;
4779 sbeg = RSTRING_PTR(str);
4780 slen = RSTRING_LEN(sub);
4781 if (slen == 0)
return s - sbeg;
4783 t = RSTRING_PTR(sub);
4785 searchlen = s - sbeg + 1;
4787 if (memcmp(s, t, slen) == 0) {
4792 hit = memrchr(sbeg, c, searchlen);
4795 if (hit != adjusted) {
4796 searchlen = adjusted - sbeg;
4799 if (memcmp(hit, t, slen) == 0)
4801 searchlen = adjusted - sbeg;
4802 }
while (searchlen > 0);
4816 enc = rb_enc_check(str, sub);
4817 if (is_broken_string(sub))
return -1;
4818 singlebyte = single_byte_optimizable(str);
4819 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4820 slen = str_strlen(sub, enc);
4823 if (
len < slen)
return -1;
4824 if (
len - pos < slen) pos =
len - slen;
4825 if (
len == 0)
return pos;
4827 sbeg = RSTRING_PTR(str);
4830 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4836 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4837 return str_rindex(str, sub, s, enc);
4849rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4854 long pos,
len = str_strlen(str, enc);
4856 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4858 if (pos < 0 && (pos +=
len) < 0) {
4864 if (pos >
len) pos =
len;
4872 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4873 enc, single_byte_optimizable(str));
4884 pos = rb_str_rindex(str, sub, pos);
4894rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4900 enc = rb_enc_check(str, sub);
4901 if (is_broken_string(sub))
return -1;
4902 len = RSTRING_LEN(str);
4903 slen = RSTRING_LEN(sub);
4906 if (
len < slen)
return -1;
4907 if (
len - pos < slen) pos =
len - slen;
4908 if (
len == 0)
return pos;
4910 sbeg = RSTRING_PTR(str);
4913 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4920 return str_rindex(str, sub, s, enc);
5010rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
5014 long pos,
len = RSTRING_LEN(str);
5016 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
5018 if (pos < 0 && (pos +=
len) < 0) {
5024 if (pos >
len) pos =
len;
5030 str_ensure_byte_pos(str, pos);
5042 pos = rb_str_byterindex(str, sub, pos);
5043 if (pos >= 0)
return LONG2NUM(pos);
5085 switch (OBJ_BUILTIN_TYPE(y)) {
5139rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5146 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5177rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5181 re = get_pat(argv[0]);
5182 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5191static enum neighbor_char
5197 if (rb_enc_mbminlen(enc) > 1) {
5199 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5201 return NEIGHBOR_NOT_CHAR;
5203 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5205 if (!l)
return NEIGHBOR_NOT_CHAR;
5206 if (l !=
len)
return NEIGHBOR_WRAPPED;
5207 rb_enc_mbcput(c, p, enc);
5208 r = rb_enc_precise_mbclen(p, p +
len, enc);
5210 return NEIGHBOR_NOT_CHAR;
5212 return NEIGHBOR_FOUND;
5215 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5218 return NEIGHBOR_WRAPPED;
5219 ++((
unsigned char*)p)[i];
5220 l = rb_enc_precise_mbclen(p, p+
len, enc);
5224 return NEIGHBOR_FOUND;
5227 memset(p+l, 0xff,
len-l);
5233 for (len2 =
len-1; 0 < len2; len2--) {
5234 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5238 memset(p+len2+1, 0xff,
len-(len2+1));
5243static enum neighbor_char
5248 if (rb_enc_mbminlen(enc) > 1) {
5250 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5252 return NEIGHBOR_NOT_CHAR;
5254 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5255 if (!c)
return NEIGHBOR_NOT_CHAR;
5258 if (!l)
return NEIGHBOR_NOT_CHAR;
5259 if (l !=
len)
return NEIGHBOR_WRAPPED;
5260 rb_enc_mbcput(c, p, enc);
5261 r = rb_enc_precise_mbclen(p, p +
len, enc);
5263 return NEIGHBOR_NOT_CHAR;
5265 return NEIGHBOR_FOUND;
5268 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5271 return NEIGHBOR_WRAPPED;
5272 --((
unsigned char*)p)[i];
5273 l = rb_enc_precise_mbclen(p, p+
len, enc);
5277 return NEIGHBOR_FOUND;
5280 memset(p+l, 0,
len-l);
5286 for (len2 =
len-1; 0 < len2; len2--) {
5287 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5291 memset(p+len2+1, 0,
len-(len2+1));
5305static enum neighbor_char
5306enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5308 enum neighbor_char ret;
5312 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5316 const int max_gaps = 1;
5318 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5320 ctype = ONIGENC_CTYPE_DIGIT;
5322 ctype = ONIGENC_CTYPE_ALPHA;
5324 return NEIGHBOR_NOT_CHAR;
5327 for (
try = 0;
try <= max_gaps; ++
try) {
5328 ret = enc_succ_char(p,
len, enc);
5329 if (ret == NEIGHBOR_FOUND) {
5330 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5332 return NEIGHBOR_FOUND;
5339 ret = enc_pred_char(p,
len, enc);
5340 if (ret == NEIGHBOR_FOUND) {
5341 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5354 return NEIGHBOR_NOT_CHAR;
5357 if (ctype != ONIGENC_CTYPE_DIGIT) {
5359 return NEIGHBOR_WRAPPED;
5363 enc_succ_char(carry,
len, enc);
5364 return NEIGHBOR_WRAPPED;
5382 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5383 rb_enc_cr_str_copy_for_substr(str, orig);
5384 return str_succ(str);
5391 char *sbeg, *s, *e, *last_alnum = 0;
5392 int found_alnum = 0;
5394 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5395 long carry_pos = 0, carry_len = 1;
5396 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5398 slen = RSTRING_LEN(str);
5399 if (slen == 0)
return str;
5401 enc = STR_ENC_GET(str);
5402 sbeg = RSTRING_PTR(str);
5403 s = e = sbeg + slen;
5405 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5406 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5412 l = rb_enc_precise_mbclen(s, e, enc);
5413 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5414 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5415 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5417 case NEIGHBOR_NOT_CHAR:
5419 case NEIGHBOR_FOUND:
5421 case NEIGHBOR_WRAPPED:
5426 carry_pos = s - sbeg;
5431 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5432 enum neighbor_char neighbor;
5433 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5434 l = rb_enc_precise_mbclen(s, e, enc);
5435 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5436 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5438 neighbor = enc_succ_char(tmp, l, enc);
5440 case NEIGHBOR_FOUND:
5444 case NEIGHBOR_WRAPPED:
5447 case NEIGHBOR_NOT_CHAR:
5450 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5452 enc_succ_char(s, l, enc);
5454 if (!rb_enc_asciicompat(enc)) {
5455 MEMCPY(carry, s,
char, l);
5458 carry_pos = s - sbeg;
5462 RESIZE_CAPA(str, slen + carry_len);
5463 sbeg = RSTRING_PTR(str);
5464 s = sbeg + carry_pos;
5465 memmove(s + carry_len, s, slen - carry_pos);
5466 memmove(s, carry, carry_len);
5468 STR_SET_LEN(str, slen);
5469 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5485rb_str_succ_bang(
VALUE str)
5493all_digits_p(
const char *s,
long len)
5521 VALUE end, exclusive;
5525 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5531 VALUE current, after_end;
5538 enc = rb_enc_check(beg, end);
5539 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5541 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5542 char c = RSTRING_PTR(beg)[0];
5543 char e = RSTRING_PTR(end)[0];
5545 if (c > e || (excl && c == e))
return beg;
5547 VALUE str = rb_enc_str_new(&c, 1, enc);
5549 if ((*each)(str, arg))
break;
5550 if (!excl && c == e)
break;
5552 if (excl && c == e)
break;
5557 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5558 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5559 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5564 b = rb_str_to_inum(beg, 10, FALSE);
5565 e = rb_str_to_inum(end, 10, FALSE);
5572 if (excl && bi == ei)
break;
5573 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5578 ID op = excl ?
'<' : idLE;
5579 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5584 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5585 b = rb_funcallv(b, succ, 0, 0);
5592 if (n > 0 || (excl && n == 0))
return beg;
5594 after_end = rb_funcallv(end, succ, 0, 0);
5599 next = rb_funcallv(current, succ, 0, 0);
5600 if ((*each)(current, arg))
break;
5601 if (
NIL_P(next))
break;
5605 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5620 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5621 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5622 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5624 b = rb_str_to_inum(beg, 10, FALSE);
5630 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5638 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5639 b = rb_funcallv(b, succ, 0, 0);
5645 VALUE next = rb_funcallv(current, succ, 0, 0);
5646 if ((*each)(current, arg))
break;
5649 if (RSTRING_LEN(current) == 0)
5660 if (!
rb_equal(str, *argp))
return 0;
5674 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5675 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5676 rb_enc_asciicompat(STR_ENC_GET(val))) {
5677 const char *bp = RSTRING_PTR(beg);
5678 const char *ep = RSTRING_PTR(end);
5679 const char *vp = RSTRING_PTR(val);
5680 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5681 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5689 if (b <= v && v < e)
return Qtrue;
5690 return RBOOL(!
RTEST(exclusive) && v == e);
5697 all_digits_p(bp, RSTRING_LEN(beg)) &&
5698 all_digits_p(ep, RSTRING_LEN(end))) {
5703 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5705 return RBOOL(
NIL_P(val));
5728 return rb_str_subpat(str, indx,
INT2FIX(0));
5731 if (rb_str_index(str, indx, 0) != -1)
5737 long beg,
len = str_strlen(str, NULL);
5749 return str_substr(str, idx, 1, FALSE);
5766rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5770 return rb_str_subpat(str, argv[0], argv[1]);
5773 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5777 return rb_str_aref(str, argv[0]);
5783 char *ptr = RSTRING_PTR(str);
5784 long olen = RSTRING_LEN(str), nlen;
5786 str_modifiable(str);
5787 if (
len > olen)
len = olen;
5789 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5791 size_t old_capa =
RSTRING(str)->as.heap.aux.capa + TERM_LEN(str);
5792 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5794 ptr =
RSTRING(str)->as.embed.ary;
5795 memmove(ptr, oldptr +
len, nlen);
5796 if (fl == STR_NOEMBED) {
5797 SIZED_FREE_N(oldptr, old_capa);
5801 if (!STR_SHARED_P(str)) {
5803 rb_enc_cr_str_exact_copy(shared, str);
5808 STR_SET_LEN(str, nlen);
5810 if (!SHARABLE_MIDDLE_SUBSTRING) {
5811 TERM_FILL(ptr + nlen, TERM_LEN(str));
5818rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5824 if (beg == 0 && vlen == 0) {
5829 str_modify_keep_cr(str);
5833 RESIZE_CAPA(str, slen + vlen -
len);
5834 sptr = RSTRING_PTR(str);
5843 memmove(sptr + beg + vlen,
5845 slen - (beg +
len));
5847 if (vlen < beg &&
len < 0) {
5851 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5854 STR_SET_LEN(str, slen);
5855 TERM_FILL(&sptr[slen], TERM_LEN(str));
5862 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5871 int singlebyte = single_byte_optimizable(str);
5877 enc = rb_enc_check(str, val);
5878 slen = str_strlen(str, enc);
5880 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5889 if (
len > slen - beg) {
5892 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5897 beg = p - RSTRING_PTR(str);
5899 rb_str_update_0(str, beg,
len, val);
5900 rb_enc_associate(str, enc);
5911 long start, end,
len;
5921 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5925 nth += regs->num_regs;
5935 enc = rb_enc_check_str(str, val);
5936 rb_str_update_0(str, start,
len, val);
5937 rb_enc_associate(str, enc);
5945 switch (
TYPE(indx)) {
5947 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5951 beg = rb_str_index(str, indx, 0);
5990rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5994 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6002 return rb_str_aset(str, argv[0], argv[1]);
6054rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6062 str_modify_keep_cr(str);
6070 if ((nth += regs->num_regs) <= 0)
return Qnil;
6072 else if (nth >= regs->num_regs)
return Qnil;
6074 len = END(nth) - beg;
6077 else if (argc == 2) {
6086 beg = p - RSTRING_PTR(str);
6090 beg = rb_str_index(str, indx, 0);
6091 if (beg == -1)
return Qnil;
6092 len = RSTRING_LEN(indx);
6104 beg = p - RSTRING_PTR(str);
6113 beg = p - RSTRING_PTR(str);
6117 rb_enc_cr_str_copy_for_substr(result, str);
6125 char *sptr = RSTRING_PTR(str);
6126 long slen = RSTRING_LEN(str);
6127 if (beg +
len > slen)
6131 slen - (beg +
len));
6133 STR_SET_LEN(str, slen);
6134 TERM_FILL(&sptr[slen], TERM_LEN(str));
6145 switch (OBJ_BUILTIN_TYPE(pat)) {
6164get_pat_quoted(
VALUE pat,
int check)
6168 switch (OBJ_BUILTIN_TYPE(pat)) {
6182 if (check && is_broken_string(pat)) {
6189rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6192 pos = rb_str_byteindex(str, pat, pos);
6193 if (set_backref_str) {
6195 str = rb_str_new_frozen_String(str);
6196 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6198 *match = match_data;
6208 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6213rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6215 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6233rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6248 hash = rb_check_hash_type(repl);
6255 pat = get_pat_quoted(argv[0], 1);
6257 str_modifiable(str);
6258 beg = rb_pat_search(pat, str, 0, 1);
6272 end0 = beg0 + RSTRING_LEN(pat);
6281 if (iter || !
NIL_P(hash)) {
6282 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6288 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6291 str_mod_check(str, p,
len);
6292 rb_check_frozen(str);
6298 enc = rb_enc_compatible(str, repl);
6301 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6305 rb_enc_inspect_name(str_enc),
6306 rb_enc_inspect_name(STR_ENC_GET(repl)));
6308 enc = STR_ENC_GET(repl);
6311 rb_enc_associate(str, enc);
6321 rlen = RSTRING_LEN(repl);
6322 len = RSTRING_LEN(str);
6324 RESIZE_CAPA(str,
len + rlen - plen);
6326 p = RSTRING_PTR(str);
6328 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6330 rp = RSTRING_PTR(repl);
6331 memmove(p + beg0, rp, rlen);
6333 STR_SET_LEN(str,
len);
6334 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6357 rb_str_sub_bang(argc, argv, str);
6362str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6365 long beg, beg0, end0;
6366 long offset, blen, slen,
len, last;
6367 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6369 int need_backref_str = -1;
6380 hash = rb_check_hash_type(repl);
6384 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6393 rb_error_arity(argc, 1, 2);
6396 pat = get_pat_quoted(argv[0], 1);
6397 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6400 if (bang)
return Qnil;
6405 blen = RSTRING_LEN(str) + 30;
6407 sp = RSTRING_PTR(str);
6408 slen = RSTRING_LEN(str);
6410 str_enc = STR_ENC_GET(str);
6411 rb_enc_associate(dest, str_enc);
6418 end0 = beg0 + RSTRING_LEN(pat);
6432 struct RString fake_str = {RBASIC_INIT};
6434 if (mode == FAST_MAP) {
6443 val = rb_hash_aref(hash, key);
6446 str_mod_check(str, sp, slen);
6451 else if (need_backref_str) {
6453 if (need_backref_str < 0) {
6454 need_backref_str = val != repl;
6461 len = beg0 - offset;
6475 if (RSTRING_LEN(str) <= end0)
break;
6476 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6478 offset = end0 +
len;
6480 cp = RSTRING_PTR(str) + offset;
6481 if (offset > RSTRING_LEN(str))
break;
6484 if (mode != FAST_MAP && mode != STR) {
6487 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6492 if (RSTRING_LEN(str) > offset) {
6495 rb_pat_search0(pat, str, last, 1, &match);
6497 str_shared_replace(str, dest);
6522rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6524 str_modify_keep_cr(str);
6525 return str_gsub(argc, argv, str, 1);
6575 return str_gsub(argc, argv, str, 0);
6595 str_modifiable(str);
6596 if (str == str2)
return str;
6600 return str_replace(str, str2);
6617rb_str_clear(
VALUE str)
6621 STR_SET_LEN(str, 0);
6622 RSTRING_PTR(str)[0] = 0;
6623 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6639rb_str_chr(
VALUE str)
6657 pos += RSTRING_LEN(str);
6658 if (pos < 0 || RSTRING_LEN(str) <= pos)
6661 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6681 long len = RSTRING_LEN(str);
6682 char *
ptr, *head, *left = 0;
6686 if (pos < -
len ||
len <= pos)
6693 char byte = (char)(
NUM2INT(w) & 0xFF);
6695 if (!str_independent(str))
6696 str_make_independent(str);
6697 enc = STR_ENC_GET(str);
6698 head = RSTRING_PTR(str);
6700 if (!STR_EMBED_P(str)) {
6707 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6715 width = rb_enc_precise_mbclen(left, head+
len, enc);
6717 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6733str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6735 long n = RSTRING_LEN(str);
6737 if (beg > n ||
len < 0)
return Qnil;
6740 if (beg < 0)
return Qnil;
6745 if (!empty)
return Qnil;
6749 VALUE str2 = str_subseq(str, beg,
len);
6751 str_enc_copy_direct(str2, str);
6753 if (RSTRING_LEN(str2) == 0) {
6754 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6788 long beg,
len = RSTRING_LEN(str);
6796 return str_byte_substr(str, beg,
len, TRUE);
6801 return str_byte_substr(str, idx, 1, FALSE);
6813rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6818 return str_byte_substr(str, beg,
len, TRUE);
6821 return str_byte_aref(str, argv[0]);
6825str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6827 long end, slen = RSTRING_LEN(str);
6830 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6839 if (*
len > slen - *beg) {
6843 str_ensure_byte_pos(str, *beg);
6844 str_ensure_byte_pos(str, end);
6858rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6860 long beg,
len, vbeg, vlen;
6865 if (!(argc == 2 || argc == 3 || argc == 5)) {
6866 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6870 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6871 rb_builtin_class_name(argv[0]));
6878 vlen = RSTRING_LEN(val);
6883 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6884 rb_builtin_class_name(argv[2]));
6896 vlen = RSTRING_LEN(val);
6904 str_check_beg_len(str, &beg, &
len);
6905 str_check_beg_len(val, &vbeg, &vlen);
6906 str_modify_keep_cr(str);
6909 rb_enc_associate(str, rb_enc_check(str, val));
6912 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6934rb_str_reverse(
VALUE str)
6941 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6942 enc = STR_ENC_GET(str);
6948 if (RSTRING_LEN(str) > 1) {
6949 if (single_byte_optimizable(str)) {
6956 int clen = rb_enc_fast_mbclen(s, e, enc);
6964 cr = rb_enc_asciicompat(enc) ?
6967 int clen = rb_enc_mbclen(s, e, enc);
6976 STR_SET_LEN(rev, RSTRING_LEN(str));
6977 str_enc_copy_direct(rev, str);
6999rb_str_reverse_bang(
VALUE str)
7001 if (RSTRING_LEN(str) > 1) {
7002 if (single_byte_optimizable(str)) {
7005 str_modify_keep_cr(str);
7006 s = RSTRING_PTR(str);
7015 str_shared_replace(str, rb_str_reverse(str));
7019 str_modify_keep_cr(str);
7048 i = rb_str_index(str, arg, 0);
7050 return RBOOL(i != -1);
7094 rb_raise(rb_eArgError,
"invalid radix %d", base);
7096 return rb_str_to_inum(str, base, FALSE);
7121rb_str_to_f(
VALUE str)
7138rb_str_to_s(
VALUE str)
7150 char s[RUBY_MAX_CHAR_LEN];
7151 int n = rb_enc_codelen(c, enc);
7153 rb_enc_mbcput(c, s, enc);
7158#define CHAR_ESC_LEN 13
7161rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7163 char buf[CHAR_ESC_LEN + 1];
7171 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7173 else if (c < 0x10000) {
7174 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7177 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7182 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7185 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7188 l = (int)strlen(buf);
7194ruby_escaped_char(
int c)
7197 case '\0':
return "\\0";
7198 case '\n':
return "\\n";
7199 case '\r':
return "\\r";
7200 case '\t':
return "\\t";
7201 case '\f':
return "\\f";
7202 case '\013':
return "\\v";
7203 case '\010':
return "\\b";
7204 case '\007':
return "\\a";
7205 case '\033':
return "\\e";
7206 case '\x7f':
return "\\c?";
7212rb_str_escape(
VALUE str)
7216 const char *p = RSTRING_PTR(str);
7218 const char *prev = p;
7219 char buf[CHAR_ESC_LEN + 1];
7221 int unicode_p = rb_enc_unicode_p(enc);
7222 int asciicompat = rb_enc_asciicompat(enc);
7227 int n = rb_enc_precise_mbclen(p, pend, enc);
7229 if (p > prev) str_buf_cat(result, prev, p - prev);
7230 n = rb_enc_mbminlen(enc);
7232 n = (int)(pend - p);
7234 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7235 str_buf_cat(result, buf, strlen(buf));
7241 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7243 cc = ruby_escaped_char(c);
7245 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7246 str_buf_cat(result, cc, strlen(cc));
7249 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7252 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7253 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7257 if (p > prev) str_buf_cat(result, prev, p - prev);
7276 const char *p, *pend, *prev;
7277 char buf[CHAR_ESC_LEN + 1];
7279 rb_encoding *resenc = rb_default_internal_encoding();
7280 int unicode_p = rb_enc_unicode_p(enc);
7281 int asciicompat = rb_enc_asciicompat(enc);
7283 if (resenc == NULL) resenc = rb_default_external_encoding();
7284 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7285 rb_enc_associate(result, resenc);
7286 str_buf_cat2(result,
"\"");
7294 n = rb_enc_precise_mbclen(p, pend, enc);
7296 if (p > prev) str_buf_cat(result, prev, p - prev);
7297 n = rb_enc_mbminlen(enc);
7299 n = (int)(pend - p);
7301 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7302 str_buf_cat(result, buf, strlen(buf));
7308 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7310 if ((asciicompat || unicode_p) &&
7311 (c ==
'"'|| c ==
'\\' ||
7316 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7317 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7318 str_buf_cat2(result,
"\\");
7319 if (asciicompat || enc == resenc) {
7325 case '\n': cc =
'n';
break;
7326 case '\r': cc =
'r';
break;
7327 case '\t': cc =
't';
break;
7328 case '\f': cc =
'f';
break;
7329 case '\013': cc =
'v';
break;
7330 case '\010': cc =
'b';
break;
7331 case '\007': cc =
'a';
break;
7332 case 033: cc =
'e';
break;
7333 default: cc = 0;
break;
7336 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7339 str_buf_cat(result, buf, 2);
7352 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7356 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7357 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7362 if (p > prev) str_buf_cat(result, prev, p - prev);
7363 str_buf_cat2(result,
"\"");
7368#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7381 int encidx = rb_enc_get_index(str);
7384 const char *p, *pend;
7387 int u8 = (encidx == rb_utf8_encindex());
7388 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7391 if (!rb_enc_asciicompat(enc)) {
7393 len += strlen(enc->name);
7396 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7399 unsigned char c = *p++;
7402 case '"':
case '\\':
7403 case '\n':
case '\r':
7404 case '\t':
case '\f':
7405 case '\013':
case '\010':
case '\007':
case '\033':
7410 clen = IS_EVSTR(p, pend) ? 2 : 1;
7418 if (u8 && c > 0x7F) {
7419 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7421 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7424 else if (cc <= 0xFFFFF)
7437 if (clen > LONG_MAX -
len) {
7444 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7445 q = RSTRING_PTR(result); qend = q +
len + 1;
7449 unsigned char c = *p++;
7451 if (c ==
'"' || c ==
'\\') {
7455 else if (c ==
'#') {
7456 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7459 else if (c ==
'\n') {
7463 else if (c ==
'\r') {
7467 else if (c ==
'\t') {
7471 else if (c ==
'\f') {
7475 else if (c ==
'\013') {
7479 else if (c ==
'\010') {
7483 else if (c ==
'\007') {
7487 else if (c ==
'\033') {
7497 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7499 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7502 snprintf(q, qend-q,
"u%04X", cc);
7504 snprintf(q, qend-q,
"u{%X}", cc);
7509 snprintf(q, qend-q,
"x%02X", c);
7515 if (!rb_enc_asciicompat(enc)) {
7516 snprintf(q, qend-q, nonascii_suffix, enc->name);
7517 encidx = rb_ascii8bit_encindex();
7520 rb_enc_associate_index(result, encidx);
7526unescape_ascii(
unsigned int c)
7550undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7552 const char *s = *ss;
7556 unsigned char buf[6];
7574 *buf = unescape_ascii(*s);
7586 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7587 if (*penc != enc_utf8) {
7589 rb_enc_associate(undumped, enc_utf8);
7606 if (hexlen == 0 || hexlen > 6) {
7612 if (0xd800 <= c && c <= 0xdfff) {
7615 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7625 if (0xd800 <= c && c <= 0xdfff) {
7628 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7658static VALUE rb_str_is_ascii_only_p(
VALUE str);
7670str_undump(
VALUE str)
7672 const char *s = RSTRING_PTR(str);
7675 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7677 bool binary =
false;
7681 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7684 if (!str_null_check(str, &w)) {
7687 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7688 if (*s !=
'"')
goto invalid_format;
7706 static const char force_encoding_suffix[] =
".force_encoding(\"";
7707 static const char dup_suffix[] =
".dup";
7708 const char *encname;
7713 size =
sizeof(dup_suffix) - 1;
7714 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7716 size =
sizeof(force_encoding_suffix) - 1;
7717 if (s_end - s <= size)
goto invalid_format;
7718 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7722 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7726 s = memchr(s,
'"', s_end-s);
7728 if (!s)
goto invalid_format;
7729 if (s_end - s != 2)
goto invalid_format;
7730 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7732 encidx = rb_enc_find_index2(encname, (
long)size);
7736 rb_enc_associate_index(undumped, encidx);
7746 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7757 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7763 if (rb_enc_dummy_p(enc)) {
7770str_true_enc(
VALUE str)
7773 rb_str_check_dummy_enc(enc);
7777static OnigCaseFoldType
7778check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7783 rb_raise(rb_eArgError,
"too many options");
7784 if (argv[0]==sym_turkic) {
7785 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7787 if (argv[1]==sym_lithuanian)
7788 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7790 rb_raise(rb_eArgError,
"invalid second option");
7793 else if (argv[0]==sym_lithuanian) {
7794 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7796 if (argv[1]==sym_turkic)
7797 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7799 rb_raise(rb_eArgError,
"invalid second option");
7803 rb_raise(rb_eArgError,
"too many options");
7804 else if (argv[0]==sym_ascii)
7805 flags |= ONIGENC_CASE_ASCII_ONLY;
7806 else if (argv[0]==sym_fold) {
7807 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7808 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7810 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7813 rb_raise(rb_eArgError,
"invalid option");
7820 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7826#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7827#ifndef CASEMAP_DEBUG
7828# define CASEMAP_DEBUG 0
7836 OnigUChar space[FLEX_ARY_LEN];
7840mapping_buffer_free(
void *p)
7844 while (current_buffer) {
7845 previous_buffer = current_buffer;
7846 current_buffer = current_buffer->next;
7847 ruby_xfree_sized(previous_buffer, offsetof(
mapping_buffer, space) + previous_buffer->capa);
7853 {0, mapping_buffer_free,},
7862 const OnigUChar *source_current, *source_end;
7863 int target_length = 0;
7864 VALUE buffer_anchor;
7867 size_t buffer_count = 0;
7868 int buffer_length_or_invalid;
7870 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7872 source_current = (OnigUChar*)RSTRING_PTR(source);
7877 while (source_current < source_end) {
7879 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7880 if (CASEMAP_DEBUG) {
7881 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7884 *pre_buffer = current_buffer;
7885 pre_buffer = ¤t_buffer->next;
7886 current_buffer->next = NULL;
7887 current_buffer->capa =
capa;
7888 buffer_length_or_invalid = enc->case_map(flags,
7889 &source_current, source_end,
7890 current_buffer->space,
7891 current_buffer->space+current_buffer->capa,
7893 if (buffer_length_or_invalid < 0) {
7894 current_buffer =
DATA_PTR(buffer_anchor);
7896 mapping_buffer_free(current_buffer);
7897 rb_raise(rb_eArgError,
"input string invalid");
7899 target_length += current_buffer->used = buffer_length_or_invalid;
7901 if (CASEMAP_DEBUG) {
7902 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7905 if (buffer_count==1) {
7906 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7909 char *target_current;
7912 target_current = RSTRING_PTR(target);
7913 current_buffer =
DATA_PTR(buffer_anchor);
7914 while (current_buffer) {
7915 memcpy(target_current, current_buffer->space, current_buffer->used);
7916 target_current += current_buffer->used;
7917 current_buffer = current_buffer->next;
7920 current_buffer =
DATA_PTR(buffer_anchor);
7922 mapping_buffer_free(current_buffer);
7927 str_enc_copy_direct(target, source);
7936 const OnigUChar *source_current, *source_end;
7937 OnigUChar *target_current, *target_end;
7938 long old_length = RSTRING_LEN(source);
7939 int length_or_invalid;
7941 if (old_length == 0)
return Qnil;
7943 source_current = (OnigUChar*)RSTRING_PTR(source);
7945 if (source == target) {
7946 target_current = (OnigUChar*)source_current;
7947 target_end = (OnigUChar*)source_end;
7950 target_current = (OnigUChar*)RSTRING_PTR(target);
7954 length_or_invalid = onigenc_ascii_only_case_map(flags,
7955 &source_current, source_end,
7956 target_current, target_end, enc);
7957 if (length_or_invalid < 0)
7958 rb_raise(rb_eArgError,
"input string invalid");
7959 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7960 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7961 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7962 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
7963 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7966 str_enc_copy(target, source);
7972upcase_single(
VALUE str)
7974 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
7975 bool modified =
false;
7978 unsigned int c = *(
unsigned char*)s;
7980 if (
'a' <= c && c <=
'z') {
7981 *s =
'A' + (c -
'a');
8002rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
8005 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8007 flags = check_case_options(argc, argv, flags);
8008 str_modify_keep_cr(str);
8009 enc = str_true_enc(str);
8010 if (case_option_single_p(flags, enc, str)) {
8011 if (upcase_single(str))
8012 flags |= ONIGENC_CASE_MODIFIED;
8014 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8015 rb_str_ascii_casemap(str, str, &flags, enc);
8017 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8019 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8032rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8035 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8038 flags = check_case_options(argc, argv, flags);
8039 enc = str_true_enc(str);
8040 if (case_option_single_p(flags, enc, str)) {
8041 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8042 str_enc_copy_direct(ret, str);
8045 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8047 rb_str_ascii_casemap(str, ret, &flags, enc);
8050 ret = rb_str_casemap(str, &flags, enc);
8057downcase_single(
VALUE str)
8059 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8060 bool modified =
false;
8063 unsigned int c = *(
unsigned char*)s;
8065 if (
'A' <= c && c <=
'Z') {
8066 *s =
'a' + (c -
'A');
8088rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8091 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8093 flags = check_case_options(argc, argv, flags);
8094 str_modify_keep_cr(str);
8095 enc = str_true_enc(str);
8096 if (case_option_single_p(flags, enc, str)) {
8097 if (downcase_single(str))
8098 flags |= ONIGENC_CASE_MODIFIED;
8100 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8101 rb_str_ascii_casemap(str, str, &flags, enc);
8103 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8105 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8119rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8122 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8125 flags = check_case_options(argc, argv, flags);
8126 enc = str_true_enc(str);
8127 if (case_option_single_p(flags, enc, str)) {
8128 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8129 str_enc_copy_direct(ret, str);
8130 downcase_single(ret);
8132 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8134 rb_str_ascii_casemap(str, ret, &flags, enc);
8137 ret = rb_str_casemap(str, &flags, enc);
8157rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8160 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8162 flags = check_case_options(argc, argv, flags);
8163 str_modify_keep_cr(str);
8164 enc = str_true_enc(str);
8165 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8166 if (flags&ONIGENC_CASE_ASCII_ONLY)
8167 rb_str_ascii_casemap(str, str, &flags, enc);
8169 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8171 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8185rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8188 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8191 flags = check_case_options(argc, argv, flags);
8192 enc = str_true_enc(str);
8193 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8194 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8196 rb_str_ascii_casemap(str, ret, &flags, enc);
8199 ret = rb_str_casemap(str, &flags, enc);
8218rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8221 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8223 flags = check_case_options(argc, argv, flags);
8224 str_modify_keep_cr(str);
8225 enc = str_true_enc(str);
8226 if (flags&ONIGENC_CASE_ASCII_ONLY)
8227 rb_str_ascii_casemap(str, str, &flags, enc);
8229 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8231 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8245rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8248 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8251 flags = check_case_options(argc, argv, flags);
8252 enc = str_true_enc(str);
8253 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8254 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8256 rb_str_ascii_casemap(str, ret, &flags, enc);
8259 ret = rb_str_casemap(str, &flags, enc);
8264typedef unsigned char *USTR;
8268 unsigned int now, max;
8280 if (t->p == t->pend)
return -1;
8281 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8284 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8286 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8288 if (t->p < t->pend) {
8289 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8292 if (t->now < 0x80 && c < 0x80) {
8293 rb_raise(rb_eArgError,
8294 "invalid range \"%c-%c\" in string transliteration",
8298 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8302 else if (t->now < c) {
8311 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8312 if (t->now == t->max) {
8317 if (t->now < t->max) {
8333 const unsigned int errc = -1;
8334 unsigned int trans[256];
8336 struct tr trsrc, trrepl;
8338 unsigned int c, c0, last = 0;
8339 int modify = 0, i, l;
8340 unsigned char *s, *send;
8342 int singlebyte = single_byte_optimizable(str);
8346#define CHECK_IF_ASCII(c) \
8347 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8348 (cr = ENC_CODERANGE_VALID) : 0)
8352 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8353 if (RSTRING_LEN(repl) == 0) {
8354 return rb_str_delete_bang(1, &src, str);
8358 e1 = rb_enc_check(str, src);
8359 e2 = rb_enc_check(str, repl);
8364 enc = rb_enc_check(src, repl);
8366 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8367 if (RSTRING_LEN(src) > 1 &&
8368 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8369 trsrc.p + l < trsrc.pend) {
8373 trrepl.p = RSTRING_PTR(repl);
8374 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8375 trsrc.gen = trrepl.gen = 0;
8376 trsrc.now = trrepl.now = 0;
8377 trsrc.max = trrepl.max = 0;
8380 for (i=0; i<256; i++) {
8383 while ((c = trnext(&trsrc, enc)) != errc) {
8388 if (!hash) hash = rb_hash_new();
8392 while ((c = trnext(&trrepl, enc)) != errc)
8395 for (i=0; i<256; i++) {
8396 if (trans[i] != errc) {
8404 for (i=0; i<256; i++) {
8407 while ((c = trnext(&trsrc, enc)) != errc) {
8408 r = trnext(&trrepl, enc);
8409 if (r == errc) r = trrepl.now;
8412 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8415 if (!hash) hash = rb_hash_new();
8423 str_modify_keep_cr(str);
8424 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8425 termlen = rb_enc_mbminlen(enc);
8428 long offset, max = RSTRING_LEN(str);
8429 unsigned int save = -1;
8430 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8435 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8437 SIZED_FREE_N(buf, max + termlen);
8438 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8441 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8443 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8452 if (cflag) c = last;
8455 else if (cflag) c = errc;
8461 if (c != (
unsigned int)-1) {
8467 tlen = rb_enc_codelen(c, enc);
8473 if (enc != e1) may_modify = 1;
8475 if ((offset = t - buf) + tlen > max) {
8476 size_t MAYBE_UNUSED(old) = max + termlen;
8477 max = offset + tlen + (send - s);
8478 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8481 rb_enc_mbcput(c, t, enc);
8482 if (may_modify && memcmp(s, t, tlen) != 0) {
8488 if (!STR_EMBED_P(str)) {
8489 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8491 TERM_FILL((
char *)t, termlen);
8492 RSTRING(str)->as.heap.ptr = (
char *)buf;
8493 STR_SET_LEN(str, t - buf);
8494 STR_SET_NOEMBED(str);
8495 RSTRING(str)->as.heap.aux.capa = max;
8499 c = (
unsigned char)*s;
8500 if (trans[c] != errc) {
8517 long offset, max = (long)((send - s) * 1.2);
8518 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8523 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8525 SIZED_FREE_N(buf, max + termlen);
8526 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8529 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8531 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8539 if (cflag) c = last;
8542 else if (cflag) c = errc;
8546 c = cflag ? last : errc;
8549 tlen = rb_enc_codelen(c, enc);
8554 if (enc != e1) may_modify = 1;
8556 if ((offset = t - buf) + tlen > max) {
8557 size_t MAYBE_UNUSED(old) = max + termlen;
8558 max = offset + tlen + (long)((send - s) * 1.2);
8559 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8563 rb_enc_mbcput(c, t, enc);
8564 if (may_modify && memcmp(s, t, tlen) != 0) {
8572 if (!STR_EMBED_P(str)) {
8573 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8575 TERM_FILL((
char *)t, termlen);
8576 RSTRING(str)->as.heap.ptr = (
char *)buf;
8577 STR_SET_LEN(str, t - buf);
8578 STR_SET_NOEMBED(str);
8579 RSTRING(str)->as.heap.aux.capa = max;
8585 rb_enc_associate(str, enc);
8607 return tr_trans(str, src, repl, 0);
8652 tr_trans(str, src, repl, 0);
8656#define TR_TABLE_MAX (UCHAR_MAX+1)
8657#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8659tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8662 const unsigned int errc = -1;
8663 char buf[TR_TABLE_MAX];
8666 VALUE table = 0, ptable = 0;
8667 int i, l, cflag = 0;
8669 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8670 tr.gen =
tr.now =
tr.max = 0;
8672 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8677 for (i=0; i<TR_TABLE_MAX; i++) {
8680 stable[TR_TABLE_MAX] = cflag;
8682 else if (stable[TR_TABLE_MAX] && !cflag) {
8683 stable[TR_TABLE_MAX] = 0;
8685 for (i=0; i<TR_TABLE_MAX; i++) {
8689 while ((c = trnext(&
tr, enc)) != errc) {
8690 if (c < TR_TABLE_MAX) {
8691 buf[(
unsigned char)c] = !cflag;
8696 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8699 table = ptable ? ptable : rb_hash_new();
8703 table = rb_hash_new();
8708 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8709 rb_hash_aset(table, key,
Qtrue);
8713 for (i=0; i<TR_TABLE_MAX; i++) {
8714 stable[i] = stable[i] && buf[i];
8716 if (!table && !cflag) {
8723tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8725 if (c < TR_TABLE_MAX) {
8726 return table[c] != 0;
8732 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8733 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8737 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8740 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8755rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8757 char squeez[TR_TABLE_SIZE];
8760 VALUE del = 0, nodel = 0;
8762 int i, ascompat, cr;
8764 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8766 for (i=0; i<argc; i++) {
8770 enc = rb_enc_check(str, s);
8771 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8774 str_modify_keep_cr(str);
8775 ascompat = rb_enc_asciicompat(enc);
8776 s = t = RSTRING_PTR(str);
8783 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8794 c = rb_enc_codepoint_len(s, send, &clen, enc);
8796 if (tr_find(c, squeez, del, nodel)) {
8800 if (t != s) rb_enc_mbcput(c, t, enc);
8807 TERM_FILL(t, TERM_LEN(str));
8808 STR_SET_LEN(str, t - RSTRING_PTR(str));
8811 if (modify)
return str;
8825rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8828 rb_str_delete_bang(argc, argv, str);
8846rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8848 char squeez[TR_TABLE_SIZE];
8850 VALUE del = 0, nodel = 0;
8851 unsigned char *s, *send, *t;
8853 int ascompat, singlebyte = single_byte_optimizable(str);
8857 enc = STR_ENC_GET(str);
8860 for (i=0; i<argc; i++) {
8864 enc = rb_enc_check(str, s);
8865 if (singlebyte && !single_byte_optimizable(s))
8867 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8871 str_modify_keep_cr(str);
8872 s = t = (
unsigned char *)RSTRING_PTR(str);
8873 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8876 ascompat = rb_enc_asciicompat(enc);
8880 unsigned int c = *s++;
8881 if (c != save || (argc > 0 && !squeez[c])) {
8891 if (ascompat && (c = *s) < 0x80) {
8892 if (c != save || (argc > 0 && !squeez[c])) {
8898 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8900 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8901 if (t != s) rb_enc_mbcput(c, t, enc);
8910 TERM_FILL((
char *)t, TERM_LEN(str));
8911 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8912 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
8916 if (modify)
return str;
8930rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8933 rb_str_squeeze_bang(argc, argv, str);
8953 return tr_trans(str, src, repl, 1);
8981 tr_trans(str, src, repl, 1);
8994rb_str_count(
int argc,
VALUE *argv,
VALUE str)
8996 char table[TR_TABLE_SIZE];
8998 VALUE del = 0, nodel = 0, tstr;
9008 enc = rb_enc_check(str, tstr);
9011 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9012 (ptstr = RSTRING_PTR(tstr),
9013 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9014 !is_broken_string(str)) {
9016 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9018 s = RSTRING_PTR(str);
9019 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9022 if (*(
unsigned char*)s++ == c) n++;
9028 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9029 for (i=1; i<argc; i++) {
9032 enc = rb_enc_check(str, tstr);
9033 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9036 s = RSTRING_PTR(str);
9037 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9039 ascompat = rb_enc_asciicompat(enc);
9043 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9051 c = rb_enc_codepoint_len(s, send, &clen, enc);
9052 if (tr_find(c, table, del, nodel)) {
9063rb_fs_check(
VALUE val)
9067 if (
NIL_P(val))
return 0;
9072static const char isspacetable[256] = {
9073 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9075 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9086 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9087 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9088 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9091#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9094split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9096 if (empty_count >= 0 &&
len == 0) {
9097 return empty_count + 1;
9099 if (empty_count > 0) {
9104 }
while (--empty_count > 0);
9108 rb_yield(str_new_empty_String(str));
9109 }
while (--empty_count > 0);
9123 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9127literal_split_pattern(
VALUE spat, split_type_t default_type)
9135 return SPLIT_TYPE_CHARS;
9137 else if (rb_enc_asciicompat(enc)) {
9138 if (
len == 1 && ptr[0] ==
' ') {
9139 return SPLIT_TYPE_AWK;
9144 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9145 return SPLIT_TYPE_AWK;
9148 return default_type;
9161rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9166 split_type_t split_type;
9167 long beg, end, i = 0, empty_count = -1;
9172 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9174 if (lim <= 0) limit =
Qnil;
9175 else if (lim == 1) {
9176 if (RSTRING_LEN(str) == 0)
9187 if (
NIL_P(limit) && !lim) empty_count = 0;
9189 enc = STR_ENC_GET(str);
9190 split_type = SPLIT_TYPE_REGEXP;
9192 spat = get_pat_quoted(spat, 0);
9194 else if (
NIL_P(spat = rb_fs)) {
9195 split_type = SPLIT_TYPE_AWK;
9197 else if (!(spat = rb_fs_check(spat))) {
9198 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9203 if (split_type != SPLIT_TYPE_AWK) {
9208 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9209 if (split_type == SPLIT_TYPE_AWK) {
9211 split_type = SPLIT_TYPE_STRING;
9216 mustnot_broken(spat);
9217 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9225#define SPLIT_STR(beg, len) ( \
9226 empty_count = split_string(result, str, beg, len, empty_count), \
9227 str_mod_check(str, str_start, str_len))
9230 char *ptr = RSTRING_PTR(str);
9231 char *
const str_start = ptr;
9232 const long str_len = RSTRING_LEN(str);
9233 char *
const eptr = str_start + str_len;
9234 if (split_type == SPLIT_TYPE_AWK) {
9241 if (is_ascii_string(str)) {
9242 while (ptr < eptr) {
9243 c = (
unsigned char)*ptr++;
9245 if (ascii_isspace(c)) {
9251 if (!
NIL_P(limit) && lim <= i)
break;
9254 else if (ascii_isspace(c)) {
9255 SPLIT_STR(beg, end-beg);
9258 if (!
NIL_P(limit)) ++i;
9266 while (ptr < eptr) {
9269 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9278 if (!
NIL_P(limit) && lim <= i)
break;
9282 SPLIT_STR(beg, end-beg);
9285 if (!
NIL_P(limit)) ++i;
9293 else if (split_type == SPLIT_TYPE_STRING) {
9294 char *substr_start = ptr;
9295 char *sptr = RSTRING_PTR(spat);
9296 long slen = RSTRING_LEN(spat);
9299 mustnot_broken(str);
9300 enc = rb_enc_check(str, spat);
9301 while (ptr < eptr &&
9302 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9305 if (t != ptr + end) {
9309 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9310 str_mod_check(spat, sptr, slen);
9313 if (!
NIL_P(limit) && lim <= ++i)
break;
9315 beg = ptr - str_start;
9317 else if (split_type == SPLIT_TYPE_CHARS) {
9321 mustnot_broken(str);
9322 enc = rb_enc_get(str);
9323 while (ptr < eptr &&
9324 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9325 SPLIT_STR(ptr - str_start, n);
9327 if (!
NIL_P(limit) && lim <= ++i)
break;
9329 beg = ptr - str_start;
9333 long len = RSTRING_LEN(str);
9341 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9346 if (start == end && BEG(0) == END(0)) {
9351 else if (last_null == 1) {
9352 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9359 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9365 SPLIT_STR(beg, end-beg);
9366 beg = start = END(0);
9370 for (idx=1; idx < regs->num_regs; idx++) {
9371 if (BEG(idx) == -1)
continue;
9372 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9374 if (!
NIL_P(limit) && lim <= ++i)
break;
9376 if (match) rb_match_unbusy(match);
9378 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9379 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9382 return result ? result : str;
9392 return rb_str_split_m(1, &sep, str);
9395#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9410#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9413chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9415 const char *prev = rb_enc_prev_char(p, e, e, enc);
9418 prev = rb_enc_prev_char(p, e, e, enc);
9419 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9431 RSTRING_LEN(rs) != 1 ||
9432 RSTRING_PTR(rs)[0] !=
'\n')) {
9438#define rb_rs get_rs()
9445 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9446 long pos,
len, rslen;
9452 static ID keywords[1];
9457 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9461 if (!ENUM_ELEM(ary, str)) {
9469 if (!RSTRING_LEN(str))
goto end;
9471 ptr = subptr = RSTRING_PTR(str);
9473 len = RSTRING_LEN(str);
9475 rslen = RSTRING_LEN(rs);
9478 enc = rb_enc_get(str);
9480 enc = rb_enc_check(str, rs);
9485 const char *eol = NULL;
9487 while (subend < pend) {
9488 long chomp_rslen = 0;
9490 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9492 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9494 if (eol == subend)
break;
9498 chomp_rslen = -rslen;
9502 if (!subptr) subptr = subend;
9506 }
while (subend < pend);
9508 if (rslen == 0) chomp_rslen = 0;
9510 subend - subptr + (chomp ? chomp_rslen : rslen));
9511 if (ENUM_ELEM(ary, line)) {
9512 str_mod_check(str, ptr,
len);
9514 subptr = eol = NULL;
9519 rsptr = RSTRING_PTR(rs);
9520 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9529 rsptr = RSTRING_PTR(rs);
9530 rslen = RSTRING_LEN(rs);
9533 while (subptr < pend) {
9534 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9538 if (hit != adjusted) {
9542 subend = hit += rslen;
9545 subend = chomp_newline(subptr, subend, enc);
9552 if (ENUM_ELEM(ary, line)) {
9553 str_mod_check(str, ptr,
len);
9558 if (subptr != pend) {
9561 pend = chomp_newline(subptr, pend, enc);
9563 else if (pend - subptr >= rslen &&
9564 memcmp(pend - rslen, rsptr, rslen) == 0) {
9569 ENUM_ELEM(ary, line);
9590rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9593 return rb_str_enumerate_lines(argc, argv, str, 0);
9648rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9650 VALUE ary = WANTARRAY(
"lines", 0);
9651 return rb_str_enumerate_lines(argc, argv, str, ary);
9665 for (i=0; i<RSTRING_LEN(str); i++) {
9666 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9684rb_str_each_byte(
VALUE str)
9687 return rb_str_enumerate_bytes(str, 0);
9699rb_str_bytes(
VALUE str)
9701 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9702 return rb_str_enumerate_bytes(str, ary);
9720 ptr = RSTRING_PTR(str);
9721 len = RSTRING_LEN(str);
9722 enc = rb_enc_get(str);
9725 for (i = 0; i <
len; i += n) {
9726 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9731 for (i = 0; i <
len; i += n) {
9732 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9753rb_str_each_char(
VALUE str)
9756 return rb_str_enumerate_chars(str, 0);
9768rb_str_chars(
VALUE str)
9771 return rb_str_enumerate_chars(str, ary);
9775rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9780 const char *ptr, *end;
9783 if (single_byte_optimizable(str))
9784 return rb_str_enumerate_bytes(str, ary);
9787 ptr = RSTRING_PTR(str);
9789 enc = STR_ENC_GET(str);
9792 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9813rb_str_each_codepoint(
VALUE str)
9816 return rb_str_enumerate_codepoints(str, 0);
9828rb_str_codepoints(
VALUE str)
9831 return rb_str_enumerate_codepoints(str, ary);
9837 int encidx = rb_enc_to_index(enc);
9839 const OnigUChar source_ascii[] =
"\\X";
9840 const OnigUChar *source = source_ascii;
9841 size_t source_len =
sizeof(source_ascii) - 1;
9844#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9845#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9846#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9847#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9848#define CASE_UTF(e) \
9849 case ENCINDEX_UTF_##e: { \
9850 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9851 source = source_UTF_##e; \
9852 source_len = sizeof(source_UTF_##e); \
9855 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9863 regex_t *reg_grapheme_cluster;
9865 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9866 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9868 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9869 onig_error_code_to_str(message, r, &einfo);
9870 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9873 return reg_grapheme_cluster;
9879 int encidx = rb_enc_to_index(enc);
9880 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9882 if (encidx == rb_utf8_encindex()) {
9883 if (!reg_grapheme_cluster_utf8) {
9884 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9887 return reg_grapheme_cluster_utf8;
9896 size_t grapheme_cluster_count = 0;
9898 const char *ptr, *end;
9900 if (!rb_enc_unicode_p(enc)) {
9904 bool cached_reg_grapheme_cluster =
true;
9905 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9906 if (!reg_grapheme_cluster) {
9907 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9908 cached_reg_grapheme_cluster =
false;
9911 ptr = RSTRING_PTR(str);
9915 OnigPosition
len = onig_match(reg_grapheme_cluster,
9916 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9917 (
const OnigUChar *)ptr, NULL, 0);
9918 if (
len <= 0)
break;
9919 grapheme_cluster_count++;
9923 if (!cached_reg_grapheme_cluster) {
9924 onig_free(reg_grapheme_cluster);
9927 return SIZET2NUM(grapheme_cluster_count);
9931rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9935 const char *ptr0, *ptr, *end;
9937 if (!rb_enc_unicode_p(enc)) {
9938 return rb_str_enumerate_chars(str, ary);
9943 bool cached_reg_grapheme_cluster =
true;
9944 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9945 if (!reg_grapheme_cluster) {
9946 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9947 cached_reg_grapheme_cluster =
false;
9950 ptr0 = ptr = RSTRING_PTR(str);
9954 OnigPosition
len = onig_match(reg_grapheme_cluster,
9955 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9956 (
const OnigUChar *)ptr, NULL, 0);
9957 if (
len <= 0)
break;
9962 if (!cached_reg_grapheme_cluster) {
9963 onig_free(reg_grapheme_cluster);
9983rb_str_each_grapheme_cluster(
VALUE str)
9986 return rb_str_enumerate_grapheme_clusters(str, 0);
9998rb_str_grapheme_clusters(
VALUE str)
10001 return rb_str_enumerate_grapheme_clusters(str, ary);
10005chopped_length(
VALUE str)
10008 const char *p, *p2, *beg, *end;
10010 beg = RSTRING_PTR(str);
10011 end = beg + RSTRING_LEN(str);
10012 if (beg >= end)
return 0;
10013 p = rb_enc_prev_char(beg, end, end, enc);
10015 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10016 p2 = rb_enc_prev_char(beg, p, end, enc);
10017 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10035rb_str_chop_bang(
VALUE str)
10037 str_modify_keep_cr(str);
10038 if (RSTRING_LEN(str) > 0) {
10040 len = chopped_length(str);
10041 STR_SET_LEN(str,
len);
10042 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10061rb_str_chop(
VALUE str)
10067smart_chomp(
VALUE str,
const char *e,
const char *p)
10070 if (rb_enc_mbminlen(enc) > 1) {
10075 pp = e - rb_enc_mbminlen(enc);
10078 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10086 if (--e > p && *(e-1) ==
'\r') {
10103 char *pp, *e, *rsptr;
10105 char *
const p = RSTRING_PTR(str);
10106 long len = RSTRING_LEN(str);
10108 if (
len == 0)
return 0;
10111 return smart_chomp(str, e, p);
10114 enc = rb_enc_get(str);
10117 if (rb_enc_mbminlen(enc) > 1) {
10122 pp -= rb_enc_mbminlen(enc);
10125 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10132 while (e > p && *(e-1) ==
'\n') {
10134 if (e > p && *(e-1) ==
'\r')
10140 if (rslen >
len)
return len;
10142 enc = rb_enc_get(rs);
10143 newline = rsptr[rslen-1];
10144 if (rslen == rb_enc_mbminlen(enc)) {
10146 if (newline ==
'\n')
10147 return smart_chomp(str, e, p);
10151 return smart_chomp(str, e, p);
10155 enc = rb_enc_check(str, rs);
10156 if (is_broken_string(rs)) {
10160 if (p[
len-1] == newline &&
10162 memcmp(rsptr, pp, rslen) == 0)) {
10163 if (at_char_boundary(p, pp, e, enc))
10164 return len - rslen;
10176chomp_rs(
int argc,
const VALUE *argv)
10180 VALUE rs = argv[0];
10192 long olen = RSTRING_LEN(str);
10193 long len = chompped_length(str, rs);
10194 if (
len >= olen)
return Qnil;
10195 str_modify_keep_cr(str);
10196 STR_SET_LEN(str,
len);
10197 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10217rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10220 str_modifiable(str);
10221 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10222 rs = chomp_rs(argc, argv);
10224 return rb_str_chomp_string(str, rs);
10237rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10239 VALUE rs = chomp_rs(argc, argv);
10245tr_setup_table_multi(
char table[TR_TABLE_SIZE],
VALUE *tablep,
VALUE *ctablep,
10246 VALUE str,
int num_selectors,
VALUE *selectors)
10250 for (i=0; i<num_selectors; i++) {
10251 VALUE selector = selectors[i];
10255 enc = rb_enc_check(str, selector);
10256 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10263 const char *
const start = s;
10265 if (!s || s >= e)
return 0;
10268 if (single_byte_optimizable(str)) {
10269 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10274 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10284lstrip_offset_table(
VALUE str,
const char *s,
const char *e,
rb_encoding *enc,
10285 char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
10287 const char *
const start = s;
10289 if (!s || s >= e)
return 0;
10294 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10296 if (!tr_find(cc, table, del, nodel))
break;
10315rb_str_lstrip_bang(
int argc,
VALUE *argv,
VALUE str)
10319 long olen, loffset;
10321 str_modify_keep_cr(str);
10322 enc = STR_ENC_GET(str);
10325 char table[TR_TABLE_SIZE];
10326 VALUE del = 0, nodel = 0;
10328 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10329 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10332 loffset = lstrip_offset(str, start, start+olen, enc);
10336 long len = olen-loffset;
10337 s = start + loffset;
10338 memmove(start, s,
len);
10339 STR_SET_LEN(str,
len);
10340 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10375rb_str_lstrip(
int argc,
VALUE *argv,
VALUE str)
10382 char table[TR_TABLE_SIZE];
10383 VALUE del = 0, nodel = 0;
10385 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10386 loffset = lstrip_offset_table(str, start, start+
len, STR_ENC_GET(str), table, del, nodel);
10389 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10391 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10400 rb_str_check_dummy_enc(enc);
10404 if (!s || s >= e)
return 0;
10408 if (single_byte_optimizable(str)) {
10410 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10415 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10425rstrip_offset_table(
VALUE str,
const char *s,
const char *e,
rb_encoding *enc,
10426 char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
10431 rb_str_check_dummy_enc(enc);
10435 if (!s || s >= e)
return 0;
10439 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10441 if (!tr_find(c, table, del, nodel))
break;
10461rb_str_rstrip_bang(
int argc,
VALUE *argv,
VALUE str)
10465 long olen, roffset;
10467 str_modify_keep_cr(str);
10468 enc = STR_ENC_GET(str);
10471 char table[TR_TABLE_SIZE];
10472 VALUE del = 0, nodel = 0;
10474 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10475 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10478 roffset = rstrip_offset(str, start, start+olen, enc);
10481 long len = olen - roffset;
10483 STR_SET_LEN(str,
len);
10484 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10518rb_str_rstrip(
int argc,
VALUE *argv,
VALUE str)
10522 long olen, roffset;
10524 enc = STR_ENC_GET(str);
10527 char table[TR_TABLE_SIZE];
10528 VALUE del = 0, nodel = 0;
10530 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10531 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10534 roffset = rstrip_offset(str, start, start+olen, enc);
10536 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10554rb_str_strip_bang(
int argc,
VALUE *argv,
VALUE str)
10557 long olen, loffset, roffset;
10560 str_modify_keep_cr(str);
10561 enc = STR_ENC_GET(str);
10565 char table[TR_TABLE_SIZE];
10566 VALUE del = 0, nodel = 0;
10568 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10569 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10570 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10573 loffset = lstrip_offset(str, start, start+olen, enc);
10574 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10577 if (loffset > 0 || roffset > 0) {
10578 long len = olen-roffset;
10581 memmove(start, start + loffset,
len);
10583 STR_SET_LEN(str,
len);
10584 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10619rb_str_strip(
int argc,
VALUE *argv,
VALUE str)
10622 long olen, loffset, roffset;
10628 char table[TR_TABLE_SIZE];
10629 VALUE del = 0, nodel = 0;
10631 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10632 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10633 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10636 loffset = lstrip_offset(str, start, start+olen, enc);
10637 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10640 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10645scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10648 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10654 end = pos + RSTRING_LEN(pat);
10668 if (RSTRING_LEN(str) > end)
10669 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10678 if (!regs || regs->num_regs == 1) {
10684 for (
int i = 1; i < regs->num_regs; i++) {
10715 long last = -1, prev = 0;
10716 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10718 pat = get_pat_quoted(pat, 1);
10719 mustnot_broken(str);
10723 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10728 if (last >= 0) rb_pat_search(pat, str, last, 1);
10733 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10737 str_mod_check(str, p,
len);
10739 if (last >= 0) rb_pat_search(pat, str, last, 1);
10791rb_str_hex(
VALUE str)
10793 return rb_str_to_inum(str, 16, FALSE);
10877rb_str_oct(
VALUE str)
10879 return rb_str_to_inum(str, -8, FALSE);
10882#ifndef HAVE_CRYPT_R
10887 rb_nativethread_lock_t lock;
10888} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10957# define CRYPT_END() ALLOCV_END(databuf)
10960 extern char *crypt(
const char *,
const char *);
10961# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10964 const char *s, *saltp;
10967 char salt_8bit_clean[3];
10971 mustnot_wchar(str);
10972 mustnot_wchar(salt);
10974 saltp = RSTRING_PTR(salt);
10975 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10976 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10980 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10981 salt_8bit_clean[0] = saltp[0] & 0x7f;
10982 salt_8bit_clean[1] = saltp[1] & 0x7f;
10983 salt_8bit_clean[2] =
'\0';
10984 saltp = salt_8bit_clean;
10989# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10990 data->initialized = 0;
10992 res = crypt_r(s, saltp, data);
10995 res = crypt(s, saltp);
11010 size_t res_size = strlen(res)+1;
11011 tmp_buf =
ALLOCA_N(
char, res_size);
11012 memcpy(tmp_buf, res, res_size);
11049 char *ptr, *p, *pend;
11052 unsigned long sum0 = 0;
11057 ptr = p = RSTRING_PTR(str);
11058 len = RSTRING_LEN(str);
11064 str_mod_check(str, ptr,
len);
11067 sum0 += (
unsigned char)*p;
11078 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
11079 sum0 &= (((
unsigned long)1)<<bits)-1;
11099rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
11103 long width,
len, flen = 1, fclen = 1;
11106 const char *f =
" ";
11107 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11109 int singlebyte = 1, cr;
11113 enc = STR_ENC_GET(str);
11114 termlen = rb_enc_mbminlen(enc);
11118 enc = rb_enc_check(str, pad);
11119 f = RSTRING_PTR(pad);
11120 flen = RSTRING_LEN(pad);
11121 fclen = str_strlen(pad, enc);
11122 singlebyte = single_byte_optimizable(pad);
11123 if (flen == 0 || fclen == 0) {
11124 rb_raise(rb_eArgError,
"zero width padding");
11127 len = str_strlen(str, enc);
11128 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
11130 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
11134 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11135 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11137 size = RSTRING_LEN(str);
11138 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11139 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11140 (
len += llen2 + rlen2) >= LONG_MAX - size) {
11141 rb_raise(rb_eArgError,
"argument too big");
11145 p = RSTRING_PTR(res);
11147 memset(p, *f, llen);
11151 while (llen >= fclen) {
11157 memcpy(p, f, llen2);
11161 memcpy(p, RSTRING_PTR(str), size);
11164 memset(p, *f, rlen);
11168 while (rlen >= fclen) {
11174 memcpy(p, f, rlen2);
11178 TERM_FILL(p, termlen);
11179 STR_SET_LEN(res, p-RSTRING_PTR(res));
11200rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
11202 return rb_str_justify(argc, argv, str,
'l');
11214rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11216 return rb_str_justify(argc, argv, str,
'r');
11229rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11231 return rb_str_justify(argc, argv, str,
'c');
11247 sep = get_pat_quoted(sep, 0);
11259 pos = rb_str_index(str, sep, 0);
11260 if (pos < 0)
goto failed;
11265 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11268 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11282 long pos = RSTRING_LEN(str);
11284 sep = get_pat_quoted(sep, 0);
11297 pos = rb_str_rindex(str, sep, pos);
11306 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11308 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11320rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11324 for (i=0; i<argc; i++) {
11325 VALUE tmp = argv[i];
11327 if (rb_reg_start_with_p(tmp, str))
11331 const char *p, *s, *e;
11336 enc = rb_enc_check(str, tmp);
11337 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11338 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11339 p = RSTRING_PTR(str);
11342 if (!at_char_right_boundary(p, s, e, enc))
11344 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11360rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11364 for (i=0; i<argc; i++) {
11365 VALUE tmp = argv[i];
11366 const char *p, *s, *e;
11371 enc = rb_enc_check(str, tmp);
11372 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11373 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11374 p = RSTRING_PTR(str);
11377 if (!at_char_boundary(p, s, e, enc))
11379 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11395deleted_prefix_length(
VALUE str,
VALUE prefix)
11397 const char *strptr, *prefixptr;
11398 long olen, prefixlen;
11403 if (!is_broken_string(prefix) ||
11404 !rb_enc_asciicompat(enc) ||
11405 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11406 enc = rb_enc_check(str, prefix);
11410 prefixlen = RSTRING_LEN(prefix);
11411 if (prefixlen <= 0)
return 0;
11412 olen = RSTRING_LEN(str);
11413 if (olen < prefixlen)
return 0;
11414 strptr = RSTRING_PTR(str);
11415 prefixptr = RSTRING_PTR(prefix);
11416 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11417 if (is_broken_string(prefix)) {
11418 if (!is_broken_string(str)) {
11422 const char *strend = strptr + olen;
11423 const char *after_prefix = strptr + prefixlen;
11424 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11445rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11448 str_modify_keep_cr(str);
11450 prefixlen = deleted_prefix_length(str, prefix);
11451 if (prefixlen <= 0)
return Qnil;
11465rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11469 prefixlen = deleted_prefix_length(str, prefix);
11470 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11472 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11485deleted_suffix_length(
VALUE str,
VALUE suffix)
11487 const char *strptr, *suffixptr;
11488 long olen, suffixlen;
11492 if (is_broken_string(suffix))
return 0;
11493 enc = rb_enc_check(str, suffix);
11496 suffixlen = RSTRING_LEN(suffix);
11497 if (suffixlen <= 0)
return 0;
11498 olen = RSTRING_LEN(str);
11499 if (olen < suffixlen)
return 0;
11500 strptr = RSTRING_PTR(str);
11501 suffixptr = RSTRING_PTR(suffix);
11502 const char *strend = strptr + olen;
11503 const char *before_suffix = strend - suffixlen;
11504 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11505 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11521rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11523 long olen, suffixlen,
len;
11524 str_modifiable(str);
11526 suffixlen = deleted_suffix_length(str, suffix);
11527 if (suffixlen <= 0)
return Qnil;
11529 olen = RSTRING_LEN(str);
11530 str_modify_keep_cr(str);
11531 len = olen - suffixlen;
11532 STR_SET_LEN(str,
len);
11533 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11549rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11553 suffixlen = deleted_suffix_length(str, suffix);
11554 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11556 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11563 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11569nil_setter_warning(
ID id)
11571 rb_warn_deprecated(
"non-nil '%"PRIsVALUE
"'", NULL, rb_id2str(
id));
11578 if (!
NIL_P(*var)) {
11579 nil_setter_warning(
id);
11586 val = rb_fs_check(val);
11589 "value of %"PRIsVALUE
" must be String or Regexp",
11593 nil_setter_warning(
id);
11610 str_modifiable(str);
11613 int idx = rb_enc_to_index(encoding);
11620 rb_enc_associate_index(str, idx);
11644 if (STR_EMBED_P(str)) {
11645 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11650 str_replace_shared_without_enc(str2, str);
11652 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11682rb_str_valid_encoding_p(
VALUE str)
11702rb_str_is_ascii_only_p(
VALUE str)
11712 static const char ellipsis[] =
"...";
11713 const long ellipsislen =
sizeof(ellipsis) - 1;
11715 const long blen = RSTRING_LEN(str);
11716 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11717 VALUE estr, ret = 0;
11720 if (
len * rb_enc_mbminlen(enc) >= blen ||
11724 else if (
len <= ellipsislen ||
11726 if (rb_enc_asciicompat(enc)) {
11728 rb_enc_associate(ret, enc);
11735 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11740 rb_enc_from_encoding(enc), 0,
Qnil);
11753 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11759 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11778 if (enc == STR_ENC_GET(str)) {
11783 return enc_str_scrub(enc, str, repl, cr);
11791 const char *rep, *p, *e, *p1, *sp;
11797 rb_raise(rb_eArgError,
"both of block and replacement given");
11804 if (!
NIL_P(repl)) {
11805 repl = str_compat_and_valid(repl, enc);
11808 if (rb_enc_dummy_p(enc)) {
11811 encidx = rb_enc_to_index(enc);
11813#define DEFAULT_REPLACE_CHAR(str) do { \
11814 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11815 rep = replace; replen = (int)sizeof(replace); \
11818 slen = RSTRING_LEN(str);
11819 p = RSTRING_PTR(str);
11824 if (rb_enc_asciicompat(enc)) {
11830 else if (!
NIL_P(repl)) {
11831 rep = RSTRING_PTR(repl);
11832 replen = RSTRING_LEN(repl);
11835 else if (encidx == rb_utf8_encindex()) {
11836 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11840 DEFAULT_REPLACE_CHAR(
"?");
11845 p = search_nonascii(p, e);
11850 int ret = rb_enc_precise_mbclen(p, e, enc);
11869 if (e - p < clen) clen = e - p;
11876 for (; clen > 1; clen--) {
11877 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11888 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11889 str_mod_check(str, sp, slen);
11890 repl = str_compat_and_valid(repl, enc);
11897 p = search_nonascii(p, e);
11923 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11924 str_mod_check(str, sp, slen);
11925 repl = str_compat_and_valid(repl, enc);
11934 long mbminlen = rb_enc_mbminlen(enc);
11938 else if (!
NIL_P(repl)) {
11939 rep = RSTRING_PTR(repl);
11940 replen = RSTRING_LEN(repl);
11942 else if (encidx == ENCINDEX_UTF_16BE) {
11943 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11945 else if (encidx == ENCINDEX_UTF_16LE) {
11946 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11948 else if (encidx == ENCINDEX_UTF_32BE) {
11949 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11951 else if (encidx == ENCINDEX_UTF_32LE) {
11952 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11955 DEFAULT_REPLACE_CHAR(
"?");
11959 int ret = rb_enc_precise_mbclen(p, e, enc);
11972 if (e - p < clen) clen = e - p;
11973 if (clen <= mbminlen * 2) {
11978 for (; clen > mbminlen; clen-=mbminlen) {
11979 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11989 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11990 str_mod_check(str, sp, slen);
11991 repl = str_compat_and_valid(repl, enc);
12016 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
12017 str_mod_check(str, sp, slen);
12018 repl = str_compat_and_valid(repl, enc);
12058str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
12066static ID id_normalize;
12067static ID id_normalized_p;
12068static VALUE mUnicodeNormalize;
12071unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
12073 static int UnicodeNormalizeRequired = 0;
12076 if (!UnicodeNormalizeRequired) {
12077 rb_require(
"unicode_normalize/normalize.rb");
12078 UnicodeNormalizeRequired = 1;
12082 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
12093rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
12095 return unicode_normalize_common(argc, argv, str, id_normalize);
12109rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
12111 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12138rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
12140 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12272#define sym_equal rb_obj_equal
12275sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12279 int c = rb_enc_precise_mbclen(s, send, enc);
12283 c = rb_enc_mbc_to_codepoint(s, send, enc);
12291rb_str_symname_p(
VALUE sym)
12296 rb_encoding *resenc = rb_default_internal_encoding();
12298 if (resenc == NULL) resenc = rb_default_external_encoding();
12299 enc = STR_ENC_GET(sym);
12300 ptr = RSTRING_PTR(sym);
12301 len = RSTRING_LEN(sym);
12302 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12310rb_str_quote_unprintable(
VALUE str)
12318 resenc = rb_default_internal_encoding();
12319 if (resenc == NULL) resenc = rb_default_external_encoding();
12320 enc = STR_ENC_GET(str);
12321 ptr = RSTRING_PTR(str);
12322 len = RSTRING_LEN(str);
12323 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12324 !sym_printable(ptr, ptr +
len, enc)) {
12325 return rb_str_escape(str);
12331rb_id_quote_unprintable(
ID id)
12333 VALUE str = rb_id2str(
id);
12334 if (!rb_str_symname_p(str)) {
12335 return rb_str_escape(str);
12353sym_inspect(
VALUE sym)
12360 if (!rb_str_symname_p(str)) {
12362 len = RSTRING_LEN(str);
12363 rb_str_resize(str,
len + 1);
12364 dest = RSTRING_PTR(str);
12365 memmove(dest + 1, dest,
len);
12369 VALUE orig_str = str;
12371 len = RSTRING_LEN(orig_str);
12372 str = rb_enc_str_new(0,
len + 1, enc);
12375 ptr = RSTRING_PTR(orig_str);
12376 dest = RSTRING_PTR(str);
12377 memcpy(dest + 1, ptr,
len);
12397rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12402 rb_raise(rb_eArgError,
"no receiver given");
12505 return rb_str_match(
rb_sym2str(sym), other);
12520sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12522 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12535sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12537 return rb_str_match_m_p(argc, argv, sym);
12555 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12566sym_length(
VALUE sym)
12580sym_empty(
VALUE sym)
12614sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12630sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12646sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12660sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12662 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12675sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12677 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12689sym_encoding(
VALUE sym)
12695string_for_symbol(
VALUE name)
12700 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12714 name = string_for_symbol(name);
12715 return rb_intern_str(name);
12724 name = string_for_symbol(name);
12748 return rb_fstring(str);
12754 struct RString fake_str = {RBASIC_INIT};
12755 int encidx = ENCINDEX_US_ASCII;
12758 encidx = ENCINDEX_ASCII_8BIT;
12761 VALUE str = setup_fake_str(&fake_str,
ptr,
len, encidx);
12763 return register_fstring(str,
true,
false);
12775 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12776 rb_enc_autoload(enc);
12779 struct RString fake_str = {RBASIC_INIT};
12780 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12786 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12787 rb_enc_autoload(enc);
12790 struct RString fake_str = {RBASIC_INIT};
12791 VALUE str = register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12802#if USE_YJIT || USE_ZJIT
12804rb_jit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12809 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12810 rb_str_buf_cat_byte(str, (
char) code);
12820fstring_set_class_i(
VALUE *str,
void *data)
12824 return ST_CONTINUE;
12832 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12999 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_cObject
Object class.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
struct RString::@53::@55 embed
Embedded contents.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
long len
Length of the string, not including terminating NUL character.
union RString::@53 as
String's specific fields.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
struct RString::@53::@54 heap
Strings that use separated memory region for contents use this pattern.
union RString::@53::@54::@56 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.