14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
46#include "ruby_assert.h"
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
62#undef rb_usascii_str_new
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
126#define RUBY_MAX_CHAR_LEN 16
127#define STR_PRECOMPUTED_HASH FL_USER4
128#define STR_SHARED_ROOT FL_USER5
129#define STR_BORROWED FL_USER6
130#define STR_TMPLOCK FL_USER7
131#define STR_NOFREE FL_USER18
132#define STR_FAKESTR FL_USER19
134#define STR_SET_NOEMBED(str) do {\
135 FL_SET((str), STR_NOEMBED);\
136 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
138#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
140#define STR_SET_LEN(str, n) do { \
141 RSTRING(str)->len = (n); \
145str_encindex_fastpath(
int encindex)
149 case ENCINDEX_ASCII_8BIT:
151 case ENCINDEX_US_ASCII:
159str_enc_fastpath(
VALUE str)
164#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
165#define TERM_FILL(ptr, termlen) do {\
166 char *const term_fill_ptr = (ptr);\
167 const int term_fill_len = (termlen);\
168 *term_fill_ptr = '\0';\
169 if (UNLIKELY(term_fill_len > 1))\
170 memset(term_fill_ptr, 0, term_fill_len);\
173#define RESIZE_CAPA(str,capacity) do {\
174 const int termlen = TERM_LEN(str);\
175 RESIZE_CAPA_TERM(str,capacity,termlen);\
177#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
178 if (STR_EMBED_P(str)) {\
179 if (str_embed_capa(str) < capacity + termlen) {\
180 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
181 const long tlen = RSTRING_LEN(str);\
182 memcpy(tmp, RSTRING_PTR(str), tlen);\
183 RSTRING(str)->as.heap.ptr = tmp;\
184 RSTRING(str)->len = tlen;\
185 STR_SET_NOEMBED(str);\
186 RSTRING(str)->as.heap.aux.capa = (capacity);\
190 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
191 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
192 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
193 RSTRING(str)->as.heap.aux.capa = (capacity);\
197#define STR_SET_SHARED(str, shared_str) do { \
198 if (!FL_TEST(str, STR_FAKESTR)) { \
199 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
200 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
201 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
202 FL_SET((str), STR_SHARED); \
203 FL_SET((shared_str), STR_SHARED_ROOT); \
204 if (RBASIC_CLASS((shared_str)) == 0) \
205 FL_SET_RAW((shared_str), STR_BORROWED); \
209#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
210#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
213#define STR_ENC_GET(str) get_encoding(str)
215#if !defined SHARABLE_MIDDLE_SUBSTRING
216# define SHARABLE_MIDDLE_SUBSTRING 0
218#if !SHARABLE_MIDDLE_SUBSTRING
219#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
221#define SHARABLE_SUBSTRING_P(beg, len, end) 1
226str_embed_capa(
VALUE str)
228 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
232rb_str_reembeddable_p(
VALUE str)
234 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
238rb_str_embed_size(
long capa)
244rb_str_size_as_embedded(
VALUE str)
247 if (STR_EMBED_P(str)) {
248 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
252 else if (rb_str_reembeddable_p(str)) {
253 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
256 real_size =
sizeof(
struct RString);
260 real_size +=
sizeof(st_index_t);
267STR_EMBEDDABLE_P(
long len,
long termlen)
269 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
274static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
275static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
277static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
278static inline void str_modifiable(
VALUE str);
283str_make_independent(
VALUE str)
285 long len = RSTRING_LEN(str);
286 int termlen = TERM_LEN(str);
287 str_make_independent_expand((str),
len, 0L, termlen);
290static inline int str_dependent_p(
VALUE str);
293rb_str_make_independent(
VALUE str)
295 if (str_dependent_p(str)) {
296 str_make_independent(str);
301rb_str_make_embedded(
VALUE str)
306 char *buf =
RSTRING(str)->as.heap.ptr;
310 STR_SET_LEN(str,
len);
313 memcpy(RSTRING_PTR(str), buf,
len);
317 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
321rb_debug_rstring_null_ptr(
const char *func)
323 fprintf(stderr,
"%s is returning NULL!! "
324 "SIGSEGV is highly expected to follow immediately.\n"
325 "If you could reproduce, attach your debugger here, "
326 "and look at the passed string.\n",
331static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
334get_encoding(
VALUE str)
340mustnot_broken(
VALUE str)
342 if (is_broken_string(str)) {
343 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348mustnot_wchar(
VALUE str)
351 if (rb_enc_mbminlen(enc) > 1) {
352 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
358static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
360#if SIZEOF_LONG == SIZEOF_VOIDP
361#define PRECOMPUTED_FAKESTR_HASH 1
365#ifdef PRECOMPUTED_FAKESTR_HASH
367fstring_hash(
VALUE str)
372 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
382#define fstring_hash rb_str_hash
390#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
392static inline st_index_t
393str_do_hash(
VALUE str)
395 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
397 if (e && !is_ascii_string(str)) {
404str_store_precomputed_hash(
VALUE str, st_index_t hash)
410 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
411 size_t free_bytes = str_embed_capa(str) - used_bytes;
415 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
417 FL_SET(str, STR_PRECOMPUTED_HASH);
425 bool force_precompute_hash;
429fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data,
int existing)
438 if (rb_objspace_garbage_object_p(str)) {
457 long len = RSTRING_LEN(str);
458 long capa =
len +
sizeof(st_index_t);
459 int term_len = TERM_LEN(str);
461 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
463 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
464 STR_SET_LEN(new_str, RSTRING_LEN(str));
466 rb_enc_copy(new_str, str);
467 str_store_precomputed_hash(new_str, str_do_hash(str));
471 rb_enc_copy(new_str, str);
472#ifdef PRECOMPUTED_FAKESTR_HASH
473 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
474 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
488 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
491 if (STR_SHARED_P(str)) {
493 str_make_independent(str);
496 if (!BARE_STRING_P(str)) {
502 RBASIC(str)->flags |= RSTRING_FSTR;
504 *key = *value = arg->fstr = str;
517 if (
FL_TEST(str, RSTRING_FSTR))
520 bare = BARE_STRING_P(str);
522 if (STR_EMBED_P(str)) {
527 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
534 rb_str_resize(str, RSTRING_LEN(str));
536 fstr = register_fstring(str,
false,
false);
539 str_replace_shared_without_enc(str, fstr);
547register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
551 .force_precompute_hash = force_precompute_hash
554#if SIZEOF_VOIDP == SIZEOF_LONG
558 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
564 st_table *frozen_strings = rb_vm_fstring_table();
567 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
568 }
while (UNDEF_P(args.fstr));
581setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
596 return (
VALUE)fake_str;
605 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
614rb_fstring_new(
const char *ptr,
long len)
617 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
624 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
628rb_fstring_cstr(
const char *
ptr)
630 return rb_fstring_new(
ptr, strlen(
ptr));
634fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
644 const char *aptr, *bptr;
647 return (alen != blen ||
649 memcmp(aptr, bptr, alen) != 0);
653single_byte_optimizable(
VALUE str)
657 case ENCINDEX_ASCII_8BIT:
658 case ENCINDEX_US_ASCII:
680static inline const char *
681search_nonascii(
const char *p,
const char *e)
683 const uintptr_t *s, *t;
685#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
686# if SIZEOF_UINTPTR_T == 8
687# define NONASCII_MASK UINT64_C(0x8080808080808080)
688# elif SIZEOF_UINTPTR_T == 4
689# define NONASCII_MASK UINT32_C(0x80808080)
691# error "don't know what to do."
694# if SIZEOF_UINTPTR_T == 8
695# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
696# elif SIZEOF_UINTPTR_T == 4
697# define NONASCII_MASK 0x80808080UL
699# error "don't know what to do."
703 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
704#if !UNALIGNED_WORD_ACCESS
705 if ((uintptr_t)p % SIZEOF_VOIDP) {
706 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
711 case 7:
if (p[-7]&0x80)
return p-7;
712 case 6:
if (p[-6]&0x80)
return p-6;
713 case 5:
if (p[-5]&0x80)
return p-5;
714 case 4:
if (p[-4]&0x80)
return p-4;
716 case 3:
if (p[-3]&0x80)
return p-3;
717 case 2:
if (p[-2]&0x80)
return p-2;
718 case 1:
if (p[-1]&0x80)
return p-1;
723#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
724#define aligned_ptr(value) \
725 __builtin_assume_aligned((value), sizeof(uintptr_t))
727#define aligned_ptr(value) (uintptr_t *)(value)
730 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
733 if (*s & NONASCII_MASK) {
734#ifdef WORDS_BIGENDIAN
735 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
737 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
747 case 7:
if (e[-7]&0x80)
return e-7;
748 case 6:
if (e[-6]&0x80)
return e-6;
749 case 5:
if (e[-5]&0x80)
return e-5;
750 case 4:
if (e[-4]&0x80)
return e-4;
752 case 3:
if (e[-3]&0x80)
return e-3;
753 case 2:
if (e[-2]&0x80)
return e-2;
754 case 1:
if (e[-1]&0x80)
return e-1;
762 const char *e = p +
len;
764 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
766 p = search_nonascii(p, e);
770 if (rb_enc_asciicompat(enc)) {
771 p = search_nonascii(p, e);
774 int ret = rb_enc_precise_mbclen(p, e, enc);
778 p = search_nonascii(p, e);
784 int ret = rb_enc_precise_mbclen(p, e, enc);
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
803 p = search_nonascii(p, e);
807 else if (rb_enc_asciicompat(enc)) {
808 p = search_nonascii(p, e);
814 int ret = rb_enc_precise_mbclen(p, e, enc);
821 p = search_nonascii(p, e);
827 int ret = rb_enc_precise_mbclen(p, e, enc);
852 rb_enc_set_index(str1, rb_enc_get_index(str2));
860rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
865 str_enc_copy(dest, src);
866 if (RSTRING_LEN(dest) == 0) {
867 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
878 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
879 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
890rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
892 str_enc_copy(dest, src);
899 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
905 return enc_coderange_scan(str, enc);
914 cr = enc_coderange_scan(str, get_encoding(str));
921rb_enc_str_asciicompat(
VALUE str)
924 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
932 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
941str_mod_check(
VALUE s,
const char *p,
long len)
943 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
949str_capacity(
VALUE str,
const int termlen)
951 if (STR_EMBED_P(str)) {
952 return str_embed_capa(str) - termlen;
954 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
958 return RSTRING(str)->as.heap.aux.capa;
965 return str_capacity(str, TERM_LEN(str));
969must_not_null(
const char *
ptr)
972 rb_raise(rb_eArgError,
"NULL pointer given");
979 size_t size = rb_str_embed_size(
capa);
983 NEWOBJ_OF(str,
struct RString, klass,
990str_alloc_heap(
VALUE klass)
992 NEWOBJ_OF(str,
struct RString, klass,
999empty_str_alloc(
VALUE klass)
1001 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1002 VALUE str = str_alloc_embed(klass, 0);
1003 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1014 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1018 enc = rb_ascii8bit_encoding();
1021 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1023 int termlen = rb_enc_mbminlen(enc);
1025 if (STR_EMBEDDABLE_P(
len, termlen)) {
1026 str = str_alloc_embed(klass,
len + termlen);
1032 str = str_alloc_heap(klass);
1038 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1041 rb_enc_raw_set(str, enc);
1044 memcpy(RSTRING_PTR(str),
ptr,
len);
1047 STR_SET_LEN(str,
len);
1048 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1055 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1090 __msan_unpoison_string(
ptr);
1110 if (rb_enc_mbminlen(enc) != 1) {
1111 rb_raise(rb_eArgError,
"wchar encoding given");
1113 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1117str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1122 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1126 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1129 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1130 str = str_alloc_heap(klass);
1134 RBASIC(str)->flags |= STR_NOFREE;
1135 rb_enc_associate_index(str, encindex);
1164static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1166 int ecflags,
VALUE ecopts);
1171 int encidx = rb_enc_to_index(enc);
1172 if (rb_enc_get_index(str) == encidx)
1173 return is_ascii_string(str);
1184 if (!to)
return str;
1185 if (!from) from = rb_enc_get(str);
1186 if (from == to)
return str;
1187 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1188 rb_is_ascii8bit_enc(to)) {
1189 if (STR_ENC_GET(str) != to) {
1191 rb_enc_associate(str, to);
1198 from, to, ecflags, ecopts);
1199 if (
NIL_P(newstr)) {
1207rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1212 olen = RSTRING_LEN(newstr);
1213 if (ofs < -olen || olen < ofs)
1215 if (ofs < 0) ofs += olen;
1217 STR_SET_LEN(newstr, ofs);
1221 rb_str_modify(newstr);
1222 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1230 STR_SET_LEN(str, 0);
1231 rb_enc_associate(str, enc);
1237str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1239 int ecflags,
VALUE ecopts)
1244 VALUE econv_wrapper;
1245 const unsigned char *start, *sp;
1246 unsigned char *dest, *dp;
1247 size_t converted_output = (size_t)ofs;
1252 RBASIC_CLEAR_CLASS(econv_wrapper);
1254 if (!ec)
return Qnil;
1257 sp = (
unsigned char*)
ptr;
1259 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1260 (dp = dest + converted_output),
1264 size_t converted_input = sp - start;
1265 size_t rest =
len - converted_input;
1266 converted_output = dp - dest;
1268 if (converted_input && converted_output &&
1269 rest < (LONG_MAX / converted_output)) {
1270 rest = (rest * converted_output) / converted_input;
1275 olen += rest < 2 ? 2 : rest;
1276 rb_str_resize(newstr, olen);
1283 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1285 rb_enc_associate(newstr, to);
1304 const int eidx = rb_enc_to_index(eenc);
1307 return rb_enc_str_new(
ptr,
len, eenc);
1311 if ((eidx == rb_ascii8bit_encindex()) ||
1312 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1316 ienc = rb_default_internal_encoding();
1317 if (!ienc || eenc == ienc) {
1318 return rb_enc_str_new(
ptr,
len, eenc);
1322 if ((eidx == rb_ascii8bit_encindex()) ||
1323 (eidx == rb_usascii_encindex()) ||
1324 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1325 return rb_enc_str_new(
ptr,
len, ienc);
1328 str = rb_enc_str_new(NULL, 0, ienc);
1331 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1332 rb_str_initialize(str,
ptr,
len, eenc);
1340 int eidx = rb_enc_to_index(eenc);
1341 if (eidx == rb_usascii_encindex() &&
1342 !is_ascii_string(str)) {
1343 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1346 rb_enc_associate_index(str, eidx);
1405str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1407 const int termlen = TERM_LEN(str);
1412 if (str_embed_capa(str2) >=
len + termlen) {
1413 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1414 STR_SET_EMBED(str2);
1415 memcpy(ptr2, RSTRING_PTR(str),
len);
1416 TERM_FILL(ptr2+
len, termlen);
1420 if (STR_SHARED_P(str)) {
1421 root =
RSTRING(str)->as.heap.aux.shared;
1430 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1432 rb_fatal(
"about to free a possible shared root");
1434 char *ptr2 = STR_HEAP_PTR(str2);
1436 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1439 FL_SET(str2, STR_NOEMBED);
1441 STR_SET_SHARED(str2, root);
1444 STR_SET_LEN(str2,
len);
1452 str_replace_shared_without_enc(str2, str);
1453 rb_enc_cr_str_exact_copy(str2, str);
1460 return str_replace_shared(str_alloc_heap(klass), str);
1477rb_str_new_frozen_String(
VALUE orig)
1485rb_str_frozen_bare_string(
VALUE orig)
1487 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1492rb_str_tmp_frozen_acquire(
VALUE orig)
1495 return str_new_frozen_buffer(0, orig, FALSE);
1499rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1501 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1502 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1504 VALUE str = str_alloc_heap(0);
1507 FL_SET(str, STR_SHARED_ROOT);
1509 size_t capa = str_capacity(orig, TERM_LEN(orig));
1515 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1516 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1523 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1524 RBASIC(orig)->flags &= ~STR_NOFREE;
1525 STR_SET_SHARED(orig, str);
1535rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1540 if (STR_EMBED_P(tmp)) {
1549 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1553 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1554 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1559 STR_SET_LEN(tmp, 0);
1567 return str_new_frozen_buffer(klass, orig, TRUE);
1576 VALUE str = str_alloc_heap(klass);
1577 STR_SET_LEN(str, RSTRING_LEN(orig));
1578 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1579 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1580 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1581 RBASIC(orig)->flags &= ~STR_NOFREE;
1582 STR_SET_SHARED(orig, str);
1589str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1593 long len = RSTRING_LEN(orig);
1594 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1595 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1597 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1598 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1604 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1605 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1611 if ((ofs > 0) || (rest > 0) ||
1614 str = str_new_shared(klass,
shared);
1616 RSTRING(str)->as.heap.ptr += ofs;
1617 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1625 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1626 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1628 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1629 STR_SET_LEN(str, RSTRING_LEN(orig));
1634 str = heap_str_make_shared(klass, orig);
1638 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1650str_new_empty_String(
VALUE str)
1653 rb_enc_copy(v, str);
1657#define STR_BUF_MIN_SIZE 63
1662 if (STR_EMBEDDABLE_P(
capa, 1)) {
1670 RSTRING(str)->as.heap.ptr[0] =
'\0';
1690 return str_new(0, 0,
len);
1696 if (STR_EMBED_P(str)) {
1697 RB_DEBUG_COUNTER_INC(obj_str_embed);
1699 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1700 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1701 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1704 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1705 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1710rb_str_memsize(
VALUE str)
1712 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1713 return STR_HEAP_SIZE(str);
1723 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1726static inline void str_discard(
VALUE str);
1727static void str_shared_replace(
VALUE str,
VALUE str2);
1732 if (str != str2) str_shared_replace(str, str2);
1743 enc = STR_ENC_GET(str2);
1746 termlen = rb_enc_mbminlen(enc);
1748 STR_SET_LEN(str, RSTRING_LEN(str2));
1750 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1752 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1753 rb_enc_associate(str, enc);
1757 if (STR_EMBED_P(str2)) {
1759 long len = RSTRING_LEN(str2);
1762 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1763 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1764 RSTRING(str2)->as.heap.ptr = new_ptr;
1765 STR_SET_LEN(str2,
len);
1767 STR_SET_NOEMBED(str2);
1770 STR_SET_NOEMBED(str);
1772 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1774 if (
FL_TEST(str2, STR_SHARED)) {
1776 STR_SET_SHARED(str,
shared);
1779 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1783 STR_SET_EMBED(str2);
1784 RSTRING_PTR(str2)[0] = 0;
1785 STR_SET_LEN(str2, 0);
1786 rb_enc_associate(str, enc);
1800 return rb_obj_as_string_result(str, obj);
1816 len = RSTRING_LEN(str2);
1817 if (STR_SHARED_P(str2)) {
1820 STR_SET_NOEMBED(str);
1821 STR_SET_LEN(str,
len);
1822 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1823 STR_SET_SHARED(str,
shared);
1824 rb_enc_cr_str_exact_copy(str, str2);
1827 str_replace_shared(str, str2);
1836 size_t size = rb_str_embed_size(
capa);
1840 NEWOBJ_OF(str,
struct RString, klass,
1849 NEWOBJ_OF(str,
struct RString, klass,
1860 encidx = rb_enc_get_index(str);
1861 flags &= ~ENCODING_MASK;
1864 if (encidx) rb_enc_associate_index(dup, encidx);
1874 long len = RSTRING_LEN(str);
1879 STR_SET_LEN(dup, RSTRING_LEN(str));
1880 return str_duplicate_setup_encoding(str, dup, flags);
1889 root =
RSTRING(str)->as.heap.aux.shared;
1891 else if (UNLIKELY(!(flags &
FL_FREEZE))) {
1892 root = str = str_new_frozen(klass, str);
1898 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1899 FL_SET(root, STR_SHARED_ROOT);
1901 flags |= RSTRING_NOEMBED | STR_SHARED;
1903 STR_SET_LEN(dup, RSTRING_LEN(str));
1904 return str_duplicate_setup_encoding(str, dup, flags);
1910 if (STR_EMBED_P(str)) {
1911 return str_duplicate_setup_embed(klass, str, dup);
1914 return str_duplicate_setup_heap(klass, str, dup);
1922 if (STR_EMBED_P(str)) {
1923 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1926 dup = str_alloc_heap(klass);
1929 return str_duplicate_setup(klass, str, dup);
1940rb_str_dup_m(
VALUE str)
1942 if (LIKELY(BARE_STRING_P(str))) {
1953 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1960 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1964 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1965 str_duplicate_setup_embed(klass, str, new_str);
1968 new_str = ec_str_alloc_heap(ec, klass);
1969 str_duplicate_setup_heap(klass, str, new_str);
1978rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
1980 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
1982 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1999 static ID keyword_ids[2];
2000 VALUE orig, opt, venc, vcapa;
2005 if (!keyword_ids[0]) {
2006 keyword_ids[0] = rb_id_encoding();
2007 CONST_ID(keyword_ids[1],
"capacity");
2015 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2016 enc = rb_to_encoding(venc);
2018 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2021 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2023 if (
capa < STR_BUF_MIN_SIZE) {
2024 capa = STR_BUF_MIN_SIZE;
2028 len = RSTRING_LEN(orig);
2032 if (orig == str) n = 0;
2034 str_modifiable(str);
2035 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2037 const size_t size = (size_t)
capa + termlen;
2038 const char *
const old_ptr = RSTRING_PTR(str);
2039 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2040 char *new_ptr =
ALLOC_N(
char, size);
2041 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2042 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2044 RSTRING(str)->as.heap.ptr = new_ptr;
2046 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2047 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2048 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2050 STR_SET_LEN(str,
len);
2053 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2054 rb_enc_cr_str_exact_copy(str, orig);
2056 FL_SET(str, STR_NOEMBED);
2063 rb_enc_associate(str, enc);
2075rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2081 static ID keyword_ids[2];
2091 keyword_ids[0] = rb_id_encoding();
2092 CONST_ID(keyword_ids[1],
"capacity");
2094 encoding = kwargs[0];
2095 capacity = kwargs[1];
2104 if (UNDEF_P(encoding)) {
2106 encoding = rb_obj_encoding(orig);
2110 if (!UNDEF_P(encoding)) {
2111 enc = rb_to_encoding(encoding);
2115 if (UNDEF_P(capacity)) {
2117 VALUE empty_str = str_new(klass,
"", 0);
2119 rb_enc_associate(empty_str, enc);
2123 VALUE copy = str_duplicate(klass, orig);
2124 rb_enc_associate(copy, enc);
2137 if (orig_capa >
capa) {
2142 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2143 STR_SET_LEN(str, 0);
2154#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2169static inline uintptr_t
2170count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2175 d = (d>>6) | (~d>>7);
2176 d &= NONASCII_MASK >> 7;
2179#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2181 return rb_popcount_intptr(d);
2185# if SIZEOF_VOIDP == 8
2194enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2200 long diff = (long)(e - p);
2201 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2206 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2207 const uintptr_t *s, *t;
2208 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2209 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2210 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2211 while (p < (
const char *)s) {
2212 if (is_utf8_lead_byte(*p))
len++;
2216 len += count_utf8_lead_bytes_with_word(s);
2219 p = (
const char *)s;
2222 if (is_utf8_lead_byte(*p))
len++;
2228 else if (rb_enc_asciicompat(enc)) {
2233 q = search_nonascii(p, e);
2239 p += rb_enc_fast_mbclen(p, e, enc);
2246 q = search_nonascii(p, e);
2252 p += rb_enc_mbclen(p, e, enc);
2259 for (c=0; p<e; c++) {
2260 p += rb_enc_mbclen(p, e, enc);
2275rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2283 long diff = (long)(e - p);
2284 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2286 else if (rb_enc_asciicompat(enc)) {
2290 q = search_nonascii(p, e);
2298 ret = rb_enc_precise_mbclen(p, e, enc);
2313 for (c=0; p<e; c++) {
2314 ret = rb_enc_precise_mbclen(p, e, enc);
2321 if (p + rb_enc_mbminlen(enc) <= e)
2322 p += rb_enc_mbminlen(enc);
2338 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2339 if (!enc) enc = STR_ENC_GET(str);
2340 p = RSTRING_PTR(str);
2345 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2350 return enc_strlen(p, e, enc, cr);
2357 return str_strlen(str, NULL);
2371 return LONG2NUM(str_strlen(str, NULL));
2383rb_str_bytesize(
VALUE str)
2401rb_str_empty(
VALUE str)
2403 return RBOOL(RSTRING_LEN(str) == 0);
2421 char *ptr1, *ptr2, *ptr3;
2426 enc = rb_enc_check_str(str1, str2);
2429 termlen = rb_enc_mbminlen(enc);
2430 if (len1 > LONG_MAX - len2) {
2431 rb_raise(rb_eArgError,
"string size too big");
2433 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2434 ptr3 = RSTRING_PTR(str3);
2435 memcpy(ptr3, ptr1, len1);
2436 memcpy(ptr3+len1, ptr2, len2);
2437 TERM_FILL(&ptr3[len1+len2], termlen);
2453 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2456 int enc1 = rb_enc_get_index(str1);
2457 int enc2 = rb_enc_get_index(str2);
2462 else if (enc2 < 0) {
2465 else if (enc1 != enc2) {
2468 else if (len1 > LONG_MAX - len2) {
2501 rb_enc_copy(str2, str);
2506 rb_raise(rb_eArgError,
"negative argument");
2508 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2509 if (STR_EMBEDDABLE_P(
len, 1)) {
2511 memset(RSTRING_PTR(str2), 0,
len + 1);
2518 STR_SET_LEN(str2,
len);
2519 rb_enc_copy(str2, str);
2522 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2523 rb_raise(rb_eArgError,
"argument too big");
2526 len *= RSTRING_LEN(str);
2527 termlen = TERM_LEN(str);
2529 ptr2 = RSTRING_PTR(str2);
2531 n = RSTRING_LEN(str);
2532 memcpy(ptr2, RSTRING_PTR(str), n);
2533 while (n <=
len/2) {
2534 memcpy(ptr2 + n, ptr2, n);
2537 memcpy(ptr2 + n, ptr2,
len-n);
2539 STR_SET_LEN(str2,
len);
2540 TERM_FILL(&ptr2[
len], termlen);
2541 rb_enc_cr_str_copy_for_substr(str2, str);
2567 VALUE tmp = rb_check_array_type(arg);
2576rb_check_lockedtmp(
VALUE str)
2578 if (
FL_TEST(str, STR_TMPLOCK)) {
2585#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2587str_modifiable(
VALUE str)
2589 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2590 if (CHILLED_STRING_P(str)) {
2591 CHILLED_STRING_MUTATED(str);
2593 rb_check_lockedtmp(str);
2594 rb_check_frozen(str);
2599str_dependent_p(
VALUE str)
2601 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2611#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2613str_independent(
VALUE str)
2615 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2616 str_modifiable(str);
2617 return !str_dependent_p(str);
2623str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2631 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2636 STR_SET_LEN(str,
len);
2641 oldptr = RSTRING_PTR(str);
2643 memcpy(
ptr, oldptr,
len);
2645 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2648 STR_SET_NOEMBED(str);
2649 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2650 TERM_FILL(
ptr +
len, termlen);
2652 STR_SET_LEN(str,
len);
2659 if (!str_independent(str))
2660 str_make_independent(str);
2667 int termlen = TERM_LEN(str);
2668 long len = RSTRING_LEN(str);
2671 rb_raise(rb_eArgError,
"negative expanding string size");
2673 if (expand >= LONG_MAX -
len) {
2674 rb_raise(rb_eArgError,
"string size too big");
2677 if (!str_independent(str)) {
2678 str_make_independent_expand(str,
len, expand, termlen);
2680 else if (expand > 0) {
2681 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2688str_modify_keep_cr(
VALUE str)
2690 if (!str_independent(str))
2691 str_make_independent(str);
2698str_discard(
VALUE str)
2700 str_modifiable(str);
2701 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2702 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2703 RSTRING(str)->as.heap.ptr = 0;
2704 STR_SET_LEN(str, 0);
2711 int encindex = rb_enc_get_index(str);
2713 if (RB_UNLIKELY(encindex == -1)) {
2717 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2722 if (!rb_enc_asciicompat(enc)) {
2742 return RSTRING_PTR(str);
2746zero_filled(
const char *s,
int n)
2748 for (; n > 0; --n) {
2755str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2757 const char *e = s +
len;
2759 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2760 if (zero_filled(s, minlen))
return s;
2766str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2771 if (str_dependent_p(str)) {
2772 if (!zero_filled(s +
len, termlen))
2773 str_make_independent_expand(str,
len, 0L, termlen);
2776 TERM_FILL(s +
len, termlen);
2779 return RSTRING_PTR(str);
2783rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2785 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2786 long len = RSTRING_LEN(str);
2790 rb_check_lockedtmp(str);
2791 str_make_independent_expand(str,
len, 0L, termlen);
2793 else if (str_dependent_p(str)) {
2794 if (termlen > oldtermlen)
2795 str_make_independent_expand(str,
len, 0L, termlen);
2798 if (!STR_EMBED_P(str)) {
2803 if (termlen > oldtermlen) {
2804 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2812str_null_check(
VALUE str,
int *w)
2814 char *s = RSTRING_PTR(str);
2815 long len = RSTRING_LEN(str);
2817 const int minlen = rb_enc_mbminlen(enc);
2821 if (str_null_char(s,
len, minlen, enc)) {
2824 return str_fill_term(str, s,
len, minlen);
2827 if (!s || memchr(s, 0,
len)) {
2831 s = str_fill_term(str, s,
len, minlen);
2837rb_str_to_cstr(
VALUE str)
2840 return str_null_check(str, &w);
2848 char *s = str_null_check(str, &w);
2851 rb_raise(rb_eArgError,
"string contains null char");
2853 rb_raise(rb_eArgError,
"string contains null byte");
2859rb_str_fill_terminator(
VALUE str,
const int newminlen)
2861 char *s = RSTRING_PTR(str);
2862 long len = RSTRING_LEN(str);
2863 return str_fill_term(str, s,
len, newminlen);
2869 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2895str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
2904 else if (rb_enc_asciicompat(enc)) {
2905 const char *p2, *e2;
2908 while (p < e && 0 < nth) {
2915 p2 = search_nonascii(p, e2);
2924 n = rb_enc_mbclen(p, e, enc);
2935 while (p < e && nth--) {
2936 p += rb_enc_mbclen(p, e, enc);
2947 return str_nth_len(p, e, &nth, enc);
2951str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2956 p = str_nth_len(p, e, &nth, enc);
2965str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2967 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2968 if (!pp)
return e - p;
2975 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
2976 STR_ENC_GET(str), single_byte_optimizable(str));
2981str_utf8_nth(
const char *p,
const char *e,
long *nthp)
2984 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
2985 const uintptr_t *s, *t;
2986 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2987 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2988 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2989 while (p < (
const char *)s) {
2990 if (is_utf8_lead_byte(*p)) nth--;
2994 nth -= count_utf8_lead_bytes_with_word(s);
2996 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3000 if (is_utf8_lead_byte(*p)) {
3001 if (nth == 0)
break;
3011str_utf8_offset(
const char *p,
const char *e,
long nth)
3013 const char *pp = str_utf8_nth(p, e, &nth);
3022 if (single_byte_optimizable(str) || pos < 0)
3025 char *p = RSTRING_PTR(str);
3026 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3031str_subseq(
VALUE str,
long beg,
long len)
3039 const int termlen = TERM_LEN(str);
3040 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3047 if (str_embed_capa(str2) >=
len + termlen) {
3048 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3049 STR_SET_EMBED(str2);
3050 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3051 TERM_FILL(ptr2+
len, termlen);
3053 STR_SET_LEN(str2,
len);
3057 str_replace_shared(str2, str);
3060 RSTRING(str2)->as.heap.ptr += beg;
3061 if (RSTRING_LEN(str2) >
len) {
3062 STR_SET_LEN(str2,
len);
3072 VALUE str2 = str_subseq(str, beg,
len);
3073 rb_enc_cr_str_copy_for_substr(str2, str);
3082 const long blen = RSTRING_LEN(str);
3084 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3086 if (
len < 0)
return 0;
3087 if (beg < 0 && -beg < 0)
return 0;
3091 if (single_byte_optimizable(str)) {
3092 if (beg > blen)
return 0;
3095 if (beg < 0)
return 0;
3097 if (
len > blen - beg)
3099 if (
len < 0)
return 0;
3104 if (
len > -beg)
len = -beg;
3108 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3111 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3117 slen = str_strlen(str, enc);
3119 if (beg < 0)
return 0;
3121 if (
len == 0)
goto end;
3124 else if (beg > 0 && beg > blen) {
3128 if (beg > str_strlen(str, enc))
return 0;
3133 enc == rb_utf8_encoding()) {
3134 p = str_utf8_nth(s, e, &beg);
3135 if (beg > 0)
return 0;
3136 len = str_utf8_offset(p, e,
len);
3142 p = s + beg * char_sz;
3146 else if (
len * char_sz > e - p)
3151 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3152 if (beg > 0)
return 0;
3156 len = str_offset(p, e,
len, enc, 0);
3164static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3169 return str_substr(str, beg,
len, TRUE);
3179str_substr(
VALUE str,
long beg,
long len,
int empty)
3183 if (!p)
return Qnil;
3184 if (!
len && !empty)
return Qnil;
3186 beg = p - RSTRING_PTR(str);
3188 VALUE str2 = str_subseq(str, beg,
len);
3189 rb_enc_cr_str_copy_for_substr(str2, str);
3197 if (CHILLED_STRING_P(str)) {
3202 rb_str_resize(str, RSTRING_LEN(str));
3218 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3248str_uminus(
VALUE str)
3253 return rb_fstring(str);
3257#define rb_str_dup_frozen rb_str_new_frozen
3262 if (
FL_TEST(str, STR_TMPLOCK)) {
3265 FL_SET(str, STR_TMPLOCK);
3272 if (!
FL_TEST(str, STR_TMPLOCK)) {
3290 const int termlen = TERM_LEN(str);
3292 str_modifiable(str);
3293 if (STR_SHARED_P(str)) {
3296 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3297 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3308 else if (
len > RSTRING_LEN(str)) {
3312 const char *
const new_end = RSTRING_PTR(str) +
len;
3322 else if (
len < RSTRING_LEN(str)) {
3330 STR_SET_LEN(str,
len);
3331 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3338 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3341 int independent = str_independent(str);
3342 long slen = RSTRING_LEN(str);
3343 const int termlen = TERM_LEN(str);
3345 if (slen >
len || (termlen != 1 && slen <
len)) {
3351 if (STR_EMBED_P(str)) {
3352 if (
len == slen)
return str;
3353 if (str_embed_capa(str) >=
len + termlen) {
3354 STR_SET_LEN(str,
len);
3358 str_make_independent_expand(str, slen,
len - slen, termlen);
3360 else if (str_embed_capa(str) >=
len + termlen) {
3361 char *
ptr = STR_HEAP_PTR(str);
3363 if (slen >
len) slen =
len;
3366 STR_SET_LEN(str,
len);
3367 if (independent) ruby_xfree(
ptr);
3370 else if (!independent) {
3371 if (
len == slen)
return str;
3372 str_make_independent_expand(str, slen,
len - slen, termlen);
3376 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3377 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3380 else if (
len == slen)
return str;
3381 STR_SET_LEN(str,
len);
3388str_ensure_available_capa(
VALUE str,
long len)
3390 str_modify_keep_cr(str);
3392 const int termlen = TERM_LEN(str);
3393 long olen = RSTRING_LEN(str);
3395 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3396 rb_raise(rb_eArgError,
"string sizes too big");
3399 long total = olen +
len;
3400 long capa = str_capacity(str, termlen);
3403 if (total >= LONG_MAX / 2) {
3406 while (total >
capa) {
3409 RESIZE_CAPA_TERM(str,
capa, termlen);
3414str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3417 str_modify_keep_cr(str);
3422 if (
len == 0)
return 0;
3424 long total, olen,
off = -1;
3426 const int termlen = TERM_LEN(str);
3429 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3433 long capa = str_capacity(str, termlen);
3435 if (olen > LONG_MAX -
len) {
3436 rb_raise(rb_eArgError,
"string sizes too big");
3440 if (total >= LONG_MAX / 2) {
3443 while (total >
capa) {
3446 RESIZE_CAPA_TERM(str,
capa, termlen);
3447 sptr = RSTRING_PTR(str);
3452 memcpy(sptr + olen,
ptr,
len);
3453 STR_SET_LEN(str, total);
3454 TERM_FILL(sptr + total, termlen);
3459#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3460#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3465 if (
len == 0)
return str;
3467 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3469 return str_buf_cat(str,
ptr,
len);
3480rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3485 if (UNLIKELY(!str_independent(str))) {
3486 str_make_independent(str);
3489 long string_length = -1;
3490 const int null_terminator_length = 1;
3495 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3496 rb_raise(rb_eArgError,
"string sizes too big");
3499 long string_capacity = str_capacity(str, null_terminator_length);
3505 if (LIKELY(string_capacity >= string_length + 1)) {
3507 sptr[string_length] = byte;
3508 STR_SET_LEN(str, string_length + 1);
3509 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3513 str_buf_cat(str, (
char *)&
byte, 1);
3529 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3540rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3541 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3550 if (str_encindex == ptr_encindex) {
3552 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3556 str_enc = rb_enc_from_index(str_encindex);
3557 ptr_enc = rb_enc_from_index(ptr_encindex);
3558 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3561 if (RSTRING_LEN(str) == 0) {
3564 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3570 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3579 *ptr_cr_ret = ptr_cr;
3581 if (str_encindex != ptr_encindex &&
3584 str_enc = rb_enc_from_index(str_encindex);
3585 ptr_enc = rb_enc_from_index(ptr_encindex);
3590 res_encindex = str_encindex;
3595 res_encindex = str_encindex;
3599 res_encindex = ptr_encindex;
3604 res_encindex = str_encindex;
3611 res_encindex = str_encindex;
3617 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3619 str_buf_cat(str,
ptr,
len);
3625 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3632 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3642 if (rb_enc_asciicompat(enc)) {
3643 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3649 unsigned int c = (
unsigned char)*
ptr;
3650 int len = rb_enc_codelen(c, enc);
3651 rb_enc_mbcput(c, buf, enc);
3652 rb_enc_cr_str_buf_cat(str, buf,
len,
3665 if (str_enc_fastpath(str)) {
3669 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3675 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3686 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3702rb_str_concat_literals(
size_t num,
const VALUE *strary)
3706 unsigned long len = 1;
3711 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3713 str_enc_copy_direct(str, strary[0]);
3715 for (i = s; i < num; ++i) {
3716 const VALUE v = strary[i];
3720 if (encidx != ENCINDEX_US_ASCII) {
3722 rb_enc_set_index(str, encidx);
3747rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3749 str_modifiable(str);
3754 else if (argc > 1) {
3757 rb_enc_copy(arg_str, str);
3758 for (i = 0; i < argc; i++) {
3791rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3793 long needed_capacity = 0;
3797 for (
int index = 0; index < argc; index++) {
3798 VALUE obj = argv[index];
3806 needed_capacity += RSTRING_LEN(obj);
3811 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3818 str_ensure_available_capa(str, needed_capacity);
3821 for (
int index = 0; index < argc; index++) {
3822 VALUE obj = argv[index];
3827 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3828 char byte = (char)(
NUM2INT(obj) & 0xFF);
3842 rb_bug(
"append_as_bytes arguments should have been validated");
3846 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3847 TERM_FILL(sptr, TERM_LEN(str));
3852 for (
int index = 0; index < argc; index++) {
3853 VALUE obj = argv[index];
3870 rb_bug(
"append_as_bytes arguments should have been validated");
3944 if (rb_num_to_uint(str2, &code) == 0) {
3957 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3960 rb_str_buf_cat_byte(str1, (
unsigned char)code);
3963 long pos = RSTRING_LEN(str1);
3968 switch (
len = rb_enc_codelen(code, enc)) {
3969 case ONIGERR_INVALID_CODE_POINT_VALUE:
3970 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3972 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3978 rb_enc_mbcput(code, buf, enc);
3979 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
3980 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3982 rb_str_resize(str1, pos+
len);
3983 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
3996rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
3998 int encidx = rb_enc_to_index(enc);
4000 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4005 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4006 return ENCINDEX_ASCII_8BIT;
4029rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4031 str_modifiable(str);
4036 else if (argc > 1) {
4039 rb_enc_copy(arg_str, str);
4040 for (i = 0; i < argc; i++) {
4053 st_index_t precomputed_hash;
4054 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4056 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4057 return precomputed_hash;
4060 return str_do_hash(str);
4067 const char *ptr1, *ptr2;
4070 return (len1 != len2 ||
4072 memcmp(ptr1, ptr2, len1) != 0);
4086rb_str_hash_m(
VALUE str)
4092#define lesser(a,b) (((a)>(b))?(b):(a))
4100 if (RSTRING_LEN(str1) == 0)
return TRUE;
4101 if (RSTRING_LEN(str2) == 0)
return TRUE;
4104 if (idx1 == idx2)
return TRUE;
4109 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4113 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4123 const char *ptr1, *ptr2;
4126 if (str1 == str2)
return 0;
4129 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4138 if (len1 > len2)
return 1;
4141 if (retval > 0)
return 1;
4168 if (str1 == str2)
return Qtrue;
4175 return rb_str_eql_internal(str1, str2);
4199 if (str1 == str2)
return Qtrue;
4201 return rb_str_eql_internal(str1, str2);
4232 return rb_invcmp(str1, str2);
4274 return str_casecmp(str1, s);
4282 const char *p1, *p1end, *p2, *p2end;
4284 enc = rb_enc_compatible(str1, str2);
4289 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4290 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4291 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4292 while (p1 < p1end && p2 < p2end) {
4294 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4295 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4297 return INT2FIX(c1 < c2 ? -1 : 1);
4304 while (p1 < p1end && p2 < p2end) {
4305 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4306 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4308 if (0 <= c1 && 0 <= c2) {
4312 return INT2FIX(c1 < c2 ? -1 : 1);
4316 l1 = rb_enc_mbclen(p1, p1end, enc);
4317 l2 = rb_enc_mbclen(p2, p2end, enc);
4318 len = l1 < l2 ? l1 : l2;
4319 r = memcmp(p1, p2,
len);
4321 return INT2FIX(r < 0 ? -1 : 1);
4323 return INT2FIX(l1 < l2 ? -1 : 1);
4329 if (RSTRING_LEN(str1) == RSTRING_LEN(str2))
return INT2FIX(0);
4330 if (RSTRING_LEN(str1) > RSTRING_LEN(str2))
return INT2FIX(1);
4364 return str_casecmp_p(str1, s);
4371 VALUE folded_str1, folded_str2;
4372 VALUE fold_opt = sym_fold;
4374 enc = rb_enc_compatible(str1, str2);
4379 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4380 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4382 return rb_str_eql(folded_str1, folded_str2);
4386strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4387 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4389 const char *search_start = str_ptr;
4390 long pos, search_len = str_len - offset;
4394 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4395 if (pos < 0)
return pos;
4397 if (t == search_start + pos)
break;
4398 search_len -= t - search_start;
4399 if (search_len <= 0)
return -1;
4400 offset += t - search_start;
4403 return pos + offset;
4407#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4408#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4411rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4413 const char *str_ptr, *str_ptr_end, *sub_ptr;
4414 long str_len, sub_len;
4417 enc = rb_enc_check(str, sub);
4418 if (is_broken_string(sub))
return -1;
4420 str_ptr = RSTRING_PTR(str);
4422 str_len = RSTRING_LEN(str);
4423 sub_ptr = RSTRING_PTR(sub);
4424 sub_len = RSTRING_LEN(sub);
4426 if (str_len < sub_len)
return -1;
4429 long str_len_char, sub_len_char;
4430 int single_byte = single_byte_optimizable(str);
4431 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4432 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4434 offset += str_len_char;
4435 if (offset < 0)
return -1;
4437 if (str_len_char - offset < sub_len_char)
return -1;
4438 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4441 if (sub_len == 0)
return offset;
4444 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4458rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4465 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4466 long slen = str_strlen(str, enc);
4468 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4480 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4481 enc, single_byte_optimizable(str));
4492 pos = rb_str_index(str, sub, pos);
4506str_ensure_byte_pos(
VALUE str,
long pos)
4508 if (!single_byte_optimizable(str)) {
4509 const char *s = RSTRING_PTR(str);
4511 const char *p = s + pos;
4512 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4514 "offset %ld does not land on character boundary", pos);
4561rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4567 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4568 long slen = RSTRING_LEN(str);
4570 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4581 str_ensure_byte_pos(str, pos);
4593 pos = rb_str_byteindex(str, sub, pos);
4594 if (pos >= 0)
return LONG2NUM(pos);
4601memrchr(
const char *search_str,
int chr,
long search_len)
4603 const char *ptr = search_str + search_len;
4604 while (ptr > search_str) {
4605 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4615 char *hit, *adjusted;
4617 long slen, searchlen;
4620 sbeg = RSTRING_PTR(str);
4621 slen = RSTRING_LEN(sub);
4622 if (slen == 0)
return s - sbeg;
4624 t = RSTRING_PTR(sub);
4626 searchlen = s - sbeg + 1;
4628 if (memcmp(s, t, slen) == 0) {
4633 hit = memrchr(sbeg, c, searchlen);
4636 if (hit != adjusted) {
4637 searchlen = adjusted - sbeg;
4640 if (memcmp(hit, t, slen) == 0)
4642 searchlen = adjusted - sbeg;
4643 }
while (searchlen > 0);
4657 enc = rb_enc_check(str, sub);
4658 if (is_broken_string(sub))
return -1;
4659 singlebyte = single_byte_optimizable(str);
4660 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4661 slen = str_strlen(sub, enc);
4664 if (
len < slen)
return -1;
4665 if (
len - pos < slen) pos =
len - slen;
4666 if (
len == 0)
return pos;
4668 sbeg = RSTRING_PTR(str);
4671 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4677 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4678 return str_rindex(str, sub, s, enc);
4739rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4744 long pos,
len = str_strlen(str, enc);
4746 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4748 if (pos < 0 && (pos +=
len) < 0) {
4754 if (pos >
len) pos =
len;
4762 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4763 enc, single_byte_optimizable(str));
4774 pos = rb_str_rindex(str, sub, pos);
4784rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4790 enc = rb_enc_check(str, sub);
4791 if (is_broken_string(sub))
return -1;
4792 len = RSTRING_LEN(str);
4793 slen = RSTRING_LEN(sub);
4796 if (
len < slen)
return -1;
4797 if (
len - pos < slen) pos =
len - slen;
4798 if (
len == 0)
return pos;
4800 sbeg = RSTRING_PTR(str);
4803 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4810 return str_rindex(str, sub, s, enc);
4875rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4879 long pos,
len = RSTRING_LEN(str);
4881 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4883 if (pos < 0 && (pos +=
len) < 0) {
4889 if (pos >
len) pos =
len;
4895 str_ensure_byte_pos(str, pos);
4907 pos = rb_str_byterindex(str, sub, pos);
4908 if (pos >= 0)
return LONG2NUM(pos);
4944 switch (OBJ_BUILTIN_TYPE(y)) {
4996rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5003 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5035rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5039 re = get_pat(argv[0]);
5040 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5049static enum neighbor_char
5055 if (rb_enc_mbminlen(enc) > 1) {
5057 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5059 return NEIGHBOR_NOT_CHAR;
5061 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5063 if (!l)
return NEIGHBOR_NOT_CHAR;
5064 if (l !=
len)
return NEIGHBOR_WRAPPED;
5065 rb_enc_mbcput(c, p, enc);
5066 r = rb_enc_precise_mbclen(p, p +
len, enc);
5068 return NEIGHBOR_NOT_CHAR;
5070 return NEIGHBOR_FOUND;
5073 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5076 return NEIGHBOR_WRAPPED;
5077 ++((
unsigned char*)p)[i];
5078 l = rb_enc_precise_mbclen(p, p+
len, enc);
5082 return NEIGHBOR_FOUND;
5085 memset(p+l, 0xff,
len-l);
5091 for (len2 =
len-1; 0 < len2; len2--) {
5092 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5096 memset(p+len2+1, 0xff,
len-(len2+1));
5101static enum neighbor_char
5106 if (rb_enc_mbminlen(enc) > 1) {
5108 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5110 return NEIGHBOR_NOT_CHAR;
5112 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5113 if (!c)
return NEIGHBOR_NOT_CHAR;
5116 if (!l)
return NEIGHBOR_NOT_CHAR;
5117 if (l !=
len)
return NEIGHBOR_WRAPPED;
5118 rb_enc_mbcput(c, p, enc);
5119 r = rb_enc_precise_mbclen(p, p +
len, enc);
5121 return NEIGHBOR_NOT_CHAR;
5123 return NEIGHBOR_FOUND;
5126 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5129 return NEIGHBOR_WRAPPED;
5130 --((
unsigned char*)p)[i];
5131 l = rb_enc_precise_mbclen(p, p+
len, enc);
5135 return NEIGHBOR_FOUND;
5138 memset(p+l, 0,
len-l);
5144 for (len2 =
len-1; 0 < len2; len2--) {
5145 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5149 memset(p+len2+1, 0,
len-(len2+1));
5163static enum neighbor_char
5164enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5166 enum neighbor_char ret;
5170 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5174 const int max_gaps = 1;
5176 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5178 ctype = ONIGENC_CTYPE_DIGIT;
5180 ctype = ONIGENC_CTYPE_ALPHA;
5182 return NEIGHBOR_NOT_CHAR;
5185 for (
try = 0;
try <= max_gaps; ++
try) {
5186 ret = enc_succ_char(p,
len, enc);
5187 if (ret == NEIGHBOR_FOUND) {
5188 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5190 return NEIGHBOR_FOUND;
5197 ret = enc_pred_char(p,
len, enc);
5198 if (ret == NEIGHBOR_FOUND) {
5199 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5212 return NEIGHBOR_NOT_CHAR;
5215 if (ctype != ONIGENC_CTYPE_DIGIT) {
5217 return NEIGHBOR_WRAPPED;
5221 enc_succ_char(carry,
len, enc);
5222 return NEIGHBOR_WRAPPED;
5290 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5291 rb_enc_cr_str_copy_for_substr(str, orig);
5292 return str_succ(str);
5299 char *sbeg, *s, *e, *last_alnum = 0;
5300 int found_alnum = 0;
5302 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5303 long carry_pos = 0, carry_len = 1;
5304 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5306 slen = RSTRING_LEN(str);
5307 if (slen == 0)
return str;
5309 enc = STR_ENC_GET(str);
5310 sbeg = RSTRING_PTR(str);
5311 s = e = sbeg + slen;
5313 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5314 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5320 l = rb_enc_precise_mbclen(s, e, enc);
5321 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5322 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5323 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5325 case NEIGHBOR_NOT_CHAR:
5327 case NEIGHBOR_FOUND:
5329 case NEIGHBOR_WRAPPED:
5334 carry_pos = s - sbeg;
5339 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5340 enum neighbor_char neighbor;
5341 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5342 l = rb_enc_precise_mbclen(s, e, enc);
5343 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5344 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5346 neighbor = enc_succ_char(tmp, l, enc);
5348 case NEIGHBOR_FOUND:
5352 case NEIGHBOR_WRAPPED:
5355 case NEIGHBOR_NOT_CHAR:
5358 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5360 enc_succ_char(s, l, enc);
5362 if (!rb_enc_asciicompat(enc)) {
5363 MEMCPY(carry, s,
char, l);
5366 carry_pos = s - sbeg;
5370 RESIZE_CAPA(str, slen + carry_len);
5371 sbeg = RSTRING_PTR(str);
5372 s = sbeg + carry_pos;
5373 memmove(s + carry_len, s, slen - carry_pos);
5374 memmove(s, carry, carry_len);
5376 STR_SET_LEN(str, slen);
5377 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5391rb_str_succ_bang(
VALUE str)
5399all_digits_p(
const char *s,
long len)
5453 VALUE end, exclusive;
5457 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5463 VALUE current, after_end;
5470 enc = rb_enc_check(beg, end);
5471 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5473 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5474 char c = RSTRING_PTR(beg)[0];
5475 char e = RSTRING_PTR(end)[0];
5477 if (c > e || (excl && c == e))
return beg;
5479 VALUE str = rb_enc_str_new(&c, 1, enc);
5481 if ((*each)(str, arg))
break;
5482 if (!excl && c == e)
break;
5484 if (excl && c == e)
break;
5489 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5490 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5491 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5496 b = rb_str_to_inum(beg, 10, FALSE);
5497 e = rb_str_to_inum(end, 10, FALSE);
5504 if (excl && bi == ei)
break;
5505 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5510 ID op = excl ?
'<' : idLE;
5511 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5516 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5517 b = rb_funcallv(b, succ, 0, 0);
5524 if (n > 0 || (excl && n == 0))
return beg;
5526 after_end = rb_funcallv(end, succ, 0, 0);
5531 next = rb_funcallv(current, succ, 0, 0);
5532 if ((*each)(current, arg))
break;
5533 if (
NIL_P(next))
break;
5537 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5552 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5553 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5554 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5556 b = rb_str_to_inum(beg, 10, FALSE);
5562 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5570 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5571 b = rb_funcallv(b, succ, 0, 0);
5577 VALUE next = rb_funcallv(current, succ, 0, 0);
5578 if ((*each)(current, arg))
break;
5581 if (RSTRING_LEN(current) == 0)
5592 if (!
rb_equal(str, *argp))
return 0;
5606 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5607 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5608 rb_enc_asciicompat(STR_ENC_GET(val))) {
5609 const char *bp = RSTRING_PTR(beg);
5610 const char *ep = RSTRING_PTR(end);
5611 const char *vp = RSTRING_PTR(val);
5612 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5613 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5621 if (b <= v && v < e)
return Qtrue;
5622 return RBOOL(!
RTEST(exclusive) && v == e);
5629 all_digits_p(bp, RSTRING_LEN(beg)) &&
5630 all_digits_p(ep, RSTRING_LEN(end))) {
5635 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5637 return RBOOL(
NIL_P(val));
5660 return rb_str_subpat(str, indx,
INT2FIX(0));
5663 if (rb_str_index(str, indx, 0) != -1)
5669 long beg,
len = str_strlen(str, NULL);
5681 return str_substr(str, idx, 1, FALSE);
5700rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5704 return rb_str_subpat(str, argv[0], argv[1]);
5707 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5711 return rb_str_aref(str, argv[0]);
5717 char *ptr = RSTRING_PTR(str);
5718 long olen = RSTRING_LEN(str), nlen;
5720 str_modifiable(str);
5721 if (
len > olen)
len = olen;
5723 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5725 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5727 ptr =
RSTRING(str)->as.embed.ary;
5728 memmove(ptr, oldptr +
len, nlen);
5729 if (fl == STR_NOEMBED)
xfree(oldptr);
5732 if (!STR_SHARED_P(str)) {
5734 rb_enc_cr_str_exact_copy(shared, str);
5739 STR_SET_LEN(str, nlen);
5741 if (!SHARABLE_MIDDLE_SUBSTRING) {
5742 TERM_FILL(ptr + nlen, TERM_LEN(str));
5749rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5755 if (beg == 0 && vlen == 0) {
5760 str_modify_keep_cr(str);
5764 RESIZE_CAPA(str, slen + vlen -
len);
5765 sptr = RSTRING_PTR(str);
5774 memmove(sptr + beg + vlen,
5776 slen - (beg +
len));
5778 if (vlen < beg &&
len < 0) {
5782 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5785 STR_SET_LEN(str, slen);
5786 TERM_FILL(&sptr[slen], TERM_LEN(str));
5793 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5802 int singlebyte = single_byte_optimizable(str);
5808 enc = rb_enc_check(str, val);
5809 slen = str_strlen(str, enc);
5811 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5820 if (
len > slen - beg) {
5823 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5828 beg = p - RSTRING_PTR(str);
5830 rb_str_update_0(str, beg,
len, val);
5831 rb_enc_associate(str, enc);
5842 long start, end,
len;
5852 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5856 nth += regs->num_regs;
5866 enc = rb_enc_check_str(str, val);
5867 rb_str_update_0(str, start,
len, val);
5868 rb_enc_associate(str, enc);
5876 switch (
TYPE(indx)) {
5878 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5882 beg = rb_str_index(str, indx, 0);
5936rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5940 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5948 return rb_str_aset(str, argv[0], argv[1]);
6008rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6016 str_modify_keep_cr(str);
6024 if ((nth += regs->num_regs) <= 0)
return Qnil;
6026 else if (nth >= regs->num_regs)
return Qnil;
6028 len = END(nth) - beg;
6031 else if (argc == 2) {
6040 beg = p - RSTRING_PTR(str);
6044 beg = rb_str_index(str, indx, 0);
6045 if (beg == -1)
return Qnil;
6046 len = RSTRING_LEN(indx);
6058 beg = p - RSTRING_PTR(str);
6067 beg = p - RSTRING_PTR(str);
6071 rb_enc_cr_str_copy_for_substr(result, str);
6079 char *sptr = RSTRING_PTR(str);
6080 long slen = RSTRING_LEN(str);
6081 if (beg +
len > slen)
6085 slen - (beg +
len));
6087 STR_SET_LEN(str, slen);
6088 TERM_FILL(&sptr[slen], TERM_LEN(str));
6099 switch (OBJ_BUILTIN_TYPE(pat)) {
6118get_pat_quoted(
VALUE pat,
int check)
6122 switch (OBJ_BUILTIN_TYPE(pat)) {
6136 if (check && is_broken_string(pat)) {
6143rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6146 pos = rb_str_byteindex(str, pat, pos);
6147 if (set_backref_str) {
6149 str = rb_str_new_frozen_String(str);
6150 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6152 *match = match_data;
6162 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6167rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6169 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6188rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6202 hash = rb_check_hash_type(argv[1]);
6208 pat = get_pat_quoted(argv[0], 1);
6210 str_modifiable(str);
6211 beg = rb_pat_search(pat, str, 0, 1);
6225 end0 = beg0 + RSTRING_LEN(pat);
6234 if (iter || !
NIL_P(hash)) {
6235 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6241 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6244 str_mod_check(str, p,
len);
6245 rb_check_frozen(str);
6251 enc = rb_enc_compatible(str, repl);
6254 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6258 rb_enc_inspect_name(str_enc),
6259 rb_enc_inspect_name(STR_ENC_GET(repl)));
6261 enc = STR_ENC_GET(repl);
6264 rb_enc_associate(str, enc);
6274 rlen = RSTRING_LEN(repl);
6275 len = RSTRING_LEN(str);
6277 RESIZE_CAPA(str,
len + rlen - plen);
6279 p = RSTRING_PTR(str);
6281 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6283 rp = RSTRING_PTR(repl);
6284 memmove(p + beg0, rp, rlen);
6286 STR_SET_LEN(str,
len);
6287 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6316 rb_str_sub_bang(argc, argv, str);
6321str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6324 long beg, beg0, end0;
6325 long offset, blen, slen,
len, last;
6326 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6328 int need_backref_str = -1;
6338 hash = rb_check_hash_type(argv[1]);
6342 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6350 rb_error_arity(argc, 1, 2);
6353 pat = get_pat_quoted(argv[0], 1);
6354 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6357 if (bang)
return Qnil;
6362 blen = RSTRING_LEN(str) + 30;
6364 sp = RSTRING_PTR(str);
6365 slen = RSTRING_LEN(str);
6367 str_enc = STR_ENC_GET(str);
6368 rb_enc_associate(dest, str_enc);
6375 end0 = beg0 + RSTRING_LEN(pat);
6391 if (mode == FAST_MAP) {
6400 val = rb_hash_aref(hash, key);
6403 str_mod_check(str, sp, slen);
6408 else if (need_backref_str) {
6410 if (need_backref_str < 0) {
6411 need_backref_str = val != repl;
6418 len = beg0 - offset;
6432 if (RSTRING_LEN(str) <= end0)
break;
6433 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6435 offset = end0 +
len;
6437 cp = RSTRING_PTR(str) + offset;
6438 if (offset > RSTRING_LEN(str))
break;
6441 if (mode != FAST_MAP && mode != STR) {
6444 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6449 if (RSTRING_LEN(str) > offset) {
6452 rb_pat_search0(pat, str, last, 1, &match);
6454 str_shared_replace(str, dest);
6482rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6484 str_modify_keep_cr(str);
6485 return str_gsub(argc, argv, str, 1);
6508 return str_gsub(argc, argv, str, 0);
6526 str_modifiable(str);
6527 if (str == str2)
return str;
6531 return str_replace(str, str2);
6546rb_str_clear(
VALUE str)
6550 STR_SET_LEN(str, 0);
6551 RSTRING_PTR(str)[0] = 0;
6552 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6571rb_str_chr(
VALUE str)
6595 pos += RSTRING_LEN(str);
6596 if (pos < 0 || RSTRING_LEN(str) <= pos)
6599 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6618 long len = RSTRING_LEN(str);
6619 char *
ptr, *head, *left = 0;
6623 if (pos < -
len ||
len <= pos)
6630 char byte = (char)(
NUM2INT(w) & 0xFF);
6632 if (!str_independent(str))
6633 str_make_independent(str);
6634 enc = STR_ENC_GET(str);
6635 head = RSTRING_PTR(str);
6637 if (!STR_EMBED_P(str)) {
6644 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6652 width = rb_enc_precise_mbclen(left, head+
len, enc);
6654 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6670str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6672 long n = RSTRING_LEN(str);
6674 if (beg > n ||
len < 0)
return Qnil;
6677 if (beg < 0)
return Qnil;
6682 if (!empty)
return Qnil;
6686 VALUE str2 = str_subseq(str, beg,
len);
6688 str_enc_copy_direct(str2, str);
6690 if (RSTRING_LEN(str2) == 0) {
6691 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6725 long beg,
len = RSTRING_LEN(str);
6733 return str_byte_substr(str, beg,
len, TRUE);
6738 return str_byte_substr(str, idx, 1, FALSE);
6785rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6790 return str_byte_substr(str, beg,
len, TRUE);
6793 return str_byte_aref(str, argv[0]);
6797str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6799 long end, slen = RSTRING_LEN(str);
6802 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6811 if (*
len > slen - *beg) {
6815 str_ensure_byte_pos(str, *beg);
6816 str_ensure_byte_pos(str, end);
6841rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6843 long beg,
len, vbeg, vlen;
6848 if (!(argc == 2 || argc == 3 || argc == 5)) {
6849 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6853 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6854 rb_builtin_class_name(argv[0]));
6861 vlen = RSTRING_LEN(val);
6866 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6867 rb_builtin_class_name(argv[2]));
6879 vlen = RSTRING_LEN(val);
6887 str_check_beg_len(str, &beg, &
len);
6888 str_check_beg_len(val, &vbeg, &vlen);
6889 str_modify_keep_cr(str);
6892 rb_enc_associate(str, rb_enc_check(str, val));
6895 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6913rb_str_reverse(
VALUE str)
6920 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6921 enc = STR_ENC_GET(str);
6927 if (RSTRING_LEN(str) > 1) {
6928 if (single_byte_optimizable(str)) {
6935 int clen = rb_enc_fast_mbclen(s, e, enc);
6943 cr = rb_enc_asciicompat(enc) ?
6946 int clen = rb_enc_mbclen(s, e, enc);
6955 STR_SET_LEN(rev, RSTRING_LEN(str));
6956 str_enc_copy_direct(rev, str);
6976rb_str_reverse_bang(
VALUE str)
6978 if (RSTRING_LEN(str) > 1) {
6979 if (single_byte_optimizable(str)) {
6982 str_modify_keep_cr(str);
6983 s = RSTRING_PTR(str);
6992 str_shared_replace(str, rb_str_reverse(str));
6996 str_modify_keep_cr(str);
7021 i = rb_str_index(str, arg, 0);
7023 return RBOOL(i != -1);
7065 rb_raise(rb_eArgError,
"invalid radix %d", base);
7067 return rb_str_to_inum(str, base, FALSE);
7091rb_str_to_f(
VALUE str)
7106rb_str_to_s(
VALUE str)
7118 char s[RUBY_MAX_CHAR_LEN];
7119 int n = rb_enc_codelen(c, enc);
7121 rb_enc_mbcput(c, s, enc);
7126#define CHAR_ESC_LEN 13
7129rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7131 char buf[CHAR_ESC_LEN + 1];
7139 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7141 else if (c < 0x10000) {
7142 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7145 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7150 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7153 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7156 l = (int)strlen(buf);
7162ruby_escaped_char(
int c)
7165 case '\0':
return "\\0";
7166 case '\n':
return "\\n";
7167 case '\r':
return "\\r";
7168 case '\t':
return "\\t";
7169 case '\f':
return "\\f";
7170 case '\013':
return "\\v";
7171 case '\010':
return "\\b";
7172 case '\007':
return "\\a";
7173 case '\033':
return "\\e";
7174 case '\x7f':
return "\\c?";
7180rb_str_escape(
VALUE str)
7184 const char *p = RSTRING_PTR(str);
7186 const char *prev = p;
7187 char buf[CHAR_ESC_LEN + 1];
7189 int unicode_p = rb_enc_unicode_p(enc);
7190 int asciicompat = rb_enc_asciicompat(enc);
7195 int n = rb_enc_precise_mbclen(p, pend, enc);
7197 if (p > prev) str_buf_cat(result, prev, p - prev);
7198 n = rb_enc_mbminlen(enc);
7200 n = (int)(pend - p);
7202 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7203 str_buf_cat(result, buf, strlen(buf));
7209 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7211 cc = ruby_escaped_char(c);
7213 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7214 str_buf_cat(result, cc, strlen(cc));
7217 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7220 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7221 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7225 if (p > prev) str_buf_cat(result, prev, p - prev);
7249 const char *p, *pend, *prev;
7250 char buf[CHAR_ESC_LEN + 1];
7252 rb_encoding *resenc = rb_default_internal_encoding();
7253 int unicode_p = rb_enc_unicode_p(enc);
7254 int asciicompat = rb_enc_asciicompat(enc);
7256 if (resenc == NULL) resenc = rb_default_external_encoding();
7257 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7258 rb_enc_associate(result, resenc);
7259 str_buf_cat2(result,
"\"");
7267 n = rb_enc_precise_mbclen(p, pend, enc);
7269 if (p > prev) str_buf_cat(result, prev, p - prev);
7270 n = rb_enc_mbminlen(enc);
7272 n = (int)(pend - p);
7274 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7275 str_buf_cat(result, buf, strlen(buf));
7281 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7283 if ((asciicompat || unicode_p) &&
7284 (c ==
'"'|| c ==
'\\' ||
7289 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7290 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7291 str_buf_cat2(result,
"\\");
7292 if (asciicompat || enc == resenc) {
7298 case '\n': cc =
'n';
break;
7299 case '\r': cc =
'r';
break;
7300 case '\t': cc =
't';
break;
7301 case '\f': cc =
'f';
break;
7302 case '\013': cc =
'v';
break;
7303 case '\010': cc =
'b';
break;
7304 case '\007': cc =
'a';
break;
7305 case 033: cc =
'e';
break;
7306 default: cc = 0;
break;
7309 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7312 str_buf_cat(result, buf, 2);
7325 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7329 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7330 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7335 if (p > prev) str_buf_cat(result, prev, p - prev);
7336 str_buf_cat2(result,
"\"");
7341#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7361 int encidx = rb_enc_get_index(str);
7364 const char *p, *pend;
7367 int u8 = (encidx == rb_utf8_encindex());
7368 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7371 if (!rb_enc_asciicompat(enc)) {
7373 len += strlen(enc->name);
7376 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7379 unsigned char c = *p++;
7382 case '"':
case '\\':
7383 case '\n':
case '\r':
7384 case '\t':
case '\f':
7385 case '\013':
case '\010':
case '\007':
case '\033':
7390 clen = IS_EVSTR(p, pend) ? 2 : 1;
7398 if (u8 && c > 0x7F) {
7399 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7401 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7404 else if (cc <= 0xFFFFF)
7417 if (clen > LONG_MAX -
len) {
7424 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7425 q = RSTRING_PTR(result); qend = q +
len + 1;
7429 unsigned char c = *p++;
7431 if (c ==
'"' || c ==
'\\') {
7435 else if (c ==
'#') {
7436 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7439 else if (c ==
'\n') {
7443 else if (c ==
'\r') {
7447 else if (c ==
'\t') {
7451 else if (c ==
'\f') {
7455 else if (c ==
'\013') {
7459 else if (c ==
'\010') {
7463 else if (c ==
'\007') {
7467 else if (c ==
'\033') {
7477 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7479 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7482 snprintf(q, qend-q,
"u%04X", cc);
7484 snprintf(q, qend-q,
"u{%X}", cc);
7489 snprintf(q, qend-q,
"x%02X", c);
7495 if (!rb_enc_asciicompat(enc)) {
7496 snprintf(q, qend-q, nonascii_suffix, enc->name);
7497 encidx = rb_ascii8bit_encindex();
7500 rb_enc_associate_index(result, encidx);
7506unescape_ascii(
unsigned int c)
7530undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7532 const char *s = *ss;
7536 unsigned char buf[6];
7554 *buf = unescape_ascii(*s);
7566 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7567 if (*penc != enc_utf8) {
7569 rb_enc_associate(undumped, enc_utf8);
7586 if (hexlen == 0 || hexlen > 6) {
7592 if (0xd800 <= c && c <= 0xdfff) {
7595 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7605 if (0xd800 <= c && c <= 0xdfff) {
7608 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7636static VALUE rb_str_is_ascii_only_p(
VALUE str);
7654str_undump(
VALUE str)
7656 const char *s = RSTRING_PTR(str);
7659 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7661 bool binary =
false;
7665 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7668 if (!str_null_check(str, &w)) {
7671 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7672 if (*s !=
'"')
goto invalid_format;
7690 static const char force_encoding_suffix[] =
".force_encoding(\"";
7691 static const char dup_suffix[] =
".dup";
7692 const char *encname;
7697 size =
sizeof(dup_suffix) - 1;
7698 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7700 size =
sizeof(force_encoding_suffix) - 1;
7701 if (s_end - s <= size)
goto invalid_format;
7702 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7706 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7710 s = memchr(s,
'"', s_end-s);
7712 if (!s)
goto invalid_format;
7713 if (s_end - s != 2)
goto invalid_format;
7714 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7716 encidx = rb_enc_find_index2(encname, (
long)size);
7720 rb_enc_associate_index(undumped, encidx);
7730 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7741 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7747 if (rb_enc_dummy_p(enc)) {
7754str_true_enc(
VALUE str)
7757 rb_str_check_dummy_enc(enc);
7761static OnigCaseFoldType
7762check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7767 rb_raise(rb_eArgError,
"too many options");
7768 if (argv[0]==sym_turkic) {
7769 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7771 if (argv[1]==sym_lithuanian)
7772 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7774 rb_raise(rb_eArgError,
"invalid second option");
7777 else if (argv[0]==sym_lithuanian) {
7778 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7780 if (argv[1]==sym_turkic)
7781 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7783 rb_raise(rb_eArgError,
"invalid second option");
7787 rb_raise(rb_eArgError,
"too many options");
7788 else if (argv[0]==sym_ascii)
7789 flags |= ONIGENC_CASE_ASCII_ONLY;
7790 else if (argv[0]==sym_fold) {
7791 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7792 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7794 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7797 rb_raise(rb_eArgError,
"invalid option");
7804 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7810#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7811#ifndef CASEMAP_DEBUG
7812# define CASEMAP_DEBUG 0
7820 OnigUChar space[FLEX_ARY_LEN];
7824mapping_buffer_free(
void *p)
7828 while (current_buffer) {
7829 previous_buffer = current_buffer;
7830 current_buffer = current_buffer->next;
7831 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7837 {0, mapping_buffer_free,},
7838 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7846 const OnigUChar *source_current, *source_end;
7847 int target_length = 0;
7848 VALUE buffer_anchor;
7851 size_t buffer_count = 0;
7852 int buffer_length_or_invalid;
7854 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7856 source_current = (OnigUChar*)RSTRING_PTR(source);
7861 while (source_current < source_end) {
7863 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7864 if (CASEMAP_DEBUG) {
7865 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7868 *pre_buffer = current_buffer;
7869 pre_buffer = ¤t_buffer->next;
7870 current_buffer->next = NULL;
7871 current_buffer->capa =
capa;
7872 buffer_length_or_invalid = enc->case_map(flags,
7873 &source_current, source_end,
7874 current_buffer->space,
7875 current_buffer->space+current_buffer->capa,
7877 if (buffer_length_or_invalid < 0) {
7878 current_buffer =
DATA_PTR(buffer_anchor);
7880 mapping_buffer_free(current_buffer);
7881 rb_raise(rb_eArgError,
"input string invalid");
7883 target_length += current_buffer->used = buffer_length_or_invalid;
7885 if (CASEMAP_DEBUG) {
7886 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7889 if (buffer_count==1) {
7890 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7893 char *target_current;
7896 target_current = RSTRING_PTR(target);
7897 current_buffer =
DATA_PTR(buffer_anchor);
7898 while (current_buffer) {
7899 memcpy(target_current, current_buffer->space, current_buffer->used);
7900 target_current += current_buffer->used;
7901 current_buffer = current_buffer->next;
7904 current_buffer =
DATA_PTR(buffer_anchor);
7906 mapping_buffer_free(current_buffer);
7911 str_enc_copy_direct(target, source);
7920 const OnigUChar *source_current, *source_end;
7921 OnigUChar *target_current, *target_end;
7922 long old_length = RSTRING_LEN(source);
7923 int length_or_invalid;
7925 if (old_length == 0)
return Qnil;
7927 source_current = (OnigUChar*)RSTRING_PTR(source);
7929 if (source == target) {
7930 target_current = (OnigUChar*)source_current;
7931 target_end = (OnigUChar*)source_end;
7934 target_current = (OnigUChar*)RSTRING_PTR(target);
7938 length_or_invalid = onigenc_ascii_only_case_map(flags,
7939 &source_current, source_end,
7940 target_current, target_end, enc);
7941 if (length_or_invalid < 0)
7942 rb_raise(rb_eArgError,
"input string invalid");
7943 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7944 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7945 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7946 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
7947 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7950 str_enc_copy(target, source);
7956upcase_single(
VALUE str)
7958 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
7959 bool modified =
false;
7962 unsigned int c = *(
unsigned char*)s;
7964 if (
'a' <= c && c <=
'z') {
7965 *s =
'A' + (c -
'a');
7993rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
7996 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7998 flags = check_case_options(argc, argv, flags);
7999 str_modify_keep_cr(str);
8000 enc = str_true_enc(str);
8001 if (case_option_single_p(flags, enc, str)) {
8002 if (upcase_single(str))
8003 flags |= ONIGENC_CASE_MODIFIED;
8005 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8006 rb_str_ascii_casemap(str, str, &flags, enc);
8008 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8010 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8032rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8035 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8038 flags = check_case_options(argc, argv, flags);
8039 enc = str_true_enc(str);
8040 if (case_option_single_p(flags, enc, str)) {
8041 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8042 str_enc_copy_direct(ret, str);
8045 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8047 rb_str_ascii_casemap(str, ret, &flags, enc);
8050 ret = rb_str_casemap(str, &flags, enc);
8057downcase_single(
VALUE str)
8059 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8060 bool modified =
false;
8063 unsigned int c = *(
unsigned char*)s;
8065 if (
'A' <= c && c <=
'Z') {
8066 *s =
'a' + (c -
'A');
8095rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8098 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8100 flags = check_case_options(argc, argv, flags);
8101 str_modify_keep_cr(str);
8102 enc = str_true_enc(str);
8103 if (case_option_single_p(flags, enc, str)) {
8104 if (downcase_single(str))
8105 flags |= ONIGENC_CASE_MODIFIED;
8107 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8108 rb_str_ascii_casemap(str, str, &flags, enc);
8110 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8112 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8134rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8137 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8140 flags = check_case_options(argc, argv, flags);
8141 enc = str_true_enc(str);
8142 if (case_option_single_p(flags, enc, str)) {
8143 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8144 str_enc_copy_direct(ret, str);
8145 downcase_single(ret);
8147 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8149 rb_str_ascii_casemap(str, ret, &flags, enc);
8152 ret = rb_str_casemap(str, &flags, enc);
8180rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8183 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8185 flags = check_case_options(argc, argv, flags);
8186 str_modify_keep_cr(str);
8187 enc = str_true_enc(str);
8188 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8189 if (flags&ONIGENC_CASE_ASCII_ONLY)
8190 rb_str_ascii_casemap(str, str, &flags, enc);
8192 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8194 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8218rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8221 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8224 flags = check_case_options(argc, argv, flags);
8225 enc = str_true_enc(str);
8226 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8227 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8229 rb_str_ascii_casemap(str, ret, &flags, enc);
8232 ret = rb_str_casemap(str, &flags, enc);
8259rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8262 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8264 flags = check_case_options(argc, argv, flags);
8265 str_modify_keep_cr(str);
8266 enc = str_true_enc(str);
8267 if (flags&ONIGENC_CASE_ASCII_ONLY)
8268 rb_str_ascii_casemap(str, str, &flags, enc);
8270 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8272 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8296rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8299 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8302 flags = check_case_options(argc, argv, flags);
8303 enc = str_true_enc(str);
8304 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8305 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8307 rb_str_ascii_casemap(str, ret, &flags, enc);
8310 ret = rb_str_casemap(str, &flags, enc);
8315typedef unsigned char *USTR;
8319 unsigned int now, max;
8331 if (t->p == t->pend)
return -1;
8332 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8335 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8337 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8339 if (t->p < t->pend) {
8340 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8343 if (t->now < 0x80 && c < 0x80) {
8344 rb_raise(rb_eArgError,
8345 "invalid range \"%c-%c\" in string transliteration",
8349 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8353 else if (t->now < c) {
8362 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8363 if (t->now == t->max) {
8368 if (t->now < t->max) {
8384 const unsigned int errc = -1;
8385 unsigned int trans[256];
8387 struct tr trsrc, trrepl;
8389 unsigned int c, c0, last = 0;
8390 int modify = 0, i, l;
8391 unsigned char *s, *send;
8393 int singlebyte = single_byte_optimizable(str);
8397#define CHECK_IF_ASCII(c) \
8398 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8399 (cr = ENC_CODERANGE_VALID) : 0)
8403 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8404 if (RSTRING_LEN(repl) == 0) {
8405 return rb_str_delete_bang(1, &src, str);
8409 e1 = rb_enc_check(str, src);
8410 e2 = rb_enc_check(str, repl);
8415 enc = rb_enc_check(src, repl);
8417 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8418 if (RSTRING_LEN(src) > 1 &&
8419 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8420 trsrc.p + l < trsrc.pend) {
8424 trrepl.p = RSTRING_PTR(repl);
8425 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8426 trsrc.gen = trrepl.gen = 0;
8427 trsrc.now = trrepl.now = 0;
8428 trsrc.max = trrepl.max = 0;
8431 for (i=0; i<256; i++) {
8434 while ((c = trnext(&trsrc, enc)) != errc) {
8439 if (!hash) hash = rb_hash_new();
8443 while ((c = trnext(&trrepl, enc)) != errc)
8446 for (i=0; i<256; i++) {
8447 if (trans[i] != errc) {
8455 for (i=0; i<256; i++) {
8458 while ((c = trnext(&trsrc, enc)) != errc) {
8459 r = trnext(&trrepl, enc);
8460 if (r == errc) r = trrepl.now;
8463 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8466 if (!hash) hash = rb_hash_new();
8474 str_modify_keep_cr(str);
8475 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8476 termlen = rb_enc_mbminlen(enc);
8479 long offset, max = RSTRING_LEN(str);
8480 unsigned int save = -1;
8481 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8486 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8489 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8492 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8494 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8503 if (cflag) c = last;
8506 else if (cflag) c = errc;
8512 if (c != (
unsigned int)-1) {
8518 tlen = rb_enc_codelen(c, enc);
8524 if (enc != e1) may_modify = 1;
8526 if ((offset = t - buf) + tlen > max) {
8527 size_t MAYBE_UNUSED(old) = max + termlen;
8528 max = offset + tlen + (send - s);
8529 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8532 rb_enc_mbcput(c, t, enc);
8533 if (may_modify && memcmp(s, t, tlen) != 0) {
8539 if (!STR_EMBED_P(str)) {
8540 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8542 TERM_FILL((
char *)t, termlen);
8543 RSTRING(str)->as.heap.ptr = (
char *)buf;
8544 STR_SET_LEN(str, t - buf);
8545 STR_SET_NOEMBED(str);
8546 RSTRING(str)->as.heap.aux.capa = max;
8550 c = (
unsigned char)*s;
8551 if (trans[c] != errc) {
8568 long offset, max = (long)((send - s) * 1.2);
8569 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8574 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8577 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8580 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8582 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8590 if (cflag) c = last;
8593 else if (cflag) c = errc;
8597 c = cflag ? last : errc;
8600 tlen = rb_enc_codelen(c, enc);
8605 if (enc != e1) may_modify = 1;
8607 if ((offset = t - buf) + tlen > max) {
8608 size_t MAYBE_UNUSED(old) = max + termlen;
8609 max = offset + tlen + (long)((send - s) * 1.2);
8610 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8614 rb_enc_mbcput(c, t, enc);
8615 if (may_modify && memcmp(s, t, tlen) != 0) {
8623 if (!STR_EMBED_P(str)) {
8624 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8626 TERM_FILL((
char *)t, termlen);
8627 RSTRING(str)->as.heap.ptr = (
char *)buf;
8628 STR_SET_LEN(str, t - buf);
8629 STR_SET_NOEMBED(str);
8630 RSTRING(str)->as.heap.aux.capa = max;
8636 rb_enc_associate(str, enc);
8655 return tr_trans(str, src, repl, 0);
8702 tr_trans(str, src, repl, 0);
8706#define TR_TABLE_MAX (UCHAR_MAX+1)
8707#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8709tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8712 const unsigned int errc = -1;
8713 char buf[TR_TABLE_MAX];
8716 VALUE table = 0, ptable = 0;
8717 int i, l, cflag = 0;
8719 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8720 tr.gen =
tr.now =
tr.max = 0;
8722 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8727 for (i=0; i<TR_TABLE_MAX; i++) {
8730 stable[TR_TABLE_MAX] = cflag;
8732 else if (stable[TR_TABLE_MAX] && !cflag) {
8733 stable[TR_TABLE_MAX] = 0;
8735 for (i=0; i<TR_TABLE_MAX; i++) {
8739 while ((c = trnext(&
tr, enc)) != errc) {
8740 if (c < TR_TABLE_MAX) {
8741 buf[(
unsigned char)c] = !cflag;
8746 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8749 table = ptable ? ptable : rb_hash_new();
8753 table = rb_hash_new();
8758 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8759 rb_hash_aset(table, key,
Qtrue);
8763 for (i=0; i<TR_TABLE_MAX; i++) {
8764 stable[i] = stable[i] && buf[i];
8766 if (!table && !cflag) {
8773tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8775 if (c < TR_TABLE_MAX) {
8776 return table[c] != 0;
8782 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8783 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8787 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8790 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8804rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8806 char squeez[TR_TABLE_SIZE];
8809 VALUE del = 0, nodel = 0;
8811 int i, ascompat, cr;
8813 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8815 for (i=0; i<argc; i++) {
8819 enc = rb_enc_check(str, s);
8820 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8823 str_modify_keep_cr(str);
8824 ascompat = rb_enc_asciicompat(enc);
8825 s = t = RSTRING_PTR(str);
8832 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8843 c = rb_enc_codepoint_len(s, send, &clen, enc);
8845 if (tr_find(c, squeez, del, nodel)) {
8849 if (t != s) rb_enc_mbcput(c, t, enc);
8856 TERM_FILL(t, TERM_LEN(str));
8857 STR_SET_LEN(str, t - RSTRING_PTR(str));
8860 if (modify)
return str;
8880rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8883 rb_str_delete_bang(argc, argv, str);
8897rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8899 char squeez[TR_TABLE_SIZE];
8901 VALUE del = 0, nodel = 0;
8902 unsigned char *s, *send, *t;
8904 int ascompat, singlebyte = single_byte_optimizable(str);
8908 enc = STR_ENC_GET(str);
8911 for (i=0; i<argc; i++) {
8915 enc = rb_enc_check(str, s);
8916 if (singlebyte && !single_byte_optimizable(s))
8918 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8922 str_modify_keep_cr(str);
8923 s = t = (
unsigned char *)RSTRING_PTR(str);
8924 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8927 ascompat = rb_enc_asciicompat(enc);
8931 unsigned int c = *s++;
8932 if (c != save || (argc > 0 && !squeez[c])) {
8942 if (ascompat && (c = *s) < 0x80) {
8943 if (c != save || (argc > 0 && !squeez[c])) {
8949 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8951 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8952 if (t != s) rb_enc_mbcput(c, t, enc);
8961 TERM_FILL((
char *)t, TERM_LEN(str));
8962 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8963 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
8967 if (modify)
return str;
8990rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8993 rb_str_squeeze_bang(argc, argv, str);
9011 return tr_trans(str, src, repl, 1);
9034 tr_trans(str, src, repl, 1);
9063rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9065 char table[TR_TABLE_SIZE];
9067 VALUE del = 0, nodel = 0, tstr;
9077 enc = rb_enc_check(str, tstr);
9080 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9081 (ptstr = RSTRING_PTR(tstr),
9082 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9083 !is_broken_string(str)) {
9085 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9087 s = RSTRING_PTR(str);
9088 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9091 if (*(
unsigned char*)s++ == c) n++;
9097 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9098 for (i=1; i<argc; i++) {
9101 enc = rb_enc_check(str, tstr);
9102 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9105 s = RSTRING_PTR(str);
9106 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9108 ascompat = rb_enc_asciicompat(enc);
9112 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9120 c = rb_enc_codepoint_len(s, send, &clen, enc);
9121 if (tr_find(c, table, del, nodel)) {
9132rb_fs_check(
VALUE val)
9136 if (
NIL_P(val))
return 0;
9141static const char isspacetable[256] = {
9142 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9144 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9148 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9150 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9152 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9154 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9156 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9160#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9163split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9165 if (empty_count >= 0 &&
len == 0) {
9166 return empty_count + 1;
9168 if (empty_count > 0) {
9172 rb_ary_push(result, str_new_empty_String(str));
9173 }
while (--empty_count > 0);
9177 rb_yield(str_new_empty_String(str));
9178 }
while (--empty_count > 0);
9183 rb_ary_push(result, str);
9192 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9196literal_split_pattern(
VALUE spat, split_type_t default_type)
9204 return SPLIT_TYPE_CHARS;
9206 else if (rb_enc_asciicompat(enc)) {
9207 if (
len == 1 && ptr[0] ==
' ') {
9208 return SPLIT_TYPE_AWK;
9213 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9214 return SPLIT_TYPE_AWK;
9217 return default_type;
9230rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9235 split_type_t split_type;
9236 long beg, end, i = 0, empty_count = -1;
9241 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9243 if (lim <= 0) limit =
Qnil;
9244 else if (lim == 1) {
9245 if (RSTRING_LEN(str) == 0)
9256 if (
NIL_P(limit) && !lim) empty_count = 0;
9258 enc = STR_ENC_GET(str);
9259 split_type = SPLIT_TYPE_REGEXP;
9261 spat = get_pat_quoted(spat, 0);
9263 else if (
NIL_P(spat = rb_fs)) {
9264 split_type = SPLIT_TYPE_AWK;
9266 else if (!(spat = rb_fs_check(spat))) {
9267 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9272 if (split_type != SPLIT_TYPE_AWK) {
9277 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9278 if (split_type == SPLIT_TYPE_AWK) {
9280 split_type = SPLIT_TYPE_STRING;
9285 mustnot_broken(spat);
9286 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9294#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9297 char *ptr = RSTRING_PTR(str);
9299 if (split_type == SPLIT_TYPE_AWK) {
9304 if (result) result = rb_ary_new();
9306 if (is_ascii_string(str)) {
9307 while (ptr < eptr) {
9308 c = (
unsigned char)*ptr++;
9310 if (ascii_isspace(c)) {
9316 if (!
NIL_P(limit) && lim <= i)
break;
9319 else if (ascii_isspace(c)) {
9320 SPLIT_STR(beg, end-beg);
9323 if (!
NIL_P(limit)) ++i;
9331 while (ptr < eptr) {
9334 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9343 if (!
NIL_P(limit) && lim <= i)
break;
9347 SPLIT_STR(beg, end-beg);
9350 if (!
NIL_P(limit)) ++i;
9358 else if (split_type == SPLIT_TYPE_STRING) {
9359 char *str_start = ptr;
9360 char *substr_start = ptr;
9361 char *sptr = RSTRING_PTR(spat);
9362 long slen = RSTRING_LEN(spat);
9364 if (result) result = rb_ary_new();
9365 mustnot_broken(str);
9366 enc = rb_enc_check(str, spat);
9367 while (ptr < eptr &&
9368 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9371 if (t != ptr + end) {
9375 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9378 if (!
NIL_P(limit) && lim <= ++i)
break;
9380 beg = ptr - str_start;
9382 else if (split_type == SPLIT_TYPE_CHARS) {
9383 char *str_start = ptr;
9386 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9387 mustnot_broken(str);
9388 enc = rb_enc_get(str);
9389 while (ptr < eptr &&
9390 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9391 SPLIT_STR(ptr - str_start, n);
9393 if (!
NIL_P(limit) && lim <= ++i)
break;
9395 beg = ptr - str_start;
9398 if (result) result = rb_ary_new();
9399 long len = RSTRING_LEN(str);
9407 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9412 if (start == end && BEG(0) == END(0)) {
9417 else if (last_null == 1) {
9418 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9425 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9431 SPLIT_STR(beg, end-beg);
9432 beg = start = END(0);
9436 for (idx=1; idx < regs->num_regs; idx++) {
9437 if (BEG(idx) == -1)
continue;
9438 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9440 if (!
NIL_P(limit) && lim <= ++i)
break;
9442 if (match) rb_match_unbusy(match);
9444 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9445 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9448 return result ? result : str;
9458 return rb_str_split_m(1, &sep, str);
9461#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9467 rb_ary_push(ary, e);
9476#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9479chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9481 const char *prev = rb_enc_prev_char(p, e, e, enc);
9484 prev = rb_enc_prev_char(p, e, e, enc);
9485 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9497 RSTRING_LEN(rs) != 1 ||
9498 RSTRING_PTR(rs)[0] !=
'\n')) {
9504#define rb_rs get_rs()
9511 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9512 long pos,
len, rslen;
9518 static ID keywords[1];
9523 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9527 if (!ENUM_ELEM(ary, str)) {
9535 if (!RSTRING_LEN(str))
goto end;
9537 ptr = subptr = RSTRING_PTR(str);
9539 len = RSTRING_LEN(str);
9541 rslen = RSTRING_LEN(rs);
9544 enc = rb_enc_get(str);
9546 enc = rb_enc_check(str, rs);
9551 const char *eol = NULL;
9553 while (subend < pend) {
9554 long chomp_rslen = 0;
9556 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9558 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9560 if (eol == subend)
break;
9564 chomp_rslen = -rslen;
9568 if (!subptr) subptr = subend;
9572 }
while (subend < pend);
9574 if (rslen == 0) chomp_rslen = 0;
9576 subend - subptr + (chomp ? chomp_rslen : rslen));
9577 if (ENUM_ELEM(ary, line)) {
9578 str_mod_check(str, ptr,
len);
9580 subptr = eol = NULL;
9585 rsptr = RSTRING_PTR(rs);
9586 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9595 rsptr = RSTRING_PTR(rs);
9596 rslen = RSTRING_LEN(rs);
9599 while (subptr < pend) {
9600 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9604 if (hit != adjusted) {
9608 subend = hit += rslen;
9611 subend = chomp_newline(subptr, subend, enc);
9618 if (ENUM_ELEM(ary, line)) {
9619 str_mod_check(str, ptr,
len);
9624 if (subptr != pend) {
9627 pend = chomp_newline(subptr, pend, enc);
9629 else if (pend - subptr >= rslen &&
9630 memcmp(pend - rslen, rsptr, rslen) == 0) {
9635 ENUM_ELEM(ary, line);
9656rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9659 return rb_str_enumerate_lines(argc, argv, str, 0);
9672rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9674 VALUE ary = WANTARRAY(
"lines", 0);
9675 return rb_str_enumerate_lines(argc, argv, str, ary);
9689 for (i=0; i<RSTRING_LEN(str); i++) {
9690 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9708rb_str_each_byte(
VALUE str)
9711 return rb_str_enumerate_bytes(str, 0);
9723rb_str_bytes(
VALUE str)
9725 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9726 return rb_str_enumerate_bytes(str, ary);
9744 ptr = RSTRING_PTR(str);
9745 len = RSTRING_LEN(str);
9746 enc = rb_enc_get(str);
9749 for (i = 0; i <
len; i += n) {
9750 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9755 for (i = 0; i <
len; i += n) {
9756 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9777rb_str_each_char(
VALUE str)
9780 return rb_str_enumerate_chars(str, 0);
9792rb_str_chars(
VALUE str)
9795 return rb_str_enumerate_chars(str, ary);
9799rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9804 const char *ptr, *end;
9807 if (single_byte_optimizable(str))
9808 return rb_str_enumerate_bytes(str, ary);
9811 ptr = RSTRING_PTR(str);
9813 enc = STR_ENC_GET(str);
9816 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9837rb_str_each_codepoint(
VALUE str)
9840 return rb_str_enumerate_codepoints(str, 0);
9852rb_str_codepoints(
VALUE str)
9855 return rb_str_enumerate_codepoints(str, ary);
9861 int encidx = rb_enc_to_index(enc);
9863 const OnigUChar source_ascii[] =
"\\X";
9864 const OnigUChar *source = source_ascii;
9865 size_t source_len =
sizeof(source_ascii) - 1;
9868#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9869#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9870#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9871#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9872#define CASE_UTF(e) \
9873 case ENCINDEX_UTF_##e: { \
9874 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9875 source = source_UTF_##e; \
9876 source_len = sizeof(source_UTF_##e); \
9879 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9887 regex_t *reg_grapheme_cluster;
9889 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9890 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9892 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9893 onig_error_code_to_str(message, r, &einfo);
9894 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9897 return reg_grapheme_cluster;
9903 int encidx = rb_enc_to_index(enc);
9904 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9906 if (encidx == rb_utf8_encindex()) {
9907 if (!reg_grapheme_cluster_utf8) {
9908 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9911 return reg_grapheme_cluster_utf8;
9920 size_t grapheme_cluster_count = 0;
9922 const char *ptr, *end;
9924 if (!rb_enc_unicode_p(enc)) {
9928 bool cached_reg_grapheme_cluster =
true;
9929 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9930 if (!reg_grapheme_cluster) {
9931 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9932 cached_reg_grapheme_cluster =
false;
9935 ptr = RSTRING_PTR(str);
9939 OnigPosition
len = onig_match(reg_grapheme_cluster,
9940 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9941 (
const OnigUChar *)ptr, NULL, 0);
9942 if (
len <= 0)
break;
9943 grapheme_cluster_count++;
9947 if (!cached_reg_grapheme_cluster) {
9948 onig_free(reg_grapheme_cluster);
9951 return SIZET2NUM(grapheme_cluster_count);
9955rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9959 const char *ptr0, *ptr, *end;
9961 if (!rb_enc_unicode_p(enc)) {
9962 return rb_str_enumerate_chars(str, ary);
9967 bool cached_reg_grapheme_cluster =
true;
9968 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9969 if (!reg_grapheme_cluster) {
9970 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9971 cached_reg_grapheme_cluster =
false;
9974 ptr0 = ptr = RSTRING_PTR(str);
9978 OnigPosition
len = onig_match(reg_grapheme_cluster,
9979 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9980 (
const OnigUChar *)ptr, NULL, 0);
9981 if (
len <= 0)
break;
9986 if (!cached_reg_grapheme_cluster) {
9987 onig_free(reg_grapheme_cluster);
10007rb_str_each_grapheme_cluster(
VALUE str)
10010 return rb_str_enumerate_grapheme_clusters(str, 0);
10022rb_str_grapheme_clusters(
VALUE str)
10025 return rb_str_enumerate_grapheme_clusters(str, ary);
10029chopped_length(
VALUE str)
10032 const char *p, *p2, *beg, *end;
10034 beg = RSTRING_PTR(str);
10035 end = beg + RSTRING_LEN(str);
10036 if (beg >= end)
return 0;
10037 p = rb_enc_prev_char(beg, end, end, enc);
10039 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10040 p2 = rb_enc_prev_char(beg, p, end, enc);
10041 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10057rb_str_chop_bang(
VALUE str)
10059 str_modify_keep_cr(str);
10060 if (RSTRING_LEN(str) > 0) {
10062 len = chopped_length(str);
10063 STR_SET_LEN(str,
len);
10064 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10083rb_str_chop(
VALUE str)
10089smart_chomp(
VALUE str,
const char *e,
const char *p)
10092 if (rb_enc_mbminlen(enc) > 1) {
10097 pp = e - rb_enc_mbminlen(enc);
10100 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10108 if (--e > p && *(e-1) ==
'\r') {
10125 char *pp, *e, *rsptr;
10127 char *
const p = RSTRING_PTR(str);
10128 long len = RSTRING_LEN(str);
10130 if (
len == 0)
return 0;
10133 return smart_chomp(str, e, p);
10136 enc = rb_enc_get(str);
10139 if (rb_enc_mbminlen(enc) > 1) {
10144 pp -= rb_enc_mbminlen(enc);
10147 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10154 while (e > p && *(e-1) ==
'\n') {
10156 if (e > p && *(e-1) ==
'\r')
10162 if (rslen >
len)
return len;
10164 enc = rb_enc_get(rs);
10165 newline = rsptr[rslen-1];
10166 if (rslen == rb_enc_mbminlen(enc)) {
10168 if (newline ==
'\n')
10169 return smart_chomp(str, e, p);
10173 return smart_chomp(str, e, p);
10177 enc = rb_enc_check(str, rs);
10178 if (is_broken_string(rs)) {
10182 if (p[
len-1] == newline &&
10184 memcmp(rsptr, pp, rslen) == 0)) {
10185 if (at_char_boundary(p, pp, e, enc))
10186 return len - rslen;
10198chomp_rs(
int argc,
const VALUE *argv)
10202 VALUE rs = argv[0];
10214 long olen = RSTRING_LEN(str);
10215 long len = chompped_length(str, rs);
10216 if (
len >= olen)
return Qnil;
10217 str_modify_keep_cr(str);
10218 STR_SET_LEN(str,
len);
10219 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10236rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10239 str_modifiable(str);
10240 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10241 rs = chomp_rs(argc, argv);
10243 return rb_str_chomp_string(str, rs);
10256rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10258 VALUE rs = chomp_rs(argc, argv);
10266 const char *
const start = s;
10268 if (!s || s >= e)
return 0;
10271 if (single_byte_optimizable(str)) {
10272 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10277 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10297rb_str_lstrip_bang(
VALUE str)
10301 long olen, loffset;
10303 str_modify_keep_cr(str);
10304 enc = STR_ENC_GET(str);
10306 loffset = lstrip_offset(str, start, start+olen, enc);
10308 long len = olen-loffset;
10309 s = start + loffset;
10310 memmove(start, s,
len);
10311 STR_SET_LEN(str,
len);
10312 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10335rb_str_lstrip(
VALUE str)
10340 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10341 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10350 rb_str_check_dummy_enc(enc);
10354 if (!s || s >= e)
return 0;
10358 if (single_byte_optimizable(str)) {
10360 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10365 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10385rb_str_rstrip_bang(
VALUE str)
10389 long olen, roffset;
10391 str_modify_keep_cr(str);
10392 enc = STR_ENC_GET(str);
10394 roffset = rstrip_offset(str, start, start+olen, enc);
10396 long len = olen - roffset;
10398 STR_SET_LEN(str,
len);
10399 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10422rb_str_rstrip(
VALUE str)
10426 long olen, roffset;
10428 enc = STR_ENC_GET(str);
10430 roffset = rstrip_offset(str, start, start+olen, enc);
10432 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10448rb_str_strip_bang(
VALUE str)
10451 long olen, loffset, roffset;
10454 str_modify_keep_cr(str);
10455 enc = STR_ENC_GET(str);
10457 loffset = lstrip_offset(str, start, start+olen, enc);
10458 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10460 if (loffset > 0 || roffset > 0) {
10461 long len = olen-roffset;
10464 memmove(start, start + loffset,
len);
10466 STR_SET_LEN(str,
len);
10467 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10490rb_str_strip(
VALUE str)
10493 long olen, loffset, roffset;
10497 loffset = lstrip_offset(str, start, start+olen, enc);
10498 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10500 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10505scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10508 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10514 end = pos + RSTRING_LEN(pat);
10528 if (RSTRING_LEN(str) > end)
10529 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10538 if (!regs || regs->num_regs == 1) {
10544 for (
int i = 1; i < regs->num_regs; i++) {
10550 rb_ary_push(result, s);
10605 long last = -1, prev = 0;
10606 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10608 pat = get_pat_quoted(pat, 1);
10609 mustnot_broken(str);
10611 VALUE ary = rb_ary_new();
10613 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10616 rb_ary_push(ary, result);
10618 if (last >= 0) rb_pat_search(pat, str, last, 1);
10623 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10627 str_mod_check(str, p,
len);
10629 if (last >= 0) rb_pat_search(pat, str, last, 1);
10653rb_str_hex(
VALUE str)
10655 return rb_str_to_inum(str, 16, FALSE);
10680rb_str_oct(
VALUE str)
10682 return rb_str_to_inum(str, -8, FALSE);
10685#ifndef HAVE_CRYPT_R
10690 rb_nativethread_lock_t lock;
10691} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10694crypt_mutex_initialize(
void)
10765# define CRYPT_END() ALLOCV_END(databuf)
10767 extern char *crypt(
const char *,
const char *);
10768# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10771 const char *s, *saltp;
10774 char salt_8bit_clean[3];
10778 mustnot_wchar(str);
10779 mustnot_wchar(salt);
10781 saltp = RSTRING_PTR(salt);
10782 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10783 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10787 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10788 salt_8bit_clean[0] = saltp[0] & 0x7f;
10789 salt_8bit_clean[1] = saltp[1] & 0x7f;
10790 salt_8bit_clean[2] =
'\0';
10791 saltp = salt_8bit_clean;
10796# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10797 data->initialized = 0;
10799 res = crypt_r(s, saltp, data);
10801 crypt_mutex_initialize();
10803 res = crypt(s, saltp);
10844 char *ptr, *p, *pend;
10847 unsigned long sum0 = 0;
10852 ptr = p = RSTRING_PTR(str);
10853 len = RSTRING_LEN(str);
10859 str_mod_check(str, ptr,
len);
10862 sum0 += (
unsigned char)*p;
10873 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10874 sum0 &= (((
unsigned long)1)<<bits)-1;
10894rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10898 long width,
len, flen = 1, fclen = 1;
10901 const char *f =
" ";
10902 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10904 int singlebyte = 1, cr;
10908 enc = STR_ENC_GET(str);
10909 termlen = rb_enc_mbminlen(enc);
10913 enc = rb_enc_check(str, pad);
10914 f = RSTRING_PTR(pad);
10915 flen = RSTRING_LEN(pad);
10916 fclen = str_strlen(pad, enc);
10917 singlebyte = single_byte_optimizable(pad);
10918 if (flen == 0 || fclen == 0) {
10919 rb_raise(rb_eArgError,
"zero width padding");
10922 len = str_strlen(str, enc);
10923 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
10925 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
10929 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10930 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10932 size = RSTRING_LEN(str);
10933 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10934 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10935 (
len += llen2 + rlen2) >= LONG_MAX - size) {
10936 rb_raise(rb_eArgError,
"argument too big");
10940 p = RSTRING_PTR(res);
10942 memset(p, *f, llen);
10946 while (llen >= fclen) {
10952 memcpy(p, f, llen2);
10956 memcpy(p, RSTRING_PTR(str), size);
10959 memset(p, *f, rlen);
10963 while (rlen >= fclen) {
10969 memcpy(p, f, rlen2);
10973 TERM_FILL(p, termlen);
10974 STR_SET_LEN(res, p-RSTRING_PTR(res));
10997rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
10999 return rb_str_justify(argc, argv, str,
'l');
11013rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11015 return rb_str_justify(argc, argv, str,
'r');
11030rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11032 return rb_str_justify(argc, argv, str,
'c');
11048 sep = get_pat_quoted(sep, 0);
11060 pos = rb_str_index(str, sep, 0);
11061 if (pos < 0)
goto failed;
11066 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11069 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11083 long pos = RSTRING_LEN(str);
11085 sep = get_pat_quoted(sep, 0);
11098 pos = rb_str_rindex(str, sep, pos);
11107 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11109 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11121rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11125 for (i=0; i<argc; i++) {
11126 VALUE tmp = argv[i];
11128 if (rb_reg_start_with_p(tmp, str))
11132 const char *p, *s, *e;
11137 enc = rb_enc_check(str, tmp);
11138 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11139 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11140 p = RSTRING_PTR(str);
11143 if (!at_char_right_boundary(p, s, e, enc))
11145 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11161rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11165 for (i=0; i<argc; i++) {
11166 VALUE tmp = argv[i];
11167 const char *p, *s, *e;
11172 enc = rb_enc_check(str, tmp);
11173 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11174 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11175 p = RSTRING_PTR(str);
11178 if (!at_char_boundary(p, s, e, enc))
11180 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11196deleted_prefix_length(
VALUE str,
VALUE prefix)
11198 const char *strptr, *prefixptr;
11199 long olen, prefixlen;
11204 if (!is_broken_string(prefix) ||
11205 !rb_enc_asciicompat(enc) ||
11206 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11207 enc = rb_enc_check(str, prefix);
11211 prefixlen = RSTRING_LEN(prefix);
11212 if (prefixlen <= 0)
return 0;
11213 olen = RSTRING_LEN(str);
11214 if (olen < prefixlen)
return 0;
11215 strptr = RSTRING_PTR(str);
11216 prefixptr = RSTRING_PTR(prefix);
11217 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11218 if (is_broken_string(prefix)) {
11219 if (!is_broken_string(str)) {
11223 const char *strend = strptr + olen;
11224 const char *after_prefix = strptr + prefixlen;
11225 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11245rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11248 str_modify_keep_cr(str);
11250 prefixlen = deleted_prefix_length(str, prefix);
11251 if (prefixlen <= 0)
return Qnil;
11265rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11269 prefixlen = deleted_prefix_length(str, prefix);
11270 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11272 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11285deleted_suffix_length(
VALUE str,
VALUE suffix)
11287 const char *strptr, *suffixptr;
11288 long olen, suffixlen;
11292 if (is_broken_string(suffix))
return 0;
11293 enc = rb_enc_check(str, suffix);
11296 suffixlen = RSTRING_LEN(suffix);
11297 if (suffixlen <= 0)
return 0;
11298 olen = RSTRING_LEN(str);
11299 if (olen < suffixlen)
return 0;
11300 strptr = RSTRING_PTR(str);
11301 suffixptr = RSTRING_PTR(suffix);
11302 const char *strend = strptr + olen;
11303 const char *before_suffix = strend - suffixlen;
11304 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11305 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11320rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11322 long olen, suffixlen,
len;
11323 str_modifiable(str);
11325 suffixlen = deleted_suffix_length(str, suffix);
11326 if (suffixlen <= 0)
return Qnil;
11328 olen = RSTRING_LEN(str);
11329 str_modify_keep_cr(str);
11330 len = olen - suffixlen;
11331 STR_SET_LEN(str,
len);
11332 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11348rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11352 suffixlen = deleted_suffix_length(str, suffix);
11353 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11355 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11362 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11370 val = rb_fs_check(val);
11373 "value of %"PRIsVALUE
" must be String or Regexp",
11377 rb_warn_deprecated(
"'$;'", NULL);
11394 str_modifiable(str);
11397 int idx = rb_enc_to_index(encoding);
11404 rb_enc_associate_index(str, idx);
11428 if (STR_EMBED_P(str)) {
11429 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11434 str_replace_shared_without_enc(str2, str);
11436 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11469rb_str_valid_encoding_p(
VALUE str)
11489rb_str_is_ascii_only_p(
VALUE str)
11499 static const char ellipsis[] =
"...";
11500 const long ellipsislen =
sizeof(ellipsis) - 1;
11502 const long blen = RSTRING_LEN(str);
11503 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11504 VALUE estr, ret = 0;
11507 if (
len * rb_enc_mbminlen(enc) >= blen ||
11511 else if (
len <= ellipsislen ||
11513 if (rb_enc_asciicompat(enc)) {
11515 rb_enc_associate(ret, enc);
11522 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11527 rb_enc_from_encoding(enc), 0,
Qnil);
11540 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11546 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11565 if (enc == STR_ENC_GET(str)) {
11570 return enc_str_scrub(enc, str, repl, cr);
11578 const char *rep, *p, *e, *p1, *sp;
11584 rb_raise(rb_eArgError,
"both of block and replacement given");
11591 if (!
NIL_P(repl)) {
11592 repl = str_compat_and_valid(repl, enc);
11595 if (rb_enc_dummy_p(enc)) {
11598 encidx = rb_enc_to_index(enc);
11600#define DEFAULT_REPLACE_CHAR(str) do { \
11601 static const char replace[sizeof(str)-1] = str; \
11602 rep = replace; replen = (int)sizeof(replace); \
11605 slen = RSTRING_LEN(str);
11606 p = RSTRING_PTR(str);
11611 if (rb_enc_asciicompat(enc)) {
11617 else if (!
NIL_P(repl)) {
11618 rep = RSTRING_PTR(repl);
11619 replen = RSTRING_LEN(repl);
11622 else if (encidx == rb_utf8_encindex()) {
11623 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11627 DEFAULT_REPLACE_CHAR(
"?");
11632 p = search_nonascii(p, e);
11637 int ret = rb_enc_precise_mbclen(p, e, enc);
11656 if (e - p < clen) clen = e - p;
11663 for (; clen > 1; clen--) {
11664 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11675 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11676 str_mod_check(str, sp, slen);
11677 repl = str_compat_and_valid(repl, enc);
11684 p = search_nonascii(p, e);
11710 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11711 str_mod_check(str, sp, slen);
11712 repl = str_compat_and_valid(repl, enc);
11721 long mbminlen = rb_enc_mbminlen(enc);
11725 else if (!
NIL_P(repl)) {
11726 rep = RSTRING_PTR(repl);
11727 replen = RSTRING_LEN(repl);
11729 else if (encidx == ENCINDEX_UTF_16BE) {
11730 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11732 else if (encidx == ENCINDEX_UTF_16LE) {
11733 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11735 else if (encidx == ENCINDEX_UTF_32BE) {
11736 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11738 else if (encidx == ENCINDEX_UTF_32LE) {
11739 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11742 DEFAULT_REPLACE_CHAR(
"?");
11746 int ret = rb_enc_precise_mbclen(p, e, enc);
11759 if (e - p < clen) clen = e - p;
11760 if (clen <= mbminlen * 2) {
11765 for (; clen > mbminlen; clen-=mbminlen) {
11766 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11776 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11777 str_mod_check(str, sp, slen);
11778 repl = str_compat_and_valid(repl, enc);
11803 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11804 str_mod_check(str, sp, slen);
11805 repl = str_compat_and_valid(repl, enc);
11841str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11849static ID id_normalize;
11850static ID id_normalized_p;
11851static VALUE mUnicodeNormalize;
11854unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11856 static int UnicodeNormalizeRequired = 0;
11859 if (!UnicodeNormalizeRequired) {
11860 rb_require(
"unicode_normalize/normalize.rb");
11861 UnicodeNormalizeRequired = 1;
11865 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11902rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11904 return unicode_normalize_common(argc, argv, str, id_normalize);
11918rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
11920 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11947rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
11949 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12081#define sym_equal rb_obj_equal
12084sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12088 int c = rb_enc_precise_mbclen(s, send, enc);
12092 c = rb_enc_mbc_to_codepoint(s, send, enc);
12100rb_str_symname_p(
VALUE sym)
12105 rb_encoding *resenc = rb_default_internal_encoding();
12107 if (resenc == NULL) resenc = rb_default_external_encoding();
12108 enc = STR_ENC_GET(sym);
12109 ptr = RSTRING_PTR(sym);
12110 len = RSTRING_LEN(sym);
12111 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12119rb_str_quote_unprintable(
VALUE str)
12127 resenc = rb_default_internal_encoding();
12128 if (resenc == NULL) resenc = rb_default_external_encoding();
12129 enc = STR_ENC_GET(str);
12130 ptr = RSTRING_PTR(str);
12131 len = RSTRING_LEN(str);
12132 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12133 !sym_printable(ptr, ptr +
len, enc)) {
12134 return rb_str_escape(str);
12140rb_id_quote_unprintable(
ID id)
12142 VALUE str = rb_id2str(
id);
12143 if (!rb_str_symname_p(str)) {
12144 return rb_str_escape(str);
12162sym_inspect(
VALUE sym)
12169 if (!rb_str_symname_p(str)) {
12171 len = RSTRING_LEN(str);
12172 rb_str_resize(str,
len + 1);
12173 dest = RSTRING_PTR(str);
12174 memmove(dest + 1, dest,
len);
12178 VALUE orig_str = str;
12180 len = RSTRING_LEN(orig_str);
12181 str = rb_enc_str_new(0,
len + 1, enc);
12184 ptr = RSTRING_PTR(orig_str);
12185 dest = RSTRING_PTR(str);
12186 memcpy(dest + 1, ptr,
len);
12206rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12211 rb_raise(rb_eArgError,
"no receiver given");
12308 return rb_str_match(
rb_sym2str(sym), other);
12323sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12325 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12338sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12340 return rb_str_match_m_p(argc, argv, sym);
12358 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12369sym_length(
VALUE sym)
12383sym_empty(
VALUE sym)
12417sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12433sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12449sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12463sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12465 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12478sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12480 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12492sym_encoding(
VALUE sym)
12498string_for_symbol(
VALUE name)
12503 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12517 name = string_for_symbol(name);
12518 return rb_intern_str(name);
12527 name = string_for_symbol(name);
12551 return rb_fstring(str);
12558 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
true,
false);
12570 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12571 rb_enc_autoload(enc);
12575 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12581 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12582 rb_enc_autoload(enc);
12586 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12597rb_yjit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12602 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12603 rb_str_buf_cat_byte(str, (
char) code);
12617 st_foreach(rb_vm_fstring_table(), fstring_set_class_i,
rb_cString);
12783 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
long len
Length of the string, not including terminating NUL character.
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
union RString::@52::@53::@55 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.