14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/ractor_safe_set.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
48#include "ruby_assert.h"
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
66#undef rb_usascii_str_new
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
149str_encindex_fastpath(
int encindex)
153 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_US_ASCII:
163str_enc_fastpath(
VALUE str)
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
217#define STR_ENC_GET(str) get_encoding(str)
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
230str_embed_capa(
VALUE str)
232 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
236rb_str_reembeddable_p(
VALUE str)
238 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242rb_str_embed_size(
long capa)
248rb_str_size_as_embedded(
VALUE str)
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
260 real_size =
sizeof(
struct RString);
264 real_size +=
sizeof(st_index_t);
271STR_EMBEDDABLE_P(
long len,
long termlen)
273 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
278static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
279static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
281static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
282static inline void str_modifiable(
VALUE str);
287str_make_independent(
VALUE str)
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str),
len, 0L, termlen);
294static inline int str_dependent_p(
VALUE str);
297rb_str_make_independent(
VALUE str)
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
305rb_str_make_embedded(
VALUE str)
310 char *buf =
RSTRING(str)->as.heap.ptr;
314 STR_SET_LEN(str,
len);
317 memcpy(RSTRING_PTR(str), buf,
len);
321 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
325rb_debug_rstring_null_ptr(
const char *func)
327 fprintf(stderr,
"%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
338get_encoding(
VALUE str)
344mustnot_broken(
VALUE str)
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
352mustnot_wchar(
VALUE str)
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
360static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
368BARE_STRING_P(
VALUE str)
373static inline st_index_t
374str_do_hash(
VALUE str)
376 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
378 if (e && !is_ascii_string(str)) {
385str_store_precomputed_hash(
VALUE str, st_index_t hash)
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
396 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
398 FL_SET(str, STR_PRECOMPUTED_HASH);
411 if (
FL_TEST(str, RSTRING_FSTR))
414 bare = BARE_STRING_P(str);
416 if (STR_EMBED_P(str)) {
421 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 rb_str_resize(str, RSTRING_LEN(str));
430 fstr = register_fstring(str,
false,
false);
433 str_replace_shared_without_enc(str, fstr);
440static VALUE fstring_table_obj;
443fstring_ractor_safe_set_hash(
VALUE str)
445#ifdef PRECOMPUTED_FAKESTR_HASH
449 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
466 const char *aptr, *bptr;
473 return (alen == blen &&
475 memcmp(aptr, bptr, alen) == 0);
480 bool force_precompute_hash;
484fstring_ractor_safe_set_create(
VALUE str,
void *data)
494 long len = RSTRING_LEN(str);
495 long capa =
len +
sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
525 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
528 if (STR_SHARED_P(str)) {
530 str_make_independent(str);
533 if (!BARE_STRING_P(str)) {
539 RBASIC(str)->flags |= RSTRING_FSTR;
552 .hash = fstring_ractor_safe_set_hash,
553 .cmp = fstring_ractor_safe_set_cmp,
554 .create = fstring_ractor_safe_set_create,
558Init_fstring_table(
void)
560 fstring_table_obj = rb_ractor_safe_set_new(&fstring_ractor_safe_set_funcs, 8192);
561 rb_gc_register_address(&fstring_table_obj);
565register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
569 .force_precompute_hash = force_precompute_hash
572#if SIZEOF_VOIDP == SIZEOF_LONG
576 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
580 VALUE result = rb_ractor_safe_set_find_or_insert(&fstring_table_obj, str, &args);
582 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
592rb_obj_is_fstring_table(
VALUE obj)
596 return obj == fstring_table_obj;
600rb_gc_free_fstring(
VALUE obj)
605 rb_ractor_safe_set_delete_by_identity(fstring_table_obj, obj);
607 RB_DEBUG_COUNTER_INC(obj_str_fstr);
613rb_fstring_foreach_with_replace(
int (*callback)(
VALUE *str,
void *data),
void *data)
615 if (fstring_table_obj) {
616 rb_ractor_safe_set_foreach_with_replace(fstring_table_obj, callback, data);
621setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
624 RBASIC_SET_SHAPE_ID((
VALUE)fake_str, ROOT_SHAPE_ID);
637 return (
VALUE)fake_str;
646 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
655rb_fstring_new(
const char *ptr,
long len)
658 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
665 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
669rb_fstring_cstr(
const char *
ptr)
671 return rb_fstring_new(
ptr, strlen(
ptr));
675single_byte_optimizable(
VALUE str)
679 case ENCINDEX_ASCII_8BIT:
680 case ENCINDEX_US_ASCII:
702static inline const char *
703search_nonascii(
const char *p,
const char *e)
705 const uintptr_t *s, *t;
707#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
708# if SIZEOF_UINTPTR_T == 8
709# define NONASCII_MASK UINT64_C(0x8080808080808080)
710# elif SIZEOF_UINTPTR_T == 4
711# define NONASCII_MASK UINT32_C(0x80808080)
713# error "don't know what to do."
716# if SIZEOF_UINTPTR_T == 8
717# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
718# elif SIZEOF_UINTPTR_T == 4
719# define NONASCII_MASK 0x80808080UL
721# error "don't know what to do."
725 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
726#if !UNALIGNED_WORD_ACCESS
727 if ((uintptr_t)p % SIZEOF_VOIDP) {
728 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
733 case 7:
if (p[-7]&0x80)
return p-7;
734 case 6:
if (p[-6]&0x80)
return p-6;
735 case 5:
if (p[-5]&0x80)
return p-5;
736 case 4:
if (p[-4]&0x80)
return p-4;
738 case 3:
if (p[-3]&0x80)
return p-3;
739 case 2:
if (p[-2]&0x80)
return p-2;
740 case 1:
if (p[-1]&0x80)
return p-1;
745#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
746#define aligned_ptr(value) \
747 __builtin_assume_aligned((value), sizeof(uintptr_t))
749#define aligned_ptr(value) (uintptr_t *)(value)
752 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
755 if (*s & NONASCII_MASK) {
756#ifdef WORDS_BIGENDIAN
757 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
759 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
769 case 7:
if (e[-7]&0x80)
return e-7;
770 case 6:
if (e[-6]&0x80)
return e-6;
771 case 5:
if (e[-5]&0x80)
return e-5;
772 case 4:
if (e[-4]&0x80)
return e-4;
774 case 3:
if (e[-3]&0x80)
return e-3;
775 case 2:
if (e[-2]&0x80)
return e-2;
776 case 1:
if (e[-1]&0x80)
return e-1;
784 const char *e = p +
len;
786 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
788 p = search_nonascii(p, e);
792 if (rb_enc_asciicompat(enc)) {
793 p = search_nonascii(p, e);
796 int ret = rb_enc_precise_mbclen(p, e, enc);
800 p = search_nonascii(p, e);
806 int ret = rb_enc_precise_mbclen(p, e, enc);
822 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
825 p = search_nonascii(p, e);
829 else if (rb_enc_asciicompat(enc)) {
830 p = search_nonascii(p, e);
836 int ret = rb_enc_precise_mbclen(p, e, enc);
843 p = search_nonascii(p, e);
849 int ret = rb_enc_precise_mbclen(p, e, enc);
874 rb_enc_set_index(str1, rb_enc_get_index(str2));
882rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
887 str_enc_copy(dest, src);
888 if (RSTRING_LEN(dest) == 0) {
889 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
900 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
901 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
912rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
914 str_enc_copy(dest, src);
921 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
927 return enc_coderange_scan(str, enc);
936 cr = enc_coderange_scan(str, get_encoding(str));
943rb_enc_str_asciicompat(
VALUE str)
946 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
954 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
963str_mod_check(
VALUE s,
const char *p,
long len)
965 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
971str_capacity(
VALUE str,
const int termlen)
973 if (STR_EMBED_P(str)) {
974 return str_embed_capa(str) - termlen;
976 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
980 return RSTRING(str)->as.heap.aux.capa;
987 return str_capacity(str, TERM_LEN(str));
991must_not_null(
const char *
ptr)
994 rb_raise(rb_eArgError,
"NULL pointer given");
1001 size_t size = rb_str_embed_size(
capa);
1005 NEWOBJ_OF(str,
struct RString, klass,
1012str_alloc_heap(
VALUE klass)
1014 NEWOBJ_OF(str,
struct RString, klass,
1021empty_str_alloc(
VALUE klass)
1023 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1024 VALUE str = str_alloc_embed(klass, 0);
1025 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1036 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1040 enc = rb_ascii8bit_encoding();
1043 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1045 int termlen = rb_enc_mbminlen(enc);
1047 if (STR_EMBEDDABLE_P(
len, termlen)) {
1048 str = str_alloc_embed(klass,
len + termlen);
1054 str = str_alloc_heap(klass);
1060 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1063 rb_enc_raw_set(str, enc);
1066 memcpy(RSTRING_PTR(str),
ptr,
len);
1069 STR_SET_LEN(str,
len);
1070 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1077 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1112 __msan_unpoison_string(
ptr);
1132 if (rb_enc_mbminlen(enc) != 1) {
1133 rb_raise(rb_eArgError,
"wchar encoding given");
1135 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1139str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1144 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1148 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1151 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1152 str = str_alloc_heap(klass);
1156 RBASIC(str)->flags |= STR_NOFREE;
1157 rb_enc_associate_index(str, encindex);
1186static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1188 int ecflags,
VALUE ecopts);
1193 int encidx = rb_enc_to_index(enc);
1194 if (rb_enc_get_index(str) == encidx)
1195 return is_ascii_string(str);
1206 if (!to)
return str;
1207 if (!from) from = rb_enc_get(str);
1208 if (from == to)
return str;
1209 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1210 rb_is_ascii8bit_enc(to)) {
1211 if (STR_ENC_GET(str) != to) {
1213 rb_enc_associate(str, to);
1220 from, to, ecflags, ecopts);
1221 if (
NIL_P(newstr)) {
1229rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1234 olen = RSTRING_LEN(newstr);
1235 if (ofs < -olen || olen < ofs)
1237 if (ofs < 0) ofs += olen;
1239 STR_SET_LEN(newstr, ofs);
1243 rb_str_modify(newstr);
1244 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1252 STR_SET_LEN(str, 0);
1253 rb_enc_associate(str, enc);
1259str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1261 int ecflags,
VALUE ecopts)
1266 VALUE econv_wrapper;
1267 const unsigned char *start, *sp;
1268 unsigned char *dest, *dp;
1269 size_t converted_output = (size_t)ofs;
1274 RBASIC_CLEAR_CLASS(econv_wrapper);
1276 if (!ec)
return Qnil;
1279 sp = (
unsigned char*)
ptr;
1281 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1282 (dp = dest + converted_output),
1286 size_t converted_input = sp - start;
1287 size_t rest =
len - converted_input;
1288 converted_output = dp - dest;
1290 if (converted_input && converted_output &&
1291 rest < (LONG_MAX / converted_output)) {
1292 rest = (rest * converted_output) / converted_input;
1297 olen += rest < 2 ? 2 : rest;
1298 rb_str_resize(newstr, olen);
1305 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1307 rb_enc_associate(newstr, to);
1326 const int eidx = rb_enc_to_index(eenc);
1329 return rb_enc_str_new(
ptr,
len, eenc);
1333 if ((eidx == rb_ascii8bit_encindex()) ||
1334 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1338 ienc = rb_default_internal_encoding();
1339 if (!ienc || eenc == ienc) {
1340 return rb_enc_str_new(
ptr,
len, eenc);
1344 if ((eidx == rb_ascii8bit_encindex()) ||
1345 (eidx == rb_usascii_encindex()) ||
1346 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1347 return rb_enc_str_new(
ptr,
len, ienc);
1350 str = rb_enc_str_new(NULL, 0, ienc);
1353 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1354 rb_str_initialize(str,
ptr,
len, eenc);
1362 int eidx = rb_enc_to_index(eenc);
1363 if (eidx == rb_usascii_encindex() &&
1364 !is_ascii_string(str)) {
1365 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1368 rb_enc_associate_index(str, eidx);
1427str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1429 const int termlen = TERM_LEN(str);
1434 if (str_embed_capa(str2) >=
len + termlen) {
1435 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1436 STR_SET_EMBED(str2);
1437 memcpy(ptr2, RSTRING_PTR(str),
len);
1438 TERM_FILL(ptr2+
len, termlen);
1442 if (STR_SHARED_P(str)) {
1443 root =
RSTRING(str)->as.heap.aux.shared;
1452 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1454 rb_fatal(
"about to free a possible shared root");
1456 char *ptr2 = STR_HEAP_PTR(str2);
1458 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1461 FL_SET(str2, STR_NOEMBED);
1463 STR_SET_SHARED(str2, root);
1466 STR_SET_LEN(str2,
len);
1474 str_replace_shared_without_enc(str2, str);
1475 rb_enc_cr_str_exact_copy(str2, str);
1482 return str_replace_shared(str_alloc_heap(klass), str);
1499rb_str_new_frozen_String(
VALUE orig)
1507rb_str_frozen_bare_string(
VALUE orig)
1509 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1514rb_str_tmp_frozen_acquire(
VALUE orig)
1517 return str_new_frozen_buffer(0, orig, FALSE);
1521rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1523 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1524 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1526 VALUE str = str_alloc_heap(0);
1529 FL_SET(str, STR_SHARED_ROOT);
1531 size_t capa = str_capacity(orig, TERM_LEN(orig));
1537 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1538 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1545 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1546 RBASIC(orig)->flags &= ~STR_NOFREE;
1547 STR_SET_SHARED(orig, str);
1557rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1562 if (STR_EMBED_P(tmp)) {
1565 else if (
FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1571 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1575 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1576 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1581 STR_SET_LEN(tmp, 0);
1589 return str_new_frozen_buffer(klass, orig, TRUE);
1598 VALUE str = str_alloc_heap(klass);
1599 STR_SET_LEN(str, RSTRING_LEN(orig));
1600 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1601 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1602 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1603 RBASIC(orig)->flags &= ~STR_NOFREE;
1604 STR_SET_SHARED(orig, str);
1611str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1615 long len = RSTRING_LEN(orig);
1616 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1617 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1619 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1620 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1626 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1627 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1633 if ((ofs > 0) || (rest > 0) ||
1636 str = str_new_shared(klass,
shared);
1638 RSTRING(str)->as.heap.ptr += ofs;
1639 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1647 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1648 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1650 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1651 STR_SET_LEN(str, RSTRING_LEN(orig));
1656 str = heap_str_make_shared(klass, orig);
1660 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1672str_new_empty_String(
VALUE str)
1675 rb_enc_copy(v, str);
1679#define STR_BUF_MIN_SIZE 63
1684 if (STR_EMBEDDABLE_P(
capa, 1)) {
1692 RSTRING(str)->as.heap.ptr[0] =
'\0';
1712 return str_new(0, 0,
len);
1718 if (STR_EMBED_P(str)) {
1719 RB_DEBUG_COUNTER_INC(obj_str_embed);
1721 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1722 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1723 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1726 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1727 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1732rb_str_memsize(
VALUE str)
1734 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1735 return STR_HEAP_SIZE(str);
1745 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1748static inline void str_discard(
VALUE str);
1749static void str_shared_replace(
VALUE str,
VALUE str2);
1754 if (str != str2) str_shared_replace(str, str2);
1765 enc = STR_ENC_GET(str2);
1768 termlen = rb_enc_mbminlen(enc);
1770 STR_SET_LEN(str, RSTRING_LEN(str2));
1772 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1774 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1775 rb_enc_associate(str, enc);
1779 if (STR_EMBED_P(str2)) {
1781 long len = RSTRING_LEN(str2);
1784 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1785 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1786 RSTRING(str2)->as.heap.ptr = new_ptr;
1787 STR_SET_LEN(str2,
len);
1789 STR_SET_NOEMBED(str2);
1792 STR_SET_NOEMBED(str);
1794 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1796 if (
FL_TEST(str2, STR_SHARED)) {
1798 STR_SET_SHARED(str,
shared);
1801 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1805 STR_SET_EMBED(str2);
1806 RSTRING_PTR(str2)[0] = 0;
1807 STR_SET_LEN(str2, 0);
1808 rb_enc_associate(str, enc);
1822 return rb_obj_as_string_result(str, obj);
1838 len = RSTRING_LEN(str2);
1839 if (STR_SHARED_P(str2)) {
1842 STR_SET_NOEMBED(str);
1843 STR_SET_LEN(str,
len);
1844 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1845 STR_SET_SHARED(str,
shared);
1846 rb_enc_cr_str_exact_copy(str, str2);
1849 str_replace_shared(str, str2);
1858 size_t size = rb_str_embed_size(
capa);
1862 NEWOBJ_OF(str,
struct RString, klass,
1871 NEWOBJ_OF(str,
struct RString, klass,
1882 encidx = rb_enc_get_index(str);
1883 flags &= ~ENCODING_MASK;
1886 if (encidx) rb_enc_associate_index(dup, encidx);
1896 long len = RSTRING_LEN(str);
1901 STR_SET_LEN(dup, RSTRING_LEN(str));
1902 return str_duplicate_setup_encoding(str, dup, flags);
1911 root =
RSTRING(str)->as.heap.aux.shared;
1914 root = str = str_new_frozen(klass, str);
1920 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1921 FL_SET(root, STR_SHARED_ROOT);
1923 flags |= RSTRING_NOEMBED | STR_SHARED;
1925 STR_SET_LEN(dup, RSTRING_LEN(str));
1926 return str_duplicate_setup_encoding(str, dup, flags);
1932 if (STR_EMBED_P(str)) {
1933 return str_duplicate_setup_embed(klass, str, dup);
1936 return str_duplicate_setup_heap(klass, str, dup);
1944 if (STR_EMBED_P(str)) {
1945 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1948 dup = str_alloc_heap(klass);
1951 return str_duplicate_setup(klass, str, dup);
1962rb_str_dup_m(
VALUE str)
1964 if (LIKELY(BARE_STRING_P(str))) {
1975 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1982 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1986 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 str_duplicate_setup_embed(klass, str, new_str);
1990 new_str = ec_str_alloc_heap(ec, klass);
1991 str_duplicate_setup_heap(klass, str, new_str);
2000rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
2002 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
2026 static ID keyword_ids[2];
2027 VALUE orig, opt, venc, vcapa;
2032 if (!keyword_ids[0]) {
2033 keyword_ids[0] = rb_id_encoding();
2034 CONST_ID(keyword_ids[1],
"capacity");
2042 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2043 enc = rb_to_encoding(venc);
2045 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2048 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2050 if (
capa < STR_BUF_MIN_SIZE) {
2051 capa = STR_BUF_MIN_SIZE;
2055 len = RSTRING_LEN(orig);
2059 if (orig == str) n = 0;
2061 str_modifiable(str);
2062 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2064 const size_t size = (size_t)
capa + termlen;
2065 const char *
const old_ptr = RSTRING_PTR(str);
2066 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2067 char *new_ptr =
ALLOC_N(
char, size);
2068 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2069 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2071 RSTRING(str)->as.heap.ptr = new_ptr;
2073 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2074 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2075 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2077 STR_SET_LEN(str,
len);
2080 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2081 rb_enc_cr_str_exact_copy(str, orig);
2083 FL_SET(str, STR_NOEMBED);
2090 rb_enc_associate(str, enc);
2102rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2108 static ID keyword_ids[2];
2118 keyword_ids[0] = rb_id_encoding();
2119 CONST_ID(keyword_ids[1],
"capacity");
2121 encoding = kwargs[0];
2122 capacity = kwargs[1];
2131 if (UNDEF_P(encoding)) {
2133 encoding = rb_obj_encoding(orig);
2137 if (!UNDEF_P(encoding)) {
2138 enc = rb_to_encoding(encoding);
2142 if (UNDEF_P(capacity)) {
2144 VALUE empty_str = str_new(klass,
"", 0);
2146 rb_enc_associate(empty_str, enc);
2150 VALUE copy = str_duplicate(klass, orig);
2151 rb_enc_associate(copy, enc);
2164 if (orig_capa >
capa) {
2169 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2170 STR_SET_LEN(str, 0);
2181#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2196static inline uintptr_t
2197count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2202 d = (d>>6) | (~d>>7);
2203 d &= NONASCII_MASK >> 7;
2206#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2208 return rb_popcount_intptr(d);
2212# if SIZEOF_VOIDP == 8
2221enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2227 long diff = (long)(e - p);
2228 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2233 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2234 const uintptr_t *s, *t;
2235 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2236 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2237 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2238 while (p < (
const char *)s) {
2239 if (is_utf8_lead_byte(*p))
len++;
2243 len += count_utf8_lead_bytes_with_word(s);
2246 p = (
const char *)s;
2249 if (is_utf8_lead_byte(*p))
len++;
2255 else if (rb_enc_asciicompat(enc)) {
2260 q = search_nonascii(p, e);
2266 p += rb_enc_fast_mbclen(p, e, enc);
2273 q = search_nonascii(p, e);
2279 p += rb_enc_mbclen(p, e, enc);
2286 for (c=0; p<e; c++) {
2287 p += rb_enc_mbclen(p, e, enc);
2302rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2310 long diff = (long)(e - p);
2311 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2313 else if (rb_enc_asciicompat(enc)) {
2317 q = search_nonascii(p, e);
2325 ret = rb_enc_precise_mbclen(p, e, enc);
2340 for (c=0; p<e; c++) {
2341 ret = rb_enc_precise_mbclen(p, e, enc);
2348 if (p + rb_enc_mbminlen(enc) <= e)
2349 p += rb_enc_mbminlen(enc);
2365 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2366 if (!enc) enc = STR_ENC_GET(str);
2367 p = RSTRING_PTR(str);
2372 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2377 return enc_strlen(p, e, enc, cr);
2384 return str_strlen(str, NULL);
2398 return LONG2NUM(str_strlen(str, NULL));
2410rb_str_bytesize(
VALUE str)
2428rb_str_empty(
VALUE str)
2430 return RBOOL(RSTRING_LEN(str) == 0);
2449 char *ptr1, *ptr2, *ptr3;
2454 enc = rb_enc_check_str(str1, str2);
2457 termlen = rb_enc_mbminlen(enc);
2458 if (len1 > LONG_MAX - len2) {
2459 rb_raise(rb_eArgError,
"string size too big");
2461 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2462 ptr3 = RSTRING_PTR(str3);
2463 memcpy(ptr3, ptr1, len1);
2464 memcpy(ptr3+len1, ptr2, len2);
2465 TERM_FILL(&ptr3[len1+len2], termlen);
2481 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2484 int enc1 = rb_enc_get_index(str1);
2485 int enc2 = rb_enc_get_index(str2);
2490 else if (enc2 < 0) {
2493 else if (enc1 != enc2) {
2496 else if (len1 > LONG_MAX - len2) {
2530 rb_enc_copy(str2, str);
2535 rb_raise(rb_eArgError,
"negative argument");
2537 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2538 if (STR_EMBEDDABLE_P(
len, 1)) {
2540 memset(RSTRING_PTR(str2), 0,
len + 1);
2547 STR_SET_LEN(str2,
len);
2548 rb_enc_copy(str2, str);
2551 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2552 rb_raise(rb_eArgError,
"argument too big");
2555 len *= RSTRING_LEN(str);
2556 termlen = TERM_LEN(str);
2558 ptr2 = RSTRING_PTR(str2);
2560 n = RSTRING_LEN(str);
2561 memcpy(ptr2, RSTRING_PTR(str), n);
2562 while (n <=
len/2) {
2563 memcpy(ptr2 + n, ptr2, n);
2566 memcpy(ptr2 + n, ptr2,
len-n);
2568 STR_SET_LEN(str2,
len);
2569 TERM_FILL(&ptr2[
len], termlen);
2570 rb_enc_cr_str_copy_for_substr(str2, str);
2607rb_check_lockedtmp(
VALUE str)
2609 if (
FL_TEST(str, STR_TMPLOCK)) {
2616#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2618str_modifiable(
VALUE str)
2622 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2623 if (CHILLED_STRING_P(str)) {
2624 CHILLED_STRING_MUTATED(str);
2626 rb_check_lockedtmp(str);
2627 rb_check_frozen(str);
2632str_dependent_p(
VALUE str)
2634 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2644#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2646str_independent(
VALUE str)
2650 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2651 str_modifiable(str);
2652 return !str_dependent_p(str);
2658str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2668 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2673 STR_SET_LEN(str,
len);
2678 oldptr = RSTRING_PTR(str);
2680 memcpy(
ptr, oldptr,
len);
2682 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2685 STR_SET_NOEMBED(str);
2686 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2687 TERM_FILL(
ptr +
len, termlen);
2689 STR_SET_LEN(str,
len);
2696 if (!str_independent(str))
2697 str_make_independent(str);
2706 int termlen = TERM_LEN(str);
2707 long len = RSTRING_LEN(str);
2710 rb_raise(rb_eArgError,
"negative expanding string size");
2712 if (expand >= LONG_MAX -
len) {
2713 rb_raise(rb_eArgError,
"string size too big");
2716 if (!str_independent(str)) {
2717 str_make_independent_expand(str,
len, expand, termlen);
2719 else if (expand > 0) {
2720 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2727str_modify_keep_cr(
VALUE str)
2729 if (!str_independent(str))
2730 str_make_independent(str);
2737str_discard(
VALUE str)
2739 str_modifiable(str);
2740 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2741 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2742 RSTRING(str)->as.heap.ptr = 0;
2743 STR_SET_LEN(str, 0);
2750 int encindex = rb_enc_get_index(str);
2752 if (RB_UNLIKELY(encindex == -1)) {
2756 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2761 if (!rb_enc_asciicompat(enc)) {
2783 return RSTRING_PTR(str);
2787zero_filled(
const char *s,
int n)
2789 for (; n > 0; --n) {
2796str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2798 const char *e = s +
len;
2800 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2801 if (zero_filled(s, minlen))
return s;
2807str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2812 if (str_dependent_p(str)) {
2813 if (!zero_filled(s +
len, termlen))
2814 str_make_independent_expand(str,
len, 0L, termlen);
2817 TERM_FILL(s +
len, termlen);
2820 return RSTRING_PTR(str);
2824rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2826 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2827 long len = RSTRING_LEN(str);
2831 rb_check_lockedtmp(str);
2832 str_make_independent_expand(str,
len, 0L, termlen);
2834 else if (str_dependent_p(str)) {
2835 if (termlen > oldtermlen)
2836 str_make_independent_expand(str,
len, 0L, termlen);
2839 if (!STR_EMBED_P(str)) {
2844 if (termlen > oldtermlen) {
2845 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2853str_null_check(
VALUE str,
int *w)
2855 char *s = RSTRING_PTR(str);
2856 long len = RSTRING_LEN(str);
2858 const int minlen = rb_enc_mbminlen(enc);
2862 if (str_null_char(s,
len, minlen, enc)) {
2865 return str_fill_term(str, s,
len, minlen);
2868 if (!s || memchr(s, 0,
len)) {
2872 s = str_fill_term(str, s,
len, minlen);
2878rb_str_to_cstr(
VALUE str)
2881 return str_null_check(str, &w);
2889 char *s = str_null_check(str, &w);
2892 rb_raise(rb_eArgError,
"string contains null char");
2894 rb_raise(rb_eArgError,
"string contains null byte");
2900rb_str_fill_terminator(
VALUE str,
const int newminlen)
2902 char *s = RSTRING_PTR(str);
2903 long len = RSTRING_LEN(str);
2904 return str_fill_term(str, s,
len, newminlen);
2910 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2936str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
2945 else if (rb_enc_asciicompat(enc)) {
2946 const char *p2, *e2;
2949 while (p < e && 0 < nth) {
2956 p2 = search_nonascii(p, e2);
2965 n = rb_enc_mbclen(p, e, enc);
2976 while (p < e && nth--) {
2977 p += rb_enc_mbclen(p, e, enc);
2988 return str_nth_len(p, e, &nth, enc);
2992str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2997 p = str_nth_len(p, e, &nth, enc);
3006str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3008 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3009 if (!pp)
return e - p;
3016 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
3017 STR_ENC_GET(str), single_byte_optimizable(str));
3022str_utf8_nth(
const char *p,
const char *e,
long *nthp)
3025 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
3026 const uintptr_t *s, *t;
3027 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3028 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3029 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
3030 while (p < (
const char *)s) {
3031 if (is_utf8_lead_byte(*p)) nth--;
3035 nth -= count_utf8_lead_bytes_with_word(s);
3037 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3041 if (is_utf8_lead_byte(*p)) {
3042 if (nth == 0)
break;
3052str_utf8_offset(
const char *p,
const char *e,
long nth)
3054 const char *pp = str_utf8_nth(p, e, &nth);
3063 if (single_byte_optimizable(str) || pos < 0)
3066 char *p = RSTRING_PTR(str);
3067 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3072str_subseq(
VALUE str,
long beg,
long len)
3080 const int termlen = TERM_LEN(str);
3081 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3088 if (str_embed_capa(str2) >=
len + termlen) {
3089 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3090 STR_SET_EMBED(str2);
3091 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3092 TERM_FILL(ptr2+
len, termlen);
3094 STR_SET_LEN(str2,
len);
3098 str_replace_shared(str2, str);
3101 RSTRING(str2)->as.heap.ptr += beg;
3102 if (RSTRING_LEN(str2) >
len) {
3103 STR_SET_LEN(str2,
len);
3113 VALUE str2 = str_subseq(str, beg,
len);
3114 rb_enc_cr_str_copy_for_substr(str2, str);
3123 const long blen = RSTRING_LEN(str);
3125 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3127 if (
len < 0)
return 0;
3128 if (beg < 0 && -beg < 0)
return 0;
3132 if (single_byte_optimizable(str)) {
3133 if (beg > blen)
return 0;
3136 if (beg < 0)
return 0;
3138 if (
len > blen - beg)
3140 if (
len < 0)
return 0;
3145 if (
len > -beg)
len = -beg;
3149 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3152 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3158 slen = str_strlen(str, enc);
3160 if (beg < 0)
return 0;
3162 if (
len == 0)
goto end;
3165 else if (beg > 0 && beg > blen) {
3169 if (beg > str_strlen(str, enc))
return 0;
3174 enc == rb_utf8_encoding()) {
3175 p = str_utf8_nth(s, e, &beg);
3176 if (beg > 0)
return 0;
3177 len = str_utf8_offset(p, e,
len);
3183 p = s + beg * char_sz;
3187 else if (
len * char_sz > e - p)
3192 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3193 if (beg > 0)
return 0;
3197 len = str_offset(p, e,
len, enc, 0);
3205static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3210 return str_substr(str, beg,
len, TRUE);
3220str_substr(
VALUE str,
long beg,
long len,
int empty)
3224 if (!p)
return Qnil;
3225 if (!
len && !empty)
return Qnil;
3227 beg = p - RSTRING_PTR(str);
3229 VALUE str2 = str_subseq(str, beg,
len);
3230 rb_enc_cr_str_copy_for_substr(str2, str);
3238 if (CHILLED_STRING_P(str)) {
3243 rb_str_resize(str, RSTRING_LEN(str));
3261 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3304str_uminus(
VALUE str)
3309 return rb_fstring(str);
3313#define rb_str_dup_frozen rb_str_new_frozen
3318 rb_check_frozen(str);
3319 if (
FL_TEST(str, STR_TMPLOCK)) {
3322 FL_SET(str, STR_TMPLOCK);
3329 rb_check_frozen(str);
3330 if (!
FL_TEST(str, STR_TMPLOCK)) {
3350 const int termlen = TERM_LEN(str);
3352 str_modifiable(str);
3353 if (STR_SHARED_P(str)) {
3356 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3357 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3368 else if (
len > RSTRING_LEN(str)) {
3372 const char *
const new_end = RSTRING_PTR(str) +
len;
3382 else if (
len < RSTRING_LEN(str)) {
3390 STR_SET_LEN(str,
len);
3391 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3398 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3401 int independent = str_independent(str);
3402 long slen = RSTRING_LEN(str);
3403 const int termlen = TERM_LEN(str);
3405 if (slen >
len || (termlen != 1 && slen <
len)) {
3411 if (STR_EMBED_P(str)) {
3412 if (
len == slen)
return str;
3413 if (str_embed_capa(str) >=
len + termlen) {
3414 STR_SET_LEN(str,
len);
3418 str_make_independent_expand(str, slen,
len - slen, termlen);
3420 else if (str_embed_capa(str) >=
len + termlen) {
3421 char *
ptr = STR_HEAP_PTR(str);
3423 if (slen >
len) slen =
len;
3426 STR_SET_LEN(str,
len);
3427 if (independent) ruby_xfree(
ptr);
3430 else if (!independent) {
3431 if (
len == slen)
return str;
3432 str_make_independent_expand(str, slen,
len - slen, termlen);
3436 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3437 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3440 else if (
len == slen)
return str;
3441 STR_SET_LEN(str,
len);
3448str_ensure_available_capa(
VALUE str,
long len)
3450 str_modify_keep_cr(str);
3452 const int termlen = TERM_LEN(str);
3453 long olen = RSTRING_LEN(str);
3455 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3456 rb_raise(rb_eArgError,
"string sizes too big");
3459 long total = olen +
len;
3460 long capa = str_capacity(str, termlen);
3463 if (total >= LONG_MAX / 2) {
3466 while (total >
capa) {
3469 RESIZE_CAPA_TERM(str,
capa, termlen);
3474str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3477 str_modify_keep_cr(str);
3482 if (
len == 0)
return 0;
3484 long total, olen,
off = -1;
3486 const int termlen = TERM_LEN(str);
3489 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3493 long capa = str_capacity(str, termlen);
3495 if (olen > LONG_MAX -
len) {
3496 rb_raise(rb_eArgError,
"string sizes too big");
3500 if (total >= LONG_MAX / 2) {
3503 while (total >
capa) {
3506 RESIZE_CAPA_TERM(str,
capa, termlen);
3507 sptr = RSTRING_PTR(str);
3512 memcpy(sptr + olen,
ptr,
len);
3513 STR_SET_LEN(str, total);
3514 TERM_FILL(sptr + total, termlen);
3519#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3520#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3525 if (
len == 0)
return str;
3527 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3529 return str_buf_cat(str,
ptr,
len);
3540rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3545 if (UNLIKELY(!str_independent(str))) {
3546 str_make_independent(str);
3549 long string_length = -1;
3550 const int null_terminator_length = 1;
3555 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3556 rb_raise(rb_eArgError,
"string sizes too big");
3559 long string_capacity = str_capacity(str, null_terminator_length);
3565 if (LIKELY(string_capacity >= string_length + 1)) {
3567 sptr[string_length] = byte;
3568 STR_SET_LEN(str, string_length + 1);
3569 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3573 str_buf_cat(str, (
char *)&
byte, 1);
3589 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3600rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3601 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3610 if (str_encindex == ptr_encindex) {
3612 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3616 str_enc = rb_enc_from_index(str_encindex);
3617 ptr_enc = rb_enc_from_index(ptr_encindex);
3618 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3621 if (RSTRING_LEN(str) == 0) {
3624 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3630 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3639 *ptr_cr_ret = ptr_cr;
3641 if (str_encindex != ptr_encindex &&
3644 str_enc = rb_enc_from_index(str_encindex);
3645 ptr_enc = rb_enc_from_index(ptr_encindex);
3650 res_encindex = str_encindex;
3655 res_encindex = str_encindex;
3659 res_encindex = ptr_encindex;
3664 res_encindex = str_encindex;
3671 res_encindex = str_encindex;
3677 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3679 str_buf_cat(str,
ptr,
len);
3685 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3692 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3702 if (rb_enc_asciicompat(enc)) {
3703 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3709 unsigned int c = (
unsigned char)*
ptr;
3710 int len = rb_enc_codelen(c, enc);
3711 rb_enc_mbcput(c, buf, enc);
3712 rb_enc_cr_str_buf_cat(str, buf,
len,
3725 if (str_enc_fastpath(str)) {
3729 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3735 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3746 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3762rb_str_concat_literals(
size_t num,
const VALUE *strary)
3766 unsigned long len = 1;
3771 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3773 str_enc_copy_direct(str, strary[0]);
3775 for (i = s; i < num; ++i) {
3776 const VALUE v = strary[i];
3780 if (encidx != ENCINDEX_US_ASCII) {
3782 rb_enc_set_index(str, encidx);
3807rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3809 str_modifiable(str);
3814 else if (argc > 1) {
3817 rb_enc_copy(arg_str, str);
3818 for (i = 0; i < argc; i++) {
3853rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3855 long needed_capacity = 0;
3859 for (
int index = 0; index < argc; index++) {
3860 VALUE obj = argv[index];
3868 needed_capacity += RSTRING_LEN(obj);
3873 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3880 str_ensure_available_capa(str, needed_capacity);
3883 for (
int index = 0; index < argc; index++) {
3884 VALUE obj = argv[index];
3889 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3890 char byte = (char)(
NUM2INT(obj) & 0xFF);
3904 rb_bug(
"append_as_bytes arguments should have been validated");
3908 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3909 TERM_FILL(sptr, TERM_LEN(str));
3914 for (
int index = 0; index < argc; index++) {
3915 VALUE obj = argv[index];
3932 rb_bug(
"append_as_bytes arguments should have been validated");
4011 if (rb_num_to_uint(str2, &code) == 0) {
4024 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4027 rb_str_buf_cat_byte(str1, (
unsigned char)code);
4030 long pos = RSTRING_LEN(str1);
4035 switch (
len = rb_enc_codelen(code, enc)) {
4036 case ONIGERR_INVALID_CODE_POINT_VALUE:
4037 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4039 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4045 rb_enc_mbcput(code, buf, enc);
4046 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
4047 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4049 rb_str_resize(str1, pos+
len);
4050 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
4063rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
4065 int encidx = rb_enc_to_index(enc);
4067 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4072 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4073 return ENCINDEX_ASCII_8BIT;
4096rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4098 str_modifiable(str);
4103 else if (argc > 1) {
4106 rb_enc_copy(arg_str, str);
4107 for (i = 0; i < argc; i++) {
4120 st_index_t precomputed_hash;
4121 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4123 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4124 return precomputed_hash;
4127 return str_do_hash(str);
4134 const char *ptr1, *ptr2;
4137 return (len1 != len2 ||
4139 memcmp(ptr1, ptr2, len1) != 0);
4153rb_str_hash_m(
VALUE str)
4159#define lesser(a,b) (((a)>(b))?(b):(a))
4167 if (RSTRING_LEN(str1) == 0)
return TRUE;
4168 if (RSTRING_LEN(str2) == 0)
return TRUE;
4171 if (idx1 == idx2)
return TRUE;
4176 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4180 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4190 const char *ptr1, *ptr2;
4193 if (str1 == str2)
return 0;
4196 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4205 if (len1 > len2)
return 1;
4208 if (retval > 0)
return 1;
4242 if (str1 == str2)
return Qtrue;
4249 return rb_str_eql_internal(str1, str2);
4273 if (str1 == str2)
return Qtrue;
4275 return rb_str_eql_internal(str1, str2);
4307 return rb_invcmp(str1, str2);
4349 return str_casecmp(str1, s);
4357 const char *p1, *p1end, *p2, *p2end;
4359 enc = rb_enc_compatible(str1, str2);
4364 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4365 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4366 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4367 while (p1 < p1end && p2 < p2end) {
4369 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4370 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4372 return INT2FIX(c1 < c2 ? -1 : 1);
4379 while (p1 < p1end && p2 < p2end) {
4380 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4381 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4383 if (0 <= c1 && 0 <= c2) {
4387 return INT2FIX(c1 < c2 ? -1 : 1);
4391 l1 = rb_enc_mbclen(p1, p1end, enc);
4392 l2 = rb_enc_mbclen(p2, p2end, enc);
4393 len = l1 < l2 ? l1 : l2;
4394 r = memcmp(p1, p2,
len);
4396 return INT2FIX(r < 0 ? -1 : 1);
4398 return INT2FIX(l1 < l2 ? -1 : 1);
4404 if (RSTRING_LEN(str1) == RSTRING_LEN(str2))
return INT2FIX(0);
4405 if (RSTRING_LEN(str1) > RSTRING_LEN(str2))
return INT2FIX(1);
4439 return str_casecmp_p(str1, s);
4446 VALUE folded_str1, folded_str2;
4447 VALUE fold_opt = sym_fold;
4449 enc = rb_enc_compatible(str1, str2);
4454 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4455 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4457 return rb_str_eql(folded_str1, folded_str2);
4461strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4462 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4464 const char *search_start = str_ptr;
4465 long pos, search_len = str_len - offset;
4469 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4470 if (pos < 0)
return pos;
4472 if (t == search_start + pos)
break;
4473 search_len -= t - search_start;
4474 if (search_len <= 0)
return -1;
4475 offset += t - search_start;
4478 return pos + offset;
4482#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4483#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4486rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4488 const char *str_ptr, *str_ptr_end, *sub_ptr;
4489 long str_len, sub_len;
4492 enc = rb_enc_check(str, sub);
4493 if (is_broken_string(sub))
return -1;
4495 str_ptr = RSTRING_PTR(str);
4497 str_len = RSTRING_LEN(str);
4498 sub_ptr = RSTRING_PTR(sub);
4499 sub_len = RSTRING_LEN(sub);
4501 if (str_len < sub_len)
return -1;
4504 long str_len_char, sub_len_char;
4505 int single_byte = single_byte_optimizable(str);
4506 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4507 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4509 offset += str_len_char;
4510 if (offset < 0)
return -1;
4512 if (str_len_char - offset < sub_len_char)
return -1;
4513 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4516 if (sub_len == 0)
return offset;
4519 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4533rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4540 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4541 long slen = str_strlen(str, enc);
4543 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4555 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4556 enc, single_byte_optimizable(str));
4567 pos = rb_str_index(str, sub, pos);
4581str_ensure_byte_pos(
VALUE str,
long pos)
4583 if (!single_byte_optimizable(str)) {
4584 const char *s = RSTRING_PTR(str);
4586 const char *p = s + pos;
4587 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4589 "offset %ld does not land on character boundary", pos);
4662rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4668 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4669 long slen = RSTRING_LEN(str);
4671 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4682 str_ensure_byte_pos(str, pos);
4694 pos = rb_str_byteindex(str, sub, pos);
4695 if (pos >= 0)
return LONG2NUM(pos);
4702memrchr(
const char *search_str,
int chr,
long search_len)
4704 const char *ptr = search_str + search_len;
4705 while (ptr > search_str) {
4706 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4716 char *hit, *adjusted;
4718 long slen, searchlen;
4721 sbeg = RSTRING_PTR(str);
4722 slen = RSTRING_LEN(sub);
4723 if (slen == 0)
return s - sbeg;
4725 t = RSTRING_PTR(sub);
4727 searchlen = s - sbeg + 1;
4729 if (memcmp(s, t, slen) == 0) {
4734 hit = memrchr(sbeg, c, searchlen);
4737 if (hit != adjusted) {
4738 searchlen = adjusted - sbeg;
4741 if (memcmp(hit, t, slen) == 0)
4743 searchlen = adjusted - sbeg;
4744 }
while (searchlen > 0);
4758 enc = rb_enc_check(str, sub);
4759 if (is_broken_string(sub))
return -1;
4760 singlebyte = single_byte_optimizable(str);
4761 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4762 slen = str_strlen(sub, enc);
4765 if (
len < slen)
return -1;
4766 if (
len - pos < slen) pos =
len - slen;
4767 if (
len == 0)
return pos;
4769 sbeg = RSTRING_PTR(str);
4772 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4778 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4779 return str_rindex(str, sub, s, enc);
4840rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4845 long pos,
len = str_strlen(str, enc);
4847 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4849 if (pos < 0 && (pos +=
len) < 0) {
4855 if (pos >
len) pos =
len;
4863 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4864 enc, single_byte_optimizable(str));
4875 pos = rb_str_rindex(str, sub, pos);
4885rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4891 enc = rb_enc_check(str, sub);
4892 if (is_broken_string(sub))
return -1;
4893 len = RSTRING_LEN(str);
4894 slen = RSTRING_LEN(sub);
4897 if (
len < slen)
return -1;
4898 if (
len - pos < slen) pos =
len - slen;
4899 if (
len == 0)
return pos;
4901 sbeg = RSTRING_PTR(str);
4904 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4911 return str_rindex(str, sub, s, enc);
5001rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
5005 long pos,
len = RSTRING_LEN(str);
5007 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
5009 if (pos < 0 && (pos +=
len) < 0) {
5015 if (pos >
len) pos =
len;
5021 str_ensure_byte_pos(str, pos);
5033 pos = rb_str_byterindex(str, sub, pos);
5034 if (pos >= 0)
return LONG2NUM(pos);
5073 switch (OBJ_BUILTIN_TYPE(y)) {
5125rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5132 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5164rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5168 re = get_pat(argv[0]);
5169 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5178static enum neighbor_char
5184 if (rb_enc_mbminlen(enc) > 1) {
5186 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5188 return NEIGHBOR_NOT_CHAR;
5190 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5192 if (!l)
return NEIGHBOR_NOT_CHAR;
5193 if (l !=
len)
return NEIGHBOR_WRAPPED;
5194 rb_enc_mbcput(c, p, enc);
5195 r = rb_enc_precise_mbclen(p, p +
len, enc);
5197 return NEIGHBOR_NOT_CHAR;
5199 return NEIGHBOR_FOUND;
5202 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5205 return NEIGHBOR_WRAPPED;
5206 ++((
unsigned char*)p)[i];
5207 l = rb_enc_precise_mbclen(p, p+
len, enc);
5211 return NEIGHBOR_FOUND;
5214 memset(p+l, 0xff,
len-l);
5220 for (len2 =
len-1; 0 < len2; len2--) {
5221 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5225 memset(p+len2+1, 0xff,
len-(len2+1));
5230static enum neighbor_char
5235 if (rb_enc_mbminlen(enc) > 1) {
5237 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5239 return NEIGHBOR_NOT_CHAR;
5241 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5242 if (!c)
return NEIGHBOR_NOT_CHAR;
5245 if (!l)
return NEIGHBOR_NOT_CHAR;
5246 if (l !=
len)
return NEIGHBOR_WRAPPED;
5247 rb_enc_mbcput(c, p, enc);
5248 r = rb_enc_precise_mbclen(p, p +
len, enc);
5250 return NEIGHBOR_NOT_CHAR;
5252 return NEIGHBOR_FOUND;
5255 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5258 return NEIGHBOR_WRAPPED;
5259 --((
unsigned char*)p)[i];
5260 l = rb_enc_precise_mbclen(p, p+
len, enc);
5264 return NEIGHBOR_FOUND;
5267 memset(p+l, 0,
len-l);
5273 for (len2 =
len-1; 0 < len2; len2--) {
5274 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5278 memset(p+len2+1, 0,
len-(len2+1));
5292static enum neighbor_char
5293enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5295 enum neighbor_char ret;
5299 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5303 const int max_gaps = 1;
5305 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5307 ctype = ONIGENC_CTYPE_DIGIT;
5309 ctype = ONIGENC_CTYPE_ALPHA;
5311 return NEIGHBOR_NOT_CHAR;
5314 for (
try = 0;
try <= max_gaps; ++
try) {
5315 ret = enc_succ_char(p,
len, enc);
5316 if (ret == NEIGHBOR_FOUND) {
5317 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5319 return NEIGHBOR_FOUND;
5326 ret = enc_pred_char(p,
len, enc);
5327 if (ret == NEIGHBOR_FOUND) {
5328 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5341 return NEIGHBOR_NOT_CHAR;
5344 if (ctype != ONIGENC_CTYPE_DIGIT) {
5346 return NEIGHBOR_WRAPPED;
5350 enc_succ_char(carry,
len, enc);
5351 return NEIGHBOR_WRAPPED;
5419 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5420 rb_enc_cr_str_copy_for_substr(str, orig);
5421 return str_succ(str);
5428 char *sbeg, *s, *e, *last_alnum = 0;
5429 int found_alnum = 0;
5431 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5432 long carry_pos = 0, carry_len = 1;
5433 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5435 slen = RSTRING_LEN(str);
5436 if (slen == 0)
return str;
5438 enc = STR_ENC_GET(str);
5439 sbeg = RSTRING_PTR(str);
5440 s = e = sbeg + slen;
5442 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5443 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5449 l = rb_enc_precise_mbclen(s, e, enc);
5450 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5451 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5452 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5454 case NEIGHBOR_NOT_CHAR:
5456 case NEIGHBOR_FOUND:
5458 case NEIGHBOR_WRAPPED:
5463 carry_pos = s - sbeg;
5468 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5469 enum neighbor_char neighbor;
5470 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5471 l = rb_enc_precise_mbclen(s, e, enc);
5472 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5473 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5475 neighbor = enc_succ_char(tmp, l, enc);
5477 case NEIGHBOR_FOUND:
5481 case NEIGHBOR_WRAPPED:
5484 case NEIGHBOR_NOT_CHAR:
5487 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5489 enc_succ_char(s, l, enc);
5491 if (!rb_enc_asciicompat(enc)) {
5492 MEMCPY(carry, s,
char, l);
5495 carry_pos = s - sbeg;
5499 RESIZE_CAPA(str, slen + carry_len);
5500 sbeg = RSTRING_PTR(str);
5501 s = sbeg + carry_pos;
5502 memmove(s + carry_len, s, slen - carry_pos);
5503 memmove(s, carry, carry_len);
5505 STR_SET_LEN(str, slen);
5506 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5520rb_str_succ_bang(
VALUE str)
5528all_digits_p(
const char *s,
long len)
5582 VALUE end, exclusive;
5586 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5592 VALUE current, after_end;
5599 enc = rb_enc_check(beg, end);
5600 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5602 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5603 char c = RSTRING_PTR(beg)[0];
5604 char e = RSTRING_PTR(end)[0];
5606 if (c > e || (excl && c == e))
return beg;
5608 VALUE str = rb_enc_str_new(&c, 1, enc);
5610 if ((*each)(str, arg))
break;
5611 if (!excl && c == e)
break;
5613 if (excl && c == e)
break;
5618 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5619 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5620 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5625 b = rb_str_to_inum(beg, 10, FALSE);
5626 e = rb_str_to_inum(end, 10, FALSE);
5633 if (excl && bi == ei)
break;
5634 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5639 ID op = excl ?
'<' : idLE;
5640 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5645 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5646 b = rb_funcallv(b, succ, 0, 0);
5653 if (n > 0 || (excl && n == 0))
return beg;
5655 after_end = rb_funcallv(end, succ, 0, 0);
5660 next = rb_funcallv(current, succ, 0, 0);
5661 if ((*each)(current, arg))
break;
5662 if (
NIL_P(next))
break;
5666 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5681 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5682 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5683 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5685 b = rb_str_to_inum(beg, 10, FALSE);
5691 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5699 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5700 b = rb_funcallv(b, succ, 0, 0);
5706 VALUE next = rb_funcallv(current, succ, 0, 0);
5707 if ((*each)(current, arg))
break;
5710 if (RSTRING_LEN(current) == 0)
5721 if (!
rb_equal(str, *argp))
return 0;
5735 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5736 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5737 rb_enc_asciicompat(STR_ENC_GET(val))) {
5738 const char *bp = RSTRING_PTR(beg);
5739 const char *ep = RSTRING_PTR(end);
5740 const char *vp = RSTRING_PTR(val);
5741 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5742 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5750 if (b <= v && v < e)
return Qtrue;
5751 return RBOOL(!
RTEST(exclusive) && v == e);
5758 all_digits_p(bp, RSTRING_LEN(beg)) &&
5759 all_digits_p(ep, RSTRING_LEN(end))) {
5764 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5766 return RBOOL(
NIL_P(val));
5789 return rb_str_subpat(str, indx,
INT2FIX(0));
5792 if (rb_str_index(str, indx, 0) != -1)
5798 long beg,
len = str_strlen(str, NULL);
5810 return str_substr(str, idx, 1, FALSE);
5829rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5833 return rb_str_subpat(str, argv[0], argv[1]);
5836 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5840 return rb_str_aref(str, argv[0]);
5846 char *ptr = RSTRING_PTR(str);
5847 long olen = RSTRING_LEN(str), nlen;
5849 str_modifiable(str);
5850 if (
len > olen)
len = olen;
5852 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5854 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5856 ptr =
RSTRING(str)->as.embed.ary;
5857 memmove(ptr, oldptr +
len, nlen);
5858 if (fl == STR_NOEMBED)
xfree(oldptr);
5861 if (!STR_SHARED_P(str)) {
5863 rb_enc_cr_str_exact_copy(shared, str);
5868 STR_SET_LEN(str, nlen);
5870 if (!SHARABLE_MIDDLE_SUBSTRING) {
5871 TERM_FILL(ptr + nlen, TERM_LEN(str));
5878rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5884 if (beg == 0 && vlen == 0) {
5889 str_modify_keep_cr(str);
5893 RESIZE_CAPA(str, slen + vlen -
len);
5894 sptr = RSTRING_PTR(str);
5903 memmove(sptr + beg + vlen,
5905 slen - (beg +
len));
5907 if (vlen < beg &&
len < 0) {
5911 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5914 STR_SET_LEN(str, slen);
5915 TERM_FILL(&sptr[slen], TERM_LEN(str));
5922 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5931 int singlebyte = single_byte_optimizable(str);
5937 enc = rb_enc_check(str, val);
5938 slen = str_strlen(str, enc);
5940 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5949 if (
len > slen - beg) {
5952 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5957 beg = p - RSTRING_PTR(str);
5959 rb_str_update_0(str, beg,
len, val);
5960 rb_enc_associate(str, enc);
5971 long start, end,
len;
5981 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5985 nth += regs->num_regs;
5995 enc = rb_enc_check_str(str, val);
5996 rb_str_update_0(str, start,
len, val);
5997 rb_enc_associate(str, enc);
6005 switch (
TYPE(indx)) {
6007 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
6011 beg = rb_str_index(str, indx, 0);
6066rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
6070 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6078 return rb_str_aset(str, argv[0], argv[1]);
6138rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6146 str_modify_keep_cr(str);
6154 if ((nth += regs->num_regs) <= 0)
return Qnil;
6156 else if (nth >= regs->num_regs)
return Qnil;
6158 len = END(nth) - beg;
6161 else if (argc == 2) {
6170 beg = p - RSTRING_PTR(str);
6174 beg = rb_str_index(str, indx, 0);
6175 if (beg == -1)
return Qnil;
6176 len = RSTRING_LEN(indx);
6188 beg = p - RSTRING_PTR(str);
6197 beg = p - RSTRING_PTR(str);
6201 rb_enc_cr_str_copy_for_substr(result, str);
6209 char *sptr = RSTRING_PTR(str);
6210 long slen = RSTRING_LEN(str);
6211 if (beg +
len > slen)
6215 slen - (beg +
len));
6217 STR_SET_LEN(str, slen);
6218 TERM_FILL(&sptr[slen], TERM_LEN(str));
6229 switch (OBJ_BUILTIN_TYPE(pat)) {
6248get_pat_quoted(
VALUE pat,
int check)
6252 switch (OBJ_BUILTIN_TYPE(pat)) {
6266 if (check && is_broken_string(pat)) {
6273rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6276 pos = rb_str_byteindex(str, pat, pos);
6277 if (set_backref_str) {
6279 str = rb_str_new_frozen_String(str);
6280 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6282 *match = match_data;
6292 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6297rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6299 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6318rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6332 hash = rb_check_hash_type(argv[1]);
6338 pat = get_pat_quoted(argv[0], 1);
6340 str_modifiable(str);
6341 beg = rb_pat_search(pat, str, 0, 1);
6355 end0 = beg0 + RSTRING_LEN(pat);
6364 if (iter || !
NIL_P(hash)) {
6365 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6371 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6374 str_mod_check(str, p,
len);
6375 rb_check_frozen(str);
6381 enc = rb_enc_compatible(str, repl);
6384 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6388 rb_enc_inspect_name(str_enc),
6389 rb_enc_inspect_name(STR_ENC_GET(repl)));
6391 enc = STR_ENC_GET(repl);
6394 rb_enc_associate(str, enc);
6404 rlen = RSTRING_LEN(repl);
6405 len = RSTRING_LEN(str);
6407 RESIZE_CAPA(str,
len + rlen - plen);
6409 p = RSTRING_PTR(str);
6411 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6413 rp = RSTRING_PTR(repl);
6414 memmove(p + beg0, rp, rlen);
6416 STR_SET_LEN(str,
len);
6417 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6446 rb_str_sub_bang(argc, argv, str);
6451str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6454 long beg, beg0, end0;
6455 long offset, blen, slen,
len, last;
6456 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6458 int need_backref_str = -1;
6468 hash = rb_check_hash_type(argv[1]);
6472 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6480 rb_error_arity(argc, 1, 2);
6483 pat = get_pat_quoted(argv[0], 1);
6484 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6487 if (bang)
return Qnil;
6492 blen = RSTRING_LEN(str) + 30;
6494 sp = RSTRING_PTR(str);
6495 slen = RSTRING_LEN(str);
6497 str_enc = STR_ENC_GET(str);
6498 rb_enc_associate(dest, str_enc);
6505 end0 = beg0 + RSTRING_LEN(pat);
6521 if (mode == FAST_MAP) {
6530 val = rb_hash_aref(hash, key);
6533 str_mod_check(str, sp, slen);
6538 else if (need_backref_str) {
6540 if (need_backref_str < 0) {
6541 need_backref_str = val != repl;
6548 len = beg0 - offset;
6562 if (RSTRING_LEN(str) <= end0)
break;
6563 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6565 offset = end0 +
len;
6567 cp = RSTRING_PTR(str) + offset;
6568 if (offset > RSTRING_LEN(str))
break;
6571 if (mode != FAST_MAP && mode != STR) {
6574 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6579 if (RSTRING_LEN(str) > offset) {
6582 rb_pat_search0(pat, str, last, 1, &match);
6584 str_shared_replace(str, dest);
6612rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6614 str_modify_keep_cr(str);
6615 return str_gsub(argc, argv, str, 1);
6638 return str_gsub(argc, argv, str, 0);
6656 str_modifiable(str);
6657 if (str == str2)
return str;
6661 return str_replace(str, str2);
6676rb_str_clear(
VALUE str)
6680 STR_SET_LEN(str, 0);
6681 RSTRING_PTR(str)[0] = 0;
6682 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6701rb_str_chr(
VALUE str)
6725 pos += RSTRING_LEN(str);
6726 if (pos < 0 || RSTRING_LEN(str) <= pos)
6729 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6748 long len = RSTRING_LEN(str);
6749 char *
ptr, *head, *left = 0;
6753 if (pos < -
len ||
len <= pos)
6760 char byte = (char)(
NUM2INT(w) & 0xFF);
6762 if (!str_independent(str))
6763 str_make_independent(str);
6764 enc = STR_ENC_GET(str);
6765 head = RSTRING_PTR(str);
6767 if (!STR_EMBED_P(str)) {
6774 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6782 width = rb_enc_precise_mbclen(left, head+
len, enc);
6784 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6800str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6802 long n = RSTRING_LEN(str);
6804 if (beg > n ||
len < 0)
return Qnil;
6807 if (beg < 0)
return Qnil;
6812 if (!empty)
return Qnil;
6816 VALUE str2 = str_subseq(str, beg,
len);
6818 str_enc_copy_direct(str2, str);
6820 if (RSTRING_LEN(str2) == 0) {
6821 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6855 long beg,
len = RSTRING_LEN(str);
6863 return str_byte_substr(str, beg,
len, TRUE);
6868 return str_byte_substr(str, idx, 1, FALSE);
6880rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6885 return str_byte_substr(str, beg,
len, TRUE);
6888 return str_byte_aref(str, argv[0]);
6892str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6894 long end, slen = RSTRING_LEN(str);
6897 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6906 if (*
len > slen - *beg) {
6910 str_ensure_byte_pos(str, *beg);
6911 str_ensure_byte_pos(str, end);
6925rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6927 long beg,
len, vbeg, vlen;
6932 if (!(argc == 2 || argc == 3 || argc == 5)) {
6933 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6937 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6938 rb_builtin_class_name(argv[0]));
6945 vlen = RSTRING_LEN(val);
6950 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6951 rb_builtin_class_name(argv[2]));
6963 vlen = RSTRING_LEN(val);
6971 str_check_beg_len(str, &beg, &
len);
6972 str_check_beg_len(val, &vbeg, &vlen);
6973 str_modify_keep_cr(str);
6976 rb_enc_associate(str, rb_enc_check(str, val));
6979 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6997rb_str_reverse(
VALUE str)
7004 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
7005 enc = STR_ENC_GET(str);
7011 if (RSTRING_LEN(str) > 1) {
7012 if (single_byte_optimizable(str)) {
7019 int clen = rb_enc_fast_mbclen(s, e, enc);
7027 cr = rb_enc_asciicompat(enc) ?
7030 int clen = rb_enc_mbclen(s, e, enc);
7039 STR_SET_LEN(rev, RSTRING_LEN(str));
7040 str_enc_copy_direct(rev, str);
7060rb_str_reverse_bang(
VALUE str)
7062 if (RSTRING_LEN(str) > 1) {
7063 if (single_byte_optimizable(str)) {
7066 str_modify_keep_cr(str);
7067 s = RSTRING_PTR(str);
7076 str_shared_replace(str, rb_str_reverse(str));
7080 str_modify_keep_cr(str);
7105 i = rb_str_index(str, arg, 0);
7107 return RBOOL(i != -1);
7149 rb_raise(rb_eArgError,
"invalid radix %d", base);
7151 return rb_str_to_inum(str, base, FALSE);
7175rb_str_to_f(
VALUE str)
7190rb_str_to_s(
VALUE str)
7202 char s[RUBY_MAX_CHAR_LEN];
7203 int n = rb_enc_codelen(c, enc);
7205 rb_enc_mbcput(c, s, enc);
7210#define CHAR_ESC_LEN 13
7213rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7215 char buf[CHAR_ESC_LEN + 1];
7223 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7225 else if (c < 0x10000) {
7226 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7229 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7234 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7237 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7240 l = (int)strlen(buf);
7246ruby_escaped_char(
int c)
7249 case '\0':
return "\\0";
7250 case '\n':
return "\\n";
7251 case '\r':
return "\\r";
7252 case '\t':
return "\\t";
7253 case '\f':
return "\\f";
7254 case '\013':
return "\\v";
7255 case '\010':
return "\\b";
7256 case '\007':
return "\\a";
7257 case '\033':
return "\\e";
7258 case '\x7f':
return "\\c?";
7264rb_str_escape(
VALUE str)
7268 const char *p = RSTRING_PTR(str);
7270 const char *prev = p;
7271 char buf[CHAR_ESC_LEN + 1];
7273 int unicode_p = rb_enc_unicode_p(enc);
7274 int asciicompat = rb_enc_asciicompat(enc);
7279 int n = rb_enc_precise_mbclen(p, pend, enc);
7281 if (p > prev) str_buf_cat(result, prev, p - prev);
7282 n = rb_enc_mbminlen(enc);
7284 n = (int)(pend - p);
7286 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7287 str_buf_cat(result, buf, strlen(buf));
7293 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7295 cc = ruby_escaped_char(c);
7297 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7298 str_buf_cat(result, cc, strlen(cc));
7301 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7304 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7305 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7309 if (p > prev) str_buf_cat(result, prev, p - prev);
7333 const char *p, *pend, *prev;
7334 char buf[CHAR_ESC_LEN + 1];
7336 rb_encoding *resenc = rb_default_internal_encoding();
7337 int unicode_p = rb_enc_unicode_p(enc);
7338 int asciicompat = rb_enc_asciicompat(enc);
7340 if (resenc == NULL) resenc = rb_default_external_encoding();
7341 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7342 rb_enc_associate(result, resenc);
7343 str_buf_cat2(result,
"\"");
7351 n = rb_enc_precise_mbclen(p, pend, enc);
7353 if (p > prev) str_buf_cat(result, prev, p - prev);
7354 n = rb_enc_mbminlen(enc);
7356 n = (int)(pend - p);
7358 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7359 str_buf_cat(result, buf, strlen(buf));
7365 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7367 if ((asciicompat || unicode_p) &&
7368 (c ==
'"'|| c ==
'\\' ||
7373 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7374 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7375 str_buf_cat2(result,
"\\");
7376 if (asciicompat || enc == resenc) {
7382 case '\n': cc =
'n';
break;
7383 case '\r': cc =
'r';
break;
7384 case '\t': cc =
't';
break;
7385 case '\f': cc =
'f';
break;
7386 case '\013': cc =
'v';
break;
7387 case '\010': cc =
'b';
break;
7388 case '\007': cc =
'a';
break;
7389 case 033: cc =
'e';
break;
7390 default: cc = 0;
break;
7393 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7396 str_buf_cat(result, buf, 2);
7409 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7413 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7414 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7419 if (p > prev) str_buf_cat(result, prev, p - prev);
7420 str_buf_cat2(result,
"\"");
7425#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7445 int encidx = rb_enc_get_index(str);
7448 const char *p, *pend;
7451 int u8 = (encidx == rb_utf8_encindex());
7452 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7455 if (!rb_enc_asciicompat(enc)) {
7457 len += strlen(enc->name);
7460 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7463 unsigned char c = *p++;
7466 case '"':
case '\\':
7467 case '\n':
case '\r':
7468 case '\t':
case '\f':
7469 case '\013':
case '\010':
case '\007':
case '\033':
7474 clen = IS_EVSTR(p, pend) ? 2 : 1;
7482 if (u8 && c > 0x7F) {
7483 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7485 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7488 else if (cc <= 0xFFFFF)
7501 if (clen > LONG_MAX -
len) {
7508 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7509 q = RSTRING_PTR(result); qend = q +
len + 1;
7513 unsigned char c = *p++;
7515 if (c ==
'"' || c ==
'\\') {
7519 else if (c ==
'#') {
7520 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7523 else if (c ==
'\n') {
7527 else if (c ==
'\r') {
7531 else if (c ==
'\t') {
7535 else if (c ==
'\f') {
7539 else if (c ==
'\013') {
7543 else if (c ==
'\010') {
7547 else if (c ==
'\007') {
7551 else if (c ==
'\033') {
7561 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7563 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7566 snprintf(q, qend-q,
"u%04X", cc);
7568 snprintf(q, qend-q,
"u{%X}", cc);
7573 snprintf(q, qend-q,
"x%02X", c);
7579 if (!rb_enc_asciicompat(enc)) {
7580 snprintf(q, qend-q, nonascii_suffix, enc->name);
7581 encidx = rb_ascii8bit_encindex();
7584 rb_enc_associate_index(result, encidx);
7590unescape_ascii(
unsigned int c)
7614undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7616 const char *s = *ss;
7620 unsigned char buf[6];
7638 *buf = unescape_ascii(*s);
7650 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7651 if (*penc != enc_utf8) {
7653 rb_enc_associate(undumped, enc_utf8);
7670 if (hexlen == 0 || hexlen > 6) {
7676 if (0xd800 <= c && c <= 0xdfff) {
7679 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7689 if (0xd800 <= c && c <= 0xdfff) {
7692 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7720static VALUE rb_str_is_ascii_only_p(
VALUE str);
7738str_undump(
VALUE str)
7740 const char *s = RSTRING_PTR(str);
7743 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7745 bool binary =
false;
7749 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7752 if (!str_null_check(str, &w)) {
7755 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7756 if (*s !=
'"')
goto invalid_format;
7774 static const char force_encoding_suffix[] =
".force_encoding(\"";
7775 static const char dup_suffix[] =
".dup";
7776 const char *encname;
7781 size =
sizeof(dup_suffix) - 1;
7782 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7784 size =
sizeof(force_encoding_suffix) - 1;
7785 if (s_end - s <= size)
goto invalid_format;
7786 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7790 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7794 s = memchr(s,
'"', s_end-s);
7796 if (!s)
goto invalid_format;
7797 if (s_end - s != 2)
goto invalid_format;
7798 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7800 encidx = rb_enc_find_index2(encname, (
long)size);
7804 rb_enc_associate_index(undumped, encidx);
7814 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7825 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7831 if (rb_enc_dummy_p(enc)) {
7838str_true_enc(
VALUE str)
7841 rb_str_check_dummy_enc(enc);
7845static OnigCaseFoldType
7846check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7851 rb_raise(rb_eArgError,
"too many options");
7852 if (argv[0]==sym_turkic) {
7853 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7855 if (argv[1]==sym_lithuanian)
7856 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7858 rb_raise(rb_eArgError,
"invalid second option");
7861 else if (argv[0]==sym_lithuanian) {
7862 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7864 if (argv[1]==sym_turkic)
7865 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7867 rb_raise(rb_eArgError,
"invalid second option");
7871 rb_raise(rb_eArgError,
"too many options");
7872 else if (argv[0]==sym_ascii)
7873 flags |= ONIGENC_CASE_ASCII_ONLY;
7874 else if (argv[0]==sym_fold) {
7875 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7876 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7878 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7881 rb_raise(rb_eArgError,
"invalid option");
7888 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7894#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7895#ifndef CASEMAP_DEBUG
7896# define CASEMAP_DEBUG 0
7904 OnigUChar space[FLEX_ARY_LEN];
7908mapping_buffer_free(
void *p)
7912 while (current_buffer) {
7913 previous_buffer = current_buffer;
7914 current_buffer = current_buffer->next;
7915 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7921 {0, mapping_buffer_free,},
7922 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7930 const OnigUChar *source_current, *source_end;
7931 int target_length = 0;
7932 VALUE buffer_anchor;
7935 size_t buffer_count = 0;
7936 int buffer_length_or_invalid;
7938 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7940 source_current = (OnigUChar*)RSTRING_PTR(source);
7945 while (source_current < source_end) {
7947 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7948 if (CASEMAP_DEBUG) {
7949 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7952 *pre_buffer = current_buffer;
7953 pre_buffer = ¤t_buffer->next;
7954 current_buffer->next = NULL;
7955 current_buffer->capa =
capa;
7956 buffer_length_or_invalid = enc->case_map(flags,
7957 &source_current, source_end,
7958 current_buffer->space,
7959 current_buffer->space+current_buffer->capa,
7961 if (buffer_length_or_invalid < 0) {
7962 current_buffer =
DATA_PTR(buffer_anchor);
7964 mapping_buffer_free(current_buffer);
7965 rb_raise(rb_eArgError,
"input string invalid");
7967 target_length += current_buffer->used = buffer_length_or_invalid;
7969 if (CASEMAP_DEBUG) {
7970 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7973 if (buffer_count==1) {
7974 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7977 char *target_current;
7980 target_current = RSTRING_PTR(target);
7981 current_buffer =
DATA_PTR(buffer_anchor);
7982 while (current_buffer) {
7983 memcpy(target_current, current_buffer->space, current_buffer->used);
7984 target_current += current_buffer->used;
7985 current_buffer = current_buffer->next;
7988 current_buffer =
DATA_PTR(buffer_anchor);
7990 mapping_buffer_free(current_buffer);
7995 str_enc_copy_direct(target, source);
8004 const OnigUChar *source_current, *source_end;
8005 OnigUChar *target_current, *target_end;
8006 long old_length = RSTRING_LEN(source);
8007 int length_or_invalid;
8009 if (old_length == 0)
return Qnil;
8011 source_current = (OnigUChar*)RSTRING_PTR(source);
8013 if (source == target) {
8014 target_current = (OnigUChar*)source_current;
8015 target_end = (OnigUChar*)source_end;
8018 target_current = (OnigUChar*)RSTRING_PTR(target);
8022 length_or_invalid = onigenc_ascii_only_case_map(flags,
8023 &source_current, source_end,
8024 target_current, target_end, enc);
8025 if (length_or_invalid < 0)
8026 rb_raise(rb_eArgError,
"input string invalid");
8027 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8028 fprintf(stderr,
"problem with rb_str_ascii_casemap"
8029 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8030 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
8031 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8034 str_enc_copy(target, source);
8040upcase_single(
VALUE str)
8042 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8043 bool modified =
false;
8046 unsigned int c = *(
unsigned char*)s;
8048 if (
'a' <= c && c <=
'z') {
8049 *s =
'A' + (c -
'a');
8077rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
8080 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8082 flags = check_case_options(argc, argv, flags);
8083 str_modify_keep_cr(str);
8084 enc = str_true_enc(str);
8085 if (case_option_single_p(flags, enc, str)) {
8086 if (upcase_single(str))
8087 flags |= ONIGENC_CASE_MODIFIED;
8089 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8090 rb_str_ascii_casemap(str, str, &flags, enc);
8092 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8094 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8116rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8119 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8122 flags = check_case_options(argc, argv, flags);
8123 enc = str_true_enc(str);
8124 if (case_option_single_p(flags, enc, str)) {
8125 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8126 str_enc_copy_direct(ret, str);
8129 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8131 rb_str_ascii_casemap(str, ret, &flags, enc);
8134 ret = rb_str_casemap(str, &flags, enc);
8141downcase_single(
VALUE str)
8143 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8144 bool modified =
false;
8147 unsigned int c = *(
unsigned char*)s;
8149 if (
'A' <= c && c <=
'Z') {
8150 *s =
'a' + (c -
'A');
8179rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8182 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8184 flags = check_case_options(argc, argv, flags);
8185 str_modify_keep_cr(str);
8186 enc = str_true_enc(str);
8187 if (case_option_single_p(flags, enc, str)) {
8188 if (downcase_single(str))
8189 flags |= ONIGENC_CASE_MODIFIED;
8191 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8192 rb_str_ascii_casemap(str, str, &flags, enc);
8194 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8196 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8218rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8221 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8224 flags = check_case_options(argc, argv, flags);
8225 enc = str_true_enc(str);
8226 if (case_option_single_p(flags, enc, str)) {
8227 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8228 str_enc_copy_direct(ret, str);
8229 downcase_single(ret);
8231 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8233 rb_str_ascii_casemap(str, ret, &flags, enc);
8236 ret = rb_str_casemap(str, &flags, enc);
8264rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8267 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8269 flags = check_case_options(argc, argv, flags);
8270 str_modify_keep_cr(str);
8271 enc = str_true_enc(str);
8272 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8273 if (flags&ONIGENC_CASE_ASCII_ONLY)
8274 rb_str_ascii_casemap(str, str, &flags, enc);
8276 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8278 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8302rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8305 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8308 flags = check_case_options(argc, argv, flags);
8309 enc = str_true_enc(str);
8310 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8311 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8313 rb_str_ascii_casemap(str, ret, &flags, enc);
8316 ret = rb_str_casemap(str, &flags, enc);
8343rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8346 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8348 flags = check_case_options(argc, argv, flags);
8349 str_modify_keep_cr(str);
8350 enc = str_true_enc(str);
8351 if (flags&ONIGENC_CASE_ASCII_ONLY)
8352 rb_str_ascii_casemap(str, str, &flags, enc);
8354 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8356 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8380rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8383 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8386 flags = check_case_options(argc, argv, flags);
8387 enc = str_true_enc(str);
8388 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8389 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8391 rb_str_ascii_casemap(str, ret, &flags, enc);
8394 ret = rb_str_casemap(str, &flags, enc);
8399typedef unsigned char *USTR;
8403 unsigned int now, max;
8415 if (t->p == t->pend)
return -1;
8416 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8419 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8421 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8423 if (t->p < t->pend) {
8424 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8427 if (t->now < 0x80 && c < 0x80) {
8428 rb_raise(rb_eArgError,
8429 "invalid range \"%c-%c\" in string transliteration",
8433 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8437 else if (t->now < c) {
8446 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8447 if (t->now == t->max) {
8452 if (t->now < t->max) {
8468 const unsigned int errc = -1;
8469 unsigned int trans[256];
8471 struct tr trsrc, trrepl;
8473 unsigned int c, c0, last = 0;
8474 int modify = 0, i, l;
8475 unsigned char *s, *send;
8477 int singlebyte = single_byte_optimizable(str);
8481#define CHECK_IF_ASCII(c) \
8482 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8483 (cr = ENC_CODERANGE_VALID) : 0)
8487 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8488 if (RSTRING_LEN(repl) == 0) {
8489 return rb_str_delete_bang(1, &src, str);
8493 e1 = rb_enc_check(str, src);
8494 e2 = rb_enc_check(str, repl);
8499 enc = rb_enc_check(src, repl);
8501 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8502 if (RSTRING_LEN(src) > 1 &&
8503 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8504 trsrc.p + l < trsrc.pend) {
8508 trrepl.p = RSTRING_PTR(repl);
8509 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8510 trsrc.gen = trrepl.gen = 0;
8511 trsrc.now = trrepl.now = 0;
8512 trsrc.max = trrepl.max = 0;
8515 for (i=0; i<256; i++) {
8518 while ((c = trnext(&trsrc, enc)) != errc) {
8523 if (!hash) hash = rb_hash_new();
8527 while ((c = trnext(&trrepl, enc)) != errc)
8530 for (i=0; i<256; i++) {
8531 if (trans[i] != errc) {
8539 for (i=0; i<256; i++) {
8542 while ((c = trnext(&trsrc, enc)) != errc) {
8543 r = trnext(&trrepl, enc);
8544 if (r == errc) r = trrepl.now;
8547 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8550 if (!hash) hash = rb_hash_new();
8558 str_modify_keep_cr(str);
8559 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8560 termlen = rb_enc_mbminlen(enc);
8563 long offset, max = RSTRING_LEN(str);
8564 unsigned int save = -1;
8565 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8570 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8573 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8576 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8578 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8587 if (cflag) c = last;
8590 else if (cflag) c = errc;
8596 if (c != (
unsigned int)-1) {
8602 tlen = rb_enc_codelen(c, enc);
8608 if (enc != e1) may_modify = 1;
8610 if ((offset = t - buf) + tlen > max) {
8611 size_t MAYBE_UNUSED(old) = max + termlen;
8612 max = offset + tlen + (send - s);
8613 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8616 rb_enc_mbcput(c, t, enc);
8617 if (may_modify && memcmp(s, t, tlen) != 0) {
8623 if (!STR_EMBED_P(str)) {
8624 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8626 TERM_FILL((
char *)t, termlen);
8627 RSTRING(str)->as.heap.ptr = (
char *)buf;
8628 STR_SET_LEN(str, t - buf);
8629 STR_SET_NOEMBED(str);
8630 RSTRING(str)->as.heap.aux.capa = max;
8634 c = (
unsigned char)*s;
8635 if (trans[c] != errc) {
8652 long offset, max = (long)((send - s) * 1.2);
8653 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8658 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8661 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8664 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8666 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8674 if (cflag) c = last;
8677 else if (cflag) c = errc;
8681 c = cflag ? last : errc;
8684 tlen = rb_enc_codelen(c, enc);
8689 if (enc != e1) may_modify = 1;
8691 if ((offset = t - buf) + tlen > max) {
8692 size_t MAYBE_UNUSED(old) = max + termlen;
8693 max = offset + tlen + (long)((send - s) * 1.2);
8694 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8698 rb_enc_mbcput(c, t, enc);
8699 if (may_modify && memcmp(s, t, tlen) != 0) {
8707 if (!STR_EMBED_P(str)) {
8708 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8710 TERM_FILL((
char *)t, termlen);
8711 RSTRING(str)->as.heap.ptr = (
char *)buf;
8712 STR_SET_LEN(str, t - buf);
8713 STR_SET_NOEMBED(str);
8714 RSTRING(str)->as.heap.aux.capa = max;
8720 rb_enc_associate(str, enc);
8739 return tr_trans(str, src, repl, 0);
8786 tr_trans(str, src, repl, 0);
8790#define TR_TABLE_MAX (UCHAR_MAX+1)
8791#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8793tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8796 const unsigned int errc = -1;
8797 char buf[TR_TABLE_MAX];
8800 VALUE table = 0, ptable = 0;
8801 int i, l, cflag = 0;
8803 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8804 tr.gen =
tr.now =
tr.max = 0;
8806 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8811 for (i=0; i<TR_TABLE_MAX; i++) {
8814 stable[TR_TABLE_MAX] = cflag;
8816 else if (stable[TR_TABLE_MAX] && !cflag) {
8817 stable[TR_TABLE_MAX] = 0;
8819 for (i=0; i<TR_TABLE_MAX; i++) {
8823 while ((c = trnext(&
tr, enc)) != errc) {
8824 if (c < TR_TABLE_MAX) {
8825 buf[(
unsigned char)c] = !cflag;
8830 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8833 table = ptable ? ptable : rb_hash_new();
8837 table = rb_hash_new();
8842 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8843 rb_hash_aset(table, key,
Qtrue);
8847 for (i=0; i<TR_TABLE_MAX; i++) {
8848 stable[i] = stable[i] && buf[i];
8850 if (!table && !cflag) {
8857tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8859 if (c < TR_TABLE_MAX) {
8860 return table[c] != 0;
8866 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8867 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8871 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8874 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8888rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8890 char squeez[TR_TABLE_SIZE];
8893 VALUE del = 0, nodel = 0;
8895 int i, ascompat, cr;
8897 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8899 for (i=0; i<argc; i++) {
8903 enc = rb_enc_check(str, s);
8904 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8907 str_modify_keep_cr(str);
8908 ascompat = rb_enc_asciicompat(enc);
8909 s = t = RSTRING_PTR(str);
8916 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8927 c = rb_enc_codepoint_len(s, send, &clen, enc);
8929 if (tr_find(c, squeez, del, nodel)) {
8933 if (t != s) rb_enc_mbcput(c, t, enc);
8940 TERM_FILL(t, TERM_LEN(str));
8941 STR_SET_LEN(str, t - RSTRING_PTR(str));
8944 if (modify)
return str;
8964rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8967 rb_str_delete_bang(argc, argv, str);
8981rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8983 char squeez[TR_TABLE_SIZE];
8985 VALUE del = 0, nodel = 0;
8986 unsigned char *s, *send, *t;
8988 int ascompat, singlebyte = single_byte_optimizable(str);
8992 enc = STR_ENC_GET(str);
8995 for (i=0; i<argc; i++) {
8999 enc = rb_enc_check(str, s);
9000 if (singlebyte && !single_byte_optimizable(s))
9002 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
9006 str_modify_keep_cr(str);
9007 s = t = (
unsigned char *)RSTRING_PTR(str);
9008 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
9011 ascompat = rb_enc_asciicompat(enc);
9015 unsigned int c = *s++;
9016 if (c != save || (argc > 0 && !squeez[c])) {
9026 if (ascompat && (c = *s) < 0x80) {
9027 if (c != save || (argc > 0 && !squeez[c])) {
9033 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
9035 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9036 if (t != s) rb_enc_mbcput(c, t, enc);
9045 TERM_FILL((
char *)t, TERM_LEN(str));
9046 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9047 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
9051 if (modify)
return str;
9074rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
9077 rb_str_squeeze_bang(argc, argv, str);
9095 return tr_trans(str, src, repl, 1);
9118 tr_trans(str, src, repl, 1);
9147rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9149 char table[TR_TABLE_SIZE];
9151 VALUE del = 0, nodel = 0, tstr;
9161 enc = rb_enc_check(str, tstr);
9164 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9165 (ptstr = RSTRING_PTR(tstr),
9166 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9167 !is_broken_string(str)) {
9169 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9171 s = RSTRING_PTR(str);
9172 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9175 if (*(
unsigned char*)s++ == c) n++;
9181 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9182 for (i=1; i<argc; i++) {
9185 enc = rb_enc_check(str, tstr);
9186 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9189 s = RSTRING_PTR(str);
9190 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9192 ascompat = rb_enc_asciicompat(enc);
9196 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9204 c = rb_enc_codepoint_len(s, send, &clen, enc);
9205 if (tr_find(c, table, del, nodel)) {
9216rb_fs_check(
VALUE val)
9220 if (
NIL_P(val))
return 0;
9225static const char isspacetable[256] = {
9226 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9228 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9240 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9244#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9247split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9249 if (empty_count >= 0 &&
len == 0) {
9250 return empty_count + 1;
9252 if (empty_count > 0) {
9257 }
while (--empty_count > 0);
9261 rb_yield(str_new_empty_String(str));
9262 }
while (--empty_count > 0);
9276 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9280literal_split_pattern(
VALUE spat, split_type_t default_type)
9288 return SPLIT_TYPE_CHARS;
9290 else if (rb_enc_asciicompat(enc)) {
9291 if (
len == 1 && ptr[0] ==
' ') {
9292 return SPLIT_TYPE_AWK;
9297 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9298 return SPLIT_TYPE_AWK;
9301 return default_type;
9314rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9319 split_type_t split_type;
9320 long beg, end, i = 0, empty_count = -1;
9325 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9327 if (lim <= 0) limit =
Qnil;
9328 else if (lim == 1) {
9329 if (RSTRING_LEN(str) == 0)
9340 if (
NIL_P(limit) && !lim) empty_count = 0;
9342 enc = STR_ENC_GET(str);
9343 split_type = SPLIT_TYPE_REGEXP;
9345 spat = get_pat_quoted(spat, 0);
9347 else if (
NIL_P(spat = rb_fs)) {
9348 split_type = SPLIT_TYPE_AWK;
9350 else if (!(spat = rb_fs_check(spat))) {
9351 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9356 if (split_type != SPLIT_TYPE_AWK) {
9361 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9362 if (split_type == SPLIT_TYPE_AWK) {
9364 split_type = SPLIT_TYPE_STRING;
9369 mustnot_broken(spat);
9370 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9378#define SPLIT_STR(beg, len) ( \
9379 empty_count = split_string(result, str, beg, len, empty_count), \
9380 str_mod_check(str, str_start, str_len))
9383 char *ptr = RSTRING_PTR(str);
9384 char *
const str_start = ptr;
9385 const long str_len = RSTRING_LEN(str);
9386 char *
const eptr = str_start + str_len;
9387 if (split_type == SPLIT_TYPE_AWK) {
9394 if (is_ascii_string(str)) {
9395 while (ptr < eptr) {
9396 c = (
unsigned char)*ptr++;
9398 if (ascii_isspace(c)) {
9404 if (!
NIL_P(limit) && lim <= i)
break;
9407 else if (ascii_isspace(c)) {
9408 SPLIT_STR(beg, end-beg);
9411 if (!
NIL_P(limit)) ++i;
9419 while (ptr < eptr) {
9422 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9431 if (!
NIL_P(limit) && lim <= i)
break;
9435 SPLIT_STR(beg, end-beg);
9438 if (!
NIL_P(limit)) ++i;
9446 else if (split_type == SPLIT_TYPE_STRING) {
9447 char *substr_start = ptr;
9448 char *sptr = RSTRING_PTR(spat);
9449 long slen = RSTRING_LEN(spat);
9452 mustnot_broken(str);
9453 enc = rb_enc_check(str, spat);
9454 while (ptr < eptr &&
9455 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9458 if (t != ptr + end) {
9462 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9463 str_mod_check(spat, sptr, slen);
9466 if (!
NIL_P(limit) && lim <= ++i)
break;
9468 beg = ptr - str_start;
9470 else if (split_type == SPLIT_TYPE_CHARS) {
9474 mustnot_broken(str);
9475 enc = rb_enc_get(str);
9476 while (ptr < eptr &&
9477 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9478 SPLIT_STR(ptr - str_start, n);
9480 if (!
NIL_P(limit) && lim <= ++i)
break;
9482 beg = ptr - str_start;
9486 long len = RSTRING_LEN(str);
9494 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9499 if (start == end && BEG(0) == END(0)) {
9504 else if (last_null == 1) {
9505 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9512 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9518 SPLIT_STR(beg, end-beg);
9519 beg = start = END(0);
9523 for (idx=1; idx < regs->num_regs; idx++) {
9524 if (BEG(idx) == -1)
continue;
9525 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9527 if (!
NIL_P(limit) && lim <= ++i)
break;
9529 if (match) rb_match_unbusy(match);
9531 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9532 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9535 return result ? result : str;
9545 return rb_str_split_m(1, &sep, str);
9548#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9563#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9566chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9568 const char *prev = rb_enc_prev_char(p, e, e, enc);
9571 prev = rb_enc_prev_char(p, e, e, enc);
9572 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9584 RSTRING_LEN(rs) != 1 ||
9585 RSTRING_PTR(rs)[0] !=
'\n')) {
9591#define rb_rs get_rs()
9598 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9599 long pos,
len, rslen;
9605 static ID keywords[1];
9610 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9614 if (!ENUM_ELEM(ary, str)) {
9622 if (!RSTRING_LEN(str))
goto end;
9624 ptr = subptr = RSTRING_PTR(str);
9626 len = RSTRING_LEN(str);
9628 rslen = RSTRING_LEN(rs);
9631 enc = rb_enc_get(str);
9633 enc = rb_enc_check(str, rs);
9638 const char *eol = NULL;
9640 while (subend < pend) {
9641 long chomp_rslen = 0;
9643 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9645 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9647 if (eol == subend)
break;
9651 chomp_rslen = -rslen;
9655 if (!subptr) subptr = subend;
9659 }
while (subend < pend);
9661 if (rslen == 0) chomp_rslen = 0;
9663 subend - subptr + (chomp ? chomp_rslen : rslen));
9664 if (ENUM_ELEM(ary, line)) {
9665 str_mod_check(str, ptr,
len);
9667 subptr = eol = NULL;
9672 rsptr = RSTRING_PTR(rs);
9673 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9682 rsptr = RSTRING_PTR(rs);
9683 rslen = RSTRING_LEN(rs);
9686 while (subptr < pend) {
9687 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9691 if (hit != adjusted) {
9695 subend = hit += rslen;
9698 subend = chomp_newline(subptr, subend, enc);
9705 if (ENUM_ELEM(ary, line)) {
9706 str_mod_check(str, ptr,
len);
9711 if (subptr != pend) {
9714 pend = chomp_newline(subptr, pend, enc);
9716 else if (pend - subptr >= rslen &&
9717 memcmp(pend - rslen, rsptr, rslen) == 0) {
9722 ENUM_ELEM(ary, line);
9743rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9746 return rb_str_enumerate_lines(argc, argv, str, 0);
9759rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9761 VALUE ary = WANTARRAY(
"lines", 0);
9762 return rb_str_enumerate_lines(argc, argv, str, ary);
9776 for (i=0; i<RSTRING_LEN(str); i++) {
9777 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9795rb_str_each_byte(
VALUE str)
9798 return rb_str_enumerate_bytes(str, 0);
9810rb_str_bytes(
VALUE str)
9812 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9813 return rb_str_enumerate_bytes(str, ary);
9831 ptr = RSTRING_PTR(str);
9832 len = RSTRING_LEN(str);
9833 enc = rb_enc_get(str);
9836 for (i = 0; i <
len; i += n) {
9837 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9842 for (i = 0; i <
len; i += n) {
9843 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9864rb_str_each_char(
VALUE str)
9867 return rb_str_enumerate_chars(str, 0);
9879rb_str_chars(
VALUE str)
9882 return rb_str_enumerate_chars(str, ary);
9886rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9891 const char *ptr, *end;
9894 if (single_byte_optimizable(str))
9895 return rb_str_enumerate_bytes(str, ary);
9898 ptr = RSTRING_PTR(str);
9900 enc = STR_ENC_GET(str);
9903 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9924rb_str_each_codepoint(
VALUE str)
9927 return rb_str_enumerate_codepoints(str, 0);
9939rb_str_codepoints(
VALUE str)
9942 return rb_str_enumerate_codepoints(str, ary);
9948 int encidx = rb_enc_to_index(enc);
9950 const OnigUChar source_ascii[] =
"\\X";
9951 const OnigUChar *source = source_ascii;
9952 size_t source_len =
sizeof(source_ascii) - 1;
9955#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9956#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9957#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9958#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9959#define CASE_UTF(e) \
9960 case ENCINDEX_UTF_##e: { \
9961 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9962 source = source_UTF_##e; \
9963 source_len = sizeof(source_UTF_##e); \
9966 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9974 regex_t *reg_grapheme_cluster;
9976 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9977 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9979 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9980 onig_error_code_to_str(message, r, &einfo);
9981 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9984 return reg_grapheme_cluster;
9990 int encidx = rb_enc_to_index(enc);
9991 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9993 if (encidx == rb_utf8_encindex()) {
9994 if (!reg_grapheme_cluster_utf8) {
9995 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9998 return reg_grapheme_cluster_utf8;
10007 size_t grapheme_cluster_count = 0;
10009 const char *ptr, *end;
10011 if (!rb_enc_unicode_p(enc)) {
10015 bool cached_reg_grapheme_cluster =
true;
10016 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10017 if (!reg_grapheme_cluster) {
10018 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10019 cached_reg_grapheme_cluster =
false;
10022 ptr = RSTRING_PTR(str);
10025 while (ptr < end) {
10026 OnigPosition
len = onig_match(reg_grapheme_cluster,
10027 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
10028 (
const OnigUChar *)ptr, NULL, 0);
10029 if (
len <= 0)
break;
10030 grapheme_cluster_count++;
10034 if (!cached_reg_grapheme_cluster) {
10035 onig_free(reg_grapheme_cluster);
10038 return SIZET2NUM(grapheme_cluster_count);
10042rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
10046 const char *ptr0, *ptr, *end;
10048 if (!rb_enc_unicode_p(enc)) {
10049 return rb_str_enumerate_chars(str, ary);
10054 bool cached_reg_grapheme_cluster =
true;
10055 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10056 if (!reg_grapheme_cluster) {
10057 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10058 cached_reg_grapheme_cluster =
false;
10061 ptr0 = ptr = RSTRING_PTR(str);
10064 while (ptr < end) {
10065 OnigPosition
len = onig_match(reg_grapheme_cluster,
10066 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
10067 (
const OnigUChar *)ptr, NULL, 0);
10068 if (
len <= 0)
break;
10073 if (!cached_reg_grapheme_cluster) {
10074 onig_free(reg_grapheme_cluster);
10094rb_str_each_grapheme_cluster(
VALUE str)
10097 return rb_str_enumerate_grapheme_clusters(str, 0);
10109rb_str_grapheme_clusters(
VALUE str)
10112 return rb_str_enumerate_grapheme_clusters(str, ary);
10116chopped_length(
VALUE str)
10119 const char *p, *p2, *beg, *end;
10121 beg = RSTRING_PTR(str);
10122 end = beg + RSTRING_LEN(str);
10123 if (beg >= end)
return 0;
10124 p = rb_enc_prev_char(beg, end, end, enc);
10126 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10127 p2 = rb_enc_prev_char(beg, p, end, enc);
10128 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10144rb_str_chop_bang(
VALUE str)
10146 str_modify_keep_cr(str);
10147 if (RSTRING_LEN(str) > 0) {
10149 len = chopped_length(str);
10150 STR_SET_LEN(str,
len);
10151 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10170rb_str_chop(
VALUE str)
10176smart_chomp(
VALUE str,
const char *e,
const char *p)
10179 if (rb_enc_mbminlen(enc) > 1) {
10184 pp = e - rb_enc_mbminlen(enc);
10187 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10195 if (--e > p && *(e-1) ==
'\r') {
10212 char *pp, *e, *rsptr;
10214 char *
const p = RSTRING_PTR(str);
10215 long len = RSTRING_LEN(str);
10217 if (
len == 0)
return 0;
10220 return smart_chomp(str, e, p);
10223 enc = rb_enc_get(str);
10226 if (rb_enc_mbminlen(enc) > 1) {
10231 pp -= rb_enc_mbminlen(enc);
10234 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10241 while (e > p && *(e-1) ==
'\n') {
10243 if (e > p && *(e-1) ==
'\r')
10249 if (rslen >
len)
return len;
10251 enc = rb_enc_get(rs);
10252 newline = rsptr[rslen-1];
10253 if (rslen == rb_enc_mbminlen(enc)) {
10255 if (newline ==
'\n')
10256 return smart_chomp(str, e, p);
10260 return smart_chomp(str, e, p);
10264 enc = rb_enc_check(str, rs);
10265 if (is_broken_string(rs)) {
10269 if (p[
len-1] == newline &&
10271 memcmp(rsptr, pp, rslen) == 0)) {
10272 if (at_char_boundary(p, pp, e, enc))
10273 return len - rslen;
10285chomp_rs(
int argc,
const VALUE *argv)
10289 VALUE rs = argv[0];
10301 long olen = RSTRING_LEN(str);
10302 long len = chompped_length(str, rs);
10303 if (
len >= olen)
return Qnil;
10304 str_modify_keep_cr(str);
10305 STR_SET_LEN(str,
len);
10306 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10323rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10326 str_modifiable(str);
10327 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10328 rs = chomp_rs(argc, argv);
10330 return rb_str_chomp_string(str, rs);
10343rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10345 VALUE rs = chomp_rs(argc, argv);
10353 const char *
const start = s;
10355 if (!s || s >= e)
return 0;
10358 if (single_byte_optimizable(str)) {
10359 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10364 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10384rb_str_lstrip_bang(
VALUE str)
10388 long olen, loffset;
10390 str_modify_keep_cr(str);
10391 enc = STR_ENC_GET(str);
10393 loffset = lstrip_offset(str, start, start+olen, enc);
10395 long len = olen-loffset;
10396 s = start + loffset;
10397 memmove(start, s,
len);
10398 STR_SET_LEN(str,
len);
10399 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10422rb_str_lstrip(
VALUE str)
10427 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10428 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10437 rb_str_check_dummy_enc(enc);
10441 if (!s || s >= e)
return 0;
10445 if (single_byte_optimizable(str)) {
10447 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10452 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10472rb_str_rstrip_bang(
VALUE str)
10476 long olen, roffset;
10478 str_modify_keep_cr(str);
10479 enc = STR_ENC_GET(str);
10481 roffset = rstrip_offset(str, start, start+olen, enc);
10483 long len = olen - roffset;
10485 STR_SET_LEN(str,
len);
10486 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10509rb_str_rstrip(
VALUE str)
10513 long olen, roffset;
10515 enc = STR_ENC_GET(str);
10517 roffset = rstrip_offset(str, start, start+olen, enc);
10519 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10535rb_str_strip_bang(
VALUE str)
10538 long olen, loffset, roffset;
10541 str_modify_keep_cr(str);
10542 enc = STR_ENC_GET(str);
10544 loffset = lstrip_offset(str, start, start+olen, enc);
10545 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10547 if (loffset > 0 || roffset > 0) {
10548 long len = olen-roffset;
10551 memmove(start, start + loffset,
len);
10553 STR_SET_LEN(str,
len);
10554 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10577rb_str_strip(
VALUE str)
10580 long olen, loffset, roffset;
10584 loffset = lstrip_offset(str, start, start+olen, enc);
10585 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10587 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10592scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10595 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10601 end = pos + RSTRING_LEN(pat);
10615 if (RSTRING_LEN(str) > end)
10616 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10625 if (!regs || regs->num_regs == 1) {
10631 for (
int i = 1; i < regs->num_regs; i++) {
10692 long last = -1, prev = 0;
10693 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10695 pat = get_pat_quoted(pat, 1);
10696 mustnot_broken(str);
10700 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10705 if (last >= 0) rb_pat_search(pat, str, last, 1);
10710 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10714 str_mod_check(str, p,
len);
10716 if (last >= 0) rb_pat_search(pat, str, last, 1);
10740rb_str_hex(
VALUE str)
10742 return rb_str_to_inum(str, 16, FALSE);
10767rb_str_oct(
VALUE str)
10769 return rb_str_to_inum(str, -8, FALSE);
10772#ifndef HAVE_CRYPT_R
10777 rb_nativethread_lock_t lock;
10778} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10847# define CRYPT_END() ALLOCV_END(databuf)
10850 extern char *crypt(
const char *,
const char *);
10851# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10854 const char *s, *saltp;
10857 char salt_8bit_clean[3];
10861 mustnot_wchar(str);
10862 mustnot_wchar(salt);
10864 saltp = RSTRING_PTR(salt);
10865 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10866 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10870 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10871 salt_8bit_clean[0] = saltp[0] & 0x7f;
10872 salt_8bit_clean[1] = saltp[1] & 0x7f;
10873 salt_8bit_clean[2] =
'\0';
10874 saltp = salt_8bit_clean;
10879# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10880 data->initialized = 0;
10882 res = crypt_r(s, saltp, data);
10885 res = crypt(s, saltp);
10900 size_t res_size = strlen(res)+1;
10901 tmp_buf =
ALLOCA_N(
char, res_size);
10902 memcpy(tmp_buf, res, res_size);
10939 char *ptr, *p, *pend;
10942 unsigned long sum0 = 0;
10947 ptr = p = RSTRING_PTR(str);
10948 len = RSTRING_LEN(str);
10954 str_mod_check(str, ptr,
len);
10957 sum0 += (
unsigned char)*p;
10968 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10969 sum0 &= (((
unsigned long)1)<<bits)-1;
10989rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10993 long width,
len, flen = 1, fclen = 1;
10996 const char *f =
" ";
10997 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10999 int singlebyte = 1, cr;
11003 enc = STR_ENC_GET(str);
11004 termlen = rb_enc_mbminlen(enc);
11008 enc = rb_enc_check(str, pad);
11009 f = RSTRING_PTR(pad);
11010 flen = RSTRING_LEN(pad);
11011 fclen = str_strlen(pad, enc);
11012 singlebyte = single_byte_optimizable(pad);
11013 if (flen == 0 || fclen == 0) {
11014 rb_raise(rb_eArgError,
"zero width padding");
11017 len = str_strlen(str, enc);
11018 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
11020 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
11024 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11025 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11027 size = RSTRING_LEN(str);
11028 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11029 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11030 (
len += llen2 + rlen2) >= LONG_MAX - size) {
11031 rb_raise(rb_eArgError,
"argument too big");
11035 p = RSTRING_PTR(res);
11037 memset(p, *f, llen);
11041 while (llen >= fclen) {
11047 memcpy(p, f, llen2);
11051 memcpy(p, RSTRING_PTR(str), size);
11054 memset(p, *f, rlen);
11058 while (rlen >= fclen) {
11064 memcpy(p, f, rlen2);
11068 TERM_FILL(p, termlen);
11069 STR_SET_LEN(res, p-RSTRING_PTR(res));
11092rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
11094 return rb_str_justify(argc, argv, str,
'l');
11108rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11110 return rb_str_justify(argc, argv, str,
'r');
11125rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11127 return rb_str_justify(argc, argv, str,
'c');
11143 sep = get_pat_quoted(sep, 0);
11155 pos = rb_str_index(str, sep, 0);
11156 if (pos < 0)
goto failed;
11161 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11164 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11178 long pos = RSTRING_LEN(str);
11180 sep = get_pat_quoted(sep, 0);
11193 pos = rb_str_rindex(str, sep, pos);
11202 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11204 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11216rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11220 for (i=0; i<argc; i++) {
11221 VALUE tmp = argv[i];
11223 if (rb_reg_start_with_p(tmp, str))
11227 const char *p, *s, *e;
11232 enc = rb_enc_check(str, tmp);
11233 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11234 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11235 p = RSTRING_PTR(str);
11238 if (!at_char_right_boundary(p, s, e, enc))
11240 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11256rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11260 for (i=0; i<argc; i++) {
11261 VALUE tmp = argv[i];
11262 const char *p, *s, *e;
11267 enc = rb_enc_check(str, tmp);
11268 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11269 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11270 p = RSTRING_PTR(str);
11273 if (!at_char_boundary(p, s, e, enc))
11275 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11291deleted_prefix_length(
VALUE str,
VALUE prefix)
11293 const char *strptr, *prefixptr;
11294 long olen, prefixlen;
11299 if (!is_broken_string(prefix) ||
11300 !rb_enc_asciicompat(enc) ||
11301 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11302 enc = rb_enc_check(str, prefix);
11306 prefixlen = RSTRING_LEN(prefix);
11307 if (prefixlen <= 0)
return 0;
11308 olen = RSTRING_LEN(str);
11309 if (olen < prefixlen)
return 0;
11310 strptr = RSTRING_PTR(str);
11311 prefixptr = RSTRING_PTR(prefix);
11312 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11313 if (is_broken_string(prefix)) {
11314 if (!is_broken_string(str)) {
11318 const char *strend = strptr + olen;
11319 const char *after_prefix = strptr + prefixlen;
11320 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11340rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11343 str_modify_keep_cr(str);
11345 prefixlen = deleted_prefix_length(str, prefix);
11346 if (prefixlen <= 0)
return Qnil;
11360rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11364 prefixlen = deleted_prefix_length(str, prefix);
11365 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11367 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11380deleted_suffix_length(
VALUE str,
VALUE suffix)
11382 const char *strptr, *suffixptr;
11383 long olen, suffixlen;
11387 if (is_broken_string(suffix))
return 0;
11388 enc = rb_enc_check(str, suffix);
11391 suffixlen = RSTRING_LEN(suffix);
11392 if (suffixlen <= 0)
return 0;
11393 olen = RSTRING_LEN(str);
11394 if (olen < suffixlen)
return 0;
11395 strptr = RSTRING_PTR(str);
11396 suffixptr = RSTRING_PTR(suffix);
11397 const char *strend = strptr + olen;
11398 const char *before_suffix = strend - suffixlen;
11399 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11400 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11415rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11417 long olen, suffixlen,
len;
11418 str_modifiable(str);
11420 suffixlen = deleted_suffix_length(str, suffix);
11421 if (suffixlen <= 0)
return Qnil;
11423 olen = RSTRING_LEN(str);
11424 str_modify_keep_cr(str);
11425 len = olen - suffixlen;
11426 STR_SET_LEN(str,
len);
11427 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11443rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11447 suffixlen = deleted_suffix_length(str, suffix);
11448 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11450 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11457 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11465 val = rb_fs_check(val);
11468 "value of %"PRIsVALUE
" must be String or Regexp",
11472 rb_warn_deprecated(
"'$;'", NULL);
11489 str_modifiable(str);
11492 int idx = rb_enc_to_index(encoding);
11499 rb_enc_associate_index(str, idx);
11523 if (STR_EMBED_P(str)) {
11524 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11529 str_replace_shared_without_enc(str2, str);
11531 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11564rb_str_valid_encoding_p(
VALUE str)
11584rb_str_is_ascii_only_p(
VALUE str)
11594 static const char ellipsis[] =
"...";
11595 const long ellipsislen =
sizeof(ellipsis) - 1;
11597 const long blen = RSTRING_LEN(str);
11598 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11599 VALUE estr, ret = 0;
11602 if (
len * rb_enc_mbminlen(enc) >= blen ||
11606 else if (
len <= ellipsislen ||
11608 if (rb_enc_asciicompat(enc)) {
11610 rb_enc_associate(ret, enc);
11617 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11622 rb_enc_from_encoding(enc), 0,
Qnil);
11635 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11641 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11660 if (enc == STR_ENC_GET(str)) {
11665 return enc_str_scrub(enc, str, repl, cr);
11673 const char *rep, *p, *e, *p1, *sp;
11679 rb_raise(rb_eArgError,
"both of block and replacement given");
11686 if (!
NIL_P(repl)) {
11687 repl = str_compat_and_valid(repl, enc);
11690 if (rb_enc_dummy_p(enc)) {
11693 encidx = rb_enc_to_index(enc);
11695#define DEFAULT_REPLACE_CHAR(str) do { \
11696 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11697 rep = replace; replen = (int)sizeof(replace); \
11700 slen = RSTRING_LEN(str);
11701 p = RSTRING_PTR(str);
11706 if (rb_enc_asciicompat(enc)) {
11712 else if (!
NIL_P(repl)) {
11713 rep = RSTRING_PTR(repl);
11714 replen = RSTRING_LEN(repl);
11717 else if (encidx == rb_utf8_encindex()) {
11718 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11722 DEFAULT_REPLACE_CHAR(
"?");
11727 p = search_nonascii(p, e);
11732 int ret = rb_enc_precise_mbclen(p, e, enc);
11751 if (e - p < clen) clen = e - p;
11758 for (; clen > 1; clen--) {
11759 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11770 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11771 str_mod_check(str, sp, slen);
11772 repl = str_compat_and_valid(repl, enc);
11779 p = search_nonascii(p, e);
11805 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11806 str_mod_check(str, sp, slen);
11807 repl = str_compat_and_valid(repl, enc);
11816 long mbminlen = rb_enc_mbminlen(enc);
11820 else if (!
NIL_P(repl)) {
11821 rep = RSTRING_PTR(repl);
11822 replen = RSTRING_LEN(repl);
11824 else if (encidx == ENCINDEX_UTF_16BE) {
11825 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11827 else if (encidx == ENCINDEX_UTF_16LE) {
11828 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11830 else if (encidx == ENCINDEX_UTF_32BE) {
11831 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11833 else if (encidx == ENCINDEX_UTF_32LE) {
11834 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11837 DEFAULT_REPLACE_CHAR(
"?");
11841 int ret = rb_enc_precise_mbclen(p, e, enc);
11854 if (e - p < clen) clen = e - p;
11855 if (clen <= mbminlen * 2) {
11860 for (; clen > mbminlen; clen-=mbminlen) {
11861 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11871 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11872 str_mod_check(str, sp, slen);
11873 repl = str_compat_and_valid(repl, enc);
11898 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11899 str_mod_check(str, sp, slen);
11900 repl = str_compat_and_valid(repl, enc);
11936str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11944static ID id_normalize;
11945static ID id_normalized_p;
11946static VALUE mUnicodeNormalize;
11949unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11951 static int UnicodeNormalizeRequired = 0;
11954 if (!UnicodeNormalizeRequired) {
11955 rb_require(
"unicode_normalize/normalize.rb");
11956 UnicodeNormalizeRequired = 1;
11960 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11997rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11999 return unicode_normalize_common(argc, argv, str, id_normalize);
12013rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
12015 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12042rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
12044 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12176#define sym_equal rb_obj_equal
12179sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12183 int c = rb_enc_precise_mbclen(s, send, enc);
12187 c = rb_enc_mbc_to_codepoint(s, send, enc);
12195rb_str_symname_p(
VALUE sym)
12200 rb_encoding *resenc = rb_default_internal_encoding();
12202 if (resenc == NULL) resenc = rb_default_external_encoding();
12203 enc = STR_ENC_GET(sym);
12204 ptr = RSTRING_PTR(sym);
12205 len = RSTRING_LEN(sym);
12206 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12214rb_str_quote_unprintable(
VALUE str)
12222 resenc = rb_default_internal_encoding();
12223 if (resenc == NULL) resenc = rb_default_external_encoding();
12224 enc = STR_ENC_GET(str);
12225 ptr = RSTRING_PTR(str);
12226 len = RSTRING_LEN(str);
12227 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12228 !sym_printable(ptr, ptr +
len, enc)) {
12229 return rb_str_escape(str);
12235rb_id_quote_unprintable(
ID id)
12237 VALUE str = rb_id2str(
id);
12238 if (!rb_str_symname_p(str)) {
12239 return rb_str_escape(str);
12257sym_inspect(
VALUE sym)
12264 if (!rb_str_symname_p(str)) {
12266 len = RSTRING_LEN(str);
12267 rb_str_resize(str,
len + 1);
12268 dest = RSTRING_PTR(str);
12269 memmove(dest + 1, dest,
len);
12273 VALUE orig_str = str;
12275 len = RSTRING_LEN(orig_str);
12276 str = rb_enc_str_new(0,
len + 1, enc);
12279 ptr = RSTRING_PTR(orig_str);
12280 dest = RSTRING_PTR(str);
12281 memcpy(dest + 1, ptr,
len);
12301rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12306 rb_raise(rb_eArgError,
"no receiver given");
12403 return rb_str_match(
rb_sym2str(sym), other);
12418sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12420 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12433sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12435 return rb_str_match_m_p(argc, argv, sym);
12453 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12464sym_length(
VALUE sym)
12478sym_empty(
VALUE sym)
12512sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12528sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12544sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12558sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12560 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12573sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12575 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12587sym_encoding(
VALUE sym)
12593string_for_symbol(
VALUE name)
12598 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12612 name = string_for_symbol(name);
12613 return rb_intern_str(name);
12622 name = string_for_symbol(name);
12646 return rb_fstring(str);
12653 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
true,
false);
12665 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12666 rb_enc_autoload(enc);
12670 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12676 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12677 rb_enc_autoload(enc);
12681 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12692rb_yjit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12697 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12698 rb_str_buf_cat_byte(str, (
char) code);
12708fstring_set_class_i(
VALUE *str,
void *data)
12712 return ST_CONTINUE;
12720 rb_ractor_safe_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12887 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.