14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
48#include "ruby_assert.h"
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
66#undef rb_usascii_str_new
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
149str_encindex_fastpath(
int encindex)
153 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_US_ASCII:
163str_enc_fastpath(
VALUE str)
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
217#define STR_ENC_GET(str) get_encoding(str)
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
230str_embed_capa(
VALUE str)
232 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
236rb_str_reembeddable_p(
VALUE str)
238 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242rb_str_embed_size(
long capa)
248rb_str_size_as_embedded(
VALUE str)
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
260 real_size =
sizeof(
struct RString);
264 real_size +=
sizeof(st_index_t);
271STR_EMBEDDABLE_P(
long len,
long termlen)
273 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
278static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
279static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
281static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
282static inline void str_modifiable(
VALUE str);
287str_make_independent(
VALUE str)
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str),
len, 0L, termlen);
294static inline int str_dependent_p(
VALUE str);
297rb_str_make_independent(
VALUE str)
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
305rb_str_make_embedded(
VALUE str)
310 char *buf =
RSTRING(str)->as.heap.ptr;
314 STR_SET_LEN(str,
len);
317 memcpy(RSTRING_PTR(str), buf,
len);
321 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
325rb_debug_rstring_null_ptr(
const char *func)
327 fprintf(stderr,
"%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
338get_encoding(
VALUE str)
344mustnot_broken(
VALUE str)
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
352mustnot_wchar(
VALUE str)
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
360static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
368BARE_STRING_P(
VALUE str)
373static inline st_index_t
374str_do_hash(
VALUE str)
376 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
378 if (e && !is_ascii_string(str)) {
385str_store_precomputed_hash(
VALUE str, st_index_t hash)
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
396 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
398 FL_SET(str, STR_PRECOMPUTED_HASH);
411 if (
FL_TEST(str, RSTRING_FSTR))
414 bare = BARE_STRING_P(str);
416 if (STR_EMBED_P(str)) {
421 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 rb_str_resize(str, RSTRING_LEN(str));
430 fstr = register_fstring(str,
false,
false);
433 str_replace_shared_without_enc(str, fstr);
440static VALUE fstring_table_obj;
443fstring_concurrent_set_hash(
VALUE str)
445#ifdef PRECOMPUTED_FAKESTR_HASH
449 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
466 const char *aptr, *bptr;
473 return (alen == blen &&
475 memcmp(aptr, bptr, alen) == 0);
480 bool force_precompute_hash;
484fstring_concurrent_set_create(
VALUE str,
void *data)
494 long len = RSTRING_LEN(str);
495 long capa =
len +
sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
525 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
528 if (STR_SHARED_P(str)) {
530 str_make_independent(str);
533 if (!BARE_STRING_P(str)) {
539 RBASIC(str)->flags |= RSTRING_FSTR;
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
558Init_fstring_table(
void)
560 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
561 rb_gc_register_address(&fstring_table_obj);
565register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
569 .force_precompute_hash = force_precompute_hash
572#if SIZEOF_VOIDP == SIZEOF_LONG
576 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
580 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
582 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
592rb_obj_is_fstring_table(
VALUE obj)
596 return obj == fstring_table_obj;
600rb_gc_free_fstring(
VALUE obj)
605 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
607 RB_DEBUG_COUNTER_INC(obj_str_fstr);
613rb_fstring_foreach_with_replace(
int (*callback)(
VALUE *str,
void *data),
void *data)
615 if (fstring_table_obj) {
616 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
621setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
624 RBASIC_SET_SHAPE_ID((
VALUE)fake_str, ROOT_SHAPE_ID);
637 return (
VALUE)fake_str;
646 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
655rb_fstring_new(
const char *ptr,
long len)
658 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
665 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
669rb_fstring_cstr(
const char *
ptr)
671 return rb_fstring_new(
ptr, strlen(
ptr));
675single_byte_optimizable(
VALUE str)
679 case ENCINDEX_ASCII_8BIT:
680 case ENCINDEX_US_ASCII:
702static inline const char *
703search_nonascii(
const char *p,
const char *e)
705 const uintptr_t *s, *t;
707#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
708# if SIZEOF_UINTPTR_T == 8
709# define NONASCII_MASK UINT64_C(0x8080808080808080)
710# elif SIZEOF_UINTPTR_T == 4
711# define NONASCII_MASK UINT32_C(0x80808080)
713# error "don't know what to do."
716# if SIZEOF_UINTPTR_T == 8
717# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
718# elif SIZEOF_UINTPTR_T == 4
719# define NONASCII_MASK 0x80808080UL
721# error "don't know what to do."
725 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
726#if !UNALIGNED_WORD_ACCESS
727 if ((uintptr_t)p % SIZEOF_VOIDP) {
728 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
733 case 7:
if (p[-7]&0x80)
return p-7;
734 case 6:
if (p[-6]&0x80)
return p-6;
735 case 5:
if (p[-5]&0x80)
return p-5;
736 case 4:
if (p[-4]&0x80)
return p-4;
738 case 3:
if (p[-3]&0x80)
return p-3;
739 case 2:
if (p[-2]&0x80)
return p-2;
740 case 1:
if (p[-1]&0x80)
return p-1;
745#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
746#define aligned_ptr(value) \
747 __builtin_assume_aligned((value), sizeof(uintptr_t))
749#define aligned_ptr(value) (uintptr_t *)(value)
752 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
755 if (*s & NONASCII_MASK) {
756#ifdef WORDS_BIGENDIAN
757 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
759 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
769 case 7:
if (e[-7]&0x80)
return e-7;
770 case 6:
if (e[-6]&0x80)
return e-6;
771 case 5:
if (e[-5]&0x80)
return e-5;
772 case 4:
if (e[-4]&0x80)
return e-4;
774 case 3:
if (e[-3]&0x80)
return e-3;
775 case 2:
if (e[-2]&0x80)
return e-2;
776 case 1:
if (e[-1]&0x80)
return e-1;
784 const char *e = p +
len;
786 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
788 p = search_nonascii(p, e);
792 if (rb_enc_asciicompat(enc)) {
793 p = search_nonascii(p, e);
796 int ret = rb_enc_precise_mbclen(p, e, enc);
800 p = search_nonascii(p, e);
806 int ret = rb_enc_precise_mbclen(p, e, enc);
822 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
825 p = search_nonascii(p, e);
829 else if (rb_enc_asciicompat(enc)) {
830 p = search_nonascii(p, e);
836 int ret = rb_enc_precise_mbclen(p, e, enc);
843 p = search_nonascii(p, e);
849 int ret = rb_enc_precise_mbclen(p, e, enc);
874 rb_enc_set_index(str1, rb_enc_get_index(str2));
882rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
887 str_enc_copy(dest, src);
888 if (RSTRING_LEN(dest) == 0) {
889 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
900 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
901 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
912rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
914 str_enc_copy(dest, src);
921 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
927 return enc_coderange_scan(str, enc);
936 cr = enc_coderange_scan(str, get_encoding(str));
943rb_enc_str_asciicompat(
VALUE str)
946 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
954 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
963str_mod_check(
VALUE s,
const char *p,
long len)
965 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
971str_capacity(
VALUE str,
const int termlen)
973 if (STR_EMBED_P(str)) {
974 return str_embed_capa(str) - termlen;
976 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
980 return RSTRING(str)->as.heap.aux.capa;
987 return str_capacity(str, TERM_LEN(str));
991must_not_null(
const char *
ptr)
994 rb_raise(rb_eArgError,
"NULL pointer given");
1001 size_t size = rb_str_embed_size(
capa);
1005 NEWOBJ_OF(str,
struct RString, klass,
1012str_alloc_heap(
VALUE klass)
1014 NEWOBJ_OF(str,
struct RString, klass,
1021empty_str_alloc(
VALUE klass)
1023 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1024 VALUE str = str_alloc_embed(klass, 0);
1025 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1036 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1040 enc = rb_ascii8bit_encoding();
1043 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1045 int termlen = rb_enc_mbminlen(enc);
1047 if (STR_EMBEDDABLE_P(
len, termlen)) {
1048 str = str_alloc_embed(klass,
len + termlen);
1054 str = str_alloc_heap(klass);
1060 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1063 rb_enc_raw_set(str, enc);
1066 memcpy(RSTRING_PTR(str),
ptr,
len);
1069 STR_SET_LEN(str,
len);
1070 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1077 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1112 __msan_unpoison_string(
ptr);
1132 if (rb_enc_mbminlen(enc) != 1) {
1133 rb_raise(rb_eArgError,
"wchar encoding given");
1135 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1139str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1144 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1148 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1151 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1152 str = str_alloc_heap(klass);
1156 RBASIC(str)->flags |= STR_NOFREE;
1157 rb_enc_associate_index(str, encindex);
1186static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1188 int ecflags,
VALUE ecopts);
1193 int encidx = rb_enc_to_index(enc);
1194 if (rb_enc_get_index(str) == encidx)
1195 return is_ascii_string(str);
1206 if (!to)
return str;
1207 if (!from) from = rb_enc_get(str);
1208 if (from == to)
return str;
1209 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1210 rb_is_ascii8bit_enc(to)) {
1211 if (STR_ENC_GET(str) != to) {
1213 rb_enc_associate(str, to);
1220 from, to, ecflags, ecopts);
1221 if (
NIL_P(newstr)) {
1229rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1234 olen = RSTRING_LEN(newstr);
1235 if (ofs < -olen || olen < ofs)
1237 if (ofs < 0) ofs += olen;
1239 STR_SET_LEN(newstr, ofs);
1243 rb_str_modify(newstr);
1244 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1252 STR_SET_LEN(str, 0);
1253 rb_enc_associate(str, enc);
1259str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1261 int ecflags,
VALUE ecopts)
1266 VALUE econv_wrapper;
1267 const unsigned char *start, *sp;
1268 unsigned char *dest, *dp;
1269 size_t converted_output = (size_t)ofs;
1274 RBASIC_CLEAR_CLASS(econv_wrapper);
1276 if (!ec)
return Qnil;
1279 sp = (
unsigned char*)
ptr;
1281 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1282 (dp = dest + converted_output),
1286 size_t converted_input = sp - start;
1287 size_t rest =
len - converted_input;
1288 converted_output = dp - dest;
1290 if (converted_input && converted_output &&
1291 rest < (LONG_MAX / converted_output)) {
1292 rest = (rest * converted_output) / converted_input;
1297 olen += rest < 2 ? 2 : rest;
1298 rb_str_resize(newstr, olen);
1305 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1307 rb_enc_associate(newstr, to);
1326 const int eidx = rb_enc_to_index(eenc);
1329 return rb_enc_str_new(
ptr,
len, eenc);
1333 if ((eidx == rb_ascii8bit_encindex()) ||
1334 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1338 ienc = rb_default_internal_encoding();
1339 if (!ienc || eenc == ienc) {
1340 return rb_enc_str_new(
ptr,
len, eenc);
1344 if ((eidx == rb_ascii8bit_encindex()) ||
1345 (eidx == rb_usascii_encindex()) ||
1346 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1347 return rb_enc_str_new(
ptr,
len, ienc);
1350 str = rb_enc_str_new(NULL, 0, ienc);
1353 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1354 rb_str_initialize(str,
ptr,
len, eenc);
1362 int eidx = rb_enc_to_index(eenc);
1363 if (eidx == rb_usascii_encindex() &&
1364 !is_ascii_string(str)) {
1365 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1368 rb_enc_associate_index(str, eidx);
1427str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1429 const int termlen = TERM_LEN(str);
1434 if (str_embed_capa(str2) >=
len + termlen) {
1435 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1436 STR_SET_EMBED(str2);
1437 memcpy(ptr2, RSTRING_PTR(str),
len);
1438 TERM_FILL(ptr2+
len, termlen);
1442 if (STR_SHARED_P(str)) {
1443 root =
RSTRING(str)->as.heap.aux.shared;
1452 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1454 rb_fatal(
"about to free a possible shared root");
1456 char *ptr2 = STR_HEAP_PTR(str2);
1458 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1461 FL_SET(str2, STR_NOEMBED);
1463 STR_SET_SHARED(str2, root);
1466 STR_SET_LEN(str2,
len);
1474 str_replace_shared_without_enc(str2, str);
1475 rb_enc_cr_str_exact_copy(str2, str);
1482 return str_replace_shared(str_alloc_heap(klass), str);
1499rb_str_new_frozen_String(
VALUE orig)
1507rb_str_frozen_bare_string(
VALUE orig)
1509 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1514rb_str_tmp_frozen_acquire(
VALUE orig)
1517 return str_new_frozen_buffer(0, orig, FALSE);
1521rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1523 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1524 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1526 VALUE str = str_alloc_heap(0);
1529 FL_SET(str, STR_SHARED_ROOT);
1531 size_t capa = str_capacity(orig, TERM_LEN(orig));
1537 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1538 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1545 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1546 RBASIC(orig)->flags &= ~STR_NOFREE;
1547 STR_SET_SHARED(orig, str);
1557rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1562 if (STR_EMBED_P(tmp)) {
1565 else if (
FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1571 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1575 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1576 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1581 STR_SET_LEN(tmp, 0);
1589 return str_new_frozen_buffer(klass, orig, TRUE);
1598 VALUE str = str_alloc_heap(klass);
1599 STR_SET_LEN(str, RSTRING_LEN(orig));
1600 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1601 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1602 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1603 RBASIC(orig)->flags &= ~STR_NOFREE;
1604 STR_SET_SHARED(orig, str);
1611str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1615 long len = RSTRING_LEN(orig);
1616 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1617 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1619 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1620 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1626 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1627 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1633 if ((ofs > 0) || (rest > 0) ||
1636 str = str_new_shared(klass,
shared);
1638 RSTRING(str)->as.heap.ptr += ofs;
1639 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1647 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1648 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1650 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1651 STR_SET_LEN(str, RSTRING_LEN(orig));
1656 str = heap_str_make_shared(klass, orig);
1660 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1672str_new_empty_String(
VALUE str)
1675 rb_enc_copy(v, str);
1679#define STR_BUF_MIN_SIZE 63
1684 if (STR_EMBEDDABLE_P(
capa, 1)) {
1692 RSTRING(str)->as.heap.ptr[0] =
'\0';
1712 return str_new(0, 0,
len);
1718 if (STR_EMBED_P(str)) {
1719 RB_DEBUG_COUNTER_INC(obj_str_embed);
1721 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1722 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1723 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1726 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1727 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1732rb_str_memsize(
VALUE str)
1734 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1735 return STR_HEAP_SIZE(str);
1745 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1748static inline void str_discard(
VALUE str);
1749static void str_shared_replace(
VALUE str,
VALUE str2);
1754 if (str != str2) str_shared_replace(str, str2);
1765 enc = STR_ENC_GET(str2);
1768 termlen = rb_enc_mbminlen(enc);
1770 STR_SET_LEN(str, RSTRING_LEN(str2));
1772 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1774 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1775 rb_enc_associate(str, enc);
1779 if (STR_EMBED_P(str2)) {
1781 long len = RSTRING_LEN(str2);
1784 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1785 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1786 RSTRING(str2)->as.heap.ptr = new_ptr;
1787 STR_SET_LEN(str2,
len);
1789 STR_SET_NOEMBED(str2);
1792 STR_SET_NOEMBED(str);
1794 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1796 if (
FL_TEST(str2, STR_SHARED)) {
1798 STR_SET_SHARED(str,
shared);
1801 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1805 STR_SET_EMBED(str2);
1806 RSTRING_PTR(str2)[0] = 0;
1807 STR_SET_LEN(str2, 0);
1808 rb_enc_associate(str, enc);
1822 return rb_obj_as_string_result(str, obj);
1838 len = RSTRING_LEN(str2);
1839 if (STR_SHARED_P(str2)) {
1842 STR_SET_NOEMBED(str);
1843 STR_SET_LEN(str,
len);
1844 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1845 STR_SET_SHARED(str,
shared);
1846 rb_enc_cr_str_exact_copy(str, str2);
1849 str_replace_shared(str, str2);
1858 size_t size = rb_str_embed_size(
capa);
1862 NEWOBJ_OF(str,
struct RString, klass,
1871 NEWOBJ_OF(str,
struct RString, klass,
1882 encidx = rb_enc_get_index(str);
1883 flags &= ~ENCODING_MASK;
1886 if (encidx) rb_enc_associate_index(dup, encidx);
1896 long len = RSTRING_LEN(str);
1901 STR_SET_LEN(dup, RSTRING_LEN(str));
1902 return str_duplicate_setup_encoding(str, dup, flags);
1911 root =
RSTRING(str)->as.heap.aux.shared;
1914 root = str = str_new_frozen(klass, str);
1920 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1921 FL_SET(root, STR_SHARED_ROOT);
1923 flags |= RSTRING_NOEMBED | STR_SHARED;
1925 STR_SET_LEN(dup, RSTRING_LEN(str));
1926 return str_duplicate_setup_encoding(str, dup, flags);
1932 if (STR_EMBED_P(str)) {
1933 return str_duplicate_setup_embed(klass, str, dup);
1936 return str_duplicate_setup_heap(klass, str, dup);
1944 if (STR_EMBED_P(str)) {
1945 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1948 dup = str_alloc_heap(klass);
1951 return str_duplicate_setup(klass, str, dup);
1962rb_str_dup_m(
VALUE str)
1964 if (LIKELY(BARE_STRING_P(str))) {
1975 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1982 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1986 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 str_duplicate_setup_embed(klass, str, new_str);
1990 new_str = ec_str_alloc_heap(ec, klass);
1991 str_duplicate_setup_heap(klass, str, new_str);
2000rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
2002 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
2026 static ID keyword_ids[2];
2027 VALUE orig, opt, venc, vcapa;
2032 if (!keyword_ids[0]) {
2033 keyword_ids[0] = rb_id_encoding();
2034 CONST_ID(keyword_ids[1],
"capacity");
2042 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2043 enc = rb_to_encoding(venc);
2045 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2048 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2050 if (
capa < STR_BUF_MIN_SIZE) {
2051 capa = STR_BUF_MIN_SIZE;
2055 len = RSTRING_LEN(orig);
2059 if (orig == str) n = 0;
2061 str_modifiable(str);
2062 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2064 const size_t size = (size_t)
capa + termlen;
2065 const char *
const old_ptr = RSTRING_PTR(str);
2066 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2067 char *new_ptr =
ALLOC_N(
char, size);
2068 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2069 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2071 RSTRING(str)->as.heap.ptr = new_ptr;
2073 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2074 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2075 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2077 STR_SET_LEN(str,
len);
2080 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2081 rb_enc_cr_str_exact_copy(str, orig);
2083 FL_SET(str, STR_NOEMBED);
2090 rb_enc_associate(str, enc);
2102rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2108 static ID keyword_ids[2];
2118 keyword_ids[0] = rb_id_encoding();
2119 CONST_ID(keyword_ids[1],
"capacity");
2121 encoding = kwargs[0];
2122 capacity = kwargs[1];
2131 if (UNDEF_P(encoding)) {
2133 encoding = rb_obj_encoding(orig);
2137 if (!UNDEF_P(encoding)) {
2138 enc = rb_to_encoding(encoding);
2142 if (UNDEF_P(capacity)) {
2144 VALUE empty_str = str_new(klass,
"", 0);
2146 rb_enc_associate(empty_str, enc);
2150 VALUE copy = str_duplicate(klass, orig);
2151 rb_enc_associate(copy, enc);
2164 if (orig_capa >
capa) {
2169 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2170 STR_SET_LEN(str, 0);
2181#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2196static inline uintptr_t
2197count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2202 d = (d>>6) | (~d>>7);
2203 d &= NONASCII_MASK >> 7;
2206#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2208 return rb_popcount_intptr(d);
2212# if SIZEOF_VOIDP == 8
2221enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2227 long diff = (long)(e - p);
2228 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2233 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2234 const uintptr_t *s, *t;
2235 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2236 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2237 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2238 while (p < (
const char *)s) {
2239 if (is_utf8_lead_byte(*p))
len++;
2243 len += count_utf8_lead_bytes_with_word(s);
2246 p = (
const char *)s;
2249 if (is_utf8_lead_byte(*p))
len++;
2255 else if (rb_enc_asciicompat(enc)) {
2260 q = search_nonascii(p, e);
2266 p += rb_enc_fast_mbclen(p, e, enc);
2273 q = search_nonascii(p, e);
2279 p += rb_enc_mbclen(p, e, enc);
2286 for (c=0; p<e; c++) {
2287 p += rb_enc_mbclen(p, e, enc);
2302rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2310 long diff = (long)(e - p);
2311 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2313 else if (rb_enc_asciicompat(enc)) {
2317 q = search_nonascii(p, e);
2325 ret = rb_enc_precise_mbclen(p, e, enc);
2340 for (c=0; p<e; c++) {
2341 ret = rb_enc_precise_mbclen(p, e, enc);
2348 if (p + rb_enc_mbminlen(enc) <= e)
2349 p += rb_enc_mbminlen(enc);
2365 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2366 if (!enc) enc = STR_ENC_GET(str);
2367 p = RSTRING_PTR(str);
2372 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2377 return enc_strlen(p, e, enc, cr);
2384 return str_strlen(str, NULL);
2398 return LONG2NUM(str_strlen(str, NULL));
2410rb_str_bytesize(
VALUE str)
2428rb_str_empty(
VALUE str)
2430 return RBOOL(RSTRING_LEN(str) == 0);
2449 char *ptr1, *ptr2, *ptr3;
2454 enc = rb_enc_check_str(str1, str2);
2457 termlen = rb_enc_mbminlen(enc);
2458 if (len1 > LONG_MAX - len2) {
2459 rb_raise(rb_eArgError,
"string size too big");
2461 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2462 ptr3 = RSTRING_PTR(str3);
2463 memcpy(ptr3, ptr1, len1);
2464 memcpy(ptr3+len1, ptr2, len2);
2465 TERM_FILL(&ptr3[len1+len2], termlen);
2481 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2484 int enc1 = rb_enc_get_index(str1);
2485 int enc2 = rb_enc_get_index(str2);
2490 else if (enc2 < 0) {
2493 else if (enc1 != enc2) {
2496 else if (len1 > LONG_MAX - len2) {
2530 rb_enc_copy(str2, str);
2535 rb_raise(rb_eArgError,
"negative argument");
2537 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2538 if (STR_EMBEDDABLE_P(
len, 1)) {
2540 memset(RSTRING_PTR(str2), 0,
len + 1);
2547 STR_SET_LEN(str2,
len);
2548 rb_enc_copy(str2, str);
2551 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2552 rb_raise(rb_eArgError,
"argument too big");
2555 len *= RSTRING_LEN(str);
2556 termlen = TERM_LEN(str);
2558 ptr2 = RSTRING_PTR(str2);
2560 n = RSTRING_LEN(str);
2561 memcpy(ptr2, RSTRING_PTR(str), n);
2562 while (n <=
len/2) {
2563 memcpy(ptr2 + n, ptr2, n);
2566 memcpy(ptr2 + n, ptr2,
len-n);
2568 STR_SET_LEN(str2,
len);
2569 TERM_FILL(&ptr2[
len], termlen);
2570 rb_enc_cr_str_copy_for_substr(str2, str);
2607rb_check_lockedtmp(
VALUE str)
2609 if (
FL_TEST(str, STR_TMPLOCK)) {
2616#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2618str_modifiable(
VALUE str)
2622 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2623 if (CHILLED_STRING_P(str)) {
2624 CHILLED_STRING_MUTATED(str);
2626 rb_check_lockedtmp(str);
2627 rb_check_frozen(str);
2632str_dependent_p(
VALUE str)
2634 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2644#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2646str_independent(
VALUE str)
2650 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2651 str_modifiable(str);
2652 return !str_dependent_p(str);
2658str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2668 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2673 STR_SET_LEN(str,
len);
2678 oldptr = RSTRING_PTR(str);
2680 memcpy(
ptr, oldptr,
len);
2682 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2685 STR_SET_NOEMBED(str);
2686 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2687 TERM_FILL(
ptr +
len, termlen);
2689 STR_SET_LEN(str,
len);
2696 if (!str_independent(str))
2697 str_make_independent(str);
2706 int termlen = TERM_LEN(str);
2707 long len = RSTRING_LEN(str);
2710 rb_raise(rb_eArgError,
"negative expanding string size");
2712 if (expand >= LONG_MAX -
len) {
2713 rb_raise(rb_eArgError,
"string size too big");
2716 if (!str_independent(str)) {
2717 str_make_independent_expand(str,
len, expand, termlen);
2719 else if (expand > 0) {
2720 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2727str_modify_keep_cr(
VALUE str)
2729 if (!str_independent(str))
2730 str_make_independent(str);
2737str_discard(
VALUE str)
2739 str_modifiable(str);
2740 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2741 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2742 RSTRING(str)->as.heap.ptr = 0;
2743 STR_SET_LEN(str, 0);
2750 int encindex = rb_enc_get_index(str);
2752 if (RB_UNLIKELY(encindex == -1)) {
2756 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2761 if (!rb_enc_asciicompat(enc)) {
2783 return RSTRING_PTR(str);
2787zero_filled(
const char *s,
int n)
2789 for (; n > 0; --n) {
2796str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2798 const char *e = s +
len;
2800 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2801 if (zero_filled(s, minlen))
return s;
2807str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2812 if (str_dependent_p(str)) {
2813 if (!zero_filled(s +
len, termlen))
2814 str_make_independent_expand(str,
len, 0L, termlen);
2817 TERM_FILL(s +
len, termlen);
2820 return RSTRING_PTR(str);
2824rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2826 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2827 long len = RSTRING_LEN(str);
2831 rb_check_lockedtmp(str);
2832 str_make_independent_expand(str,
len, 0L, termlen);
2834 else if (str_dependent_p(str)) {
2835 if (termlen > oldtermlen)
2836 str_make_independent_expand(str,
len, 0L, termlen);
2839 if (!STR_EMBED_P(str)) {
2844 if (termlen > oldtermlen) {
2845 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2853str_null_check(
VALUE str,
int *w)
2855 char *s = RSTRING_PTR(str);
2856 long len = RSTRING_LEN(str);
2858 const int minlen = rb_enc_mbminlen(enc);
2862 if (str_null_char(s,
len, minlen, enc)) {
2865 return str_fill_term(str, s,
len, minlen);
2868 if (!s || memchr(s, 0,
len)) {
2872 s = str_fill_term(str, s,
len, minlen);
2878rb_str_to_cstr(
VALUE str)
2881 return str_null_check(str, &w);
2889 char *s = str_null_check(str, &w);
2892 rb_raise(rb_eArgError,
"string contains null char");
2894 rb_raise(rb_eArgError,
"string contains null byte");
2900rb_str_fill_terminator(
VALUE str,
const int newminlen)
2902 char *s = RSTRING_PTR(str);
2903 long len = RSTRING_LEN(str);
2904 return str_fill_term(str, s,
len, newminlen);
2910 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2936str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
2945 else if (rb_enc_asciicompat(enc)) {
2946 const char *p2, *e2;
2949 while (p < e && 0 < nth) {
2956 p2 = search_nonascii(p, e2);
2965 n = rb_enc_mbclen(p, e, enc);
2976 while (p < e && nth--) {
2977 p += rb_enc_mbclen(p, e, enc);
2988 return str_nth_len(p, e, &nth, enc);
2992str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2997 p = str_nth_len(p, e, &nth, enc);
3006str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3008 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3009 if (!pp)
return e - p;
3016 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
3017 STR_ENC_GET(str), single_byte_optimizable(str));
3022str_utf8_nth(
const char *p,
const char *e,
long *nthp)
3025 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
3026 const uintptr_t *s, *t;
3027 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3028 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3029 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
3030 while (p < (
const char *)s) {
3031 if (is_utf8_lead_byte(*p)) nth--;
3035 nth -= count_utf8_lead_bytes_with_word(s);
3037 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3041 if (is_utf8_lead_byte(*p)) {
3042 if (nth == 0)
break;
3052str_utf8_offset(
const char *p,
const char *e,
long nth)
3054 const char *pp = str_utf8_nth(p, e, &nth);
3063 if (single_byte_optimizable(str) || pos < 0)
3066 char *p = RSTRING_PTR(str);
3067 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3072str_subseq(
VALUE str,
long beg,
long len)
3080 const int termlen = TERM_LEN(str);
3081 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3088 if (str_embed_capa(str2) >=
len + termlen) {
3089 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3090 STR_SET_EMBED(str2);
3091 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3092 TERM_FILL(ptr2+
len, termlen);
3094 STR_SET_LEN(str2,
len);
3098 str_replace_shared(str2, str);
3101 RSTRING(str2)->as.heap.ptr += beg;
3102 if (RSTRING_LEN(str2) >
len) {
3103 STR_SET_LEN(str2,
len);
3113 VALUE str2 = str_subseq(str, beg,
len);
3114 rb_enc_cr_str_copy_for_substr(str2, str);
3123 const long blen = RSTRING_LEN(str);
3125 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3127 if (
len < 0)
return 0;
3128 if (beg < 0 && -beg < 0)
return 0;
3132 if (single_byte_optimizable(str)) {
3133 if (beg > blen)
return 0;
3136 if (beg < 0)
return 0;
3138 if (
len > blen - beg)
3140 if (
len < 0)
return 0;
3145 if (
len > -beg)
len = -beg;
3149 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3152 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3158 slen = str_strlen(str, enc);
3160 if (beg < 0)
return 0;
3162 if (
len == 0)
goto end;
3165 else if (beg > 0 && beg > blen) {
3169 if (beg > str_strlen(str, enc))
return 0;
3174 enc == rb_utf8_encoding()) {
3175 p = str_utf8_nth(s, e, &beg);
3176 if (beg > 0)
return 0;
3177 len = str_utf8_offset(p, e,
len);
3183 p = s + beg * char_sz;
3187 else if (
len * char_sz > e - p)
3192 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3193 if (beg > 0)
return 0;
3197 len = str_offset(p, e,
len, enc, 0);
3205static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3210 return str_substr(str, beg,
len, TRUE);
3220str_substr(
VALUE str,
long beg,
long len,
int empty)
3224 if (!p)
return Qnil;
3225 if (!
len && !empty)
return Qnil;
3227 beg = p - RSTRING_PTR(str);
3229 VALUE str2 = str_subseq(str, beg,
len);
3230 rb_enc_cr_str_copy_for_substr(str2, str);
3238 if (CHILLED_STRING_P(str)) {
3243 rb_str_resize(str, RSTRING_LEN(str));
3261 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3304str_uminus(
VALUE str)
3309 return rb_fstring(str);
3313#define rb_str_dup_frozen rb_str_new_frozen
3318 rb_check_frozen(str);
3319 if (
FL_TEST(str, STR_TMPLOCK)) {
3322 FL_SET(str, STR_TMPLOCK);
3329 rb_check_frozen(str);
3330 if (!
FL_TEST(str, STR_TMPLOCK)) {
3350 const int termlen = TERM_LEN(str);
3352 str_modifiable(str);
3353 if (STR_SHARED_P(str)) {
3356 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3357 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3368 else if (
len > RSTRING_LEN(str)) {
3372 const char *
const new_end = RSTRING_PTR(str) +
len;
3382 else if (
len < RSTRING_LEN(str)) {
3390 STR_SET_LEN(str,
len);
3391 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3398 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3401 int independent = str_independent(str);
3402 long slen = RSTRING_LEN(str);
3403 const int termlen = TERM_LEN(str);
3405 if (slen >
len || (termlen != 1 && slen <
len)) {
3411 if (STR_EMBED_P(str)) {
3412 if (
len == slen)
return str;
3413 if (str_embed_capa(str) >=
len + termlen) {
3414 STR_SET_LEN(str,
len);
3418 str_make_independent_expand(str, slen,
len - slen, termlen);
3420 else if (str_embed_capa(str) >=
len + termlen) {
3421 char *
ptr = STR_HEAP_PTR(str);
3423 if (slen >
len) slen =
len;
3426 STR_SET_LEN(str,
len);
3427 if (independent) ruby_xfree(
ptr);
3430 else if (!independent) {
3431 if (
len == slen)
return str;
3432 str_make_independent_expand(str, slen,
len - slen, termlen);
3436 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3437 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3440 else if (
len == slen)
return str;
3441 STR_SET_LEN(str,
len);
3448str_ensure_available_capa(
VALUE str,
long len)
3450 str_modify_keep_cr(str);
3452 const int termlen = TERM_LEN(str);
3453 long olen = RSTRING_LEN(str);
3455 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3456 rb_raise(rb_eArgError,
"string sizes too big");
3459 long total = olen +
len;
3460 long capa = str_capacity(str, termlen);
3463 if (total >= LONG_MAX / 2) {
3466 while (total >
capa) {
3469 RESIZE_CAPA_TERM(str,
capa, termlen);
3474str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3477 str_modify_keep_cr(str);
3482 if (
len == 0)
return 0;
3484 long total, olen,
off = -1;
3486 const int termlen = TERM_LEN(str);
3489 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3493 long capa = str_capacity(str, termlen);
3495 if (olen > LONG_MAX -
len) {
3496 rb_raise(rb_eArgError,
"string sizes too big");
3500 if (total >= LONG_MAX / 2) {
3503 while (total >
capa) {
3506 RESIZE_CAPA_TERM(str,
capa, termlen);
3507 sptr = RSTRING_PTR(str);
3512 memcpy(sptr + olen,
ptr,
len);
3513 STR_SET_LEN(str, total);
3514 TERM_FILL(sptr + total, termlen);
3519#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3520#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3525 if (
len == 0)
return str;
3527 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3529 return str_buf_cat(str,
ptr,
len);
3540rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3545 if (UNLIKELY(!str_independent(str))) {
3546 str_make_independent(str);
3549 long string_length = -1;
3550 const int null_terminator_length = 1;
3555 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3556 rb_raise(rb_eArgError,
"string sizes too big");
3559 long string_capacity = str_capacity(str, null_terminator_length);
3565 if (LIKELY(string_capacity >= string_length + 1)) {
3567 sptr[string_length] = byte;
3568 STR_SET_LEN(str, string_length + 1);
3569 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3573 str_buf_cat(str, (
char *)&
byte, 1);
3589 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3600rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3601 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3610 if (str_encindex == ptr_encindex) {
3612 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3616 str_enc = rb_enc_from_index(str_encindex);
3617 ptr_enc = rb_enc_from_index(ptr_encindex);
3618 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3621 if (RSTRING_LEN(str) == 0) {
3624 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3630 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3639 *ptr_cr_ret = ptr_cr;
3641 if (str_encindex != ptr_encindex &&
3644 str_enc = rb_enc_from_index(str_encindex);
3645 ptr_enc = rb_enc_from_index(ptr_encindex);
3650 res_encindex = str_encindex;
3655 res_encindex = str_encindex;
3659 res_encindex = ptr_encindex;
3664 res_encindex = str_encindex;
3671 res_encindex = str_encindex;
3677 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3679 str_buf_cat(str,
ptr,
len);
3685 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3692 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3702 if (rb_enc_asciicompat(enc)) {
3703 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3709 unsigned int c = (
unsigned char)*
ptr;
3710 int len = rb_enc_codelen(c, enc);
3711 rb_enc_mbcput(c, buf, enc);
3712 rb_enc_cr_str_buf_cat(str, buf,
len,
3725 if (str_enc_fastpath(str)) {
3729 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3735 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3746 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3762rb_str_concat_literals(
size_t num,
const VALUE *strary)
3766 unsigned long len = 1;
3771 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3773 str_enc_copy_direct(str, strary[0]);
3775 for (i = s; i < num; ++i) {
3776 const VALUE v = strary[i];
3780 if (encidx != ENCINDEX_US_ASCII) {
3782 rb_enc_set_index(str, encidx);
3795rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3797 str_modifiable(str);
3802 else if (argc > 1) {
3805 rb_enc_copy(arg_str, str);
3806 for (i = 0; i < argc; i++) {
3841rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3843 long needed_capacity = 0;
3847 for (
int index = 0; index < argc; index++) {
3848 VALUE obj = argv[index];
3856 needed_capacity += RSTRING_LEN(obj);
3861 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3868 str_ensure_available_capa(str, needed_capacity);
3871 for (
int index = 0; index < argc; index++) {
3872 VALUE obj = argv[index];
3877 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3878 char byte = (char)(
NUM2INT(obj) & 0xFF);
3892 rb_bug(
"append_as_bytes arguments should have been validated");
3896 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3897 TERM_FILL(sptr, TERM_LEN(str));
3902 for (
int index = 0; index < argc; index++) {
3903 VALUE obj = argv[index];
3920 rb_bug(
"append_as_bytes arguments should have been validated");
3999 if (rb_num_to_uint(str2, &code) == 0) {
4012 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4015 rb_str_buf_cat_byte(str1, (
unsigned char)code);
4018 long pos = RSTRING_LEN(str1);
4023 switch (
len = rb_enc_codelen(code, enc)) {
4024 case ONIGERR_INVALID_CODE_POINT_VALUE:
4025 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4027 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4033 rb_enc_mbcput(code, buf, enc);
4034 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
4035 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4037 rb_str_resize(str1, pos+
len);
4038 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
4051rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
4053 int encidx = rb_enc_to_index(enc);
4055 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4060 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4061 return ENCINDEX_ASCII_8BIT;
4084rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4086 str_modifiable(str);
4091 else if (argc > 1) {
4094 rb_enc_copy(arg_str, str);
4095 for (i = 0; i < argc; i++) {
4108 st_index_t precomputed_hash;
4109 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4111 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4112 return precomputed_hash;
4115 return str_do_hash(str);
4122 const char *ptr1, *ptr2;
4125 return (len1 != len2 ||
4127 memcmp(ptr1, ptr2, len1) != 0);
4141rb_str_hash_m(
VALUE str)
4147#define lesser(a,b) (((a)>(b))?(b):(a))
4155 if (RSTRING_LEN(str1) == 0)
return TRUE;
4156 if (RSTRING_LEN(str2) == 0)
return TRUE;
4159 if (idx1 == idx2)
return TRUE;
4164 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4168 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4178 const char *ptr1, *ptr2;
4181 if (str1 == str2)
return 0;
4184 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4193 if (len1 > len2)
return 1;
4196 if (retval > 0)
return 1;
4230 if (str1 == str2)
return Qtrue;
4237 return rb_str_eql_internal(str1, str2);
4261 if (str1 == str2)
return Qtrue;
4263 return rb_str_eql_internal(str1, str2);
4295 return rb_invcmp(str1, str2);
4337 return str_casecmp(str1, s);
4345 const char *p1, *p1end, *p2, *p2end;
4347 enc = rb_enc_compatible(str1, str2);
4352 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4353 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4354 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4355 while (p1 < p1end && p2 < p2end) {
4357 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4358 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4360 return INT2FIX(c1 < c2 ? -1 : 1);
4367 while (p1 < p1end && p2 < p2end) {
4368 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4369 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4371 if (0 <= c1 && 0 <= c2) {
4375 return INT2FIX(c1 < c2 ? -1 : 1);
4379 l1 = rb_enc_mbclen(p1, p1end, enc);
4380 l2 = rb_enc_mbclen(p2, p2end, enc);
4381 len = l1 < l2 ? l1 : l2;
4382 r = memcmp(p1, p2,
len);
4384 return INT2FIX(r < 0 ? -1 : 1);
4386 return INT2FIX(l1 < l2 ? -1 : 1);
4392 if (RSTRING_LEN(str1) == RSTRING_LEN(str2))
return INT2FIX(0);
4393 if (RSTRING_LEN(str1) > RSTRING_LEN(str2))
return INT2FIX(1);
4426 return str_casecmp_p(str1, s);
4433 VALUE folded_str1, folded_str2;
4434 VALUE fold_opt = sym_fold;
4436 enc = rb_enc_compatible(str1, str2);
4441 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4442 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4444 return rb_str_eql(folded_str1, folded_str2);
4448strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4449 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4451 const char *search_start = str_ptr;
4452 long pos, search_len = str_len - offset;
4456 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4457 if (pos < 0)
return pos;
4459 if (t == search_start + pos)
break;
4460 search_len -= t - search_start;
4461 if (search_len <= 0)
return -1;
4462 offset += t - search_start;
4465 return pos + offset;
4469#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4470#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4473rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4475 const char *str_ptr, *str_ptr_end, *sub_ptr;
4476 long str_len, sub_len;
4479 enc = rb_enc_check(str, sub);
4480 if (is_broken_string(sub))
return -1;
4482 str_ptr = RSTRING_PTR(str);
4484 str_len = RSTRING_LEN(str);
4485 sub_ptr = RSTRING_PTR(sub);
4486 sub_len = RSTRING_LEN(sub);
4488 if (str_len < sub_len)
return -1;
4491 long str_len_char, sub_len_char;
4492 int single_byte = single_byte_optimizable(str);
4493 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4494 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4496 offset += str_len_char;
4497 if (offset < 0)
return -1;
4499 if (str_len_char - offset < sub_len_char)
return -1;
4500 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4503 if (sub_len == 0)
return offset;
4506 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4520rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4527 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4528 long slen = str_strlen(str, enc);
4530 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4542 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4543 enc, single_byte_optimizable(str));
4554 pos = rb_str_index(str, sub, pos);
4568str_ensure_byte_pos(
VALUE str,
long pos)
4570 if (!single_byte_optimizable(str)) {
4571 const char *s = RSTRING_PTR(str);
4573 const char *p = s + pos;
4574 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4576 "offset %ld does not land on character boundary", pos);
4649rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4655 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4656 long slen = RSTRING_LEN(str);
4658 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4669 str_ensure_byte_pos(str, pos);
4681 pos = rb_str_byteindex(str, sub, pos);
4682 if (pos >= 0)
return LONG2NUM(pos);
4689memrchr(
const char *search_str,
int chr,
long search_len)
4691 const char *ptr = search_str + search_len;
4692 while (ptr > search_str) {
4693 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4703 char *hit, *adjusted;
4705 long slen, searchlen;
4708 sbeg = RSTRING_PTR(str);
4709 slen = RSTRING_LEN(sub);
4710 if (slen == 0)
return s - sbeg;
4712 t = RSTRING_PTR(sub);
4714 searchlen = s - sbeg + 1;
4716 if (memcmp(s, t, slen) == 0) {
4721 hit = memrchr(sbeg, c, searchlen);
4724 if (hit != adjusted) {
4725 searchlen = adjusted - sbeg;
4728 if (memcmp(hit, t, slen) == 0)
4730 searchlen = adjusted - sbeg;
4731 }
while (searchlen > 0);
4745 enc = rb_enc_check(str, sub);
4746 if (is_broken_string(sub))
return -1;
4747 singlebyte = single_byte_optimizable(str);
4748 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4749 slen = str_strlen(sub, enc);
4752 if (
len < slen)
return -1;
4753 if (
len - pos < slen) pos =
len - slen;
4754 if (
len == 0)
return pos;
4756 sbeg = RSTRING_PTR(str);
4759 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4765 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4766 return str_rindex(str, sub, s, enc);
4827rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4832 long pos,
len = str_strlen(str, enc);
4834 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4836 if (pos < 0 && (pos +=
len) < 0) {
4842 if (pos >
len) pos =
len;
4850 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4851 enc, single_byte_optimizable(str));
4862 pos = rb_str_rindex(str, sub, pos);
4872rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4878 enc = rb_enc_check(str, sub);
4879 if (is_broken_string(sub))
return -1;
4880 len = RSTRING_LEN(str);
4881 slen = RSTRING_LEN(sub);
4884 if (
len < slen)
return -1;
4885 if (
len - pos < slen) pos =
len - slen;
4886 if (
len == 0)
return pos;
4888 sbeg = RSTRING_PTR(str);
4891 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4898 return str_rindex(str, sub, s, enc);
4988rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4992 long pos,
len = RSTRING_LEN(str);
4994 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4996 if (pos < 0 && (pos +=
len) < 0) {
5002 if (pos >
len) pos =
len;
5008 str_ensure_byte_pos(str, pos);
5020 pos = rb_str_byterindex(str, sub, pos);
5021 if (pos >= 0)
return LONG2NUM(pos);
5060 switch (OBJ_BUILTIN_TYPE(y)) {
5112rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5119 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5151rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5155 re = get_pat(argv[0]);
5156 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5165static enum neighbor_char
5171 if (rb_enc_mbminlen(enc) > 1) {
5173 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5175 return NEIGHBOR_NOT_CHAR;
5177 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5179 if (!l)
return NEIGHBOR_NOT_CHAR;
5180 if (l !=
len)
return NEIGHBOR_WRAPPED;
5181 rb_enc_mbcput(c, p, enc);
5182 r = rb_enc_precise_mbclen(p, p +
len, enc);
5184 return NEIGHBOR_NOT_CHAR;
5186 return NEIGHBOR_FOUND;
5189 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5192 return NEIGHBOR_WRAPPED;
5193 ++((
unsigned char*)p)[i];
5194 l = rb_enc_precise_mbclen(p, p+
len, enc);
5198 return NEIGHBOR_FOUND;
5201 memset(p+l, 0xff,
len-l);
5207 for (len2 =
len-1; 0 < len2; len2--) {
5208 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5212 memset(p+len2+1, 0xff,
len-(len2+1));
5217static enum neighbor_char
5222 if (rb_enc_mbminlen(enc) > 1) {
5224 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5226 return NEIGHBOR_NOT_CHAR;
5228 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5229 if (!c)
return NEIGHBOR_NOT_CHAR;
5232 if (!l)
return NEIGHBOR_NOT_CHAR;
5233 if (l !=
len)
return NEIGHBOR_WRAPPED;
5234 rb_enc_mbcput(c, p, enc);
5235 r = rb_enc_precise_mbclen(p, p +
len, enc);
5237 return NEIGHBOR_NOT_CHAR;
5239 return NEIGHBOR_FOUND;
5242 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5245 return NEIGHBOR_WRAPPED;
5246 --((
unsigned char*)p)[i];
5247 l = rb_enc_precise_mbclen(p, p+
len, enc);
5251 return NEIGHBOR_FOUND;
5254 memset(p+l, 0,
len-l);
5260 for (len2 =
len-1; 0 < len2; len2--) {
5261 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5265 memset(p+len2+1, 0,
len-(len2+1));
5279static enum neighbor_char
5280enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5282 enum neighbor_char ret;
5286 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5290 const int max_gaps = 1;
5292 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5294 ctype = ONIGENC_CTYPE_DIGIT;
5296 ctype = ONIGENC_CTYPE_ALPHA;
5298 return NEIGHBOR_NOT_CHAR;
5301 for (
try = 0;
try <= max_gaps; ++
try) {
5302 ret = enc_succ_char(p,
len, enc);
5303 if (ret == NEIGHBOR_FOUND) {
5304 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5306 return NEIGHBOR_FOUND;
5313 ret = enc_pred_char(p,
len, enc);
5314 if (ret == NEIGHBOR_FOUND) {
5315 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5328 return NEIGHBOR_NOT_CHAR;
5331 if (ctype != ONIGENC_CTYPE_DIGIT) {
5333 return NEIGHBOR_WRAPPED;
5337 enc_succ_char(carry,
len, enc);
5338 return NEIGHBOR_WRAPPED;
5406 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5407 rb_enc_cr_str_copy_for_substr(str, orig);
5408 return str_succ(str);
5415 char *sbeg, *s, *e, *last_alnum = 0;
5416 int found_alnum = 0;
5418 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5419 long carry_pos = 0, carry_len = 1;
5420 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5422 slen = RSTRING_LEN(str);
5423 if (slen == 0)
return str;
5425 enc = STR_ENC_GET(str);
5426 sbeg = RSTRING_PTR(str);
5427 s = e = sbeg + slen;
5429 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5430 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5436 l = rb_enc_precise_mbclen(s, e, enc);
5437 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5438 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5439 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5441 case NEIGHBOR_NOT_CHAR:
5443 case NEIGHBOR_FOUND:
5445 case NEIGHBOR_WRAPPED:
5450 carry_pos = s - sbeg;
5455 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5456 enum neighbor_char neighbor;
5457 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5458 l = rb_enc_precise_mbclen(s, e, enc);
5459 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5460 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5462 neighbor = enc_succ_char(tmp, l, enc);
5464 case NEIGHBOR_FOUND:
5468 case NEIGHBOR_WRAPPED:
5471 case NEIGHBOR_NOT_CHAR:
5474 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5476 enc_succ_char(s, l, enc);
5478 if (!rb_enc_asciicompat(enc)) {
5479 MEMCPY(carry, s,
char, l);
5482 carry_pos = s - sbeg;
5486 RESIZE_CAPA(str, slen + carry_len);
5487 sbeg = RSTRING_PTR(str);
5488 s = sbeg + carry_pos;
5489 memmove(s + carry_len, s, slen - carry_pos);
5490 memmove(s, carry, carry_len);
5492 STR_SET_LEN(str, slen);
5493 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5507rb_str_succ_bang(
VALUE str)
5515all_digits_p(
const char *s,
long len)
5569 VALUE end, exclusive;
5573 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5579 VALUE current, after_end;
5586 enc = rb_enc_check(beg, end);
5587 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5589 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5590 char c = RSTRING_PTR(beg)[0];
5591 char e = RSTRING_PTR(end)[0];
5593 if (c > e || (excl && c == e))
return beg;
5595 VALUE str = rb_enc_str_new(&c, 1, enc);
5597 if ((*each)(str, arg))
break;
5598 if (!excl && c == e)
break;
5600 if (excl && c == e)
break;
5605 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5606 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5607 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5612 b = rb_str_to_inum(beg, 10, FALSE);
5613 e = rb_str_to_inum(end, 10, FALSE);
5620 if (excl && bi == ei)
break;
5621 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5626 ID op = excl ?
'<' : idLE;
5627 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5632 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5633 b = rb_funcallv(b, succ, 0, 0);
5640 if (n > 0 || (excl && n == 0))
return beg;
5642 after_end = rb_funcallv(end, succ, 0, 0);
5647 next = rb_funcallv(current, succ, 0, 0);
5648 if ((*each)(current, arg))
break;
5649 if (
NIL_P(next))
break;
5653 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5668 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5669 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5670 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5672 b = rb_str_to_inum(beg, 10, FALSE);
5678 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5686 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5687 b = rb_funcallv(b, succ, 0, 0);
5693 VALUE next = rb_funcallv(current, succ, 0, 0);
5694 if ((*each)(current, arg))
break;
5697 if (RSTRING_LEN(current) == 0)
5708 if (!
rb_equal(str, *argp))
return 0;
5722 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5723 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5724 rb_enc_asciicompat(STR_ENC_GET(val))) {
5725 const char *bp = RSTRING_PTR(beg);
5726 const char *ep = RSTRING_PTR(end);
5727 const char *vp = RSTRING_PTR(val);
5728 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5729 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5737 if (b <= v && v < e)
return Qtrue;
5738 return RBOOL(!
RTEST(exclusive) && v == e);
5745 all_digits_p(bp, RSTRING_LEN(beg)) &&
5746 all_digits_p(ep, RSTRING_LEN(end))) {
5751 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5753 return RBOOL(
NIL_P(val));
5776 return rb_str_subpat(str, indx,
INT2FIX(0));
5779 if (rb_str_index(str, indx, 0) != -1)
5785 long beg,
len = str_strlen(str, NULL);
5797 return str_substr(str, idx, 1, FALSE);
5816rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5820 return rb_str_subpat(str, argv[0], argv[1]);
5823 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5827 return rb_str_aref(str, argv[0]);
5833 char *ptr = RSTRING_PTR(str);
5834 long olen = RSTRING_LEN(str), nlen;
5836 str_modifiable(str);
5837 if (
len > olen)
len = olen;
5839 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5841 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5843 ptr =
RSTRING(str)->as.embed.ary;
5844 memmove(ptr, oldptr +
len, nlen);
5845 if (fl == STR_NOEMBED)
xfree(oldptr);
5848 if (!STR_SHARED_P(str)) {
5850 rb_enc_cr_str_exact_copy(shared, str);
5855 STR_SET_LEN(str, nlen);
5857 if (!SHARABLE_MIDDLE_SUBSTRING) {
5858 TERM_FILL(ptr + nlen, TERM_LEN(str));
5865rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5871 if (beg == 0 && vlen == 0) {
5876 str_modify_keep_cr(str);
5880 RESIZE_CAPA(str, slen + vlen -
len);
5881 sptr = RSTRING_PTR(str);
5890 memmove(sptr + beg + vlen,
5892 slen - (beg +
len));
5894 if (vlen < beg &&
len < 0) {
5898 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5901 STR_SET_LEN(str, slen);
5902 TERM_FILL(&sptr[slen], TERM_LEN(str));
5909 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5918 int singlebyte = single_byte_optimizable(str);
5924 enc = rb_enc_check(str, val);
5925 slen = str_strlen(str, enc);
5927 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5936 if (
len > slen - beg) {
5939 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5944 beg = p - RSTRING_PTR(str);
5946 rb_str_update_0(str, beg,
len, val);
5947 rb_enc_associate(str, enc);
5958 long start, end,
len;
5968 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5972 nth += regs->num_regs;
5982 enc = rb_enc_check_str(str, val);
5983 rb_str_update_0(str, start,
len, val);
5984 rb_enc_associate(str, enc);
5992 switch (
TYPE(indx)) {
5994 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5998 beg = rb_str_index(str, indx, 0);
6053rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
6057 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6065 return rb_str_aset(str, argv[0], argv[1]);
6125rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6133 str_modify_keep_cr(str);
6141 if ((nth += regs->num_regs) <= 0)
return Qnil;
6143 else if (nth >= regs->num_regs)
return Qnil;
6145 len = END(nth) - beg;
6148 else if (argc == 2) {
6157 beg = p - RSTRING_PTR(str);
6161 beg = rb_str_index(str, indx, 0);
6162 if (beg == -1)
return Qnil;
6163 len = RSTRING_LEN(indx);
6175 beg = p - RSTRING_PTR(str);
6184 beg = p - RSTRING_PTR(str);
6188 rb_enc_cr_str_copy_for_substr(result, str);
6196 char *sptr = RSTRING_PTR(str);
6197 long slen = RSTRING_LEN(str);
6198 if (beg +
len > slen)
6202 slen - (beg +
len));
6204 STR_SET_LEN(str, slen);
6205 TERM_FILL(&sptr[slen], TERM_LEN(str));
6216 switch (OBJ_BUILTIN_TYPE(pat)) {
6235get_pat_quoted(
VALUE pat,
int check)
6239 switch (OBJ_BUILTIN_TYPE(pat)) {
6253 if (check && is_broken_string(pat)) {
6260rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6263 pos = rb_str_byteindex(str, pat, pos);
6264 if (set_backref_str) {
6266 str = rb_str_new_frozen_String(str);
6267 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6269 *match = match_data;
6279 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6284rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6286 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6305rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6319 hash = rb_check_hash_type(argv[1]);
6325 pat = get_pat_quoted(argv[0], 1);
6327 str_modifiable(str);
6328 beg = rb_pat_search(pat, str, 0, 1);
6342 end0 = beg0 + RSTRING_LEN(pat);
6351 if (iter || !
NIL_P(hash)) {
6352 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6358 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6361 str_mod_check(str, p,
len);
6362 rb_check_frozen(str);
6368 enc = rb_enc_compatible(str, repl);
6371 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6375 rb_enc_inspect_name(str_enc),
6376 rb_enc_inspect_name(STR_ENC_GET(repl)));
6378 enc = STR_ENC_GET(repl);
6381 rb_enc_associate(str, enc);
6391 rlen = RSTRING_LEN(repl);
6392 len = RSTRING_LEN(str);
6394 RESIZE_CAPA(str,
len + rlen - plen);
6396 p = RSTRING_PTR(str);
6398 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6400 rp = RSTRING_PTR(repl);
6401 memmove(p + beg0, rp, rlen);
6403 STR_SET_LEN(str,
len);
6404 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6433 rb_str_sub_bang(argc, argv, str);
6438str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6441 long beg, beg0, end0;
6442 long offset, blen, slen,
len, last;
6443 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6445 int need_backref_str = -1;
6455 hash = rb_check_hash_type(argv[1]);
6459 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6467 rb_error_arity(argc, 1, 2);
6470 pat = get_pat_quoted(argv[0], 1);
6471 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6474 if (bang)
return Qnil;
6479 blen = RSTRING_LEN(str) + 30;
6481 sp = RSTRING_PTR(str);
6482 slen = RSTRING_LEN(str);
6484 str_enc = STR_ENC_GET(str);
6485 rb_enc_associate(dest, str_enc);
6492 end0 = beg0 + RSTRING_LEN(pat);
6508 if (mode == FAST_MAP) {
6517 val = rb_hash_aref(hash, key);
6520 str_mod_check(str, sp, slen);
6525 else if (need_backref_str) {
6527 if (need_backref_str < 0) {
6528 need_backref_str = val != repl;
6535 len = beg0 - offset;
6549 if (RSTRING_LEN(str) <= end0)
break;
6550 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6552 offset = end0 +
len;
6554 cp = RSTRING_PTR(str) + offset;
6555 if (offset > RSTRING_LEN(str))
break;
6558 if (mode != FAST_MAP && mode != STR) {
6561 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6566 if (RSTRING_LEN(str) > offset) {
6569 rb_pat_search0(pat, str, last, 1, &match);
6571 str_shared_replace(str, dest);
6599rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6601 str_modify_keep_cr(str);
6602 return str_gsub(argc, argv, str, 1);
6625 return str_gsub(argc, argv, str, 0);
6643 str_modifiable(str);
6644 if (str == str2)
return str;
6648 return str_replace(str, str2);
6665rb_str_clear(
VALUE str)
6669 STR_SET_LEN(str, 0);
6670 RSTRING_PTR(str)[0] = 0;
6671 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6687rb_str_chr(
VALUE str)
6711 pos += RSTRING_LEN(str);
6712 if (pos < 0 || RSTRING_LEN(str) <= pos)
6715 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6734 long len = RSTRING_LEN(str);
6735 char *
ptr, *head, *left = 0;
6739 if (pos < -
len ||
len <= pos)
6746 char byte = (char)(
NUM2INT(w) & 0xFF);
6748 if (!str_independent(str))
6749 str_make_independent(str);
6750 enc = STR_ENC_GET(str);
6751 head = RSTRING_PTR(str);
6753 if (!STR_EMBED_P(str)) {
6760 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6768 width = rb_enc_precise_mbclen(left, head+
len, enc);
6770 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6786str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6788 long n = RSTRING_LEN(str);
6790 if (beg > n ||
len < 0)
return Qnil;
6793 if (beg < 0)
return Qnil;
6798 if (!empty)
return Qnil;
6802 VALUE str2 = str_subseq(str, beg,
len);
6804 str_enc_copy_direct(str2, str);
6806 if (RSTRING_LEN(str2) == 0) {
6807 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6841 long beg,
len = RSTRING_LEN(str);
6849 return str_byte_substr(str, beg,
len, TRUE);
6854 return str_byte_substr(str, idx, 1, FALSE);
6866rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6871 return str_byte_substr(str, beg,
len, TRUE);
6874 return str_byte_aref(str, argv[0]);
6878str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6880 long end, slen = RSTRING_LEN(str);
6883 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6892 if (*
len > slen - *beg) {
6896 str_ensure_byte_pos(str, *beg);
6897 str_ensure_byte_pos(str, end);
6911rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6913 long beg,
len, vbeg, vlen;
6918 if (!(argc == 2 || argc == 3 || argc == 5)) {
6919 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6923 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6924 rb_builtin_class_name(argv[0]));
6931 vlen = RSTRING_LEN(val);
6936 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6937 rb_builtin_class_name(argv[2]));
6949 vlen = RSTRING_LEN(val);
6957 str_check_beg_len(str, &beg, &
len);
6958 str_check_beg_len(val, &vbeg, &vlen);
6959 str_modify_keep_cr(str);
6962 rb_enc_associate(str, rb_enc_check(str, val));
6965 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6983rb_str_reverse(
VALUE str)
6990 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6991 enc = STR_ENC_GET(str);
6997 if (RSTRING_LEN(str) > 1) {
6998 if (single_byte_optimizable(str)) {
7005 int clen = rb_enc_fast_mbclen(s, e, enc);
7013 cr = rb_enc_asciicompat(enc) ?
7016 int clen = rb_enc_mbclen(s, e, enc);
7025 STR_SET_LEN(rev, RSTRING_LEN(str));
7026 str_enc_copy_direct(rev, str);
7046rb_str_reverse_bang(
VALUE str)
7048 if (RSTRING_LEN(str) > 1) {
7049 if (single_byte_optimizable(str)) {
7052 str_modify_keep_cr(str);
7053 s = RSTRING_PTR(str);
7062 str_shared_replace(str, rb_str_reverse(str));
7066 str_modify_keep_cr(str);
7091 i = rb_str_index(str, arg, 0);
7093 return RBOOL(i != -1);
7135 rb_raise(rb_eArgError,
"invalid radix %d", base);
7137 return rb_str_to_inum(str, base, FALSE);
7161rb_str_to_f(
VALUE str)
7176rb_str_to_s(
VALUE str)
7188 char s[RUBY_MAX_CHAR_LEN];
7189 int n = rb_enc_codelen(c, enc);
7191 rb_enc_mbcput(c, s, enc);
7196#define CHAR_ESC_LEN 13
7199rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7201 char buf[CHAR_ESC_LEN + 1];
7209 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7211 else if (c < 0x10000) {
7212 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7215 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7220 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7223 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7226 l = (int)strlen(buf);
7232ruby_escaped_char(
int c)
7235 case '\0':
return "\\0";
7236 case '\n':
return "\\n";
7237 case '\r':
return "\\r";
7238 case '\t':
return "\\t";
7239 case '\f':
return "\\f";
7240 case '\013':
return "\\v";
7241 case '\010':
return "\\b";
7242 case '\007':
return "\\a";
7243 case '\033':
return "\\e";
7244 case '\x7f':
return "\\c?";
7250rb_str_escape(
VALUE str)
7254 const char *p = RSTRING_PTR(str);
7256 const char *prev = p;
7257 char buf[CHAR_ESC_LEN + 1];
7259 int unicode_p = rb_enc_unicode_p(enc);
7260 int asciicompat = rb_enc_asciicompat(enc);
7265 int n = rb_enc_precise_mbclen(p, pend, enc);
7267 if (p > prev) str_buf_cat(result, prev, p - prev);
7268 n = rb_enc_mbminlen(enc);
7270 n = (int)(pend - p);
7272 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7273 str_buf_cat(result, buf, strlen(buf));
7279 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7281 cc = ruby_escaped_char(c);
7283 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7284 str_buf_cat(result, cc, strlen(cc));
7287 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7290 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7291 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7295 if (p > prev) str_buf_cat(result, prev, p - prev);
7319 const char *p, *pend, *prev;
7320 char buf[CHAR_ESC_LEN + 1];
7322 rb_encoding *resenc = rb_default_internal_encoding();
7323 int unicode_p = rb_enc_unicode_p(enc);
7324 int asciicompat = rb_enc_asciicompat(enc);
7326 if (resenc == NULL) resenc = rb_default_external_encoding();
7327 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7328 rb_enc_associate(result, resenc);
7329 str_buf_cat2(result,
"\"");
7337 n = rb_enc_precise_mbclen(p, pend, enc);
7339 if (p > prev) str_buf_cat(result, prev, p - prev);
7340 n = rb_enc_mbminlen(enc);
7342 n = (int)(pend - p);
7344 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7345 str_buf_cat(result, buf, strlen(buf));
7351 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7353 if ((asciicompat || unicode_p) &&
7354 (c ==
'"'|| c ==
'\\' ||
7359 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7360 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7361 str_buf_cat2(result,
"\\");
7362 if (asciicompat || enc == resenc) {
7368 case '\n': cc =
'n';
break;
7369 case '\r': cc =
'r';
break;
7370 case '\t': cc =
't';
break;
7371 case '\f': cc =
'f';
break;
7372 case '\013': cc =
'v';
break;
7373 case '\010': cc =
'b';
break;
7374 case '\007': cc =
'a';
break;
7375 case 033: cc =
'e';
break;
7376 default: cc = 0;
break;
7379 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7382 str_buf_cat(result, buf, 2);
7395 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7399 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7400 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7405 if (p > prev) str_buf_cat(result, prev, p - prev);
7406 str_buf_cat2(result,
"\"");
7411#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7431 int encidx = rb_enc_get_index(str);
7434 const char *p, *pend;
7437 int u8 = (encidx == rb_utf8_encindex());
7438 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7441 if (!rb_enc_asciicompat(enc)) {
7443 len += strlen(enc->name);
7446 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7449 unsigned char c = *p++;
7452 case '"':
case '\\':
7453 case '\n':
case '\r':
7454 case '\t':
case '\f':
7455 case '\013':
case '\010':
case '\007':
case '\033':
7460 clen = IS_EVSTR(p, pend) ? 2 : 1;
7468 if (u8 && c > 0x7F) {
7469 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7471 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7474 else if (cc <= 0xFFFFF)
7487 if (clen > LONG_MAX -
len) {
7494 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7495 q = RSTRING_PTR(result); qend = q +
len + 1;
7499 unsigned char c = *p++;
7501 if (c ==
'"' || c ==
'\\') {
7505 else if (c ==
'#') {
7506 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7509 else if (c ==
'\n') {
7513 else if (c ==
'\r') {
7517 else if (c ==
'\t') {
7521 else if (c ==
'\f') {
7525 else if (c ==
'\013') {
7529 else if (c ==
'\010') {
7533 else if (c ==
'\007') {
7537 else if (c ==
'\033') {
7547 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7549 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7552 snprintf(q, qend-q,
"u%04X", cc);
7554 snprintf(q, qend-q,
"u{%X}", cc);
7559 snprintf(q, qend-q,
"x%02X", c);
7565 if (!rb_enc_asciicompat(enc)) {
7566 snprintf(q, qend-q, nonascii_suffix, enc->name);
7567 encidx = rb_ascii8bit_encindex();
7570 rb_enc_associate_index(result, encidx);
7576unescape_ascii(
unsigned int c)
7600undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7602 const char *s = *ss;
7606 unsigned char buf[6];
7624 *buf = unescape_ascii(*s);
7636 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7637 if (*penc != enc_utf8) {
7639 rb_enc_associate(undumped, enc_utf8);
7656 if (hexlen == 0 || hexlen > 6) {
7662 if (0xd800 <= c && c <= 0xdfff) {
7665 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7675 if (0xd800 <= c && c <= 0xdfff) {
7678 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7706static VALUE rb_str_is_ascii_only_p(
VALUE str);
7724str_undump(
VALUE str)
7726 const char *s = RSTRING_PTR(str);
7729 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7731 bool binary =
false;
7735 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7738 if (!str_null_check(str, &w)) {
7741 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7742 if (*s !=
'"')
goto invalid_format;
7760 static const char force_encoding_suffix[] =
".force_encoding(\"";
7761 static const char dup_suffix[] =
".dup";
7762 const char *encname;
7767 size =
sizeof(dup_suffix) - 1;
7768 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7770 size =
sizeof(force_encoding_suffix) - 1;
7771 if (s_end - s <= size)
goto invalid_format;
7772 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7776 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7780 s = memchr(s,
'"', s_end-s);
7782 if (!s)
goto invalid_format;
7783 if (s_end - s != 2)
goto invalid_format;
7784 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7786 encidx = rb_enc_find_index2(encname, (
long)size);
7790 rb_enc_associate_index(undumped, encidx);
7800 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7811 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7817 if (rb_enc_dummy_p(enc)) {
7824str_true_enc(
VALUE str)
7827 rb_str_check_dummy_enc(enc);
7831static OnigCaseFoldType
7832check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7837 rb_raise(rb_eArgError,
"too many options");
7838 if (argv[0]==sym_turkic) {
7839 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7841 if (argv[1]==sym_lithuanian)
7842 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7844 rb_raise(rb_eArgError,
"invalid second option");
7847 else if (argv[0]==sym_lithuanian) {
7848 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7850 if (argv[1]==sym_turkic)
7851 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7853 rb_raise(rb_eArgError,
"invalid second option");
7857 rb_raise(rb_eArgError,
"too many options");
7858 else if (argv[0]==sym_ascii)
7859 flags |= ONIGENC_CASE_ASCII_ONLY;
7860 else if (argv[0]==sym_fold) {
7861 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7862 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7864 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7867 rb_raise(rb_eArgError,
"invalid option");
7874 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7880#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7881#ifndef CASEMAP_DEBUG
7882# define CASEMAP_DEBUG 0
7890 OnigUChar space[FLEX_ARY_LEN];
7894mapping_buffer_free(
void *p)
7898 while (current_buffer) {
7899 previous_buffer = current_buffer;
7900 current_buffer = current_buffer->next;
7901 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7907 {0, mapping_buffer_free,},
7908 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7916 const OnigUChar *source_current, *source_end;
7917 int target_length = 0;
7918 VALUE buffer_anchor;
7921 size_t buffer_count = 0;
7922 int buffer_length_or_invalid;
7924 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7926 source_current = (OnigUChar*)RSTRING_PTR(source);
7931 while (source_current < source_end) {
7933 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7934 if (CASEMAP_DEBUG) {
7935 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7938 *pre_buffer = current_buffer;
7939 pre_buffer = ¤t_buffer->next;
7940 current_buffer->next = NULL;
7941 current_buffer->capa =
capa;
7942 buffer_length_or_invalid = enc->case_map(flags,
7943 &source_current, source_end,
7944 current_buffer->space,
7945 current_buffer->space+current_buffer->capa,
7947 if (buffer_length_or_invalid < 0) {
7948 current_buffer =
DATA_PTR(buffer_anchor);
7950 mapping_buffer_free(current_buffer);
7951 rb_raise(rb_eArgError,
"input string invalid");
7953 target_length += current_buffer->used = buffer_length_or_invalid;
7955 if (CASEMAP_DEBUG) {
7956 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7959 if (buffer_count==1) {
7960 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7963 char *target_current;
7966 target_current = RSTRING_PTR(target);
7967 current_buffer =
DATA_PTR(buffer_anchor);
7968 while (current_buffer) {
7969 memcpy(target_current, current_buffer->space, current_buffer->used);
7970 target_current += current_buffer->used;
7971 current_buffer = current_buffer->next;
7974 current_buffer =
DATA_PTR(buffer_anchor);
7976 mapping_buffer_free(current_buffer);
7981 str_enc_copy_direct(target, source);
7990 const OnigUChar *source_current, *source_end;
7991 OnigUChar *target_current, *target_end;
7992 long old_length = RSTRING_LEN(source);
7993 int length_or_invalid;
7995 if (old_length == 0)
return Qnil;
7997 source_current = (OnigUChar*)RSTRING_PTR(source);
7999 if (source == target) {
8000 target_current = (OnigUChar*)source_current;
8001 target_end = (OnigUChar*)source_end;
8004 target_current = (OnigUChar*)RSTRING_PTR(target);
8008 length_or_invalid = onigenc_ascii_only_case_map(flags,
8009 &source_current, source_end,
8010 target_current, target_end, enc);
8011 if (length_or_invalid < 0)
8012 rb_raise(rb_eArgError,
"input string invalid");
8013 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8014 fprintf(stderr,
"problem with rb_str_ascii_casemap"
8015 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8016 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
8017 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8020 str_enc_copy(target, source);
8026upcase_single(
VALUE str)
8028 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8029 bool modified =
false;
8032 unsigned int c = *(
unsigned char*)s;
8034 if (
'a' <= c && c <=
'z') {
8035 *s =
'A' + (c -
'a');
8063rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
8066 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8068 flags = check_case_options(argc, argv, flags);
8069 str_modify_keep_cr(str);
8070 enc = str_true_enc(str);
8071 if (case_option_single_p(flags, enc, str)) {
8072 if (upcase_single(str))
8073 flags |= ONIGENC_CASE_MODIFIED;
8075 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8076 rb_str_ascii_casemap(str, str, &flags, enc);
8078 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8080 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8102rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8105 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8108 flags = check_case_options(argc, argv, flags);
8109 enc = str_true_enc(str);
8110 if (case_option_single_p(flags, enc, str)) {
8111 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8112 str_enc_copy_direct(ret, str);
8115 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8117 rb_str_ascii_casemap(str, ret, &flags, enc);
8120 ret = rb_str_casemap(str, &flags, enc);
8127downcase_single(
VALUE str)
8129 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8130 bool modified =
false;
8133 unsigned int c = *(
unsigned char*)s;
8135 if (
'A' <= c && c <=
'Z') {
8136 *s =
'a' + (c -
'A');
8158rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8161 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8163 flags = check_case_options(argc, argv, flags);
8164 str_modify_keep_cr(str);
8165 enc = str_true_enc(str);
8166 if (case_option_single_p(flags, enc, str)) {
8167 if (downcase_single(str))
8168 flags |= ONIGENC_CASE_MODIFIED;
8170 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8171 rb_str_ascii_casemap(str, str, &flags, enc);
8173 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8175 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8189rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8192 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8195 flags = check_case_options(argc, argv, flags);
8196 enc = str_true_enc(str);
8197 if (case_option_single_p(flags, enc, str)) {
8198 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8199 str_enc_copy_direct(ret, str);
8200 downcase_single(ret);
8202 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8204 rb_str_ascii_casemap(str, ret, &flags, enc);
8207 ret = rb_str_casemap(str, &flags, enc);
8227rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8230 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8232 flags = check_case_options(argc, argv, flags);
8233 str_modify_keep_cr(str);
8234 enc = str_true_enc(str);
8235 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8236 if (flags&ONIGENC_CASE_ASCII_ONLY)
8237 rb_str_ascii_casemap(str, str, &flags, enc);
8239 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8241 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8274rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8277 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8280 flags = check_case_options(argc, argv, flags);
8281 enc = str_true_enc(str);
8282 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8283 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8285 rb_str_ascii_casemap(str, ret, &flags, enc);
8288 ret = rb_str_casemap(str, &flags, enc);
8315rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8318 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8320 flags = check_case_options(argc, argv, flags);
8321 str_modify_keep_cr(str);
8322 enc = str_true_enc(str);
8323 if (flags&ONIGENC_CASE_ASCII_ONLY)
8324 rb_str_ascii_casemap(str, str, &flags, enc);
8326 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8328 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8352rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8355 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8358 flags = check_case_options(argc, argv, flags);
8359 enc = str_true_enc(str);
8360 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8361 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8363 rb_str_ascii_casemap(str, ret, &flags, enc);
8366 ret = rb_str_casemap(str, &flags, enc);
8371typedef unsigned char *USTR;
8375 unsigned int now, max;
8387 if (t->p == t->pend)
return -1;
8388 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8391 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8393 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8395 if (t->p < t->pend) {
8396 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8399 if (t->now < 0x80 && c < 0x80) {
8400 rb_raise(rb_eArgError,
8401 "invalid range \"%c-%c\" in string transliteration",
8405 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8409 else if (t->now < c) {
8418 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8419 if (t->now == t->max) {
8424 if (t->now < t->max) {
8440 const unsigned int errc = -1;
8441 unsigned int trans[256];
8443 struct tr trsrc, trrepl;
8445 unsigned int c, c0, last = 0;
8446 int modify = 0, i, l;
8447 unsigned char *s, *send;
8449 int singlebyte = single_byte_optimizable(str);
8453#define CHECK_IF_ASCII(c) \
8454 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8455 (cr = ENC_CODERANGE_VALID) : 0)
8459 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8460 if (RSTRING_LEN(repl) == 0) {
8461 return rb_str_delete_bang(1, &src, str);
8465 e1 = rb_enc_check(str, src);
8466 e2 = rb_enc_check(str, repl);
8471 enc = rb_enc_check(src, repl);
8473 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8474 if (RSTRING_LEN(src) > 1 &&
8475 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8476 trsrc.p + l < trsrc.pend) {
8480 trrepl.p = RSTRING_PTR(repl);
8481 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8482 trsrc.gen = trrepl.gen = 0;
8483 trsrc.now = trrepl.now = 0;
8484 trsrc.max = trrepl.max = 0;
8487 for (i=0; i<256; i++) {
8490 while ((c = trnext(&trsrc, enc)) != errc) {
8495 if (!hash) hash = rb_hash_new();
8499 while ((c = trnext(&trrepl, enc)) != errc)
8502 for (i=0; i<256; i++) {
8503 if (trans[i] != errc) {
8511 for (i=0; i<256; i++) {
8514 while ((c = trnext(&trsrc, enc)) != errc) {
8515 r = trnext(&trrepl, enc);
8516 if (r == errc) r = trrepl.now;
8519 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8522 if (!hash) hash = rb_hash_new();
8530 str_modify_keep_cr(str);
8531 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8532 termlen = rb_enc_mbminlen(enc);
8535 long offset, max = RSTRING_LEN(str);
8536 unsigned int save = -1;
8537 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8542 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8545 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8548 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8550 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8559 if (cflag) c = last;
8562 else if (cflag) c = errc;
8568 if (c != (
unsigned int)-1) {
8574 tlen = rb_enc_codelen(c, enc);
8580 if (enc != e1) may_modify = 1;
8582 if ((offset = t - buf) + tlen > max) {
8583 size_t MAYBE_UNUSED(old) = max + termlen;
8584 max = offset + tlen + (send - s);
8585 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8588 rb_enc_mbcput(c, t, enc);
8589 if (may_modify && memcmp(s, t, tlen) != 0) {
8595 if (!STR_EMBED_P(str)) {
8596 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8598 TERM_FILL((
char *)t, termlen);
8599 RSTRING(str)->as.heap.ptr = (
char *)buf;
8600 STR_SET_LEN(str, t - buf);
8601 STR_SET_NOEMBED(str);
8602 RSTRING(str)->as.heap.aux.capa = max;
8606 c = (
unsigned char)*s;
8607 if (trans[c] != errc) {
8624 long offset, max = (long)((send - s) * 1.2);
8625 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8630 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8633 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8636 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8638 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8646 if (cflag) c = last;
8649 else if (cflag) c = errc;
8653 c = cflag ? last : errc;
8656 tlen = rb_enc_codelen(c, enc);
8661 if (enc != e1) may_modify = 1;
8663 if ((offset = t - buf) + tlen > max) {
8664 size_t MAYBE_UNUSED(old) = max + termlen;
8665 max = offset + tlen + (long)((send - s) * 1.2);
8666 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8670 rb_enc_mbcput(c, t, enc);
8671 if (may_modify && memcmp(s, t, tlen) != 0) {
8679 if (!STR_EMBED_P(str)) {
8680 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8682 TERM_FILL((
char *)t, termlen);
8683 RSTRING(str)->as.heap.ptr = (
char *)buf;
8684 STR_SET_LEN(str, t - buf);
8685 STR_SET_NOEMBED(str);
8686 RSTRING(str)->as.heap.aux.capa = max;
8692 rb_enc_associate(str, enc);
8711 return tr_trans(str, src, repl, 0);
8758 tr_trans(str, src, repl, 0);
8762#define TR_TABLE_MAX (UCHAR_MAX+1)
8763#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8765tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8768 const unsigned int errc = -1;
8769 char buf[TR_TABLE_MAX];
8772 VALUE table = 0, ptable = 0;
8773 int i, l, cflag = 0;
8775 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8776 tr.gen =
tr.now =
tr.max = 0;
8778 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8783 for (i=0; i<TR_TABLE_MAX; i++) {
8786 stable[TR_TABLE_MAX] = cflag;
8788 else if (stable[TR_TABLE_MAX] && !cflag) {
8789 stable[TR_TABLE_MAX] = 0;
8791 for (i=0; i<TR_TABLE_MAX; i++) {
8795 while ((c = trnext(&
tr, enc)) != errc) {
8796 if (c < TR_TABLE_MAX) {
8797 buf[(
unsigned char)c] = !cflag;
8802 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8805 table = ptable ? ptable : rb_hash_new();
8809 table = rb_hash_new();
8814 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8815 rb_hash_aset(table, key,
Qtrue);
8819 for (i=0; i<TR_TABLE_MAX; i++) {
8820 stable[i] = stable[i] && buf[i];
8822 if (!table && !cflag) {
8829tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8831 if (c < TR_TABLE_MAX) {
8832 return table[c] != 0;
8838 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8839 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8843 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8846 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8861rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8863 char squeez[TR_TABLE_SIZE];
8866 VALUE del = 0, nodel = 0;
8868 int i, ascompat, cr;
8870 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8872 for (i=0; i<argc; i++) {
8876 enc = rb_enc_check(str, s);
8877 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8880 str_modify_keep_cr(str);
8881 ascompat = rb_enc_asciicompat(enc);
8882 s = t = RSTRING_PTR(str);
8889 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8900 c = rb_enc_codepoint_len(s, send, &clen, enc);
8902 if (tr_find(c, squeez, del, nodel)) {
8906 if (t != s) rb_enc_mbcput(c, t, enc);
8913 TERM_FILL(t, TERM_LEN(str));
8914 STR_SET_LEN(str, t - RSTRING_PTR(str));
8917 if (modify)
return str;
8931rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8934 rb_str_delete_bang(argc, argv, str);
8948rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8950 char squeez[TR_TABLE_SIZE];
8952 VALUE del = 0, nodel = 0;
8953 unsigned char *s, *send, *t;
8955 int ascompat, singlebyte = single_byte_optimizable(str);
8959 enc = STR_ENC_GET(str);
8962 for (i=0; i<argc; i++) {
8966 enc = rb_enc_check(str, s);
8967 if (singlebyte && !single_byte_optimizable(s))
8969 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8973 str_modify_keep_cr(str);
8974 s = t = (
unsigned char *)RSTRING_PTR(str);
8975 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8978 ascompat = rb_enc_asciicompat(enc);
8982 unsigned int c = *s++;
8983 if (c != save || (argc > 0 && !squeez[c])) {
8993 if (ascompat && (c = *s) < 0x80) {
8994 if (c != save || (argc > 0 && !squeez[c])) {
9000 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
9002 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9003 if (t != s) rb_enc_mbcput(c, t, enc);
9012 TERM_FILL((
char *)t, TERM_LEN(str));
9013 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9014 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
9018 if (modify)
return str;
9041rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
9044 rb_str_squeeze_bang(argc, argv, str);
9062 return tr_trans(str, src, repl, 1);
9085 tr_trans(str, src, repl, 1);
9098rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9100 char table[TR_TABLE_SIZE];
9102 VALUE del = 0, nodel = 0, tstr;
9112 enc = rb_enc_check(str, tstr);
9115 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9116 (ptstr = RSTRING_PTR(tstr),
9117 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9118 !is_broken_string(str)) {
9120 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9122 s = RSTRING_PTR(str);
9123 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9126 if (*(
unsigned char*)s++ == c) n++;
9132 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9133 for (i=1; i<argc; i++) {
9136 enc = rb_enc_check(str, tstr);
9137 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9140 s = RSTRING_PTR(str);
9141 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9143 ascompat = rb_enc_asciicompat(enc);
9147 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9155 c = rb_enc_codepoint_len(s, send, &clen, enc);
9156 if (tr_find(c, table, del, nodel)) {
9167rb_fs_check(
VALUE val)
9171 if (
NIL_P(val))
return 0;
9176static const char isspacetable[256] = {
9177 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9179 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9182 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9195#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9198split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9200 if (empty_count >= 0 &&
len == 0) {
9201 return empty_count + 1;
9203 if (empty_count > 0) {
9208 }
while (--empty_count > 0);
9212 rb_yield(str_new_empty_String(str));
9213 }
while (--empty_count > 0);
9227 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9231literal_split_pattern(
VALUE spat, split_type_t default_type)
9239 return SPLIT_TYPE_CHARS;
9241 else if (rb_enc_asciicompat(enc)) {
9242 if (
len == 1 && ptr[0] ==
' ') {
9243 return SPLIT_TYPE_AWK;
9248 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9249 return SPLIT_TYPE_AWK;
9252 return default_type;
9265rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9270 split_type_t split_type;
9271 long beg, end, i = 0, empty_count = -1;
9276 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9278 if (lim <= 0) limit =
Qnil;
9279 else if (lim == 1) {
9280 if (RSTRING_LEN(str) == 0)
9291 if (
NIL_P(limit) && !lim) empty_count = 0;
9293 enc = STR_ENC_GET(str);
9294 split_type = SPLIT_TYPE_REGEXP;
9296 spat = get_pat_quoted(spat, 0);
9298 else if (
NIL_P(spat = rb_fs)) {
9299 split_type = SPLIT_TYPE_AWK;
9301 else if (!(spat = rb_fs_check(spat))) {
9302 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9307 if (split_type != SPLIT_TYPE_AWK) {
9312 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9313 if (split_type == SPLIT_TYPE_AWK) {
9315 split_type = SPLIT_TYPE_STRING;
9320 mustnot_broken(spat);
9321 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9329#define SPLIT_STR(beg, len) ( \
9330 empty_count = split_string(result, str, beg, len, empty_count), \
9331 str_mod_check(str, str_start, str_len))
9334 char *ptr = RSTRING_PTR(str);
9335 char *
const str_start = ptr;
9336 const long str_len = RSTRING_LEN(str);
9337 char *
const eptr = str_start + str_len;
9338 if (split_type == SPLIT_TYPE_AWK) {
9345 if (is_ascii_string(str)) {
9346 while (ptr < eptr) {
9347 c = (
unsigned char)*ptr++;
9349 if (ascii_isspace(c)) {
9355 if (!
NIL_P(limit) && lim <= i)
break;
9358 else if (ascii_isspace(c)) {
9359 SPLIT_STR(beg, end-beg);
9362 if (!
NIL_P(limit)) ++i;
9370 while (ptr < eptr) {
9373 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9382 if (!
NIL_P(limit) && lim <= i)
break;
9386 SPLIT_STR(beg, end-beg);
9389 if (!
NIL_P(limit)) ++i;
9397 else if (split_type == SPLIT_TYPE_STRING) {
9398 char *substr_start = ptr;
9399 char *sptr = RSTRING_PTR(spat);
9400 long slen = RSTRING_LEN(spat);
9403 mustnot_broken(str);
9404 enc = rb_enc_check(str, spat);
9405 while (ptr < eptr &&
9406 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9409 if (t != ptr + end) {
9413 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9414 str_mod_check(spat, sptr, slen);
9417 if (!
NIL_P(limit) && lim <= ++i)
break;
9419 beg = ptr - str_start;
9421 else if (split_type == SPLIT_TYPE_CHARS) {
9425 mustnot_broken(str);
9426 enc = rb_enc_get(str);
9427 while (ptr < eptr &&
9428 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9429 SPLIT_STR(ptr - str_start, n);
9431 if (!
NIL_P(limit) && lim <= ++i)
break;
9433 beg = ptr - str_start;
9437 long len = RSTRING_LEN(str);
9445 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9450 if (start == end && BEG(0) == END(0)) {
9455 else if (last_null == 1) {
9456 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9463 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9469 SPLIT_STR(beg, end-beg);
9470 beg = start = END(0);
9474 for (idx=1; idx < regs->num_regs; idx++) {
9475 if (BEG(idx) == -1)
continue;
9476 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9478 if (!
NIL_P(limit) && lim <= ++i)
break;
9480 if (match) rb_match_unbusy(match);
9482 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9483 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9486 return result ? result : str;
9496 return rb_str_split_m(1, &sep, str);
9499#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9514#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9517chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9519 const char *prev = rb_enc_prev_char(p, e, e, enc);
9522 prev = rb_enc_prev_char(p, e, e, enc);
9523 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9535 RSTRING_LEN(rs) != 1 ||
9536 RSTRING_PTR(rs)[0] !=
'\n')) {
9542#define rb_rs get_rs()
9549 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9550 long pos,
len, rslen;
9556 static ID keywords[1];
9561 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9565 if (!ENUM_ELEM(ary, str)) {
9573 if (!RSTRING_LEN(str))
goto end;
9575 ptr = subptr = RSTRING_PTR(str);
9577 len = RSTRING_LEN(str);
9579 rslen = RSTRING_LEN(rs);
9582 enc = rb_enc_get(str);
9584 enc = rb_enc_check(str, rs);
9589 const char *eol = NULL;
9591 while (subend < pend) {
9592 long chomp_rslen = 0;
9594 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9596 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9598 if (eol == subend)
break;
9602 chomp_rslen = -rslen;
9606 if (!subptr) subptr = subend;
9610 }
while (subend < pend);
9612 if (rslen == 0) chomp_rslen = 0;
9614 subend - subptr + (chomp ? chomp_rslen : rslen));
9615 if (ENUM_ELEM(ary, line)) {
9616 str_mod_check(str, ptr,
len);
9618 subptr = eol = NULL;
9623 rsptr = RSTRING_PTR(rs);
9624 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9633 rsptr = RSTRING_PTR(rs);
9634 rslen = RSTRING_LEN(rs);
9637 while (subptr < pend) {
9638 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9642 if (hit != adjusted) {
9646 subend = hit += rslen;
9649 subend = chomp_newline(subptr, subend, enc);
9656 if (ENUM_ELEM(ary, line)) {
9657 str_mod_check(str, ptr,
len);
9662 if (subptr != pend) {
9665 pend = chomp_newline(subptr, pend, enc);
9667 else if (pend - subptr >= rslen &&
9668 memcmp(pend - rslen, rsptr, rslen) == 0) {
9673 ENUM_ELEM(ary, line);
9694rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9697 return rb_str_enumerate_lines(argc, argv, str, 0);
9710rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9712 VALUE ary = WANTARRAY(
"lines", 0);
9713 return rb_str_enumerate_lines(argc, argv, str, ary);
9727 for (i=0; i<RSTRING_LEN(str); i++) {
9728 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9746rb_str_each_byte(
VALUE str)
9749 return rb_str_enumerate_bytes(str, 0);
9761rb_str_bytes(
VALUE str)
9763 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9764 return rb_str_enumerate_bytes(str, ary);
9782 ptr = RSTRING_PTR(str);
9783 len = RSTRING_LEN(str);
9784 enc = rb_enc_get(str);
9787 for (i = 0; i <
len; i += n) {
9788 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9793 for (i = 0; i <
len; i += n) {
9794 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9815rb_str_each_char(
VALUE str)
9818 return rb_str_enumerate_chars(str, 0);
9830rb_str_chars(
VALUE str)
9833 return rb_str_enumerate_chars(str, ary);
9837rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9842 const char *ptr, *end;
9845 if (single_byte_optimizable(str))
9846 return rb_str_enumerate_bytes(str, ary);
9849 ptr = RSTRING_PTR(str);
9851 enc = STR_ENC_GET(str);
9854 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9875rb_str_each_codepoint(
VALUE str)
9878 return rb_str_enumerate_codepoints(str, 0);
9890rb_str_codepoints(
VALUE str)
9893 return rb_str_enumerate_codepoints(str, ary);
9899 int encidx = rb_enc_to_index(enc);
9901 const OnigUChar source_ascii[] =
"\\X";
9902 const OnigUChar *source = source_ascii;
9903 size_t source_len =
sizeof(source_ascii) - 1;
9906#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9907#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9908#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9909#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9910#define CASE_UTF(e) \
9911 case ENCINDEX_UTF_##e: { \
9912 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9913 source = source_UTF_##e; \
9914 source_len = sizeof(source_UTF_##e); \
9917 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9925 regex_t *reg_grapheme_cluster;
9927 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9928 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9930 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9931 onig_error_code_to_str(message, r, &einfo);
9932 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9935 return reg_grapheme_cluster;
9941 int encidx = rb_enc_to_index(enc);
9942 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9944 if (encidx == rb_utf8_encindex()) {
9945 if (!reg_grapheme_cluster_utf8) {
9946 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9949 return reg_grapheme_cluster_utf8;
9958 size_t grapheme_cluster_count = 0;
9960 const char *ptr, *end;
9962 if (!rb_enc_unicode_p(enc)) {
9966 bool cached_reg_grapheme_cluster =
true;
9967 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9968 if (!reg_grapheme_cluster) {
9969 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9970 cached_reg_grapheme_cluster =
false;
9973 ptr = RSTRING_PTR(str);
9977 OnigPosition
len = onig_match(reg_grapheme_cluster,
9978 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9979 (
const OnigUChar *)ptr, NULL, 0);
9980 if (
len <= 0)
break;
9981 grapheme_cluster_count++;
9985 if (!cached_reg_grapheme_cluster) {
9986 onig_free(reg_grapheme_cluster);
9989 return SIZET2NUM(grapheme_cluster_count);
9993rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9997 const char *ptr0, *ptr, *end;
9999 if (!rb_enc_unicode_p(enc)) {
10000 return rb_str_enumerate_chars(str, ary);
10005 bool cached_reg_grapheme_cluster =
true;
10006 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10007 if (!reg_grapheme_cluster) {
10008 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10009 cached_reg_grapheme_cluster =
false;
10012 ptr0 = ptr = RSTRING_PTR(str);
10015 while (ptr < end) {
10016 OnigPosition
len = onig_match(reg_grapheme_cluster,
10017 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
10018 (
const OnigUChar *)ptr, NULL, 0);
10019 if (
len <= 0)
break;
10024 if (!cached_reg_grapheme_cluster) {
10025 onig_free(reg_grapheme_cluster);
10045rb_str_each_grapheme_cluster(
VALUE str)
10048 return rb_str_enumerate_grapheme_clusters(str, 0);
10060rb_str_grapheme_clusters(
VALUE str)
10063 return rb_str_enumerate_grapheme_clusters(str, ary);
10067chopped_length(
VALUE str)
10070 const char *p, *p2, *beg, *end;
10072 beg = RSTRING_PTR(str);
10073 end = beg + RSTRING_LEN(str);
10074 if (beg >= end)
return 0;
10075 p = rb_enc_prev_char(beg, end, end, enc);
10077 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10078 p2 = rb_enc_prev_char(beg, p, end, enc);
10079 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10097rb_str_chop_bang(
VALUE str)
10099 str_modify_keep_cr(str);
10100 if (RSTRING_LEN(str) > 0) {
10102 len = chopped_length(str);
10103 STR_SET_LEN(str,
len);
10104 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10123rb_str_chop(
VALUE str)
10129smart_chomp(
VALUE str,
const char *e,
const char *p)
10132 if (rb_enc_mbminlen(enc) > 1) {
10137 pp = e - rb_enc_mbminlen(enc);
10140 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10148 if (--e > p && *(e-1) ==
'\r') {
10165 char *pp, *e, *rsptr;
10167 char *
const p = RSTRING_PTR(str);
10168 long len = RSTRING_LEN(str);
10170 if (
len == 0)
return 0;
10173 return smart_chomp(str, e, p);
10176 enc = rb_enc_get(str);
10179 if (rb_enc_mbminlen(enc) > 1) {
10184 pp -= rb_enc_mbminlen(enc);
10187 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10194 while (e > p && *(e-1) ==
'\n') {
10196 if (e > p && *(e-1) ==
'\r')
10202 if (rslen >
len)
return len;
10204 enc = rb_enc_get(rs);
10205 newline = rsptr[rslen-1];
10206 if (rslen == rb_enc_mbminlen(enc)) {
10208 if (newline ==
'\n')
10209 return smart_chomp(str, e, p);
10213 return smart_chomp(str, e, p);
10217 enc = rb_enc_check(str, rs);
10218 if (is_broken_string(rs)) {
10222 if (p[
len-1] == newline &&
10224 memcmp(rsptr, pp, rslen) == 0)) {
10225 if (at_char_boundary(p, pp, e, enc))
10226 return len - rslen;
10238chomp_rs(
int argc,
const VALUE *argv)
10242 VALUE rs = argv[0];
10254 long olen = RSTRING_LEN(str);
10255 long len = chompped_length(str, rs);
10256 if (
len >= olen)
return Qnil;
10257 str_modify_keep_cr(str);
10258 STR_SET_LEN(str,
len);
10259 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10279rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10282 str_modifiable(str);
10283 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10284 rs = chomp_rs(argc, argv);
10286 return rb_str_chomp_string(str, rs);
10299rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10301 VALUE rs = chomp_rs(argc, argv);
10309 const char *
const start = s;
10311 if (!s || s >= e)
return 0;
10314 if (single_byte_optimizable(str)) {
10315 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10320 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10340rb_str_lstrip_bang(
VALUE str)
10344 long olen, loffset;
10346 str_modify_keep_cr(str);
10347 enc = STR_ENC_GET(str);
10349 loffset = lstrip_offset(str, start, start+olen, enc);
10351 long len = olen-loffset;
10352 s = start + loffset;
10353 memmove(start, s,
len);
10354 STR_SET_LEN(str,
len);
10355 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10378rb_str_lstrip(
VALUE str)
10383 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10384 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10393 rb_str_check_dummy_enc(enc);
10397 if (!s || s >= e)
return 0;
10401 if (single_byte_optimizable(str)) {
10403 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10408 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10428rb_str_rstrip_bang(
VALUE str)
10432 long olen, roffset;
10434 str_modify_keep_cr(str);
10435 enc = STR_ENC_GET(str);
10437 roffset = rstrip_offset(str, start, start+olen, enc);
10439 long len = olen - roffset;
10441 STR_SET_LEN(str,
len);
10442 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10465rb_str_rstrip(
VALUE str)
10469 long olen, roffset;
10471 enc = STR_ENC_GET(str);
10473 roffset = rstrip_offset(str, start, start+olen, enc);
10475 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10491rb_str_strip_bang(
VALUE str)
10494 long olen, loffset, roffset;
10497 str_modify_keep_cr(str);
10498 enc = STR_ENC_GET(str);
10500 loffset = lstrip_offset(str, start, start+olen, enc);
10501 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10503 if (loffset > 0 || roffset > 0) {
10504 long len = olen-roffset;
10507 memmove(start, start + loffset,
len);
10509 STR_SET_LEN(str,
len);
10510 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10533rb_str_strip(
VALUE str)
10536 long olen, loffset, roffset;
10540 loffset = lstrip_offset(str, start, start+olen, enc);
10541 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10543 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10548scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10551 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10557 end = pos + RSTRING_LEN(pat);
10571 if (RSTRING_LEN(str) > end)
10572 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10581 if (!regs || regs->num_regs == 1) {
10587 for (
int i = 1; i < regs->num_regs; i++) {
10648 long last = -1, prev = 0;
10649 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10651 pat = get_pat_quoted(pat, 1);
10652 mustnot_broken(str);
10656 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10661 if (last >= 0) rb_pat_search(pat, str, last, 1);
10666 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10670 str_mod_check(str, p,
len);
10672 if (last >= 0) rb_pat_search(pat, str, last, 1);
10696rb_str_hex(
VALUE str)
10698 return rb_str_to_inum(str, 16, FALSE);
10723rb_str_oct(
VALUE str)
10725 return rb_str_to_inum(str, -8, FALSE);
10728#ifndef HAVE_CRYPT_R
10733 rb_nativethread_lock_t lock;
10734} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10803# define CRYPT_END() ALLOCV_END(databuf)
10806 extern char *crypt(
const char *,
const char *);
10807# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10810 const char *s, *saltp;
10813 char salt_8bit_clean[3];
10817 mustnot_wchar(str);
10818 mustnot_wchar(salt);
10820 saltp = RSTRING_PTR(salt);
10821 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10822 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10826 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10827 salt_8bit_clean[0] = saltp[0] & 0x7f;
10828 salt_8bit_clean[1] = saltp[1] & 0x7f;
10829 salt_8bit_clean[2] =
'\0';
10830 saltp = salt_8bit_clean;
10835# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10836 data->initialized = 0;
10838 res = crypt_r(s, saltp, data);
10841 res = crypt(s, saltp);
10856 size_t res_size = strlen(res)+1;
10857 tmp_buf =
ALLOCA_N(
char, res_size);
10858 memcpy(tmp_buf, res, res_size);
10895 char *ptr, *p, *pend;
10898 unsigned long sum0 = 0;
10903 ptr = p = RSTRING_PTR(str);
10904 len = RSTRING_LEN(str);
10910 str_mod_check(str, ptr,
len);
10913 sum0 += (
unsigned char)*p;
10924 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10925 sum0 &= (((
unsigned long)1)<<bits)-1;
10945rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10949 long width,
len, flen = 1, fclen = 1;
10952 const char *f =
" ";
10953 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10955 int singlebyte = 1, cr;
10959 enc = STR_ENC_GET(str);
10960 termlen = rb_enc_mbminlen(enc);
10964 enc = rb_enc_check(str, pad);
10965 f = RSTRING_PTR(pad);
10966 flen = RSTRING_LEN(pad);
10967 fclen = str_strlen(pad, enc);
10968 singlebyte = single_byte_optimizable(pad);
10969 if (flen == 0 || fclen == 0) {
10970 rb_raise(rb_eArgError,
"zero width padding");
10973 len = str_strlen(str, enc);
10974 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
10976 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
10980 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10981 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10983 size = RSTRING_LEN(str);
10984 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10985 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10986 (
len += llen2 + rlen2) >= LONG_MAX - size) {
10987 rb_raise(rb_eArgError,
"argument too big");
10991 p = RSTRING_PTR(res);
10993 memset(p, *f, llen);
10997 while (llen >= fclen) {
11003 memcpy(p, f, llen2);
11007 memcpy(p, RSTRING_PTR(str), size);
11010 memset(p, *f, rlen);
11014 while (rlen >= fclen) {
11020 memcpy(p, f, rlen2);
11024 TERM_FILL(p, termlen);
11025 STR_SET_LEN(res, p-RSTRING_PTR(res));
11048rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
11050 return rb_str_justify(argc, argv, str,
'l');
11064rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11066 return rb_str_justify(argc, argv, str,
'r');
11079rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11081 return rb_str_justify(argc, argv, str,
'c');
11097 sep = get_pat_quoted(sep, 0);
11109 pos = rb_str_index(str, sep, 0);
11110 if (pos < 0)
goto failed;
11115 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11118 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11132 long pos = RSTRING_LEN(str);
11134 sep = get_pat_quoted(sep, 0);
11147 pos = rb_str_rindex(str, sep, pos);
11156 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11158 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11170rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11174 for (i=0; i<argc; i++) {
11175 VALUE tmp = argv[i];
11177 if (rb_reg_start_with_p(tmp, str))
11181 const char *p, *s, *e;
11186 enc = rb_enc_check(str, tmp);
11187 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11188 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11189 p = RSTRING_PTR(str);
11192 if (!at_char_right_boundary(p, s, e, enc))
11194 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11210rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11214 for (i=0; i<argc; i++) {
11215 VALUE tmp = argv[i];
11216 const char *p, *s, *e;
11221 enc = rb_enc_check(str, tmp);
11222 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11223 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11224 p = RSTRING_PTR(str);
11227 if (!at_char_boundary(p, s, e, enc))
11229 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11245deleted_prefix_length(
VALUE str,
VALUE prefix)
11247 const char *strptr, *prefixptr;
11248 long olen, prefixlen;
11253 if (!is_broken_string(prefix) ||
11254 !rb_enc_asciicompat(enc) ||
11255 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11256 enc = rb_enc_check(str, prefix);
11260 prefixlen = RSTRING_LEN(prefix);
11261 if (prefixlen <= 0)
return 0;
11262 olen = RSTRING_LEN(str);
11263 if (olen < prefixlen)
return 0;
11264 strptr = RSTRING_PTR(str);
11265 prefixptr = RSTRING_PTR(prefix);
11266 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11267 if (is_broken_string(prefix)) {
11268 if (!is_broken_string(str)) {
11272 const char *strend = strptr + olen;
11273 const char *after_prefix = strptr + prefixlen;
11274 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11295rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11298 str_modify_keep_cr(str);
11300 prefixlen = deleted_prefix_length(str, prefix);
11301 if (prefixlen <= 0)
return Qnil;
11315rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11319 prefixlen = deleted_prefix_length(str, prefix);
11320 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11322 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11335deleted_suffix_length(
VALUE str,
VALUE suffix)
11337 const char *strptr, *suffixptr;
11338 long olen, suffixlen;
11342 if (is_broken_string(suffix))
return 0;
11343 enc = rb_enc_check(str, suffix);
11346 suffixlen = RSTRING_LEN(suffix);
11347 if (suffixlen <= 0)
return 0;
11348 olen = RSTRING_LEN(str);
11349 if (olen < suffixlen)
return 0;
11350 strptr = RSTRING_PTR(str);
11351 suffixptr = RSTRING_PTR(suffix);
11352 const char *strend = strptr + olen;
11353 const char *before_suffix = strend - suffixlen;
11354 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11355 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11371rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11373 long olen, suffixlen,
len;
11374 str_modifiable(str);
11376 suffixlen = deleted_suffix_length(str, suffix);
11377 if (suffixlen <= 0)
return Qnil;
11379 olen = RSTRING_LEN(str);
11380 str_modify_keep_cr(str);
11381 len = olen - suffixlen;
11382 STR_SET_LEN(str,
len);
11383 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11399rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11403 suffixlen = deleted_suffix_length(str, suffix);
11404 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11406 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11413 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11421 val = rb_fs_check(val);
11424 "value of %"PRIsVALUE
" must be String or Regexp",
11428 rb_warn_deprecated(
"'$;'", NULL);
11445 str_modifiable(str);
11448 int idx = rb_enc_to_index(encoding);
11455 rb_enc_associate_index(str, idx);
11479 if (STR_EMBED_P(str)) {
11480 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11485 str_replace_shared_without_enc(str2, str);
11487 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11520rb_str_valid_encoding_p(
VALUE str)
11540rb_str_is_ascii_only_p(
VALUE str)
11550 static const char ellipsis[] =
"...";
11551 const long ellipsislen =
sizeof(ellipsis) - 1;
11553 const long blen = RSTRING_LEN(str);
11554 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11555 VALUE estr, ret = 0;
11558 if (
len * rb_enc_mbminlen(enc) >= blen ||
11562 else if (
len <= ellipsislen ||
11564 if (rb_enc_asciicompat(enc)) {
11566 rb_enc_associate(ret, enc);
11573 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11578 rb_enc_from_encoding(enc), 0,
Qnil);
11591 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11597 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11616 if (enc == STR_ENC_GET(str)) {
11621 return enc_str_scrub(enc, str, repl, cr);
11629 const char *rep, *p, *e, *p1, *sp;
11635 rb_raise(rb_eArgError,
"both of block and replacement given");
11642 if (!
NIL_P(repl)) {
11643 repl = str_compat_and_valid(repl, enc);
11646 if (rb_enc_dummy_p(enc)) {
11649 encidx = rb_enc_to_index(enc);
11651#define DEFAULT_REPLACE_CHAR(str) do { \
11652 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11653 rep = replace; replen = (int)sizeof(replace); \
11656 slen = RSTRING_LEN(str);
11657 p = RSTRING_PTR(str);
11662 if (rb_enc_asciicompat(enc)) {
11668 else if (!
NIL_P(repl)) {
11669 rep = RSTRING_PTR(repl);
11670 replen = RSTRING_LEN(repl);
11673 else if (encidx == rb_utf8_encindex()) {
11674 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11678 DEFAULT_REPLACE_CHAR(
"?");
11683 p = search_nonascii(p, e);
11688 int ret = rb_enc_precise_mbclen(p, e, enc);
11707 if (e - p < clen) clen = e - p;
11714 for (; clen > 1; clen--) {
11715 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11726 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11727 str_mod_check(str, sp, slen);
11728 repl = str_compat_and_valid(repl, enc);
11735 p = search_nonascii(p, e);
11761 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11762 str_mod_check(str, sp, slen);
11763 repl = str_compat_and_valid(repl, enc);
11772 long mbminlen = rb_enc_mbminlen(enc);
11776 else if (!
NIL_P(repl)) {
11777 rep = RSTRING_PTR(repl);
11778 replen = RSTRING_LEN(repl);
11780 else if (encidx == ENCINDEX_UTF_16BE) {
11781 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11783 else if (encidx == ENCINDEX_UTF_16LE) {
11784 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11786 else if (encidx == ENCINDEX_UTF_32BE) {
11787 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11789 else if (encidx == ENCINDEX_UTF_32LE) {
11790 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11793 DEFAULT_REPLACE_CHAR(
"?");
11797 int ret = rb_enc_precise_mbclen(p, e, enc);
11810 if (e - p < clen) clen = e - p;
11811 if (clen <= mbminlen * 2) {
11816 for (; clen > mbminlen; clen-=mbminlen) {
11817 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11827 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11828 str_mod_check(str, sp, slen);
11829 repl = str_compat_and_valid(repl, enc);
11854 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11855 str_mod_check(str, sp, slen);
11856 repl = str_compat_and_valid(repl, enc);
11892str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11900static ID id_normalize;
11901static ID id_normalized_p;
11902static VALUE mUnicodeNormalize;
11905unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11907 static int UnicodeNormalizeRequired = 0;
11910 if (!UnicodeNormalizeRequired) {
11911 rb_require(
"unicode_normalize/normalize.rb");
11912 UnicodeNormalizeRequired = 1;
11916 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11953rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11955 return unicode_normalize_common(argc, argv, str, id_normalize);
11969rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
11971 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11998rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
12000 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12132#define sym_equal rb_obj_equal
12135sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12139 int c = rb_enc_precise_mbclen(s, send, enc);
12143 c = rb_enc_mbc_to_codepoint(s, send, enc);
12151rb_str_symname_p(
VALUE sym)
12156 rb_encoding *resenc = rb_default_internal_encoding();
12158 if (resenc == NULL) resenc = rb_default_external_encoding();
12159 enc = STR_ENC_GET(sym);
12160 ptr = RSTRING_PTR(sym);
12161 len = RSTRING_LEN(sym);
12162 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12170rb_str_quote_unprintable(
VALUE str)
12178 resenc = rb_default_internal_encoding();
12179 if (resenc == NULL) resenc = rb_default_external_encoding();
12180 enc = STR_ENC_GET(str);
12181 ptr = RSTRING_PTR(str);
12182 len = RSTRING_LEN(str);
12183 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12184 !sym_printable(ptr, ptr +
len, enc)) {
12185 return rb_str_escape(str);
12191rb_id_quote_unprintable(
ID id)
12193 VALUE str = rb_id2str(
id);
12194 if (!rb_str_symname_p(str)) {
12195 return rb_str_escape(str);
12213sym_inspect(
VALUE sym)
12220 if (!rb_str_symname_p(str)) {
12222 len = RSTRING_LEN(str);
12223 rb_str_resize(str,
len + 1);
12224 dest = RSTRING_PTR(str);
12225 memmove(dest + 1, dest,
len);
12229 VALUE orig_str = str;
12231 len = RSTRING_LEN(orig_str);
12232 str = rb_enc_str_new(0,
len + 1, enc);
12235 ptr = RSTRING_PTR(orig_str);
12236 dest = RSTRING_PTR(str);
12237 memcpy(dest + 1, ptr,
len);
12257rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12262 rb_raise(rb_eArgError,
"no receiver given");
12359 return rb_str_match(
rb_sym2str(sym), other);
12374sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12376 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12389sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12391 return rb_str_match_m_p(argc, argv, sym);
12409 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12420sym_length(
VALUE sym)
12434sym_empty(
VALUE sym)
12468sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12484sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12500sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12514sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12516 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12529sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12531 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12543sym_encoding(
VALUE sym)
12549string_for_symbol(
VALUE name)
12554 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12568 name = string_for_symbol(name);
12569 return rb_intern_str(name);
12578 name = string_for_symbol(name);
12602 return rb_fstring(str);
12609 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
true,
false);
12621 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12622 rb_enc_autoload(enc);
12626 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12632 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12633 rb_enc_autoload(enc);
12637 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12648rb_yjit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12653 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12654 rb_str_buf_cat_byte(str, (
char) code);
12664fstring_set_class_i(
VALUE *str,
void *data)
12668 return ST_CONTINUE;
12676 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12843 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.