14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
48#include "ruby_assert.h"
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
66#undef rb_usascii_str_new
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
149str_encindex_fastpath(
int encindex)
153 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_US_ASCII:
163str_enc_fastpath(
VALUE str)
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
217#define STR_ENC_GET(str) get_encoding(str)
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
230str_embed_capa(
VALUE str)
232 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
236rb_str_reembeddable_p(
VALUE str)
238 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242rb_str_embed_size(
long capa)
248rb_str_size_as_embedded(
VALUE str)
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
260 real_size =
sizeof(
struct RString);
264 real_size +=
sizeof(st_index_t);
271STR_EMBEDDABLE_P(
long len,
long termlen)
273 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
278static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
279static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
281static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
282static inline void str_modifiable(
VALUE str);
287str_make_independent(
VALUE str)
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str),
len, 0L, termlen);
294static inline int str_dependent_p(
VALUE str);
297rb_str_make_independent(
VALUE str)
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
305rb_str_make_embedded(
VALUE str)
310 char *buf =
RSTRING(str)->as.heap.ptr;
314 STR_SET_LEN(str,
len);
317 memcpy(RSTRING_PTR(str), buf,
len);
321 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
325rb_debug_rstring_null_ptr(
const char *func)
327 fprintf(stderr,
"%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
338get_encoding(
VALUE str)
344mustnot_broken(
VALUE str)
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
352mustnot_wchar(
VALUE str)
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
360static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
368BARE_STRING_P(
VALUE str)
373static inline st_index_t
374str_do_hash(
VALUE str)
376 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
378 if (e && !is_ascii_string(str)) {
385str_store_precomputed_hash(
VALUE str, st_index_t hash)
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
396 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
398 FL_SET(str, STR_PRECOMPUTED_HASH);
411 if (
FL_TEST(str, RSTRING_FSTR))
414 bare = BARE_STRING_P(str);
416 if (STR_EMBED_P(str)) {
421 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 rb_str_resize(str, RSTRING_LEN(str));
430 fstr = register_fstring(str,
false,
false);
433 str_replace_shared_without_enc(str, fstr);
440static VALUE fstring_table_obj;
443fstring_concurrent_set_hash(
VALUE str)
445#ifdef PRECOMPUTED_FAKESTR_HASH
449 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
466 const char *aptr, *bptr;
473 return (alen == blen &&
475 memcmp(aptr, bptr, alen) == 0);
480 bool force_precompute_hash;
484fstring_concurrent_set_create(
VALUE str,
void *data)
494 long len = RSTRING_LEN(str);
495 long capa =
len +
sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
525 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
528 if (STR_SHARED_P(str)) {
530 str_make_independent(str);
533 if (!BARE_STRING_P(str)) {
539 RBASIC(str)->flags |= RSTRING_FSTR;
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
559Init_fstring_table(
void)
561 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
562 rb_gc_register_address(&fstring_table_obj);
566register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
570 .force_precompute_hash = force_precompute_hash
573#if SIZEOF_VOIDP == SIZEOF_LONG
577 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
581 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
583 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593rb_obj_is_fstring_table(
VALUE obj)
597 return obj == fstring_table_obj;
601rb_gc_free_fstring(
VALUE obj)
606 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
608 RB_DEBUG_COUNTER_INC(obj_str_fstr);
614rb_fstring_foreach_with_replace(
int (*callback)(
VALUE *str,
void *data),
void *data)
616 if (fstring_table_obj) {
617 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
622setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
625 RBASIC_SET_SHAPE_ID((
VALUE)fake_str, ROOT_SHAPE_ID);
638 return (
VALUE)fake_str;
647 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
656rb_fstring_new(
const char *ptr,
long len)
659 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
666 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
670rb_fstring_cstr(
const char *
ptr)
672 return rb_fstring_new(
ptr, strlen(
ptr));
676single_byte_optimizable(
VALUE str)
680 case ENCINDEX_ASCII_8BIT:
681 case ENCINDEX_US_ASCII:
703static inline const char *
704search_nonascii(
const char *p,
const char *e)
706 const uintptr_t *s, *t;
708#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
709# if SIZEOF_UINTPTR_T == 8
710# define NONASCII_MASK UINT64_C(0x8080808080808080)
711# elif SIZEOF_UINTPTR_T == 4
712# define NONASCII_MASK UINT32_C(0x80808080)
714# error "don't know what to do."
717# if SIZEOF_UINTPTR_T == 8
718# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
719# elif SIZEOF_UINTPTR_T == 4
720# define NONASCII_MASK 0x80808080UL
722# error "don't know what to do."
726 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
727#if !UNALIGNED_WORD_ACCESS
728 if ((uintptr_t)p % SIZEOF_VOIDP) {
729 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
734 case 7:
if (p[-7]&0x80)
return p-7;
735 case 6:
if (p[-6]&0x80)
return p-6;
736 case 5:
if (p[-5]&0x80)
return p-5;
737 case 4:
if (p[-4]&0x80)
return p-4;
739 case 3:
if (p[-3]&0x80)
return p-3;
740 case 2:
if (p[-2]&0x80)
return p-2;
741 case 1:
if (p[-1]&0x80)
return p-1;
746#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
747#define aligned_ptr(value) \
748 __builtin_assume_aligned((value), sizeof(uintptr_t))
750#define aligned_ptr(value) (uintptr_t *)(value)
753 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
756 if (*s & NONASCII_MASK) {
757#ifdef WORDS_BIGENDIAN
758 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
760 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
770 case 7:
if (e[-7]&0x80)
return e-7;
771 case 6:
if (e[-6]&0x80)
return e-6;
772 case 5:
if (e[-5]&0x80)
return e-5;
773 case 4:
if (e[-4]&0x80)
return e-4;
775 case 3:
if (e[-3]&0x80)
return e-3;
776 case 2:
if (e[-2]&0x80)
return e-2;
777 case 1:
if (e[-1]&0x80)
return e-1;
785 const char *e = p +
len;
787 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
789 p = search_nonascii(p, e);
793 if (rb_enc_asciicompat(enc)) {
794 p = search_nonascii(p, e);
797 int ret = rb_enc_precise_mbclen(p, e, enc);
801 p = search_nonascii(p, e);
807 int ret = rb_enc_precise_mbclen(p, e, enc);
823 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
826 p = search_nonascii(p, e);
830 else if (rb_enc_asciicompat(enc)) {
831 p = search_nonascii(p, e);
837 int ret = rb_enc_precise_mbclen(p, e, enc);
844 p = search_nonascii(p, e);
850 int ret = rb_enc_precise_mbclen(p, e, enc);
875 rb_enc_set_index(str1, rb_enc_get_index(str2));
883rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
888 str_enc_copy(dest, src);
889 if (RSTRING_LEN(dest) == 0) {
890 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
901 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
902 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
913rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
915 str_enc_copy(dest, src);
922 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
928 return enc_coderange_scan(str, enc);
937 cr = enc_coderange_scan(str, get_encoding(str));
944rb_enc_str_asciicompat(
VALUE str)
947 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
955 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
964str_mod_check(
VALUE s,
const char *p,
long len)
966 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
972str_capacity(
VALUE str,
const int termlen)
974 if (STR_EMBED_P(str)) {
975 return str_embed_capa(str) - termlen;
977 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
981 return RSTRING(str)->as.heap.aux.capa;
988 return str_capacity(str, TERM_LEN(str));
992must_not_null(
const char *
ptr)
995 rb_raise(rb_eArgError,
"NULL pointer given");
1000str_alloc_embed(
VALUE klass,
size_t capa)
1002 size_t size = rb_str_embed_size(
capa);
1006 NEWOBJ_OF(str,
struct RString, klass,
1013str_alloc_heap(
VALUE klass)
1015 NEWOBJ_OF(str,
struct RString, klass,
1022empty_str_alloc(
VALUE klass)
1024 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1025 VALUE str = str_alloc_embed(klass, 0);
1026 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1037 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1041 enc = rb_ascii8bit_encoding();
1044 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1046 int termlen = rb_enc_mbminlen(enc);
1048 if (STR_EMBEDDABLE_P(
len, termlen)) {
1049 str = str_alloc_embed(klass,
len + termlen);
1055 str = str_alloc_heap(klass);
1061 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1064 rb_enc_raw_set(str, enc);
1067 memcpy(RSTRING_PTR(str),
ptr,
len);
1070 STR_SET_LEN(str,
len);
1071 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1078 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1113 __msan_unpoison_string(
ptr);
1133 if (rb_enc_mbminlen(enc) != 1) {
1134 rb_raise(rb_eArgError,
"wchar encoding given");
1136 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1140str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1145 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1149 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1152 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1153 str = str_alloc_heap(klass);
1157 RBASIC(str)->flags |= STR_NOFREE;
1158 rb_enc_associate_index(str, encindex);
1187static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1189 int ecflags,
VALUE ecopts);
1194 int encidx = rb_enc_to_index(enc);
1195 if (rb_enc_get_index(str) == encidx)
1196 return is_ascii_string(str);
1207 if (!to)
return str;
1208 if (!from) from = rb_enc_get(str);
1209 if (from == to)
return str;
1210 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1211 rb_is_ascii8bit_enc(to)) {
1212 if (STR_ENC_GET(str) != to) {
1214 rb_enc_associate(str, to);
1221 from, to, ecflags, ecopts);
1222 if (
NIL_P(newstr)) {
1230rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1235 olen = RSTRING_LEN(newstr);
1236 if (ofs < -olen || olen < ofs)
1238 if (ofs < 0) ofs += olen;
1240 STR_SET_LEN(newstr, ofs);
1244 rb_str_modify(newstr);
1245 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1253 STR_SET_LEN(str, 0);
1254 rb_enc_associate(str, enc);
1260str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1262 int ecflags,
VALUE ecopts)
1267 VALUE econv_wrapper;
1268 const unsigned char *start, *sp;
1269 unsigned char *dest, *dp;
1270 size_t converted_output = (size_t)ofs;
1275 RBASIC_CLEAR_CLASS(econv_wrapper);
1277 if (!ec)
return Qnil;
1280 sp = (
unsigned char*)
ptr;
1282 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1283 (dp = dest + converted_output),
1287 size_t converted_input = sp - start;
1288 size_t rest =
len - converted_input;
1289 converted_output = dp - dest;
1291 if (converted_input && converted_output &&
1292 rest < (LONG_MAX / converted_output)) {
1293 rest = (rest * converted_output) / converted_input;
1298 olen += rest < 2 ? 2 : rest;
1299 rb_str_resize(newstr, olen);
1306 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1308 rb_enc_associate(newstr, to);
1327 const int eidx = rb_enc_to_index(eenc);
1330 return rb_enc_str_new(
ptr,
len, eenc);
1334 if ((eidx == rb_ascii8bit_encindex()) ||
1335 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1339 ienc = rb_default_internal_encoding();
1340 if (!ienc || eenc == ienc) {
1341 return rb_enc_str_new(
ptr,
len, eenc);
1345 if ((eidx == rb_ascii8bit_encindex()) ||
1346 (eidx == rb_usascii_encindex()) ||
1347 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1348 return rb_enc_str_new(
ptr,
len, ienc);
1351 str = rb_enc_str_new(NULL, 0, ienc);
1354 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1355 rb_str_initialize(str,
ptr,
len, eenc);
1363 int eidx = rb_enc_to_index(eenc);
1364 if (eidx == rb_usascii_encindex() &&
1365 !is_ascii_string(str)) {
1366 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1369 rb_enc_associate_index(str, eidx);
1428str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1430 const int termlen = TERM_LEN(str);
1435 if (str_embed_capa(str2) >=
len + termlen) {
1436 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1437 STR_SET_EMBED(str2);
1438 memcpy(ptr2, RSTRING_PTR(str),
len);
1439 TERM_FILL(ptr2+
len, termlen);
1443 if (STR_SHARED_P(str)) {
1444 root =
RSTRING(str)->as.heap.aux.shared;
1453 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1455 rb_fatal(
"about to free a possible shared root");
1457 char *ptr2 = STR_HEAP_PTR(str2);
1459 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1462 FL_SET(str2, STR_NOEMBED);
1464 STR_SET_SHARED(str2, root);
1467 STR_SET_LEN(str2,
len);
1475 str_replace_shared_without_enc(str2, str);
1476 rb_enc_cr_str_exact_copy(str2, str);
1483 return str_replace_shared(str_alloc_heap(klass), str);
1500rb_str_new_frozen_String(
VALUE orig)
1508rb_str_frozen_bare_string(
VALUE orig)
1510 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1515rb_str_tmp_frozen_acquire(
VALUE orig)
1518 return str_new_frozen_buffer(0, orig, FALSE);
1522rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1524 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1525 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1527 VALUE str = str_alloc_heap(0);
1530 FL_SET(str, STR_SHARED_ROOT);
1532 size_t capa = str_capacity(orig, TERM_LEN(orig));
1538 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1539 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1546 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1547 RBASIC(orig)->flags &= ~STR_NOFREE;
1548 STR_SET_SHARED(orig, str);
1558rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1563 if (STR_EMBED_P(tmp)) {
1566 else if (
FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1572 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1576 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1577 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1582 STR_SET_LEN(tmp, 0);
1590 return str_new_frozen_buffer(klass, orig, TRUE);
1599 VALUE str = str_alloc_heap(klass);
1600 STR_SET_LEN(str, RSTRING_LEN(orig));
1601 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1602 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1603 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1604 RBASIC(orig)->flags &= ~STR_NOFREE;
1605 STR_SET_SHARED(orig, str);
1612str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1616 long len = RSTRING_LEN(orig);
1617 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1618 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1620 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1621 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1627 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1628 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1634 if ((ofs > 0) || (rest > 0) ||
1637 str = str_new_shared(klass,
shared);
1639 RSTRING(str)->as.heap.ptr += ofs;
1640 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1648 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1649 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1651 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1652 STR_SET_LEN(str, RSTRING_LEN(orig));
1657 str = heap_str_make_shared(klass, orig);
1661 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1673str_new_empty_String(
VALUE str)
1676 rb_enc_copy(v, str);
1680#define STR_BUF_MIN_SIZE 63
1685 if (STR_EMBEDDABLE_P(
capa, 1)) {
1693 RSTRING(str)->as.heap.ptr[0] =
'\0';
1713 return str_new(0, 0,
len);
1719 if (STR_EMBED_P(str)) {
1720 RB_DEBUG_COUNTER_INC(obj_str_embed);
1722 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1723 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1724 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1727 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1728 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1733rb_str_memsize(
VALUE str)
1735 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1736 return STR_HEAP_SIZE(str);
1746 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1749static inline void str_discard(
VALUE str);
1750static void str_shared_replace(
VALUE str,
VALUE str2);
1755 if (str != str2) str_shared_replace(str, str2);
1766 enc = STR_ENC_GET(str2);
1769 termlen = rb_enc_mbminlen(enc);
1771 STR_SET_LEN(str, RSTRING_LEN(str2));
1773 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1775 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1776 rb_enc_associate(str, enc);
1780 if (STR_EMBED_P(str2)) {
1782 long len = RSTRING_LEN(str2);
1785 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1786 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1787 RSTRING(str2)->as.heap.ptr = new_ptr;
1788 STR_SET_LEN(str2,
len);
1790 STR_SET_NOEMBED(str2);
1793 STR_SET_NOEMBED(str);
1795 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1797 if (
FL_TEST(str2, STR_SHARED)) {
1799 STR_SET_SHARED(str,
shared);
1802 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1806 STR_SET_EMBED(str2);
1807 RSTRING_PTR(str2)[0] = 0;
1808 STR_SET_LEN(str2, 0);
1809 rb_enc_associate(str, enc);
1823 return rb_obj_as_string_result(str, obj);
1839 len = RSTRING_LEN(str2);
1840 if (STR_SHARED_P(str2)) {
1843 STR_SET_NOEMBED(str);
1844 STR_SET_LEN(str,
len);
1845 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1846 STR_SET_SHARED(str,
shared);
1847 rb_enc_cr_str_exact_copy(str, str2);
1850 str_replace_shared(str, str2);
1859 size_t size = rb_str_embed_size(
capa);
1863 NEWOBJ_OF(str,
struct RString, klass,
1872 NEWOBJ_OF(str,
struct RString, klass,
1883 encidx = rb_enc_get_index(str);
1884 flags &= ~ENCODING_MASK;
1887 if (encidx) rb_enc_associate_index(dup, encidx);
1897 long len = RSTRING_LEN(str);
1902 STR_SET_LEN(dup, RSTRING_LEN(str));
1903 return str_duplicate_setup_encoding(str, dup, flags);
1912 root =
RSTRING(str)->as.heap.aux.shared;
1915 root = str = str_new_frozen(klass, str);
1921 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1922 FL_SET(root, STR_SHARED_ROOT);
1924 flags |= RSTRING_NOEMBED | STR_SHARED;
1926 STR_SET_LEN(dup, RSTRING_LEN(str));
1927 return str_duplicate_setup_encoding(str, dup, flags);
1933 if (STR_EMBED_P(str)) {
1934 return str_duplicate_setup_embed(klass, str, dup);
1937 return str_duplicate_setup_heap(klass, str, dup);
1945 if (STR_EMBED_P(str)) {
1946 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1949 dup = str_alloc_heap(klass);
1952 return str_duplicate_setup(klass, str, dup);
1963rb_str_dup_m(
VALUE str)
1965 if (LIKELY(BARE_STRING_P(str))) {
1976 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1983 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1987 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1988 str_duplicate_setup_embed(klass, str, new_str);
1991 new_str = ec_str_alloc_heap(ec, klass);
1992 str_duplicate_setup_heap(klass, str, new_str);
2001rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
2003 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
2027 static ID keyword_ids[2];
2028 VALUE orig, opt, venc, vcapa;
2033 if (!keyword_ids[0]) {
2034 keyword_ids[0] = rb_id_encoding();
2035 CONST_ID(keyword_ids[1],
"capacity");
2043 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2044 enc = rb_to_encoding(venc);
2046 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2049 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2051 if (
capa < STR_BUF_MIN_SIZE) {
2052 capa = STR_BUF_MIN_SIZE;
2056 len = RSTRING_LEN(orig);
2060 if (orig == str) n = 0;
2062 str_modifiable(str);
2063 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2065 const size_t size = (size_t)
capa + termlen;
2066 const char *
const old_ptr = RSTRING_PTR(str);
2067 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2068 char *new_ptr =
ALLOC_N(
char, size);
2069 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2070 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2072 RSTRING(str)->as.heap.ptr = new_ptr;
2074 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2075 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2076 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2078 STR_SET_LEN(str,
len);
2081 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2082 rb_enc_cr_str_exact_copy(str, orig);
2084 FL_SET(str, STR_NOEMBED);
2091 rb_enc_associate(str, enc);
2103rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2109 static ID keyword_ids[2];
2119 keyword_ids[0] = rb_id_encoding();
2120 CONST_ID(keyword_ids[1],
"capacity");
2122 encoding = kwargs[0];
2123 capacity = kwargs[1];
2132 if (UNDEF_P(encoding)) {
2134 encoding = rb_obj_encoding(orig);
2138 if (!UNDEF_P(encoding)) {
2139 enc = rb_to_encoding(encoding);
2143 if (UNDEF_P(capacity)) {
2145 VALUE empty_str = str_new(klass,
"", 0);
2147 rb_enc_associate(empty_str, enc);
2151 VALUE copy = str_duplicate(klass, orig);
2152 rb_enc_associate(copy, enc);
2165 if (orig_capa >
capa) {
2170 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2171 STR_SET_LEN(str, 0);
2182#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2197static inline uintptr_t
2198count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2203 d = (d>>6) | (~d>>7);
2204 d &= NONASCII_MASK >> 7;
2207#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2209 return rb_popcount_intptr(d);
2213# if SIZEOF_VOIDP == 8
2222enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2228 long diff = (long)(e - p);
2229 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2234 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2235 const uintptr_t *s, *t;
2236 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2237 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2238 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2239 while (p < (
const char *)s) {
2240 if (is_utf8_lead_byte(*p))
len++;
2244 len += count_utf8_lead_bytes_with_word(s);
2247 p = (
const char *)s;
2250 if (is_utf8_lead_byte(*p))
len++;
2256 else if (rb_enc_asciicompat(enc)) {
2261 q = search_nonascii(p, e);
2267 p += rb_enc_fast_mbclen(p, e, enc);
2274 q = search_nonascii(p, e);
2280 p += rb_enc_mbclen(p, e, enc);
2287 for (c=0; p<e; c++) {
2288 p += rb_enc_mbclen(p, e, enc);
2303rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2311 long diff = (long)(e - p);
2312 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2314 else if (rb_enc_asciicompat(enc)) {
2318 q = search_nonascii(p, e);
2326 ret = rb_enc_precise_mbclen(p, e, enc);
2341 for (c=0; p<e; c++) {
2342 ret = rb_enc_precise_mbclen(p, e, enc);
2349 if (p + rb_enc_mbminlen(enc) <= e)
2350 p += rb_enc_mbminlen(enc);
2366 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2367 if (!enc) enc = STR_ENC_GET(str);
2368 p = RSTRING_PTR(str);
2373 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2378 return enc_strlen(p, e, enc, cr);
2385 return str_strlen(str, NULL);
2399 return LONG2NUM(str_strlen(str, NULL));
2411rb_str_bytesize(
VALUE str)
2430rb_str_empty(
VALUE str)
2432 return RBOOL(RSTRING_LEN(str) == 0);
2451 char *ptr1, *ptr2, *ptr3;
2456 enc = rb_enc_check_str(str1, str2);
2459 termlen = rb_enc_mbminlen(enc);
2460 if (len1 > LONG_MAX - len2) {
2461 rb_raise(rb_eArgError,
"string size too big");
2463 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2464 ptr3 = RSTRING_PTR(str3);
2465 memcpy(ptr3, ptr1, len1);
2466 memcpy(ptr3+len1, ptr2, len2);
2467 TERM_FILL(&ptr3[len1+len2], termlen);
2483 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2486 int enc1 = rb_enc_get_index(str1);
2487 int enc2 = rb_enc_get_index(str2);
2492 else if (enc2 < 0) {
2495 else if (enc1 != enc2) {
2498 else if (len1 > LONG_MAX - len2) {
2532 rb_enc_copy(str2, str);
2537 rb_raise(rb_eArgError,
"negative argument");
2539 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2540 if (STR_EMBEDDABLE_P(
len, 1)) {
2542 memset(RSTRING_PTR(str2), 0,
len + 1);
2549 STR_SET_LEN(str2,
len);
2550 rb_enc_copy(str2, str);
2553 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2554 rb_raise(rb_eArgError,
"argument too big");
2557 len *= RSTRING_LEN(str);
2558 termlen = TERM_LEN(str);
2560 ptr2 = RSTRING_PTR(str2);
2562 n = RSTRING_LEN(str);
2563 memcpy(ptr2, RSTRING_PTR(str), n);
2564 while (n <=
len/2) {
2565 memcpy(ptr2 + n, ptr2, n);
2568 memcpy(ptr2 + n, ptr2,
len-n);
2570 STR_SET_LEN(str2,
len);
2571 TERM_FILL(&ptr2[
len], termlen);
2572 rb_enc_cr_str_copy_for_substr(str2, str);
2609rb_check_lockedtmp(
VALUE str)
2611 if (
FL_TEST(str, STR_TMPLOCK)) {
2618#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2620str_modifiable(
VALUE str)
2624 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2625 if (CHILLED_STRING_P(str)) {
2626 CHILLED_STRING_MUTATED(str);
2628 rb_check_lockedtmp(str);
2629 rb_check_frozen(str);
2634str_dependent_p(
VALUE str)
2636 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2646#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2648str_independent(
VALUE str)
2652 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2653 str_modifiable(str);
2654 return !str_dependent_p(str);
2660str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2670 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2675 STR_SET_LEN(str,
len);
2680 oldptr = RSTRING_PTR(str);
2682 memcpy(
ptr, oldptr,
len);
2684 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2687 STR_SET_NOEMBED(str);
2688 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2689 TERM_FILL(
ptr +
len, termlen);
2691 STR_SET_LEN(str,
len);
2698 if (!str_independent(str))
2699 str_make_independent(str);
2708 int termlen = TERM_LEN(str);
2709 long len = RSTRING_LEN(str);
2712 rb_raise(rb_eArgError,
"negative expanding string size");
2714 if (expand >= LONG_MAX -
len) {
2715 rb_raise(rb_eArgError,
"string size too big");
2718 if (!str_independent(str)) {
2719 str_make_independent_expand(str,
len, expand, termlen);
2721 else if (expand > 0) {
2722 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2729str_modify_keep_cr(
VALUE str)
2731 if (!str_independent(str))
2732 str_make_independent(str);
2739str_discard(
VALUE str)
2741 str_modifiable(str);
2742 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2743 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2744 RSTRING(str)->as.heap.ptr = 0;
2745 STR_SET_LEN(str, 0);
2752 int encindex = rb_enc_get_index(str);
2754 if (RB_UNLIKELY(encindex == -1)) {
2758 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2763 if (!rb_enc_asciicompat(enc)) {
2785 return RSTRING_PTR(str);
2789zero_filled(
const char *s,
int n)
2791 for (; n > 0; --n) {
2798str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2800 const char *e = s +
len;
2802 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2803 if (zero_filled(s, minlen))
return s;
2809str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2814 if (str_dependent_p(str)) {
2815 if (!zero_filled(s +
len, termlen))
2816 str_make_independent_expand(str,
len, 0L, termlen);
2819 TERM_FILL(s +
len, termlen);
2822 return RSTRING_PTR(str);
2826rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2828 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2829 long len = RSTRING_LEN(str);
2833 rb_check_lockedtmp(str);
2834 str_make_independent_expand(str,
len, 0L, termlen);
2836 else if (str_dependent_p(str)) {
2837 if (termlen > oldtermlen)
2838 str_make_independent_expand(str,
len, 0L, termlen);
2841 if (!STR_EMBED_P(str)) {
2846 if (termlen > oldtermlen) {
2847 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2855str_null_check(
VALUE str,
int *w)
2857 char *s = RSTRING_PTR(str);
2858 long len = RSTRING_LEN(str);
2860 const int minlen = rb_enc_mbminlen(enc);
2864 if (str_null_char(s,
len, minlen, enc)) {
2867 return str_fill_term(str, s,
len, minlen);
2870 if (!s || memchr(s, 0,
len)) {
2874 s = str_fill_term(str, s,
len, minlen);
2880rb_str_to_cstr(
VALUE str)
2883 return str_null_check(str, &w);
2891 char *s = str_null_check(str, &w);
2894 rb_raise(rb_eArgError,
"string contains null char");
2896 rb_raise(rb_eArgError,
"string contains null byte");
2902rb_str_fill_terminator(
VALUE str,
const int newminlen)
2904 char *s = RSTRING_PTR(str);
2905 long len = RSTRING_LEN(str);
2906 return str_fill_term(str, s,
len, newminlen);
2912 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2938str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
2947 else if (rb_enc_asciicompat(enc)) {
2948 const char *p2, *e2;
2951 while (p < e && 0 < nth) {
2958 p2 = search_nonascii(p, e2);
2967 n = rb_enc_mbclen(p, e, enc);
2978 while (p < e && nth--) {
2979 p += rb_enc_mbclen(p, e, enc);
2990 return str_nth_len(p, e, &nth, enc);
2994str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2999 p = str_nth_len(p, e, &nth, enc);
3008str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3010 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3011 if (!pp)
return e - p;
3018 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
3019 STR_ENC_GET(str), single_byte_optimizable(str));
3024str_utf8_nth(
const char *p,
const char *e,
long *nthp)
3027 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
3028 const uintptr_t *s, *t;
3029 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3030 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3031 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
3032 while (p < (
const char *)s) {
3033 if (is_utf8_lead_byte(*p)) nth--;
3037 nth -= count_utf8_lead_bytes_with_word(s);
3039 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3043 if (is_utf8_lead_byte(*p)) {
3044 if (nth == 0)
break;
3054str_utf8_offset(
const char *p,
const char *e,
long nth)
3056 const char *pp = str_utf8_nth(p, e, &nth);
3065 if (single_byte_optimizable(str) || pos < 0)
3068 char *p = RSTRING_PTR(str);
3069 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3074str_subseq(
VALUE str,
long beg,
long len)
3082 const int termlen = TERM_LEN(str);
3083 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3090 if (str_embed_capa(str2) >=
len + termlen) {
3091 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3092 STR_SET_EMBED(str2);
3093 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3094 TERM_FILL(ptr2+
len, termlen);
3096 STR_SET_LEN(str2,
len);
3100 str_replace_shared(str2, str);
3103 RSTRING(str2)->as.heap.ptr += beg;
3104 if (RSTRING_LEN(str2) >
len) {
3105 STR_SET_LEN(str2,
len);
3115 VALUE str2 = str_subseq(str, beg,
len);
3116 rb_enc_cr_str_copy_for_substr(str2, str);
3125 const long blen = RSTRING_LEN(str);
3127 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3129 if (
len < 0)
return 0;
3130 if (beg < 0 && -beg < 0)
return 0;
3134 if (single_byte_optimizable(str)) {
3135 if (beg > blen)
return 0;
3138 if (beg < 0)
return 0;
3140 if (
len > blen - beg)
3142 if (
len < 0)
return 0;
3147 if (
len > -beg)
len = -beg;
3151 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3154 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3160 slen = str_strlen(str, enc);
3162 if (beg < 0)
return 0;
3164 if (
len == 0)
goto end;
3167 else if (beg > 0 && beg > blen) {
3171 if (beg > str_strlen(str, enc))
return 0;
3176 enc == rb_utf8_encoding()) {
3177 p = str_utf8_nth(s, e, &beg);
3178 if (beg > 0)
return 0;
3179 len = str_utf8_offset(p, e,
len);
3185 p = s + beg * char_sz;
3189 else if (
len * char_sz > e - p)
3194 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3195 if (beg > 0)
return 0;
3199 len = str_offset(p, e,
len, enc, 0);
3207static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3212 return str_substr(str, beg,
len, TRUE);
3222str_substr(
VALUE str,
long beg,
long len,
int empty)
3226 if (!p)
return Qnil;
3227 if (!
len && !empty)
return Qnil;
3229 beg = p - RSTRING_PTR(str);
3231 VALUE str2 = str_subseq(str, beg,
len);
3232 rb_enc_cr_str_copy_for_substr(str2, str);
3240 if (CHILLED_STRING_P(str)) {
3245 rb_str_resize(str, RSTRING_LEN(str));
3263 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3306str_uminus(
VALUE str)
3311 return rb_fstring(str);
3315#define rb_str_dup_frozen rb_str_new_frozen
3320 rb_check_frozen(str);
3321 if (
FL_TEST(str, STR_TMPLOCK)) {
3324 FL_SET(str, STR_TMPLOCK);
3331 rb_check_frozen(str);
3332 if (!
FL_TEST(str, STR_TMPLOCK)) {
3352 const int termlen = TERM_LEN(str);
3354 str_modifiable(str);
3355 if (STR_SHARED_P(str)) {
3358 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3359 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3370 else if (
len > RSTRING_LEN(str)) {
3374 const char *
const new_end = RSTRING_PTR(str) +
len;
3384 else if (
len < RSTRING_LEN(str)) {
3392 STR_SET_LEN(str,
len);
3393 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3400 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3403 int independent = str_independent(str);
3404 long slen = RSTRING_LEN(str);
3405 const int termlen = TERM_LEN(str);
3407 if (slen >
len || (termlen != 1 && slen <
len)) {
3413 if (STR_EMBED_P(str)) {
3414 if (
len == slen)
return str;
3415 if (str_embed_capa(str) >=
len + termlen) {
3416 STR_SET_LEN(str,
len);
3420 str_make_independent_expand(str, slen,
len - slen, termlen);
3422 else if (str_embed_capa(str) >=
len + termlen) {
3423 char *
ptr = STR_HEAP_PTR(str);
3425 if (slen >
len) slen =
len;
3428 STR_SET_LEN(str,
len);
3429 if (independent) ruby_xfree(
ptr);
3432 else if (!independent) {
3433 if (
len == slen)
return str;
3434 str_make_independent_expand(str, slen,
len - slen, termlen);
3438 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3439 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3442 else if (
len == slen)
return str;
3443 STR_SET_LEN(str,
len);
3450str_ensure_available_capa(
VALUE str,
long len)
3452 str_modify_keep_cr(str);
3454 const int termlen = TERM_LEN(str);
3455 long olen = RSTRING_LEN(str);
3457 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3458 rb_raise(rb_eArgError,
"string sizes too big");
3461 long total = olen +
len;
3462 long capa = str_capacity(str, termlen);
3465 if (total >= LONG_MAX / 2) {
3468 while (total >
capa) {
3471 RESIZE_CAPA_TERM(str,
capa, termlen);
3476str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3479 str_modify_keep_cr(str);
3484 if (
len == 0)
return 0;
3486 long total, olen,
off = -1;
3488 const int termlen = TERM_LEN(str);
3491 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3495 long capa = str_capacity(str, termlen);
3497 if (olen > LONG_MAX -
len) {
3498 rb_raise(rb_eArgError,
"string sizes too big");
3502 if (total >= LONG_MAX / 2) {
3505 while (total >
capa) {
3508 RESIZE_CAPA_TERM(str,
capa, termlen);
3509 sptr = RSTRING_PTR(str);
3514 memcpy(sptr + olen,
ptr,
len);
3515 STR_SET_LEN(str, total);
3516 TERM_FILL(sptr + total, termlen);
3521#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3522#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3527 if (
len == 0)
return str;
3529 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3531 return str_buf_cat(str,
ptr,
len);
3542rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3547 if (UNLIKELY(!str_independent(str))) {
3548 str_make_independent(str);
3551 long string_length = -1;
3552 const int null_terminator_length = 1;
3557 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3558 rb_raise(rb_eArgError,
"string sizes too big");
3561 long string_capacity = str_capacity(str, null_terminator_length);
3567 if (LIKELY(string_capacity >= string_length + 1)) {
3569 sptr[string_length] = byte;
3570 STR_SET_LEN(str, string_length + 1);
3571 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3575 str_buf_cat(str, (
char *)&
byte, 1);
3591 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3602rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3603 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3612 if (str_encindex == ptr_encindex) {
3614 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3618 str_enc = rb_enc_from_index(str_encindex);
3619 ptr_enc = rb_enc_from_index(ptr_encindex);
3620 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3623 if (RSTRING_LEN(str) == 0) {
3626 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3632 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3641 *ptr_cr_ret = ptr_cr;
3643 if (str_encindex != ptr_encindex &&
3646 str_enc = rb_enc_from_index(str_encindex);
3647 ptr_enc = rb_enc_from_index(ptr_encindex);
3652 res_encindex = str_encindex;
3657 res_encindex = str_encindex;
3661 res_encindex = ptr_encindex;
3666 res_encindex = str_encindex;
3673 res_encindex = str_encindex;
3679 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3681 str_buf_cat(str,
ptr,
len);
3687 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3694 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3704 if (rb_enc_asciicompat(enc)) {
3705 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3711 unsigned int c = (
unsigned char)*
ptr;
3712 int len = rb_enc_codelen(c, enc);
3713 rb_enc_mbcput(c, buf, enc);
3714 rb_enc_cr_str_buf_cat(str, buf,
len,
3727 if (str_enc_fastpath(str)) {
3731 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3737 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3748 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3764rb_str_concat_literals(
size_t num,
const VALUE *strary)
3768 unsigned long len = 1;
3773 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3775 str_enc_copy_direct(str, strary[0]);
3777 for (i = s; i < num; ++i) {
3778 const VALUE v = strary[i];
3782 if (encidx != ENCINDEX_US_ASCII) {
3784 rb_enc_set_index(str, encidx);
3797rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3799 str_modifiable(str);
3804 else if (argc > 1) {
3807 rb_enc_copy(arg_str, str);
3808 for (i = 0; i < argc; i++) {
3843rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3845 long needed_capacity = 0;
3849 for (
int index = 0; index < argc; index++) {
3850 VALUE obj = argv[index];
3858 needed_capacity += RSTRING_LEN(obj);
3863 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3870 str_ensure_available_capa(str, needed_capacity);
3873 for (
int index = 0; index < argc; index++) {
3874 VALUE obj = argv[index];
3879 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3880 char byte = (char)(
NUM2INT(obj) & 0xFF);
3894 rb_bug(
"append_as_bytes arguments should have been validated");
3898 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3899 TERM_FILL(sptr, TERM_LEN(str));
3904 for (
int index = 0; index < argc; index++) {
3905 VALUE obj = argv[index];
3922 rb_bug(
"append_as_bytes arguments should have been validated");
4001 if (rb_num_to_uint(str2, &code) == 0) {
4014 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4017 rb_str_buf_cat_byte(str1, (
unsigned char)code);
4020 long pos = RSTRING_LEN(str1);
4025 switch (
len = rb_enc_codelen(code, enc)) {
4026 case ONIGERR_INVALID_CODE_POINT_VALUE:
4027 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4029 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4035 rb_enc_mbcput(code, buf, enc);
4036 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
4037 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4039 rb_str_resize(str1, pos+
len);
4040 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
4053rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
4055 int encidx = rb_enc_to_index(enc);
4057 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4062 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4063 return ENCINDEX_ASCII_8BIT;
4086rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4088 str_modifiable(str);
4093 else if (argc > 1) {
4096 rb_enc_copy(arg_str, str);
4097 for (i = 0; i < argc; i++) {
4110 st_index_t precomputed_hash;
4111 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4113 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4114 return precomputed_hash;
4117 return str_do_hash(str);
4124 const char *ptr1, *ptr2;
4127 return (len1 != len2 ||
4129 memcmp(ptr1, ptr2, len1) != 0);
4141rb_str_hash_m(
VALUE str)
4147#define lesser(a,b) (((a)>(b))?(b):(a))
4155 if (RSTRING_LEN(str1) == 0)
return TRUE;
4156 if (RSTRING_LEN(str2) == 0)
return TRUE;
4159 if (idx1 == idx2)
return TRUE;
4164 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4168 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4178 const char *ptr1, *ptr2;
4181 if (str1 == str2)
return 0;
4184 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4193 if (len1 > len2)
return 1;
4196 if (retval > 0)
return 1;
4230 if (str1 == str2)
return Qtrue;
4237 return rb_str_eql_internal(str1, str2);
4251 if (str1 == str2)
return Qtrue;
4253 return rb_str_eql_internal(str1, str2);
4285 return rb_invcmp(str1, str2);
4327 return str_casecmp(str1, s);
4335 const char *p1, *p1end, *p2, *p2end;
4337 enc = rb_enc_compatible(str1, str2);
4342 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4343 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4344 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4345 while (p1 < p1end && p2 < p2end) {
4347 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4348 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4350 return INT2FIX(c1 < c2 ? -1 : 1);
4357 while (p1 < p1end && p2 < p2end) {
4358 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4359 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4361 if (0 <= c1 && 0 <= c2) {
4365 return INT2FIX(c1 < c2 ? -1 : 1);
4369 l1 = rb_enc_mbclen(p1, p1end, enc);
4370 l2 = rb_enc_mbclen(p2, p2end, enc);
4371 len = l1 < l2 ? l1 : l2;
4372 r = memcmp(p1, p2,
len);
4374 return INT2FIX(r < 0 ? -1 : 1);
4376 return INT2FIX(l1 < l2 ? -1 : 1);
4382 if (p1 == p1end && p2 == p2end)
return INT2FIX(0);
4383 if (p1 == p1end)
return INT2FIX(-1);
4416 return str_casecmp_p(str1, s);
4423 VALUE folded_str1, folded_str2;
4424 VALUE fold_opt = sym_fold;
4426 enc = rb_enc_compatible(str1, str2);
4431 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4432 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4434 return rb_str_eql(folded_str1, folded_str2);
4438strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4439 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4441 const char *search_start = str_ptr;
4442 long pos, search_len = str_len - offset;
4446 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4447 if (pos < 0)
return pos;
4449 if (t == search_start + pos)
break;
4450 search_len -= t - search_start;
4451 if (search_len <= 0)
return -1;
4452 offset += t - search_start;
4455 return pos + offset;
4459#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4460#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4463rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4465 const char *str_ptr, *str_ptr_end, *sub_ptr;
4466 long str_len, sub_len;
4469 enc = rb_enc_check(str, sub);
4470 if (is_broken_string(sub))
return -1;
4472 str_ptr = RSTRING_PTR(str);
4474 str_len = RSTRING_LEN(str);
4475 sub_ptr = RSTRING_PTR(sub);
4476 sub_len = RSTRING_LEN(sub);
4478 if (str_len < sub_len)
return -1;
4481 long str_len_char, sub_len_char;
4482 int single_byte = single_byte_optimizable(str);
4483 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4484 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4486 offset += str_len_char;
4487 if (offset < 0)
return -1;
4489 if (str_len_char - offset < sub_len_char)
return -1;
4490 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4493 if (sub_len == 0)
return offset;
4496 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4509rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4516 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4517 long slen = str_strlen(str, enc);
4519 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4531 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4532 enc, single_byte_optimizable(str));
4543 pos = rb_str_index(str, sub, pos);
4557str_ensure_byte_pos(
VALUE str,
long pos)
4559 if (!single_byte_optimizable(str)) {
4560 const char *s = RSTRING_PTR(str);
4562 const char *p = s + pos;
4563 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4565 "offset %ld does not land on character boundary", pos);
4638rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4644 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4645 long slen = RSTRING_LEN(str);
4647 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4658 str_ensure_byte_pos(str, pos);
4670 pos = rb_str_byteindex(str, sub, pos);
4671 if (pos >= 0)
return LONG2NUM(pos);
4678memrchr(
const char *search_str,
int chr,
long search_len)
4680 const char *ptr = search_str + search_len;
4681 while (ptr > search_str) {
4682 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4692 char *hit, *adjusted;
4694 long slen, searchlen;
4697 sbeg = RSTRING_PTR(str);
4698 slen = RSTRING_LEN(sub);
4699 if (slen == 0)
return s - sbeg;
4701 t = RSTRING_PTR(sub);
4703 searchlen = s - sbeg + 1;
4705 if (memcmp(s, t, slen) == 0) {
4710 hit = memrchr(sbeg, c, searchlen);
4713 if (hit != adjusted) {
4714 searchlen = adjusted - sbeg;
4717 if (memcmp(hit, t, slen) == 0)
4719 searchlen = adjusted - sbeg;
4720 }
while (searchlen > 0);
4734 enc = rb_enc_check(str, sub);
4735 if (is_broken_string(sub))
return -1;
4736 singlebyte = single_byte_optimizable(str);
4737 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4738 slen = str_strlen(sub, enc);
4741 if (
len < slen)
return -1;
4742 if (
len - pos < slen) pos =
len - slen;
4743 if (
len == 0)
return pos;
4745 sbeg = RSTRING_PTR(str);
4748 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4754 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4755 return str_rindex(str, sub, s, enc);
4816rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4821 long pos,
len = str_strlen(str, enc);
4823 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4825 if (pos < 0 && (pos +=
len) < 0) {
4831 if (pos >
len) pos =
len;
4839 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4840 enc, single_byte_optimizable(str));
4851 pos = rb_str_rindex(str, sub, pos);
4861rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4867 enc = rb_enc_check(str, sub);
4868 if (is_broken_string(sub))
return -1;
4869 len = RSTRING_LEN(str);
4870 slen = RSTRING_LEN(sub);
4873 if (
len < slen)
return -1;
4874 if (
len - pos < slen) pos =
len - slen;
4875 if (
len == 0)
return pos;
4877 sbeg = RSTRING_PTR(str);
4880 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4887 return str_rindex(str, sub, s, enc);
4977rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4981 long pos,
len = RSTRING_LEN(str);
4983 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4985 if (pos < 0 && (pos +=
len) < 0) {
4991 if (pos >
len) pos =
len;
4997 str_ensure_byte_pos(str, pos);
5009 pos = rb_str_byterindex(str, sub, pos);
5010 if (pos >= 0)
return LONG2NUM(pos);
5049 switch (OBJ_BUILTIN_TYPE(y)) {
5101rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5108 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5140rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5144 re = get_pat(argv[0]);
5145 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5154static enum neighbor_char
5160 if (rb_enc_mbminlen(enc) > 1) {
5162 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5164 return NEIGHBOR_NOT_CHAR;
5166 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5168 if (!l)
return NEIGHBOR_NOT_CHAR;
5169 if (l !=
len)
return NEIGHBOR_WRAPPED;
5170 rb_enc_mbcput(c, p, enc);
5171 r = rb_enc_precise_mbclen(p, p +
len, enc);
5173 return NEIGHBOR_NOT_CHAR;
5175 return NEIGHBOR_FOUND;
5178 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5181 return NEIGHBOR_WRAPPED;
5182 ++((
unsigned char*)p)[i];
5183 l = rb_enc_precise_mbclen(p, p+
len, enc);
5187 return NEIGHBOR_FOUND;
5190 memset(p+l, 0xff,
len-l);
5196 for (len2 =
len-1; 0 < len2; len2--) {
5197 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5201 memset(p+len2+1, 0xff,
len-(len2+1));
5206static enum neighbor_char
5211 if (rb_enc_mbminlen(enc) > 1) {
5213 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5215 return NEIGHBOR_NOT_CHAR;
5217 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5218 if (!c)
return NEIGHBOR_NOT_CHAR;
5221 if (!l)
return NEIGHBOR_NOT_CHAR;
5222 if (l !=
len)
return NEIGHBOR_WRAPPED;
5223 rb_enc_mbcput(c, p, enc);
5224 r = rb_enc_precise_mbclen(p, p +
len, enc);
5226 return NEIGHBOR_NOT_CHAR;
5228 return NEIGHBOR_FOUND;
5231 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5234 return NEIGHBOR_WRAPPED;
5235 --((
unsigned char*)p)[i];
5236 l = rb_enc_precise_mbclen(p, p+
len, enc);
5240 return NEIGHBOR_FOUND;
5243 memset(p+l, 0,
len-l);
5249 for (len2 =
len-1; 0 < len2; len2--) {
5250 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5254 memset(p+len2+1, 0,
len-(len2+1));
5268static enum neighbor_char
5269enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5271 enum neighbor_char ret;
5275 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5279 const int max_gaps = 1;
5281 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5283 ctype = ONIGENC_CTYPE_DIGIT;
5285 ctype = ONIGENC_CTYPE_ALPHA;
5287 return NEIGHBOR_NOT_CHAR;
5290 for (
try = 0;
try <= max_gaps; ++
try) {
5291 ret = enc_succ_char(p,
len, enc);
5292 if (ret == NEIGHBOR_FOUND) {
5293 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5295 return NEIGHBOR_FOUND;
5302 ret = enc_pred_char(p,
len, enc);
5303 if (ret == NEIGHBOR_FOUND) {
5304 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5317 return NEIGHBOR_NOT_CHAR;
5320 if (ctype != ONIGENC_CTYPE_DIGIT) {
5322 return NEIGHBOR_WRAPPED;
5326 enc_succ_char(carry,
len, enc);
5327 return NEIGHBOR_WRAPPED;
5395 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5396 rb_enc_cr_str_copy_for_substr(str, orig);
5397 return str_succ(str);
5404 char *sbeg, *s, *e, *last_alnum = 0;
5405 int found_alnum = 0;
5407 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5408 long carry_pos = 0, carry_len = 1;
5409 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5411 slen = RSTRING_LEN(str);
5412 if (slen == 0)
return str;
5414 enc = STR_ENC_GET(str);
5415 sbeg = RSTRING_PTR(str);
5416 s = e = sbeg + slen;
5418 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5419 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5425 l = rb_enc_precise_mbclen(s, e, enc);
5426 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5427 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5428 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5430 case NEIGHBOR_NOT_CHAR:
5432 case NEIGHBOR_FOUND:
5434 case NEIGHBOR_WRAPPED:
5439 carry_pos = s - sbeg;
5444 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5445 enum neighbor_char neighbor;
5446 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5447 l = rb_enc_precise_mbclen(s, e, enc);
5448 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5449 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5451 neighbor = enc_succ_char(tmp, l, enc);
5453 case NEIGHBOR_FOUND:
5457 case NEIGHBOR_WRAPPED:
5460 case NEIGHBOR_NOT_CHAR:
5463 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5465 enc_succ_char(s, l, enc);
5467 if (!rb_enc_asciicompat(enc)) {
5468 MEMCPY(carry, s,
char, l);
5471 carry_pos = s - sbeg;
5475 RESIZE_CAPA(str, slen + carry_len);
5476 sbeg = RSTRING_PTR(str);
5477 s = sbeg + carry_pos;
5478 memmove(s + carry_len, s, slen - carry_pos);
5479 memmove(s, carry, carry_len);
5481 STR_SET_LEN(str, slen);
5482 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5496rb_str_succ_bang(
VALUE str)
5504all_digits_p(
const char *s,
long len)
5558 VALUE end, exclusive;
5562 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5568 VALUE current, after_end;
5575 enc = rb_enc_check(beg, end);
5576 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5578 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5579 char c = RSTRING_PTR(beg)[0];
5580 char e = RSTRING_PTR(end)[0];
5582 if (c > e || (excl && c == e))
return beg;
5584 VALUE str = rb_enc_str_new(&c, 1, enc);
5586 if ((*each)(str, arg))
break;
5587 if (!excl && c == e)
break;
5589 if (excl && c == e)
break;
5594 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5595 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5596 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5601 b = rb_str_to_inum(beg, 10, FALSE);
5602 e = rb_str_to_inum(end, 10, FALSE);
5609 if (excl && bi == ei)
break;
5610 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5615 ID op = excl ?
'<' : idLE;
5616 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5621 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5622 b = rb_funcallv(b, succ, 0, 0);
5629 if (n > 0 || (excl && n == 0))
return beg;
5631 after_end = rb_funcallv(end, succ, 0, 0);
5636 next = rb_funcallv(current, succ, 0, 0);
5637 if ((*each)(current, arg))
break;
5638 if (
NIL_P(next))
break;
5642 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5657 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5658 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5659 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5661 b = rb_str_to_inum(beg, 10, FALSE);
5667 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5675 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5676 b = rb_funcallv(b, succ, 0, 0);
5682 VALUE next = rb_funcallv(current, succ, 0, 0);
5683 if ((*each)(current, arg))
break;
5686 if (RSTRING_LEN(current) == 0)
5697 if (!
rb_equal(str, *argp))
return 0;
5711 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5712 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5713 rb_enc_asciicompat(STR_ENC_GET(val))) {
5714 const char *bp = RSTRING_PTR(beg);
5715 const char *ep = RSTRING_PTR(end);
5716 const char *vp = RSTRING_PTR(val);
5717 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5718 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5726 if (b <= v && v < e)
return Qtrue;
5727 return RBOOL(!
RTEST(exclusive) && v == e);
5734 all_digits_p(bp, RSTRING_LEN(beg)) &&
5735 all_digits_p(ep, RSTRING_LEN(end))) {
5740 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5742 return RBOOL(
NIL_P(val));
5765 return rb_str_subpat(str, indx,
INT2FIX(0));
5768 if (rb_str_index(str, indx, 0) != -1)
5774 long beg,
len = str_strlen(str, NULL);
5786 return str_substr(str, idx, 1, FALSE);
5805rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5809 return rb_str_subpat(str, argv[0], argv[1]);
5812 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5816 return rb_str_aref(str, argv[0]);
5822 char *ptr = RSTRING_PTR(str);
5823 long olen = RSTRING_LEN(str), nlen;
5825 str_modifiable(str);
5826 if (
len > olen)
len = olen;
5828 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5830 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5832 ptr =
RSTRING(str)->as.embed.ary;
5833 memmove(ptr, oldptr +
len, nlen);
5834 if (fl == STR_NOEMBED)
xfree(oldptr);
5837 if (!STR_SHARED_P(str)) {
5839 rb_enc_cr_str_exact_copy(shared, str);
5844 STR_SET_LEN(str, nlen);
5846 if (!SHARABLE_MIDDLE_SUBSTRING) {
5847 TERM_FILL(ptr + nlen, TERM_LEN(str));
5854rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5860 if (beg == 0 && vlen == 0) {
5865 str_modify_keep_cr(str);
5869 RESIZE_CAPA(str, slen + vlen -
len);
5870 sptr = RSTRING_PTR(str);
5879 memmove(sptr + beg + vlen,
5881 slen - (beg +
len));
5883 if (vlen < beg &&
len < 0) {
5887 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5890 STR_SET_LEN(str, slen);
5891 TERM_FILL(&sptr[slen], TERM_LEN(str));
5898 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5907 int singlebyte = single_byte_optimizable(str);
5913 enc = rb_enc_check(str, val);
5914 slen = str_strlen(str, enc);
5916 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5925 if (
len > slen - beg) {
5928 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5933 beg = p - RSTRING_PTR(str);
5935 rb_str_update_0(str, beg,
len, val);
5936 rb_enc_associate(str, enc);
5947 long start, end,
len;
5957 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5961 nth += regs->num_regs;
5971 enc = rb_enc_check_str(str, val);
5972 rb_str_update_0(str, start,
len, val);
5973 rb_enc_associate(str, enc);
5981 switch (
TYPE(indx)) {
5983 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5987 beg = rb_str_index(str, indx, 0);
6042rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
6046 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6054 return rb_str_aset(str, argv[0], argv[1]);
6104rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6112 str_modify_keep_cr(str);
6120 if ((nth += regs->num_regs) <= 0)
return Qnil;
6122 else if (nth >= regs->num_regs)
return Qnil;
6124 len = END(nth) - beg;
6127 else if (argc == 2) {
6136 beg = p - RSTRING_PTR(str);
6140 beg = rb_str_index(str, indx, 0);
6141 if (beg == -1)
return Qnil;
6142 len = RSTRING_LEN(indx);
6154 beg = p - RSTRING_PTR(str);
6163 beg = p - RSTRING_PTR(str);
6167 rb_enc_cr_str_copy_for_substr(result, str);
6175 char *sptr = RSTRING_PTR(str);
6176 long slen = RSTRING_LEN(str);
6177 if (beg +
len > slen)
6181 slen - (beg +
len));
6183 STR_SET_LEN(str, slen);
6184 TERM_FILL(&sptr[slen], TERM_LEN(str));
6195 switch (OBJ_BUILTIN_TYPE(pat)) {
6214get_pat_quoted(
VALUE pat,
int check)
6218 switch (OBJ_BUILTIN_TYPE(pat)) {
6232 if (check && is_broken_string(pat)) {
6239rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6242 pos = rb_str_byteindex(str, pat, pos);
6243 if (set_backref_str) {
6245 str = rb_str_new_frozen_String(str);
6246 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6248 *match = match_data;
6258 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6263rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6265 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6284rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6298 hash = rb_check_hash_type(argv[1]);
6304 pat = get_pat_quoted(argv[0], 1);
6306 str_modifiable(str);
6307 beg = rb_pat_search(pat, str, 0, 1);
6321 end0 = beg0 + RSTRING_LEN(pat);
6330 if (iter || !
NIL_P(hash)) {
6331 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6337 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6340 str_mod_check(str, p,
len);
6341 rb_check_frozen(str);
6347 enc = rb_enc_compatible(str, repl);
6350 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6354 rb_enc_inspect_name(str_enc),
6355 rb_enc_inspect_name(STR_ENC_GET(repl)));
6357 enc = STR_ENC_GET(repl);
6360 rb_enc_associate(str, enc);
6370 rlen = RSTRING_LEN(repl);
6371 len = RSTRING_LEN(str);
6373 RESIZE_CAPA(str,
len + rlen - plen);
6375 p = RSTRING_PTR(str);
6377 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6379 rp = RSTRING_PTR(repl);
6380 memmove(p + beg0, rp, rlen);
6382 STR_SET_LEN(str,
len);
6383 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6412 rb_str_sub_bang(argc, argv, str);
6417str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6420 long beg, beg0, end0;
6421 long offset, blen, slen,
len, last;
6422 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6424 int need_backref_str = -1;
6434 hash = rb_check_hash_type(argv[1]);
6438 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6446 rb_error_arity(argc, 1, 2);
6449 pat = get_pat_quoted(argv[0], 1);
6450 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6453 if (bang)
return Qnil;
6458 blen = RSTRING_LEN(str) + 30;
6460 sp = RSTRING_PTR(str);
6461 slen = RSTRING_LEN(str);
6463 str_enc = STR_ENC_GET(str);
6464 rb_enc_associate(dest, str_enc);
6471 end0 = beg0 + RSTRING_LEN(pat);
6487 if (mode == FAST_MAP) {
6496 val = rb_hash_aref(hash, key);
6499 str_mod_check(str, sp, slen);
6504 else if (need_backref_str) {
6506 if (need_backref_str < 0) {
6507 need_backref_str = val != repl;
6514 len = beg0 - offset;
6528 if (RSTRING_LEN(str) <= end0)
break;
6529 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6531 offset = end0 +
len;
6533 cp = RSTRING_PTR(str) + offset;
6534 if (offset > RSTRING_LEN(str))
break;
6537 if (mode != FAST_MAP && mode != STR) {
6540 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6545 if (RSTRING_LEN(str) > offset) {
6548 rb_pat_search0(pat, str, last, 1, &match);
6550 str_shared_replace(str, dest);
6575rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6577 str_modify_keep_cr(str);
6578 return str_gsub(argc, argv, str, 1);
6628 return str_gsub(argc, argv, str, 0);
6646 str_modifiable(str);
6647 if (str == str2)
return str;
6651 return str_replace(str, str2);
6668rb_str_clear(
VALUE str)
6672 STR_SET_LEN(str, 0);
6673 RSTRING_PTR(str)[0] = 0;
6674 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6690rb_str_chr(
VALUE str)
6708 pos += RSTRING_LEN(str);
6709 if (pos < 0 || RSTRING_LEN(str) <= pos)
6712 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6731 long len = RSTRING_LEN(str);
6732 char *
ptr, *head, *left = 0;
6736 if (pos < -
len ||
len <= pos)
6743 char byte = (char)(
NUM2INT(w) & 0xFF);
6745 if (!str_independent(str))
6746 str_make_independent(str);
6747 enc = STR_ENC_GET(str);
6748 head = RSTRING_PTR(str);
6750 if (!STR_EMBED_P(str)) {
6757 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6765 width = rb_enc_precise_mbclen(left, head+
len, enc);
6767 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6783str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6785 long n = RSTRING_LEN(str);
6787 if (beg > n ||
len < 0)
return Qnil;
6790 if (beg < 0)
return Qnil;
6795 if (!empty)
return Qnil;
6799 VALUE str2 = str_subseq(str, beg,
len);
6801 str_enc_copy_direct(str2, str);
6803 if (RSTRING_LEN(str2) == 0) {
6804 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6838 long beg,
len = RSTRING_LEN(str);
6846 return str_byte_substr(str, beg,
len, TRUE);
6851 return str_byte_substr(str, idx, 1, FALSE);
6863rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6868 return str_byte_substr(str, beg,
len, TRUE);
6871 return str_byte_aref(str, argv[0]);
6875str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6877 long end, slen = RSTRING_LEN(str);
6880 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6889 if (*
len > slen - *beg) {
6893 str_ensure_byte_pos(str, *beg);
6894 str_ensure_byte_pos(str, end);
6908rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6910 long beg,
len, vbeg, vlen;
6915 if (!(argc == 2 || argc == 3 || argc == 5)) {
6916 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6920 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6921 rb_builtin_class_name(argv[0]));
6928 vlen = RSTRING_LEN(val);
6933 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6934 rb_builtin_class_name(argv[2]));
6946 vlen = RSTRING_LEN(val);
6954 str_check_beg_len(str, &beg, &
len);
6955 str_check_beg_len(val, &vbeg, &vlen);
6956 str_modify_keep_cr(str);
6959 rb_enc_associate(str, rb_enc_check(str, val));
6962 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6980rb_str_reverse(
VALUE str)
6987 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6988 enc = STR_ENC_GET(str);
6994 if (RSTRING_LEN(str) > 1) {
6995 if (single_byte_optimizable(str)) {
7002 int clen = rb_enc_fast_mbclen(s, e, enc);
7010 cr = rb_enc_asciicompat(enc) ?
7013 int clen = rb_enc_mbclen(s, e, enc);
7022 STR_SET_LEN(rev, RSTRING_LEN(str));
7023 str_enc_copy_direct(rev, str);
7043rb_str_reverse_bang(
VALUE str)
7045 if (RSTRING_LEN(str) > 1) {
7046 if (single_byte_optimizable(str)) {
7049 str_modify_keep_cr(str);
7050 s = RSTRING_PTR(str);
7059 str_shared_replace(str, rb_str_reverse(str));
7063 str_modify_keep_cr(str);
7092 i = rb_str_index(str, arg, 0);
7094 return RBOOL(i != -1);
7136 rb_raise(rb_eArgError,
"invalid radix %d", base);
7138 return rb_str_to_inum(str, base, FALSE);
7162rb_str_to_f(
VALUE str)
7177rb_str_to_s(
VALUE str)
7189 char s[RUBY_MAX_CHAR_LEN];
7190 int n = rb_enc_codelen(c, enc);
7192 rb_enc_mbcput(c, s, enc);
7197#define CHAR_ESC_LEN 13
7200rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7202 char buf[CHAR_ESC_LEN + 1];
7210 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7212 else if (c < 0x10000) {
7213 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7216 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7221 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7224 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7227 l = (int)strlen(buf);
7233ruby_escaped_char(
int c)
7236 case '\0':
return "\\0";
7237 case '\n':
return "\\n";
7238 case '\r':
return "\\r";
7239 case '\t':
return "\\t";
7240 case '\f':
return "\\f";
7241 case '\013':
return "\\v";
7242 case '\010':
return "\\b";
7243 case '\007':
return "\\a";
7244 case '\033':
return "\\e";
7245 case '\x7f':
return "\\c?";
7251rb_str_escape(
VALUE str)
7255 const char *p = RSTRING_PTR(str);
7257 const char *prev = p;
7258 char buf[CHAR_ESC_LEN + 1];
7260 int unicode_p = rb_enc_unicode_p(enc);
7261 int asciicompat = rb_enc_asciicompat(enc);
7266 int n = rb_enc_precise_mbclen(p, pend, enc);
7268 if (p > prev) str_buf_cat(result, prev, p - prev);
7269 n = rb_enc_mbminlen(enc);
7271 n = (int)(pend - p);
7273 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7274 str_buf_cat(result, buf, strlen(buf));
7280 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7282 cc = ruby_escaped_char(c);
7284 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7285 str_buf_cat(result, cc, strlen(cc));
7288 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7291 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7292 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7296 if (p > prev) str_buf_cat(result, prev, p - prev);
7315 const char *p, *pend, *prev;
7316 char buf[CHAR_ESC_LEN + 1];
7318 rb_encoding *resenc = rb_default_internal_encoding();
7319 int unicode_p = rb_enc_unicode_p(enc);
7320 int asciicompat = rb_enc_asciicompat(enc);
7322 if (resenc == NULL) resenc = rb_default_external_encoding();
7323 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7324 rb_enc_associate(result, resenc);
7325 str_buf_cat2(result,
"\"");
7333 n = rb_enc_precise_mbclen(p, pend, enc);
7335 if (p > prev) str_buf_cat(result, prev, p - prev);
7336 n = rb_enc_mbminlen(enc);
7338 n = (int)(pend - p);
7340 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7341 str_buf_cat(result, buf, strlen(buf));
7347 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7349 if ((asciicompat || unicode_p) &&
7350 (c ==
'"'|| c ==
'\\' ||
7355 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7356 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7357 str_buf_cat2(result,
"\\");
7358 if (asciicompat || enc == resenc) {
7364 case '\n': cc =
'n';
break;
7365 case '\r': cc =
'r';
break;
7366 case '\t': cc =
't';
break;
7367 case '\f': cc =
'f';
break;
7368 case '\013': cc =
'v';
break;
7369 case '\010': cc =
'b';
break;
7370 case '\007': cc =
'a';
break;
7371 case 033: cc =
'e';
break;
7372 default: cc = 0;
break;
7375 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7378 str_buf_cat(result, buf, 2);
7391 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7395 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7396 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7401 if (p > prev) str_buf_cat(result, prev, p - prev);
7402 str_buf_cat2(result,
"\"");
7407#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7420 int encidx = rb_enc_get_index(str);
7423 const char *p, *pend;
7426 int u8 = (encidx == rb_utf8_encindex());
7427 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7430 if (!rb_enc_asciicompat(enc)) {
7432 len += strlen(enc->name);
7435 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7438 unsigned char c = *p++;
7441 case '"':
case '\\':
7442 case '\n':
case '\r':
7443 case '\t':
case '\f':
7444 case '\013':
case '\010':
case '\007':
case '\033':
7449 clen = IS_EVSTR(p, pend) ? 2 : 1;
7457 if (u8 && c > 0x7F) {
7458 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7460 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7463 else if (cc <= 0xFFFFF)
7476 if (clen > LONG_MAX -
len) {
7483 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7484 q = RSTRING_PTR(result); qend = q +
len + 1;
7488 unsigned char c = *p++;
7490 if (c ==
'"' || c ==
'\\') {
7494 else if (c ==
'#') {
7495 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7498 else if (c ==
'\n') {
7502 else if (c ==
'\r') {
7506 else if (c ==
'\t') {
7510 else if (c ==
'\f') {
7514 else if (c ==
'\013') {
7518 else if (c ==
'\010') {
7522 else if (c ==
'\007') {
7526 else if (c ==
'\033') {
7536 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7538 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7541 snprintf(q, qend-q,
"u%04X", cc);
7543 snprintf(q, qend-q,
"u{%X}", cc);
7548 snprintf(q, qend-q,
"x%02X", c);
7554 if (!rb_enc_asciicompat(enc)) {
7555 snprintf(q, qend-q, nonascii_suffix, enc->name);
7556 encidx = rb_ascii8bit_encindex();
7559 rb_enc_associate_index(result, encidx);
7565unescape_ascii(
unsigned int c)
7589undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7591 const char *s = *ss;
7595 unsigned char buf[6];
7613 *buf = unescape_ascii(*s);
7625 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7626 if (*penc != enc_utf8) {
7628 rb_enc_associate(undumped, enc_utf8);
7645 if (hexlen == 0 || hexlen > 6) {
7651 if (0xd800 <= c && c <= 0xdfff) {
7654 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7664 if (0xd800 <= c && c <= 0xdfff) {
7667 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7695static VALUE rb_str_is_ascii_only_p(
VALUE str);
7713str_undump(
VALUE str)
7715 const char *s = RSTRING_PTR(str);
7718 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7720 bool binary =
false;
7724 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7727 if (!str_null_check(str, &w)) {
7730 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7731 if (*s !=
'"')
goto invalid_format;
7749 static const char force_encoding_suffix[] =
".force_encoding(\"";
7750 static const char dup_suffix[] =
".dup";
7751 const char *encname;
7756 size =
sizeof(dup_suffix) - 1;
7757 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7759 size =
sizeof(force_encoding_suffix) - 1;
7760 if (s_end - s <= size)
goto invalid_format;
7761 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7765 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7769 s = memchr(s,
'"', s_end-s);
7771 if (!s)
goto invalid_format;
7772 if (s_end - s != 2)
goto invalid_format;
7773 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7775 encidx = rb_enc_find_index2(encname, (
long)size);
7779 rb_enc_associate_index(undumped, encidx);
7789 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7800 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7806 if (rb_enc_dummy_p(enc)) {
7813str_true_enc(
VALUE str)
7816 rb_str_check_dummy_enc(enc);
7820static OnigCaseFoldType
7821check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7826 rb_raise(rb_eArgError,
"too many options");
7827 if (argv[0]==sym_turkic) {
7828 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7830 if (argv[1]==sym_lithuanian)
7831 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7833 rb_raise(rb_eArgError,
"invalid second option");
7836 else if (argv[0]==sym_lithuanian) {
7837 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7839 if (argv[1]==sym_turkic)
7840 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7842 rb_raise(rb_eArgError,
"invalid second option");
7846 rb_raise(rb_eArgError,
"too many options");
7847 else if (argv[0]==sym_ascii)
7848 flags |= ONIGENC_CASE_ASCII_ONLY;
7849 else if (argv[0]==sym_fold) {
7850 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7851 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7853 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7856 rb_raise(rb_eArgError,
"invalid option");
7863 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7869#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7870#ifndef CASEMAP_DEBUG
7871# define CASEMAP_DEBUG 0
7879 OnigUChar space[FLEX_ARY_LEN];
7883mapping_buffer_free(
void *p)
7887 while (current_buffer) {
7888 previous_buffer = current_buffer;
7889 current_buffer = current_buffer->next;
7890 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7896 {0, mapping_buffer_free,},
7897 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7905 const OnigUChar *source_current, *source_end;
7906 int target_length = 0;
7907 VALUE buffer_anchor;
7910 size_t buffer_count = 0;
7911 int buffer_length_or_invalid;
7913 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7915 source_current = (OnigUChar*)RSTRING_PTR(source);
7920 while (source_current < source_end) {
7922 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7923 if (CASEMAP_DEBUG) {
7924 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7927 *pre_buffer = current_buffer;
7928 pre_buffer = ¤t_buffer->next;
7929 current_buffer->next = NULL;
7930 current_buffer->capa =
capa;
7931 buffer_length_or_invalid = enc->case_map(flags,
7932 &source_current, source_end,
7933 current_buffer->space,
7934 current_buffer->space+current_buffer->capa,
7936 if (buffer_length_or_invalid < 0) {
7937 current_buffer =
DATA_PTR(buffer_anchor);
7939 mapping_buffer_free(current_buffer);
7940 rb_raise(rb_eArgError,
"input string invalid");
7942 target_length += current_buffer->used = buffer_length_or_invalid;
7944 if (CASEMAP_DEBUG) {
7945 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7948 if (buffer_count==1) {
7949 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7952 char *target_current;
7955 target_current = RSTRING_PTR(target);
7956 current_buffer =
DATA_PTR(buffer_anchor);
7957 while (current_buffer) {
7958 memcpy(target_current, current_buffer->space, current_buffer->used);
7959 target_current += current_buffer->used;
7960 current_buffer = current_buffer->next;
7963 current_buffer =
DATA_PTR(buffer_anchor);
7965 mapping_buffer_free(current_buffer);
7970 str_enc_copy_direct(target, source);
7979 const OnigUChar *source_current, *source_end;
7980 OnigUChar *target_current, *target_end;
7981 long old_length = RSTRING_LEN(source);
7982 int length_or_invalid;
7984 if (old_length == 0)
return Qnil;
7986 source_current = (OnigUChar*)RSTRING_PTR(source);
7988 if (source == target) {
7989 target_current = (OnigUChar*)source_current;
7990 target_end = (OnigUChar*)source_end;
7993 target_current = (OnigUChar*)RSTRING_PTR(target);
7997 length_or_invalid = onigenc_ascii_only_case_map(flags,
7998 &source_current, source_end,
7999 target_current, target_end, enc);
8000 if (length_or_invalid < 0)
8001 rb_raise(rb_eArgError,
"input string invalid");
8002 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8003 fprintf(stderr,
"problem with rb_str_ascii_casemap"
8004 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8005 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
8006 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8009 str_enc_copy(target, source);
8015upcase_single(
VALUE str)
8017 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8018 bool modified =
false;
8021 unsigned int c = *(
unsigned char*)s;
8023 if (
'a' <= c && c <=
'z') {
8024 *s =
'A' + (c -
'a');
8052rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
8055 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8057 flags = check_case_options(argc, argv, flags);
8058 str_modify_keep_cr(str);
8059 enc = str_true_enc(str);
8060 if (case_option_single_p(flags, enc, str)) {
8061 if (upcase_single(str))
8062 flags |= ONIGENC_CASE_MODIFIED;
8064 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8065 rb_str_ascii_casemap(str, str, &flags, enc);
8067 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8069 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8091rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8094 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8097 flags = check_case_options(argc, argv, flags);
8098 enc = str_true_enc(str);
8099 if (case_option_single_p(flags, enc, str)) {
8100 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8101 str_enc_copy_direct(ret, str);
8104 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8106 rb_str_ascii_casemap(str, ret, &flags, enc);
8109 ret = rb_str_casemap(str, &flags, enc);
8116downcase_single(
VALUE str)
8118 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8119 bool modified =
false;
8122 unsigned int c = *(
unsigned char*)s;
8124 if (
'A' <= c && c <=
'Z') {
8125 *s =
'a' + (c -
'A');
8147rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8150 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8152 flags = check_case_options(argc, argv, flags);
8153 str_modify_keep_cr(str);
8154 enc = str_true_enc(str);
8155 if (case_option_single_p(flags, enc, str)) {
8156 if (downcase_single(str))
8157 flags |= ONIGENC_CASE_MODIFIED;
8159 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8160 rb_str_ascii_casemap(str, str, &flags, enc);
8162 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8164 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8178rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8181 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8184 flags = check_case_options(argc, argv, flags);
8185 enc = str_true_enc(str);
8186 if (case_option_single_p(flags, enc, str)) {
8187 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8188 str_enc_copy_direct(ret, str);
8189 downcase_single(ret);
8191 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8193 rb_str_ascii_casemap(str, ret, &flags, enc);
8196 ret = rb_str_casemap(str, &flags, enc);
8216rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8219 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8221 flags = check_case_options(argc, argv, flags);
8222 str_modify_keep_cr(str);
8223 enc = str_true_enc(str);
8224 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8225 if (flags&ONIGENC_CASE_ASCII_ONLY)
8226 rb_str_ascii_casemap(str, str, &flags, enc);
8228 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8230 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8263rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8266 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8269 flags = check_case_options(argc, argv, flags);
8270 enc = str_true_enc(str);
8271 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8272 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8274 rb_str_ascii_casemap(str, ret, &flags, enc);
8277 ret = rb_str_casemap(str, &flags, enc);
8304rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8307 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8309 flags = check_case_options(argc, argv, flags);
8310 str_modify_keep_cr(str);
8311 enc = str_true_enc(str);
8312 if (flags&ONIGENC_CASE_ASCII_ONLY)
8313 rb_str_ascii_casemap(str, str, &flags, enc);
8315 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8317 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8341rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8344 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8347 flags = check_case_options(argc, argv, flags);
8348 enc = str_true_enc(str);
8349 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8350 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8352 rb_str_ascii_casemap(str, ret, &flags, enc);
8355 ret = rb_str_casemap(str, &flags, enc);
8360typedef unsigned char *USTR;
8364 unsigned int now, max;
8376 if (t->p == t->pend)
return -1;
8377 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8380 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8382 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8384 if (t->p < t->pend) {
8385 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8388 if (t->now < 0x80 && c < 0x80) {
8389 rb_raise(rb_eArgError,
8390 "invalid range \"%c-%c\" in string transliteration",
8394 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8398 else if (t->now < c) {
8407 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8408 if (t->now == t->max) {
8413 if (t->now < t->max) {
8429 const unsigned int errc = -1;
8430 unsigned int trans[256];
8432 struct tr trsrc, trrepl;
8434 unsigned int c, c0, last = 0;
8435 int modify = 0, i, l;
8436 unsigned char *s, *send;
8438 int singlebyte = single_byte_optimizable(str);
8442#define CHECK_IF_ASCII(c) \
8443 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8444 (cr = ENC_CODERANGE_VALID) : 0)
8448 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8449 if (RSTRING_LEN(repl) == 0) {
8450 return rb_str_delete_bang(1, &src, str);
8454 e1 = rb_enc_check(str, src);
8455 e2 = rb_enc_check(str, repl);
8460 enc = rb_enc_check(src, repl);
8462 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8463 if (RSTRING_LEN(src) > 1 &&
8464 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8465 trsrc.p + l < trsrc.pend) {
8469 trrepl.p = RSTRING_PTR(repl);
8470 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8471 trsrc.gen = trrepl.gen = 0;
8472 trsrc.now = trrepl.now = 0;
8473 trsrc.max = trrepl.max = 0;
8476 for (i=0; i<256; i++) {
8479 while ((c = trnext(&trsrc, enc)) != errc) {
8484 if (!hash) hash = rb_hash_new();
8488 while ((c = trnext(&trrepl, enc)) != errc)
8491 for (i=0; i<256; i++) {
8492 if (trans[i] != errc) {
8500 for (i=0; i<256; i++) {
8503 while ((c = trnext(&trsrc, enc)) != errc) {
8504 r = trnext(&trrepl, enc);
8505 if (r == errc) r = trrepl.now;
8508 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8511 if (!hash) hash = rb_hash_new();
8519 str_modify_keep_cr(str);
8520 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8521 termlen = rb_enc_mbminlen(enc);
8524 long offset, max = RSTRING_LEN(str);
8525 unsigned int save = -1;
8526 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8531 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8534 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8537 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8539 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8548 if (cflag) c = last;
8551 else if (cflag) c = errc;
8557 if (c != (
unsigned int)-1) {
8563 tlen = rb_enc_codelen(c, enc);
8569 if (enc != e1) may_modify = 1;
8571 if ((offset = t - buf) + tlen > max) {
8572 size_t MAYBE_UNUSED(old) = max + termlen;
8573 max = offset + tlen + (send - s);
8574 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8577 rb_enc_mbcput(c, t, enc);
8578 if (may_modify && memcmp(s, t, tlen) != 0) {
8584 if (!STR_EMBED_P(str)) {
8585 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8587 TERM_FILL((
char *)t, termlen);
8588 RSTRING(str)->as.heap.ptr = (
char *)buf;
8589 STR_SET_LEN(str, t - buf);
8590 STR_SET_NOEMBED(str);
8591 RSTRING(str)->as.heap.aux.capa = max;
8595 c = (
unsigned char)*s;
8596 if (trans[c] != errc) {
8613 long offset, max = (long)((send - s) * 1.2);
8614 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8619 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8622 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8625 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8627 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8635 if (cflag) c = last;
8638 else if (cflag) c = errc;
8642 c = cflag ? last : errc;
8645 tlen = rb_enc_codelen(c, enc);
8650 if (enc != e1) may_modify = 1;
8652 if ((offset = t - buf) + tlen > max) {
8653 size_t MAYBE_UNUSED(old) = max + termlen;
8654 max = offset + tlen + (long)((send - s) * 1.2);
8655 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8659 rb_enc_mbcput(c, t, enc);
8660 if (may_modify && memcmp(s, t, tlen) != 0) {
8668 if (!STR_EMBED_P(str)) {
8669 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8671 TERM_FILL((
char *)t, termlen);
8672 RSTRING(str)->as.heap.ptr = (
char *)buf;
8673 STR_SET_LEN(str, t - buf);
8674 STR_SET_NOEMBED(str);
8675 RSTRING(str)->as.heap.aux.capa = max;
8681 rb_enc_associate(str, enc);
8700 return tr_trans(str, src, repl, 0);
8747 tr_trans(str, src, repl, 0);
8751#define TR_TABLE_MAX (UCHAR_MAX+1)
8752#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8754tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8757 const unsigned int errc = -1;
8758 char buf[TR_TABLE_MAX];
8761 VALUE table = 0, ptable = 0;
8762 int i, l, cflag = 0;
8764 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8765 tr.gen =
tr.now =
tr.max = 0;
8767 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8772 for (i=0; i<TR_TABLE_MAX; i++) {
8775 stable[TR_TABLE_MAX] = cflag;
8777 else if (stable[TR_TABLE_MAX] && !cflag) {
8778 stable[TR_TABLE_MAX] = 0;
8780 for (i=0; i<TR_TABLE_MAX; i++) {
8784 while ((c = trnext(&
tr, enc)) != errc) {
8785 if (c < TR_TABLE_MAX) {
8786 buf[(
unsigned char)c] = !cflag;
8791 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8794 table = ptable ? ptable : rb_hash_new();
8798 table = rb_hash_new();
8803 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8804 rb_hash_aset(table, key,
Qtrue);
8808 for (i=0; i<TR_TABLE_MAX; i++) {
8809 stable[i] = stable[i] && buf[i];
8811 if (!table && !cflag) {
8818tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8820 if (c < TR_TABLE_MAX) {
8821 return table[c] != 0;
8827 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8828 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8832 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8835 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8850rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8852 char squeez[TR_TABLE_SIZE];
8855 VALUE del = 0, nodel = 0;
8857 int i, ascompat, cr;
8859 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8861 for (i=0; i<argc; i++) {
8865 enc = rb_enc_check(str, s);
8866 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8869 str_modify_keep_cr(str);
8870 ascompat = rb_enc_asciicompat(enc);
8871 s = t = RSTRING_PTR(str);
8878 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8889 c = rb_enc_codepoint_len(s, send, &clen, enc);
8891 if (tr_find(c, squeez, del, nodel)) {
8895 if (t != s) rb_enc_mbcput(c, t, enc);
8902 TERM_FILL(t, TERM_LEN(str));
8903 STR_SET_LEN(str, t - RSTRING_PTR(str));
8906 if (modify)
return str;
8920rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8923 rb_str_delete_bang(argc, argv, str);
8937rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8939 char squeez[TR_TABLE_SIZE];
8941 VALUE del = 0, nodel = 0;
8942 unsigned char *s, *send, *t;
8944 int ascompat, singlebyte = single_byte_optimizable(str);
8948 enc = STR_ENC_GET(str);
8951 for (i=0; i<argc; i++) {
8955 enc = rb_enc_check(str, s);
8956 if (singlebyte && !single_byte_optimizable(s))
8958 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8962 str_modify_keep_cr(str);
8963 s = t = (
unsigned char *)RSTRING_PTR(str);
8964 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8967 ascompat = rb_enc_asciicompat(enc);
8971 unsigned int c = *s++;
8972 if (c != save || (argc > 0 && !squeez[c])) {
8982 if (ascompat && (c = *s) < 0x80) {
8983 if (c != save || (argc > 0 && !squeez[c])) {
8989 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8991 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8992 if (t != s) rb_enc_mbcput(c, t, enc);
9001 TERM_FILL((
char *)t, TERM_LEN(str));
9002 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9003 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
9007 if (modify)
return str;
9030rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
9033 rb_str_squeeze_bang(argc, argv, str);
9051 return tr_trans(str, src, repl, 1);
9074 tr_trans(str, src, repl, 1);
9087rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9089 char table[TR_TABLE_SIZE];
9091 VALUE del = 0, nodel = 0, tstr;
9101 enc = rb_enc_check(str, tstr);
9104 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9105 (ptstr = RSTRING_PTR(tstr),
9106 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9107 !is_broken_string(str)) {
9109 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9111 s = RSTRING_PTR(str);
9112 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9115 if (*(
unsigned char*)s++ == c) n++;
9121 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9122 for (i=1; i<argc; i++) {
9125 enc = rb_enc_check(str, tstr);
9126 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9129 s = RSTRING_PTR(str);
9130 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9132 ascompat = rb_enc_asciicompat(enc);
9136 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9144 c = rb_enc_codepoint_len(s, send, &clen, enc);
9145 if (tr_find(c, table, del, nodel)) {
9156rb_fs_check(
VALUE val)
9160 if (
NIL_P(val))
return 0;
9165static const char isspacetable[256] = {
9166 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9167 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9168 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9169 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9173 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9184#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9187split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9189 if (empty_count >= 0 &&
len == 0) {
9190 return empty_count + 1;
9192 if (empty_count > 0) {
9197 }
while (--empty_count > 0);
9201 rb_yield(str_new_empty_String(str));
9202 }
while (--empty_count > 0);
9216 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9220literal_split_pattern(
VALUE spat, split_type_t default_type)
9228 return SPLIT_TYPE_CHARS;
9230 else if (rb_enc_asciicompat(enc)) {
9231 if (
len == 1 && ptr[0] ==
' ') {
9232 return SPLIT_TYPE_AWK;
9237 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9238 return SPLIT_TYPE_AWK;
9241 return default_type;
9254rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9259 split_type_t split_type;
9260 long beg, end, i = 0, empty_count = -1;
9265 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9267 if (lim <= 0) limit =
Qnil;
9268 else if (lim == 1) {
9269 if (RSTRING_LEN(str) == 0)
9280 if (
NIL_P(limit) && !lim) empty_count = 0;
9282 enc = STR_ENC_GET(str);
9283 split_type = SPLIT_TYPE_REGEXP;
9285 spat = get_pat_quoted(spat, 0);
9287 else if (
NIL_P(spat = rb_fs)) {
9288 split_type = SPLIT_TYPE_AWK;
9290 else if (!(spat = rb_fs_check(spat))) {
9291 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9296 if (split_type != SPLIT_TYPE_AWK) {
9301 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9302 if (split_type == SPLIT_TYPE_AWK) {
9304 split_type = SPLIT_TYPE_STRING;
9309 mustnot_broken(spat);
9310 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9318#define SPLIT_STR(beg, len) ( \
9319 empty_count = split_string(result, str, beg, len, empty_count), \
9320 str_mod_check(str, str_start, str_len))
9323 char *ptr = RSTRING_PTR(str);
9324 char *
const str_start = ptr;
9325 const long str_len = RSTRING_LEN(str);
9326 char *
const eptr = str_start + str_len;
9327 if (split_type == SPLIT_TYPE_AWK) {
9334 if (is_ascii_string(str)) {
9335 while (ptr < eptr) {
9336 c = (
unsigned char)*ptr++;
9338 if (ascii_isspace(c)) {
9344 if (!
NIL_P(limit) && lim <= i)
break;
9347 else if (ascii_isspace(c)) {
9348 SPLIT_STR(beg, end-beg);
9351 if (!
NIL_P(limit)) ++i;
9359 while (ptr < eptr) {
9362 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9371 if (!
NIL_P(limit) && lim <= i)
break;
9375 SPLIT_STR(beg, end-beg);
9378 if (!
NIL_P(limit)) ++i;
9386 else if (split_type == SPLIT_TYPE_STRING) {
9387 char *substr_start = ptr;
9388 char *sptr = RSTRING_PTR(spat);
9389 long slen = RSTRING_LEN(spat);
9392 mustnot_broken(str);
9393 enc = rb_enc_check(str, spat);
9394 while (ptr < eptr &&
9395 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9398 if (t != ptr + end) {
9402 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9403 str_mod_check(spat, sptr, slen);
9406 if (!
NIL_P(limit) && lim <= ++i)
break;
9408 beg = ptr - str_start;
9410 else if (split_type == SPLIT_TYPE_CHARS) {
9414 mustnot_broken(str);
9415 enc = rb_enc_get(str);
9416 while (ptr < eptr &&
9417 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9418 SPLIT_STR(ptr - str_start, n);
9420 if (!
NIL_P(limit) && lim <= ++i)
break;
9422 beg = ptr - str_start;
9426 long len = RSTRING_LEN(str);
9434 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9439 if (start == end && BEG(0) == END(0)) {
9444 else if (last_null == 1) {
9445 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9452 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9458 SPLIT_STR(beg, end-beg);
9459 beg = start = END(0);
9463 for (idx=1; idx < regs->num_regs; idx++) {
9464 if (BEG(idx) == -1)
continue;
9465 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9467 if (!
NIL_P(limit) && lim <= ++i)
break;
9469 if (match) rb_match_unbusy(match);
9471 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9472 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9475 return result ? result : str;
9485 return rb_str_split_m(1, &sep, str);
9488#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9503#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9506chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9508 const char *prev = rb_enc_prev_char(p, e, e, enc);
9511 prev = rb_enc_prev_char(p, e, e, enc);
9512 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9524 RSTRING_LEN(rs) != 1 ||
9525 RSTRING_PTR(rs)[0] !=
'\n')) {
9531#define rb_rs get_rs()
9538 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9539 long pos,
len, rslen;
9545 static ID keywords[1];
9550 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9554 if (!ENUM_ELEM(ary, str)) {
9562 if (!RSTRING_LEN(str))
goto end;
9564 ptr = subptr = RSTRING_PTR(str);
9566 len = RSTRING_LEN(str);
9568 rslen = RSTRING_LEN(rs);
9571 enc = rb_enc_get(str);
9573 enc = rb_enc_check(str, rs);
9578 const char *eol = NULL;
9580 while (subend < pend) {
9581 long chomp_rslen = 0;
9583 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9585 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9587 if (eol == subend)
break;
9591 chomp_rslen = -rslen;
9595 if (!subptr) subptr = subend;
9599 }
while (subend < pend);
9601 if (rslen == 0) chomp_rslen = 0;
9603 subend - subptr + (chomp ? chomp_rslen : rslen));
9604 if (ENUM_ELEM(ary, line)) {
9605 str_mod_check(str, ptr,
len);
9607 subptr = eol = NULL;
9612 rsptr = RSTRING_PTR(rs);
9613 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9622 rsptr = RSTRING_PTR(rs);
9623 rslen = RSTRING_LEN(rs);
9626 while (subptr < pend) {
9627 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9631 if (hit != adjusted) {
9635 subend = hit += rslen;
9638 subend = chomp_newline(subptr, subend, enc);
9645 if (ENUM_ELEM(ary, line)) {
9646 str_mod_check(str, ptr,
len);
9651 if (subptr != pend) {
9654 pend = chomp_newline(subptr, pend, enc);
9656 else if (pend - subptr >= rslen &&
9657 memcmp(pend - rslen, rsptr, rslen) == 0) {
9662 ENUM_ELEM(ary, line);
9683rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9686 return rb_str_enumerate_lines(argc, argv, str, 0);
9741rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9743 VALUE ary = WANTARRAY(
"lines", 0);
9744 return rb_str_enumerate_lines(argc, argv, str, ary);
9758 for (i=0; i<RSTRING_LEN(str); i++) {
9759 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9777rb_str_each_byte(
VALUE str)
9780 return rb_str_enumerate_bytes(str, 0);
9792rb_str_bytes(
VALUE str)
9794 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9795 return rb_str_enumerate_bytes(str, ary);
9813 ptr = RSTRING_PTR(str);
9814 len = RSTRING_LEN(str);
9815 enc = rb_enc_get(str);
9818 for (i = 0; i <
len; i += n) {
9819 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9824 for (i = 0; i <
len; i += n) {
9825 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9846rb_str_each_char(
VALUE str)
9849 return rb_str_enumerate_chars(str, 0);
9861rb_str_chars(
VALUE str)
9864 return rb_str_enumerate_chars(str, ary);
9868rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9873 const char *ptr, *end;
9876 if (single_byte_optimizable(str))
9877 return rb_str_enumerate_bytes(str, ary);
9880 ptr = RSTRING_PTR(str);
9882 enc = STR_ENC_GET(str);
9885 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9906rb_str_each_codepoint(
VALUE str)
9909 return rb_str_enumerate_codepoints(str, 0);
9921rb_str_codepoints(
VALUE str)
9924 return rb_str_enumerate_codepoints(str, ary);
9930 int encidx = rb_enc_to_index(enc);
9932 const OnigUChar source_ascii[] =
"\\X";
9933 const OnigUChar *source = source_ascii;
9934 size_t source_len =
sizeof(source_ascii) - 1;
9937#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9938#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9939#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9940#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9941#define CASE_UTF(e) \
9942 case ENCINDEX_UTF_##e: { \
9943 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9944 source = source_UTF_##e; \
9945 source_len = sizeof(source_UTF_##e); \
9948 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9956 regex_t *reg_grapheme_cluster;
9958 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9959 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9961 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9962 onig_error_code_to_str(message, r, &einfo);
9963 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9966 return reg_grapheme_cluster;
9972 int encidx = rb_enc_to_index(enc);
9973 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9975 if (encidx == rb_utf8_encindex()) {
9976 if (!reg_grapheme_cluster_utf8) {
9977 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9980 return reg_grapheme_cluster_utf8;
9989 size_t grapheme_cluster_count = 0;
9991 const char *ptr, *end;
9993 if (!rb_enc_unicode_p(enc)) {
9997 bool cached_reg_grapheme_cluster =
true;
9998 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9999 if (!reg_grapheme_cluster) {
10000 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10001 cached_reg_grapheme_cluster =
false;
10004 ptr = RSTRING_PTR(str);
10007 while (ptr < end) {
10008 OnigPosition
len = onig_match(reg_grapheme_cluster,
10009 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
10010 (
const OnigUChar *)ptr, NULL, 0);
10011 if (
len <= 0)
break;
10012 grapheme_cluster_count++;
10016 if (!cached_reg_grapheme_cluster) {
10017 onig_free(reg_grapheme_cluster);
10020 return SIZET2NUM(grapheme_cluster_count);
10024rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
10028 const char *ptr0, *ptr, *end;
10030 if (!rb_enc_unicode_p(enc)) {
10031 return rb_str_enumerate_chars(str, ary);
10036 bool cached_reg_grapheme_cluster =
true;
10037 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10038 if (!reg_grapheme_cluster) {
10039 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10040 cached_reg_grapheme_cluster =
false;
10043 ptr0 = ptr = RSTRING_PTR(str);
10046 while (ptr < end) {
10047 OnigPosition
len = onig_match(reg_grapheme_cluster,
10048 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
10049 (
const OnigUChar *)ptr, NULL, 0);
10050 if (
len <= 0)
break;
10055 if (!cached_reg_grapheme_cluster) {
10056 onig_free(reg_grapheme_cluster);
10076rb_str_each_grapheme_cluster(
VALUE str)
10079 return rb_str_enumerate_grapheme_clusters(str, 0);
10091rb_str_grapheme_clusters(
VALUE str)
10094 return rb_str_enumerate_grapheme_clusters(str, ary);
10098chopped_length(
VALUE str)
10101 const char *p, *p2, *beg, *end;
10103 beg = RSTRING_PTR(str);
10104 end = beg + RSTRING_LEN(str);
10105 if (beg >= end)
return 0;
10106 p = rb_enc_prev_char(beg, end, end, enc);
10108 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10109 p2 = rb_enc_prev_char(beg, p, end, enc);
10110 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10128rb_str_chop_bang(
VALUE str)
10130 str_modify_keep_cr(str);
10131 if (RSTRING_LEN(str) > 0) {
10133 len = chopped_length(str);
10134 STR_SET_LEN(str,
len);
10135 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10154rb_str_chop(
VALUE str)
10160smart_chomp(
VALUE str,
const char *e,
const char *p)
10163 if (rb_enc_mbminlen(enc) > 1) {
10168 pp = e - rb_enc_mbminlen(enc);
10171 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10179 if (--e > p && *(e-1) ==
'\r') {
10196 char *pp, *e, *rsptr;
10198 char *
const p = RSTRING_PTR(str);
10199 long len = RSTRING_LEN(str);
10201 if (
len == 0)
return 0;
10204 return smart_chomp(str, e, p);
10207 enc = rb_enc_get(str);
10210 if (rb_enc_mbminlen(enc) > 1) {
10215 pp -= rb_enc_mbminlen(enc);
10218 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10225 while (e > p && *(e-1) ==
'\n') {
10227 if (e > p && *(e-1) ==
'\r')
10233 if (rslen >
len)
return len;
10235 enc = rb_enc_get(rs);
10236 newline = rsptr[rslen-1];
10237 if (rslen == rb_enc_mbminlen(enc)) {
10239 if (newline ==
'\n')
10240 return smart_chomp(str, e, p);
10244 return smart_chomp(str, e, p);
10248 enc = rb_enc_check(str, rs);
10249 if (is_broken_string(rs)) {
10253 if (p[
len-1] == newline &&
10255 memcmp(rsptr, pp, rslen) == 0)) {
10256 if (at_char_boundary(p, pp, e, enc))
10257 return len - rslen;
10269chomp_rs(
int argc,
const VALUE *argv)
10273 VALUE rs = argv[0];
10285 long olen = RSTRING_LEN(str);
10286 long len = chompped_length(str, rs);
10287 if (
len >= olen)
return Qnil;
10288 str_modify_keep_cr(str);
10289 STR_SET_LEN(str,
len);
10290 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10310rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10313 str_modifiable(str);
10314 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10315 rs = chomp_rs(argc, argv);
10317 return rb_str_chomp_string(str, rs);
10330rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10332 VALUE rs = chomp_rs(argc, argv);
10340 const char *
const start = s;
10342 if (!s || s >= e)
return 0;
10345 if (single_byte_optimizable(str)) {
10346 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10351 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10371rb_str_lstrip_bang(
VALUE str)
10375 long olen, loffset;
10377 str_modify_keep_cr(str);
10378 enc = STR_ENC_GET(str);
10380 loffset = lstrip_offset(str, start, start+olen, enc);
10382 long len = olen-loffset;
10383 s = start + loffset;
10384 memmove(start, s,
len);
10385 STR_SET_LEN(str,
len);
10386 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10409rb_str_lstrip(
VALUE str)
10414 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10415 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10424 rb_str_check_dummy_enc(enc);
10428 if (!s || s >= e)
return 0;
10432 if (single_byte_optimizable(str)) {
10434 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10439 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10459rb_str_rstrip_bang(
VALUE str)
10463 long olen, roffset;
10465 str_modify_keep_cr(str);
10466 enc = STR_ENC_GET(str);
10468 roffset = rstrip_offset(str, start, start+olen, enc);
10470 long len = olen - roffset;
10472 STR_SET_LEN(str,
len);
10473 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10496rb_str_rstrip(
VALUE str)
10500 long olen, roffset;
10502 enc = STR_ENC_GET(str);
10504 roffset = rstrip_offset(str, start, start+olen, enc);
10506 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10522rb_str_strip_bang(
VALUE str)
10525 long olen, loffset, roffset;
10528 str_modify_keep_cr(str);
10529 enc = STR_ENC_GET(str);
10531 loffset = lstrip_offset(str, start, start+olen, enc);
10532 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10534 if (loffset > 0 || roffset > 0) {
10535 long len = olen-roffset;
10538 memmove(start, start + loffset,
len);
10540 STR_SET_LEN(str,
len);
10541 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10564rb_str_strip(
VALUE str)
10567 long olen, loffset, roffset;
10571 loffset = lstrip_offset(str, start, start+olen, enc);
10572 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10574 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10579scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10582 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10588 end = pos + RSTRING_LEN(pat);
10602 if (RSTRING_LEN(str) > end)
10603 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10612 if (!regs || regs->num_regs == 1) {
10618 for (
int i = 1; i < regs->num_regs; i++) {
10679 long last = -1, prev = 0;
10680 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10682 pat = get_pat_quoted(pat, 1);
10683 mustnot_broken(str);
10687 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10692 if (last >= 0) rb_pat_search(pat, str, last, 1);
10697 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10701 str_mod_check(str, p,
len);
10703 if (last >= 0) rb_pat_search(pat, str, last, 1);
10730rb_str_hex(
VALUE str)
10732 return rb_str_to_inum(str, 16, FALSE);
10757rb_str_oct(
VALUE str)
10759 return rb_str_to_inum(str, -8, FALSE);
10762#ifndef HAVE_CRYPT_R
10767 rb_nativethread_lock_t lock;
10768} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10837# define CRYPT_END() ALLOCV_END(databuf)
10840 extern char *crypt(
const char *,
const char *);
10841# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10844 const char *s, *saltp;
10847 char salt_8bit_clean[3];
10851 mustnot_wchar(str);
10852 mustnot_wchar(salt);
10854 saltp = RSTRING_PTR(salt);
10855 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10856 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10860 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10861 salt_8bit_clean[0] = saltp[0] & 0x7f;
10862 salt_8bit_clean[1] = saltp[1] & 0x7f;
10863 salt_8bit_clean[2] =
'\0';
10864 saltp = salt_8bit_clean;
10869# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10870 data->initialized = 0;
10872 res = crypt_r(s, saltp, data);
10875 res = crypt(s, saltp);
10890 size_t res_size = strlen(res)+1;
10891 tmp_buf =
ALLOCA_N(
char, res_size);
10892 memcpy(tmp_buf, res, res_size);
10929 char *ptr, *p, *pend;
10932 unsigned long sum0 = 0;
10937 ptr = p = RSTRING_PTR(str);
10938 len = RSTRING_LEN(str);
10944 str_mod_check(str, ptr,
len);
10947 sum0 += (
unsigned char)*p;
10958 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10959 sum0 &= (((
unsigned long)1)<<bits)-1;
10979rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10983 long width,
len, flen = 1, fclen = 1;
10986 const char *f =
" ";
10987 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10989 int singlebyte = 1, cr;
10993 enc = STR_ENC_GET(str);
10994 termlen = rb_enc_mbminlen(enc);
10998 enc = rb_enc_check(str, pad);
10999 f = RSTRING_PTR(pad);
11000 flen = RSTRING_LEN(pad);
11001 fclen = str_strlen(pad, enc);
11002 singlebyte = single_byte_optimizable(pad);
11003 if (flen == 0 || fclen == 0) {
11004 rb_raise(rb_eArgError,
"zero width padding");
11007 len = str_strlen(str, enc);
11008 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
11010 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
11014 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11015 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11017 size = RSTRING_LEN(str);
11018 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11019 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11020 (
len += llen2 + rlen2) >= LONG_MAX - size) {
11021 rb_raise(rb_eArgError,
"argument too big");
11025 p = RSTRING_PTR(res);
11027 memset(p, *f, llen);
11031 while (llen >= fclen) {
11037 memcpy(p, f, llen2);
11041 memcpy(p, RSTRING_PTR(str), size);
11044 memset(p, *f, rlen);
11048 while (rlen >= fclen) {
11054 memcpy(p, f, rlen2);
11058 TERM_FILL(p, termlen);
11059 STR_SET_LEN(res, p-RSTRING_PTR(res));
11080rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
11082 return rb_str_justify(argc, argv, str,
'l');
11096rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11098 return rb_str_justify(argc, argv, str,
'r');
11111rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11113 return rb_str_justify(argc, argv, str,
'c');
11129 sep = get_pat_quoted(sep, 0);
11141 pos = rb_str_index(str, sep, 0);
11142 if (pos < 0)
goto failed;
11147 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11150 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11164 long pos = RSTRING_LEN(str);
11166 sep = get_pat_quoted(sep, 0);
11179 pos = rb_str_rindex(str, sep, pos);
11188 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11190 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11202rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11206 for (i=0; i<argc; i++) {
11207 VALUE tmp = argv[i];
11209 if (rb_reg_start_with_p(tmp, str))
11213 const char *p, *s, *e;
11218 enc = rb_enc_check(str, tmp);
11219 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11220 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11221 p = RSTRING_PTR(str);
11224 if (!at_char_right_boundary(p, s, e, enc))
11226 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11242rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11246 for (i=0; i<argc; i++) {
11247 VALUE tmp = argv[i];
11248 const char *p, *s, *e;
11253 enc = rb_enc_check(str, tmp);
11254 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11255 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11256 p = RSTRING_PTR(str);
11259 if (!at_char_boundary(p, s, e, enc))
11261 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11277deleted_prefix_length(
VALUE str,
VALUE prefix)
11279 const char *strptr, *prefixptr;
11280 long olen, prefixlen;
11285 if (!is_broken_string(prefix) ||
11286 !rb_enc_asciicompat(enc) ||
11287 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11288 enc = rb_enc_check(str, prefix);
11292 prefixlen = RSTRING_LEN(prefix);
11293 if (prefixlen <= 0)
return 0;
11294 olen = RSTRING_LEN(str);
11295 if (olen < prefixlen)
return 0;
11296 strptr = RSTRING_PTR(str);
11297 prefixptr = RSTRING_PTR(prefix);
11298 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11299 if (is_broken_string(prefix)) {
11300 if (!is_broken_string(str)) {
11304 const char *strend = strptr + olen;
11305 const char *after_prefix = strptr + prefixlen;
11306 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11327rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11330 str_modify_keep_cr(str);
11332 prefixlen = deleted_prefix_length(str, prefix);
11333 if (prefixlen <= 0)
return Qnil;
11347rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11351 prefixlen = deleted_prefix_length(str, prefix);
11352 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11354 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11367deleted_suffix_length(
VALUE str,
VALUE suffix)
11369 const char *strptr, *suffixptr;
11370 long olen, suffixlen;
11374 if (is_broken_string(suffix))
return 0;
11375 enc = rb_enc_check(str, suffix);
11378 suffixlen = RSTRING_LEN(suffix);
11379 if (suffixlen <= 0)
return 0;
11380 olen = RSTRING_LEN(str);
11381 if (olen < suffixlen)
return 0;
11382 strptr = RSTRING_PTR(str);
11383 suffixptr = RSTRING_PTR(suffix);
11384 const char *strend = strptr + olen;
11385 const char *before_suffix = strend - suffixlen;
11386 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11387 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11403rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11405 long olen, suffixlen,
len;
11406 str_modifiable(str);
11408 suffixlen = deleted_suffix_length(str, suffix);
11409 if (suffixlen <= 0)
return Qnil;
11411 olen = RSTRING_LEN(str);
11412 str_modify_keep_cr(str);
11413 len = olen - suffixlen;
11414 STR_SET_LEN(str,
len);
11415 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11431rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11435 suffixlen = deleted_suffix_length(str, suffix);
11436 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11438 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11445 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11453 val = rb_fs_check(val);
11456 "value of %"PRIsVALUE
" must be String or Regexp",
11460 rb_warn_deprecated(
"'$;'", NULL);
11477 str_modifiable(str);
11480 int idx = rb_enc_to_index(encoding);
11487 rb_enc_associate_index(str, idx);
11511 if (STR_EMBED_P(str)) {
11512 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11517 str_replace_shared_without_enc(str2, str);
11519 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11552rb_str_valid_encoding_p(
VALUE str)
11572rb_str_is_ascii_only_p(
VALUE str)
11582 static const char ellipsis[] =
"...";
11583 const long ellipsislen =
sizeof(ellipsis) - 1;
11585 const long blen = RSTRING_LEN(str);
11586 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11587 VALUE estr, ret = 0;
11590 if (
len * rb_enc_mbminlen(enc) >= blen ||
11594 else if (
len <= ellipsislen ||
11596 if (rb_enc_asciicompat(enc)) {
11598 rb_enc_associate(ret, enc);
11605 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11610 rb_enc_from_encoding(enc), 0,
Qnil);
11623 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11629 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11648 if (enc == STR_ENC_GET(str)) {
11653 return enc_str_scrub(enc, str, repl, cr);
11661 const char *rep, *p, *e, *p1, *sp;
11667 rb_raise(rb_eArgError,
"both of block and replacement given");
11674 if (!
NIL_P(repl)) {
11675 repl = str_compat_and_valid(repl, enc);
11678 if (rb_enc_dummy_p(enc)) {
11681 encidx = rb_enc_to_index(enc);
11683#define DEFAULT_REPLACE_CHAR(str) do { \
11684 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11685 rep = replace; replen = (int)sizeof(replace); \
11688 slen = RSTRING_LEN(str);
11689 p = RSTRING_PTR(str);
11694 if (rb_enc_asciicompat(enc)) {
11700 else if (!
NIL_P(repl)) {
11701 rep = RSTRING_PTR(repl);
11702 replen = RSTRING_LEN(repl);
11705 else if (encidx == rb_utf8_encindex()) {
11706 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11710 DEFAULT_REPLACE_CHAR(
"?");
11715 p = search_nonascii(p, e);
11720 int ret = rb_enc_precise_mbclen(p, e, enc);
11739 if (e - p < clen) clen = e - p;
11746 for (; clen > 1; clen--) {
11747 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11758 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11759 str_mod_check(str, sp, slen);
11760 repl = str_compat_and_valid(repl, enc);
11767 p = search_nonascii(p, e);
11793 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11794 str_mod_check(str, sp, slen);
11795 repl = str_compat_and_valid(repl, enc);
11804 long mbminlen = rb_enc_mbminlen(enc);
11808 else if (!
NIL_P(repl)) {
11809 rep = RSTRING_PTR(repl);
11810 replen = RSTRING_LEN(repl);
11812 else if (encidx == ENCINDEX_UTF_16BE) {
11813 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11815 else if (encidx == ENCINDEX_UTF_16LE) {
11816 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11818 else if (encidx == ENCINDEX_UTF_32BE) {
11819 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11821 else if (encidx == ENCINDEX_UTF_32LE) {
11822 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11825 DEFAULT_REPLACE_CHAR(
"?");
11829 int ret = rb_enc_precise_mbclen(p, e, enc);
11842 if (e - p < clen) clen = e - p;
11843 if (clen <= mbminlen * 2) {
11848 for (; clen > mbminlen; clen-=mbminlen) {
11849 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11859 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11860 str_mod_check(str, sp, slen);
11861 repl = str_compat_and_valid(repl, enc);
11886 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11887 str_mod_check(str, sp, slen);
11888 repl = str_compat_and_valid(repl, enc);
11924str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11932static ID id_normalize;
11933static ID id_normalized_p;
11934static VALUE mUnicodeNormalize;
11937unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11939 static int UnicodeNormalizeRequired = 0;
11942 if (!UnicodeNormalizeRequired) {
11943 rb_require(
"unicode_normalize/normalize.rb");
11944 UnicodeNormalizeRequired = 1;
11948 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11985rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11987 return unicode_normalize_common(argc, argv, str, id_normalize);
12001rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
12003 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12030rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
12032 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12164#define sym_equal rb_obj_equal
12167sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12171 int c = rb_enc_precise_mbclen(s, send, enc);
12175 c = rb_enc_mbc_to_codepoint(s, send, enc);
12183rb_str_symname_p(
VALUE sym)
12188 rb_encoding *resenc = rb_default_internal_encoding();
12190 if (resenc == NULL) resenc = rb_default_external_encoding();
12191 enc = STR_ENC_GET(sym);
12192 ptr = RSTRING_PTR(sym);
12193 len = RSTRING_LEN(sym);
12194 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12202rb_str_quote_unprintable(
VALUE str)
12210 resenc = rb_default_internal_encoding();
12211 if (resenc == NULL) resenc = rb_default_external_encoding();
12212 enc = STR_ENC_GET(str);
12213 ptr = RSTRING_PTR(str);
12214 len = RSTRING_LEN(str);
12215 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12216 !sym_printable(ptr, ptr +
len, enc)) {
12217 return rb_str_escape(str);
12223rb_id_quote_unprintable(
ID id)
12225 VALUE str = rb_id2str(
id);
12226 if (!rb_str_symname_p(str)) {
12227 return rb_str_escape(str);
12245sym_inspect(
VALUE sym)
12252 if (!rb_str_symname_p(str)) {
12254 len = RSTRING_LEN(str);
12255 rb_str_resize(str,
len + 1);
12256 dest = RSTRING_PTR(str);
12257 memmove(dest + 1, dest,
len);
12261 VALUE orig_str = str;
12263 len = RSTRING_LEN(orig_str);
12264 str = rb_enc_str_new(0,
len + 1, enc);
12267 ptr = RSTRING_PTR(orig_str);
12268 dest = RSTRING_PTR(str);
12269 memcpy(dest + 1, ptr,
len);
12289rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12294 rb_raise(rb_eArgError,
"no receiver given");
12391 return rb_str_match(
rb_sym2str(sym), other);
12406sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12408 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12421sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12423 return rb_str_match_m_p(argc, argv, sym);
12441 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12452sym_length(
VALUE sym)
12466sym_empty(
VALUE sym)
12500sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12516sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12532sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12546sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12548 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12561sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12563 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12575sym_encoding(
VALUE sym)
12581string_for_symbol(
VALUE name)
12586 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12600 name = string_for_symbol(name);
12601 return rb_intern_str(name);
12610 name = string_for_symbol(name);
12634 return rb_fstring(str);
12641 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
true,
false);
12653 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12654 rb_enc_autoload(enc);
12658 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12664 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12665 rb_enc_autoload(enc);
12669 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12680rb_yjit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12685 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12686 rb_str_buf_cat_byte(str, (
char) code);
12696fstring_set_class_i(
VALUE *str,
void *data)
12700 return ST_CONTINUE;
12708 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12875 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.