14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
48#include "ruby_assert.h"
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
66#undef rb_usascii_str_new
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
149str_encindex_fastpath(
int encindex)
153 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_US_ASCII:
163str_enc_fastpath(
VALUE str)
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
217#define STR_ENC_GET(str) get_encoding(str)
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
230str_embed_capa(
VALUE str)
232 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
236rb_str_reembeddable_p(
VALUE str)
238 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242rb_str_embed_size(
long capa)
248rb_str_size_as_embedded(
VALUE str)
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
260 real_size =
sizeof(
struct RString);
264 real_size +=
sizeof(st_index_t);
271STR_EMBEDDABLE_P(
long len,
long termlen)
273 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
278static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
279static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
281static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
282static inline void str_modifiable(
VALUE str);
287str_make_independent(
VALUE str)
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str),
len, 0L, termlen);
294static inline int str_dependent_p(
VALUE str);
297rb_str_make_independent(
VALUE str)
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
305rb_str_make_embedded(
VALUE str)
310 char *buf =
RSTRING(str)->as.heap.ptr;
314 STR_SET_LEN(str,
len);
317 memcpy(RSTRING_PTR(str), buf,
len);
321 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
325rb_debug_rstring_null_ptr(
const char *func)
327 fprintf(stderr,
"%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
338get_encoding(
VALUE str)
344mustnot_broken(
VALUE str)
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
352mustnot_wchar(
VALUE str)
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
360static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
368BARE_STRING_P(
VALUE str)
373static inline st_index_t
374str_do_hash(
VALUE str)
376 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
378 if (e && !is_ascii_string(str)) {
385str_store_precomputed_hash(
VALUE str, st_index_t hash)
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
396 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
398 FL_SET(str, STR_PRECOMPUTED_HASH);
411 if (
FL_TEST(str, RSTRING_FSTR))
414 bare = BARE_STRING_P(str);
416 if (STR_EMBED_P(str)) {
421 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 rb_str_resize(str, RSTRING_LEN(str));
430 fstr = register_fstring(str,
false,
false);
433 str_replace_shared_without_enc(str, fstr);
440static VALUE fstring_table_obj;
443fstring_concurrent_set_hash(
VALUE str)
445#ifdef PRECOMPUTED_FAKESTR_HASH
449 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
466 const char *aptr, *bptr;
473 return (alen == blen &&
475 memcmp(aptr, bptr, alen) == 0);
480 bool force_precompute_hash;
484fstring_concurrent_set_create(
VALUE str,
void *data)
494 long len = RSTRING_LEN(str);
495 long capa =
len +
sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
525 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
528 if (STR_SHARED_P(str)) {
530 str_make_independent(str);
533 if (!BARE_STRING_P(str)) {
539 RBASIC(str)->flags |= RSTRING_FSTR;
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
559Init_fstring_table(
void)
561 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
562 rb_gc_register_address(&fstring_table_obj);
566register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
570 .force_precompute_hash = force_precompute_hash
573#if SIZEOF_VOIDP == SIZEOF_LONG
577 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
581 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
583 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593rb_obj_is_fstring_table(
VALUE obj)
597 return obj == fstring_table_obj;
601rb_gc_free_fstring(
VALUE obj)
606 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
608 RB_DEBUG_COUNTER_INC(obj_str_fstr);
614rb_fstring_foreach_with_replace(
int (*callback)(
VALUE *str,
void *data),
void *data)
616 if (fstring_table_obj) {
617 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
622setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
625 RBASIC_SET_SHAPE_ID((
VALUE)fake_str, ROOT_SHAPE_ID);
638 return (
VALUE)fake_str;
647 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
656rb_fstring_new(
const char *ptr,
long len)
659 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
666 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
670rb_fstring_cstr(
const char *
ptr)
672 return rb_fstring_new(
ptr, strlen(
ptr));
676single_byte_optimizable(
VALUE str)
680 case ENCINDEX_ASCII_8BIT:
681 case ENCINDEX_US_ASCII:
703static inline const char *
704search_nonascii(
const char *p,
const char *e)
706 const uintptr_t *s, *t;
708#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
709# if SIZEOF_UINTPTR_T == 8
710# define NONASCII_MASK UINT64_C(0x8080808080808080)
711# elif SIZEOF_UINTPTR_T == 4
712# define NONASCII_MASK UINT32_C(0x80808080)
714# error "don't know what to do."
717# if SIZEOF_UINTPTR_T == 8
718# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
719# elif SIZEOF_UINTPTR_T == 4
720# define NONASCII_MASK 0x80808080UL
722# error "don't know what to do."
726 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
727#if !UNALIGNED_WORD_ACCESS
728 if ((uintptr_t)p % SIZEOF_VOIDP) {
729 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
734 case 7:
if (p[-7]&0x80)
return p-7;
735 case 6:
if (p[-6]&0x80)
return p-6;
736 case 5:
if (p[-5]&0x80)
return p-5;
737 case 4:
if (p[-4]&0x80)
return p-4;
739 case 3:
if (p[-3]&0x80)
return p-3;
740 case 2:
if (p[-2]&0x80)
return p-2;
741 case 1:
if (p[-1]&0x80)
return p-1;
746#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
747#define aligned_ptr(value) \
748 __builtin_assume_aligned((value), sizeof(uintptr_t))
750#define aligned_ptr(value) (uintptr_t *)(value)
753 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
756 if (*s & NONASCII_MASK) {
757#ifdef WORDS_BIGENDIAN
758 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
760 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
770 case 7:
if (e[-7]&0x80)
return e-7;
771 case 6:
if (e[-6]&0x80)
return e-6;
772 case 5:
if (e[-5]&0x80)
return e-5;
773 case 4:
if (e[-4]&0x80)
return e-4;
775 case 3:
if (e[-3]&0x80)
return e-3;
776 case 2:
if (e[-2]&0x80)
return e-2;
777 case 1:
if (e[-1]&0x80)
return e-1;
785 const char *e = p +
len;
787 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
789 p = search_nonascii(p, e);
793 if (rb_enc_asciicompat(enc)) {
794 p = search_nonascii(p, e);
797 int ret = rb_enc_precise_mbclen(p, e, enc);
801 p = search_nonascii(p, e);
807 int ret = rb_enc_precise_mbclen(p, e, enc);
823 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
826 p = search_nonascii(p, e);
830 else if (rb_enc_asciicompat(enc)) {
831 p = search_nonascii(p, e);
837 int ret = rb_enc_precise_mbclen(p, e, enc);
844 p = search_nonascii(p, e);
850 int ret = rb_enc_precise_mbclen(p, e, enc);
875 rb_enc_set_index(str1, rb_enc_get_index(str2));
883rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
888 str_enc_copy(dest, src);
889 if (RSTRING_LEN(dest) == 0) {
890 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
901 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
902 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
913rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
915 str_enc_copy(dest, src);
922 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
928 return enc_coderange_scan(str, enc);
937 cr = enc_coderange_scan(str, get_encoding(str));
944rb_enc_str_asciicompat(
VALUE str)
947 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
955 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
964str_mod_check(
VALUE s,
const char *p,
long len)
966 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
972str_capacity(
VALUE str,
const int termlen)
974 if (STR_EMBED_P(str)) {
975 return str_embed_capa(str) - termlen;
977 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
981 return RSTRING(str)->as.heap.aux.capa;
988 return str_capacity(str, TERM_LEN(str));
992must_not_null(
const char *
ptr)
995 rb_raise(rb_eArgError,
"NULL pointer given");
1000str_alloc_embed(
VALUE klass,
size_t capa)
1002 size_t size = rb_str_embed_size(
capa);
1006 NEWOBJ_OF(str,
struct RString, klass,
1013str_alloc_heap(
VALUE klass)
1015 NEWOBJ_OF(str,
struct RString, klass,
1022empty_str_alloc(
VALUE klass)
1024 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1025 VALUE str = str_alloc_embed(klass, 0);
1026 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1037 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1041 enc = rb_ascii8bit_encoding();
1044 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1046 int termlen = rb_enc_mbminlen(enc);
1048 if (STR_EMBEDDABLE_P(
len, termlen)) {
1049 str = str_alloc_embed(klass,
len + termlen);
1055 str = str_alloc_heap(klass);
1061 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1064 rb_enc_raw_set(str, enc);
1067 memcpy(RSTRING_PTR(str),
ptr,
len);
1070 STR_SET_LEN(str,
len);
1071 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1078 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1113 __msan_unpoison_string(
ptr);
1133 if (rb_enc_mbminlen(enc) != 1) {
1134 rb_raise(rb_eArgError,
"wchar encoding given");
1136 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1140str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1145 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1149 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1152 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1153 str = str_alloc_heap(klass);
1157 RBASIC(str)->flags |= STR_NOFREE;
1158 rb_enc_associate_index(str, encindex);
1187static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1189 int ecflags,
VALUE ecopts);
1194 int encidx = rb_enc_to_index(enc);
1195 if (rb_enc_get_index(str) == encidx)
1196 return is_ascii_string(str);
1207 if (!to)
return str;
1208 if (!from) from = rb_enc_get(str);
1209 if (from == to)
return str;
1210 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1211 rb_is_ascii8bit_enc(to)) {
1212 if (STR_ENC_GET(str) != to) {
1214 rb_enc_associate(str, to);
1221 from, to, ecflags, ecopts);
1222 if (
NIL_P(newstr)) {
1230rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1235 olen = RSTRING_LEN(newstr);
1236 if (ofs < -olen || olen < ofs)
1238 if (ofs < 0) ofs += olen;
1240 STR_SET_LEN(newstr, ofs);
1244 rb_str_modify(newstr);
1245 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1253 STR_SET_LEN(str, 0);
1254 rb_enc_associate(str, enc);
1260str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1262 int ecflags,
VALUE ecopts)
1267 VALUE econv_wrapper;
1268 const unsigned char *start, *sp;
1269 unsigned char *dest, *dp;
1270 size_t converted_output = (size_t)ofs;
1275 RBASIC_CLEAR_CLASS(econv_wrapper);
1277 if (!ec)
return Qnil;
1280 sp = (
unsigned char*)
ptr;
1282 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1283 (dp = dest + converted_output),
1287 size_t converted_input = sp - start;
1288 size_t rest =
len - converted_input;
1289 converted_output = dp - dest;
1291 if (converted_input && converted_output &&
1292 rest < (LONG_MAX / converted_output)) {
1293 rest = (rest * converted_output) / converted_input;
1298 olen += rest < 2 ? 2 : rest;
1299 rb_str_resize(newstr, olen);
1306 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1308 rb_enc_associate(newstr, to);
1327 const int eidx = rb_enc_to_index(eenc);
1330 return rb_enc_str_new(
ptr,
len, eenc);
1334 if ((eidx == rb_ascii8bit_encindex()) ||
1335 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1339 ienc = rb_default_internal_encoding();
1340 if (!ienc || eenc == ienc) {
1341 return rb_enc_str_new(
ptr,
len, eenc);
1345 if ((eidx == rb_ascii8bit_encindex()) ||
1346 (eidx == rb_usascii_encindex()) ||
1347 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1348 return rb_enc_str_new(
ptr,
len, ienc);
1351 str = rb_enc_str_new(NULL, 0, ienc);
1354 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1355 rb_str_initialize(str,
ptr,
len, eenc);
1363 int eidx = rb_enc_to_index(eenc);
1364 if (eidx == rb_usascii_encindex() &&
1365 !is_ascii_string(str)) {
1366 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1369 rb_enc_associate_index(str, eidx);
1428str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1430 const int termlen = TERM_LEN(str);
1435 if (str_embed_capa(str2) >=
len + termlen) {
1436 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1437 STR_SET_EMBED(str2);
1438 memcpy(ptr2, RSTRING_PTR(str),
len);
1439 TERM_FILL(ptr2+
len, termlen);
1443 if (STR_SHARED_P(str)) {
1444 root =
RSTRING(str)->as.heap.aux.shared;
1453 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1455 rb_fatal(
"about to free a possible shared root");
1457 char *ptr2 = STR_HEAP_PTR(str2);
1459 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1462 FL_SET(str2, STR_NOEMBED);
1464 STR_SET_SHARED(str2, root);
1467 STR_SET_LEN(str2,
len);
1475 str_replace_shared_without_enc(str2, str);
1476 rb_enc_cr_str_exact_copy(str2, str);
1483 return str_replace_shared(str_alloc_heap(klass), str);
1500rb_str_new_frozen_String(
VALUE orig)
1508rb_str_frozen_bare_string(
VALUE orig)
1510 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1515rb_str_tmp_frozen_acquire(
VALUE orig)
1518 return str_new_frozen_buffer(0, orig, FALSE);
1522rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1524 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1525 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1527 VALUE str = str_alloc_heap(0);
1530 FL_SET(str, STR_SHARED_ROOT);
1532 size_t capa = str_capacity(orig, TERM_LEN(orig));
1538 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1539 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1546 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1547 RBASIC(orig)->flags &= ~STR_NOFREE;
1548 STR_SET_SHARED(orig, str);
1558rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1563 if (STR_EMBED_P(tmp)) {
1566 else if (
FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1572 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1576 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1577 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1582 STR_SET_LEN(tmp, 0);
1590 return str_new_frozen_buffer(klass, orig, TRUE);
1599 VALUE str = str_alloc_heap(klass);
1600 STR_SET_LEN(str, RSTRING_LEN(orig));
1601 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1602 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1603 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1604 RBASIC(orig)->flags &= ~STR_NOFREE;
1605 STR_SET_SHARED(orig, str);
1612str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1616 long len = RSTRING_LEN(orig);
1617 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1618 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1620 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1621 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1627 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1628 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1634 if ((ofs > 0) || (rest > 0) ||
1637 str = str_new_shared(klass,
shared);
1639 RSTRING(str)->as.heap.ptr += ofs;
1640 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1648 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1649 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1651 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1652 STR_SET_LEN(str, RSTRING_LEN(orig));
1657 str = heap_str_make_shared(klass, orig);
1661 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1673str_new_empty_String(
VALUE str)
1676 rb_enc_copy(v, str);
1680#define STR_BUF_MIN_SIZE 63
1685 if (STR_EMBEDDABLE_P(
capa, 1)) {
1693 RSTRING(str)->as.heap.ptr[0] =
'\0';
1713 return str_new(0, 0,
len);
1719 if (STR_EMBED_P(str)) {
1720 RB_DEBUG_COUNTER_INC(obj_str_embed);
1722 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1723 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1724 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1727 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1728 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1733rb_str_memsize(
VALUE str)
1735 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1736 return STR_HEAP_SIZE(str);
1746 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1749static inline void str_discard(
VALUE str);
1750static void str_shared_replace(
VALUE str,
VALUE str2);
1755 if (str != str2) str_shared_replace(str, str2);
1766 enc = STR_ENC_GET(str2);
1769 termlen = rb_enc_mbminlen(enc);
1771 STR_SET_LEN(str, RSTRING_LEN(str2));
1773 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1775 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1776 rb_enc_associate(str, enc);
1780 if (STR_EMBED_P(str2)) {
1782 long len = RSTRING_LEN(str2);
1785 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1786 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1787 RSTRING(str2)->as.heap.ptr = new_ptr;
1788 STR_SET_LEN(str2,
len);
1790 STR_SET_NOEMBED(str2);
1793 STR_SET_NOEMBED(str);
1795 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1797 if (
FL_TEST(str2, STR_SHARED)) {
1799 STR_SET_SHARED(str,
shared);
1802 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1806 STR_SET_EMBED(str2);
1807 RSTRING_PTR(str2)[0] = 0;
1808 STR_SET_LEN(str2, 0);
1809 rb_enc_associate(str, enc);
1823 return rb_obj_as_string_result(str, obj);
1839 len = RSTRING_LEN(str2);
1840 if (STR_SHARED_P(str2)) {
1843 STR_SET_NOEMBED(str);
1844 STR_SET_LEN(str,
len);
1845 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1846 STR_SET_SHARED(str,
shared);
1847 rb_enc_cr_str_exact_copy(str, str2);
1850 str_replace_shared(str, str2);
1859 size_t size = rb_str_embed_size(
capa);
1863 NEWOBJ_OF(str,
struct RString, klass,
1872 NEWOBJ_OF(str,
struct RString, klass,
1883 encidx = rb_enc_get_index(str);
1884 flags &= ~ENCODING_MASK;
1887 if (encidx) rb_enc_associate_index(dup, encidx);
1897 long len = RSTRING_LEN(str);
1902 STR_SET_LEN(dup, RSTRING_LEN(str));
1903 return str_duplicate_setup_encoding(str, dup, flags);
1912 root =
RSTRING(str)->as.heap.aux.shared;
1915 root = str = str_new_frozen(klass, str);
1921 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1922 FL_SET(root, STR_SHARED_ROOT);
1924 flags |= RSTRING_NOEMBED | STR_SHARED;
1926 STR_SET_LEN(dup, RSTRING_LEN(str));
1927 return str_duplicate_setup_encoding(str, dup, flags);
1933 if (STR_EMBED_P(str)) {
1934 return str_duplicate_setup_embed(klass, str, dup);
1937 return str_duplicate_setup_heap(klass, str, dup);
1945 if (STR_EMBED_P(str)) {
1946 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1949 dup = str_alloc_heap(klass);
1952 return str_duplicate_setup(klass, str, dup);
1963rb_str_dup_m(
VALUE str)
1965 if (LIKELY(BARE_STRING_P(str))) {
1976 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1983 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1987 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1988 str_duplicate_setup_embed(klass, str, new_str);
1991 new_str = ec_str_alloc_heap(ec, klass);
1992 str_duplicate_setup_heap(klass, str, new_str);
2001rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
2003 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
2027 static ID keyword_ids[2];
2028 VALUE orig, opt, venc, vcapa;
2033 if (!keyword_ids[0]) {
2034 keyword_ids[0] = rb_id_encoding();
2035 CONST_ID(keyword_ids[1],
"capacity");
2043 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2044 enc = rb_to_encoding(venc);
2046 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2049 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2051 if (
capa < STR_BUF_MIN_SIZE) {
2052 capa = STR_BUF_MIN_SIZE;
2056 len = RSTRING_LEN(orig);
2060 if (orig == str) n = 0;
2062 str_modifiable(str);
2063 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2065 const size_t size = (size_t)
capa + termlen;
2066 const char *
const old_ptr = RSTRING_PTR(str);
2067 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2068 char *new_ptr =
ALLOC_N(
char, size);
2069 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2070 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2072 RSTRING(str)->as.heap.ptr = new_ptr;
2074 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2075 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2076 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2078 STR_SET_LEN(str,
len);
2081 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2082 rb_enc_cr_str_exact_copy(str, orig);
2084 FL_SET(str, STR_NOEMBED);
2091 rb_enc_associate(str, enc);
2103rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2109 static ID keyword_ids[2];
2119 keyword_ids[0] = rb_id_encoding();
2120 CONST_ID(keyword_ids[1],
"capacity");
2122 encoding = kwargs[0];
2123 capacity = kwargs[1];
2132 if (UNDEF_P(encoding)) {
2134 encoding = rb_obj_encoding(orig);
2138 if (!UNDEF_P(encoding)) {
2139 enc = rb_to_encoding(encoding);
2143 if (UNDEF_P(capacity)) {
2145 VALUE empty_str = str_new(klass,
"", 0);
2147 rb_enc_associate(empty_str, enc);
2151 VALUE copy = str_duplicate(klass, orig);
2152 rb_enc_associate(copy, enc);
2165 if (orig_capa >
capa) {
2170 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2171 STR_SET_LEN(str, 0);
2182#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2197static inline uintptr_t
2198count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2203 d = (d>>6) | (~d>>7);
2204 d &= NONASCII_MASK >> 7;
2207#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2209 return rb_popcount_intptr(d);
2213# if SIZEOF_VOIDP == 8
2222enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2228 long diff = (long)(e - p);
2229 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2234 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2235 const uintptr_t *s, *t;
2236 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2237 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2238 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2239 while (p < (
const char *)s) {
2240 if (is_utf8_lead_byte(*p))
len++;
2244 len += count_utf8_lead_bytes_with_word(s);
2247 p = (
const char *)s;
2250 if (is_utf8_lead_byte(*p))
len++;
2256 else if (rb_enc_asciicompat(enc)) {
2261 q = search_nonascii(p, e);
2267 p += rb_enc_fast_mbclen(p, e, enc);
2274 q = search_nonascii(p, e);
2280 p += rb_enc_mbclen(p, e, enc);
2287 for (c=0; p<e; c++) {
2288 p += rb_enc_mbclen(p, e, enc);
2303rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2311 long diff = (long)(e - p);
2312 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2314 else if (rb_enc_asciicompat(enc)) {
2318 q = search_nonascii(p, e);
2326 ret = rb_enc_precise_mbclen(p, e, enc);
2341 for (c=0; p<e; c++) {
2342 ret = rb_enc_precise_mbclen(p, e, enc);
2349 if (p + rb_enc_mbminlen(enc) <= e)
2350 p += rb_enc_mbminlen(enc);
2366 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2367 if (!enc) enc = STR_ENC_GET(str);
2368 p = RSTRING_PTR(str);
2373 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2378 return enc_strlen(p, e, enc, cr);
2385 return str_strlen(str, NULL);
2399 return LONG2NUM(str_strlen(str, NULL));
2411rb_str_bytesize(
VALUE str)
2430rb_str_empty(
VALUE str)
2432 return RBOOL(RSTRING_LEN(str) == 0);
2451 char *ptr1, *ptr2, *ptr3;
2456 enc = rb_enc_check_str(str1, str2);
2459 termlen = rb_enc_mbminlen(enc);
2460 if (len1 > LONG_MAX - len2) {
2461 rb_raise(rb_eArgError,
"string size too big");
2463 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2464 ptr3 = RSTRING_PTR(str3);
2465 memcpy(ptr3, ptr1, len1);
2466 memcpy(ptr3+len1, ptr2, len2);
2467 TERM_FILL(&ptr3[len1+len2], termlen);
2483 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2486 int enc1 = rb_enc_get_index(str1);
2487 int enc2 = rb_enc_get_index(str2);
2492 else if (enc2 < 0) {
2495 else if (enc1 != enc2) {
2498 else if (len1 > LONG_MAX - len2) {
2532 rb_enc_copy(str2, str);
2537 rb_raise(rb_eArgError,
"negative argument");
2539 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2540 if (STR_EMBEDDABLE_P(
len, 1)) {
2542 memset(RSTRING_PTR(str2), 0,
len + 1);
2549 STR_SET_LEN(str2,
len);
2550 rb_enc_copy(str2, str);
2553 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2554 rb_raise(rb_eArgError,
"argument too big");
2557 len *= RSTRING_LEN(str);
2558 termlen = TERM_LEN(str);
2560 ptr2 = RSTRING_PTR(str2);
2562 n = RSTRING_LEN(str);
2563 memcpy(ptr2, RSTRING_PTR(str), n);
2564 while (n <=
len/2) {
2565 memcpy(ptr2 + n, ptr2, n);
2568 memcpy(ptr2 + n, ptr2,
len-n);
2570 STR_SET_LEN(str2,
len);
2571 TERM_FILL(&ptr2[
len], termlen);
2572 rb_enc_cr_str_copy_for_substr(str2, str);
2609rb_check_lockedtmp(
VALUE str)
2611 if (
FL_TEST(str, STR_TMPLOCK)) {
2618#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2620str_modifiable(
VALUE str)
2624 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2625 if (CHILLED_STRING_P(str)) {
2626 CHILLED_STRING_MUTATED(str);
2628 rb_check_lockedtmp(str);
2629 rb_check_frozen(str);
2634str_dependent_p(
VALUE str)
2636 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2646#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2648str_independent(
VALUE str)
2652 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2653 str_modifiable(str);
2654 return !str_dependent_p(str);
2660str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2670 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2675 STR_SET_LEN(str,
len);
2680 oldptr = RSTRING_PTR(str);
2682 memcpy(
ptr, oldptr,
len);
2684 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2687 STR_SET_NOEMBED(str);
2688 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2689 TERM_FILL(
ptr +
len, termlen);
2691 STR_SET_LEN(str,
len);
2698 if (!str_independent(str))
2699 str_make_independent(str);
2708 int termlen = TERM_LEN(str);
2709 long len = RSTRING_LEN(str);
2712 rb_raise(rb_eArgError,
"negative expanding string size");
2714 if (expand >= LONG_MAX -
len) {
2715 rb_raise(rb_eArgError,
"string size too big");
2718 if (!str_independent(str)) {
2719 str_make_independent_expand(str,
len, expand, termlen);
2721 else if (expand > 0) {
2722 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2729str_modify_keep_cr(
VALUE str)
2731 if (!str_independent(str))
2732 str_make_independent(str);
2739str_discard(
VALUE str)
2741 str_modifiable(str);
2742 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2743 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2744 RSTRING(str)->as.heap.ptr = 0;
2745 STR_SET_LEN(str, 0);
2752 int encindex = rb_enc_get_index(str);
2754 if (RB_UNLIKELY(encindex == -1)) {
2758 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2763 if (!rb_enc_asciicompat(enc)) {
2785 return RSTRING_PTR(str);
2789zero_filled(
const char *s,
int n)
2791 for (; n > 0; --n) {
2798str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2800 const char *e = s +
len;
2802 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2803 if (zero_filled(s, minlen))
return s;
2809str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2814 if (str_dependent_p(str)) {
2815 if (!zero_filled(s +
len, termlen))
2816 str_make_independent_expand(str,
len, 0L, termlen);
2819 TERM_FILL(s +
len, termlen);
2822 return RSTRING_PTR(str);
2826rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2828 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2829 long len = RSTRING_LEN(str);
2833 rb_check_lockedtmp(str);
2834 str_make_independent_expand(str,
len, 0L, termlen);
2836 else if (str_dependent_p(str)) {
2837 if (termlen > oldtermlen)
2838 str_make_independent_expand(str,
len, 0L, termlen);
2841 if (!STR_EMBED_P(str)) {
2846 if (termlen > oldtermlen) {
2847 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2855str_null_check(
VALUE str,
int *w)
2857 char *s = RSTRING_PTR(str);
2858 long len = RSTRING_LEN(str);
2860 const int minlen = rb_enc_mbminlen(enc);
2864 if (str_null_char(s,
len, minlen, enc)) {
2867 return str_fill_term(str, s,
len, minlen);
2870 if (!s || memchr(s, 0,
len)) {
2874 s = str_fill_term(str, s,
len, minlen);
2880rb_str_to_cstr(
VALUE str)
2883 return str_null_check(str, &w);
2891 char *s = str_null_check(str, &w);
2894 rb_raise(rb_eArgError,
"string contains null char");
2896 rb_raise(rb_eArgError,
"string contains null byte");
2902rb_str_fill_terminator(
VALUE str,
const int newminlen)
2904 char *s = RSTRING_PTR(str);
2905 long len = RSTRING_LEN(str);
2906 return str_fill_term(str, s,
len, newminlen);
2912 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2938str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
2947 else if (rb_enc_asciicompat(enc)) {
2948 const char *p2, *e2;
2951 while (p < e && 0 < nth) {
2958 p2 = search_nonascii(p, e2);
2967 n = rb_enc_mbclen(p, e, enc);
2978 while (p < e && nth--) {
2979 p += rb_enc_mbclen(p, e, enc);
2990 return str_nth_len(p, e, &nth, enc);
2994str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2999 p = str_nth_len(p, e, &nth, enc);
3008str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3010 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3011 if (!pp)
return e - p;
3018 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
3019 STR_ENC_GET(str), single_byte_optimizable(str));
3024str_utf8_nth(
const char *p,
const char *e,
long *nthp)
3027 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
3028 const uintptr_t *s, *t;
3029 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3030 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3031 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
3032 while (p < (
const char *)s) {
3033 if (is_utf8_lead_byte(*p)) nth--;
3037 nth -= count_utf8_lead_bytes_with_word(s);
3039 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3043 if (is_utf8_lead_byte(*p)) {
3044 if (nth == 0)
break;
3054str_utf8_offset(
const char *p,
const char *e,
long nth)
3056 const char *pp = str_utf8_nth(p, e, &nth);
3065 if (single_byte_optimizable(str) || pos < 0)
3068 char *p = RSTRING_PTR(str);
3069 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3074str_subseq(
VALUE str,
long beg,
long len)
3082 const int termlen = TERM_LEN(str);
3083 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3090 if (str_embed_capa(str2) >=
len + termlen) {
3091 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3092 STR_SET_EMBED(str2);
3093 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3094 TERM_FILL(ptr2+
len, termlen);
3096 STR_SET_LEN(str2,
len);
3100 str_replace_shared(str2, str);
3103 RSTRING(str2)->as.heap.ptr += beg;
3104 if (RSTRING_LEN(str2) >
len) {
3105 STR_SET_LEN(str2,
len);
3115 VALUE str2 = str_subseq(str, beg,
len);
3116 rb_enc_cr_str_copy_for_substr(str2, str);
3125 const long blen = RSTRING_LEN(str);
3127 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3129 if (
len < 0)
return 0;
3130 if (beg < 0 && -beg < 0)
return 0;
3134 if (single_byte_optimizable(str)) {
3135 if (beg > blen)
return 0;
3138 if (beg < 0)
return 0;
3140 if (
len > blen - beg)
3142 if (
len < 0)
return 0;
3147 if (
len > -beg)
len = -beg;
3151 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3154 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3160 slen = str_strlen(str, enc);
3162 if (beg < 0)
return 0;
3164 if (
len == 0)
goto end;
3167 else if (beg > 0 && beg > blen) {
3171 if (beg > str_strlen(str, enc))
return 0;
3176 enc == rb_utf8_encoding()) {
3177 p = str_utf8_nth(s, e, &beg);
3178 if (beg > 0)
return 0;
3179 len = str_utf8_offset(p, e,
len);
3185 p = s + beg * char_sz;
3189 else if (
len * char_sz > e - p)
3194 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3195 if (beg > 0)
return 0;
3199 len = str_offset(p, e,
len, enc, 0);
3207static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3212 return str_substr(str, beg,
len, TRUE);
3222str_substr(
VALUE str,
long beg,
long len,
int empty)
3226 if (!p)
return Qnil;
3227 if (!
len && !empty)
return Qnil;
3229 beg = p - RSTRING_PTR(str);
3231 VALUE str2 = str_subseq(str, beg,
len);
3232 rb_enc_cr_str_copy_for_substr(str2, str);
3240 if (CHILLED_STRING_P(str)) {
3245 rb_str_resize(str, RSTRING_LEN(str));
3263 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3306str_uminus(
VALUE str)
3311 return rb_fstring(str);
3315#define rb_str_dup_frozen rb_str_new_frozen
3320 rb_check_frozen(str);
3321 if (
FL_TEST(str, STR_TMPLOCK)) {
3324 FL_SET(str, STR_TMPLOCK);
3331 rb_check_frozen(str);
3332 if (!
FL_TEST(str, STR_TMPLOCK)) {
3352 const int termlen = TERM_LEN(str);
3354 str_modifiable(str);
3355 if (STR_SHARED_P(str)) {
3358 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3359 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3370 else if (
len > RSTRING_LEN(str)) {
3374 const char *
const new_end = RSTRING_PTR(str) +
len;
3384 else if (
len < RSTRING_LEN(str)) {
3392 STR_SET_LEN(str,
len);
3393 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3400 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3403 int independent = str_independent(str);
3404 long slen = RSTRING_LEN(str);
3405 const int termlen = TERM_LEN(str);
3407 if (slen >
len || (termlen != 1 && slen <
len)) {
3413 if (STR_EMBED_P(str)) {
3414 if (
len == slen)
return str;
3415 if (str_embed_capa(str) >=
len + termlen) {
3416 STR_SET_LEN(str,
len);
3420 str_make_independent_expand(str, slen,
len - slen, termlen);
3422 else if (str_embed_capa(str) >=
len + termlen) {
3423 char *
ptr = STR_HEAP_PTR(str);
3425 if (slen >
len) slen =
len;
3428 STR_SET_LEN(str,
len);
3429 if (independent) ruby_xfree(
ptr);
3432 else if (!independent) {
3433 if (
len == slen)
return str;
3434 str_make_independent_expand(str, slen,
len - slen, termlen);
3438 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3439 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3442 else if (
len == slen)
return str;
3443 STR_SET_LEN(str,
len);
3450str_ensure_available_capa(
VALUE str,
long len)
3452 str_modify_keep_cr(str);
3454 const int termlen = TERM_LEN(str);
3455 long olen = RSTRING_LEN(str);
3457 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3458 rb_raise(rb_eArgError,
"string sizes too big");
3461 long total = olen +
len;
3462 long capa = str_capacity(str, termlen);
3465 if (total >= LONG_MAX / 2) {
3468 while (total >
capa) {
3471 RESIZE_CAPA_TERM(str,
capa, termlen);
3476str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3479 str_modify_keep_cr(str);
3484 if (
len == 0)
return 0;
3486 long total, olen,
off = -1;
3488 const int termlen = TERM_LEN(str);
3491 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3495 long capa = str_capacity(str, termlen);
3497 if (olen > LONG_MAX -
len) {
3498 rb_raise(rb_eArgError,
"string sizes too big");
3502 if (total >= LONG_MAX / 2) {
3505 while (total >
capa) {
3508 RESIZE_CAPA_TERM(str,
capa, termlen);
3509 sptr = RSTRING_PTR(str);
3514 memcpy(sptr + olen,
ptr,
len);
3515 STR_SET_LEN(str, total);
3516 TERM_FILL(sptr + total, termlen);
3521#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3522#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3527 if (
len == 0)
return str;
3529 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3531 return str_buf_cat(str,
ptr,
len);
3542rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3547 if (UNLIKELY(!str_independent(str))) {
3548 str_make_independent(str);
3551 long string_length = -1;
3552 const int null_terminator_length = 1;
3557 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3558 rb_raise(rb_eArgError,
"string sizes too big");
3561 long string_capacity = str_capacity(str, null_terminator_length);
3567 if (LIKELY(string_capacity >= string_length + 1)) {
3569 sptr[string_length] = byte;
3570 STR_SET_LEN(str, string_length + 1);
3571 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3575 str_buf_cat(str, (
char *)&
byte, 1);
3591 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3602rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3603 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3612 if (str_encindex == ptr_encindex) {
3614 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3618 str_enc = rb_enc_from_index(str_encindex);
3619 ptr_enc = rb_enc_from_index(ptr_encindex);
3620 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3623 if (RSTRING_LEN(str) == 0) {
3626 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3632 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3641 *ptr_cr_ret = ptr_cr;
3643 if (str_encindex != ptr_encindex &&
3646 str_enc = rb_enc_from_index(str_encindex);
3647 ptr_enc = rb_enc_from_index(ptr_encindex);
3652 res_encindex = str_encindex;
3657 res_encindex = str_encindex;
3661 res_encindex = ptr_encindex;
3666 res_encindex = str_encindex;
3673 res_encindex = str_encindex;
3679 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3681 str_buf_cat(str,
ptr,
len);
3687 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3694 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3704 if (rb_enc_asciicompat(enc)) {
3705 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3711 unsigned int c = (
unsigned char)*
ptr;
3712 int len = rb_enc_codelen(c, enc);
3713 rb_enc_mbcput(c, buf, enc);
3714 rb_enc_cr_str_buf_cat(str, buf,
len,
3727 if (str_enc_fastpath(str)) {
3731 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3737 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3748 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3764rb_str_concat_literals(
size_t num,
const VALUE *strary)
3768 unsigned long len = 1;
3773 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3775 str_enc_copy_direct(str, strary[0]);
3777 for (i = s; i < num; ++i) {
3778 const VALUE v = strary[i];
3782 if (encidx != ENCINDEX_US_ASCII) {
3784 rb_enc_set_index(str, encidx);
3797rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3799 str_modifiable(str);
3804 else if (argc > 1) {
3807 rb_enc_copy(arg_str, str);
3808 for (i = 0; i < argc; i++) {
3843rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3845 long needed_capacity = 0;
3849 for (
int index = 0; index < argc; index++) {
3850 VALUE obj = argv[index];
3858 needed_capacity += RSTRING_LEN(obj);
3863 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3870 str_ensure_available_capa(str, needed_capacity);
3873 for (
int index = 0; index < argc; index++) {
3874 VALUE obj = argv[index];
3879 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3880 char byte = (char)(
NUM2INT(obj) & 0xFF);
3894 rb_bug(
"append_as_bytes arguments should have been validated");
3898 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3899 TERM_FILL(sptr, TERM_LEN(str));
3904 for (
int index = 0; index < argc; index++) {
3905 VALUE obj = argv[index];
3922 rb_bug(
"append_as_bytes arguments should have been validated");
4001 if (rb_num_to_uint(str2, &code) == 0) {
4014 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4017 rb_str_buf_cat_byte(str1, (
unsigned char)code);
4020 long pos = RSTRING_LEN(str1);
4025 switch (
len = rb_enc_codelen(code, enc)) {
4026 case ONIGERR_INVALID_CODE_POINT_VALUE:
4027 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4029 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4035 rb_enc_mbcput(code, buf, enc);
4036 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
4037 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4039 rb_str_resize(str1, pos+
len);
4040 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
4053rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
4055 int encidx = rb_enc_to_index(enc);
4057 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4062 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4063 return ENCINDEX_ASCII_8BIT;
4086rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4088 str_modifiable(str);
4093 else if (argc > 1) {
4096 rb_enc_copy(arg_str, str);
4097 for (i = 0; i < argc; i++) {
4110 st_index_t precomputed_hash;
4111 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4113 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4114 return precomputed_hash;
4117 return str_do_hash(str);
4124 const char *ptr1, *ptr2;
4127 return (len1 != len2 ||
4129 memcmp(ptr1, ptr2, len1) != 0);
4143rb_str_hash_m(
VALUE str)
4149#define lesser(a,b) (((a)>(b))?(b):(a))
4157 if (RSTRING_LEN(str1) == 0)
return TRUE;
4158 if (RSTRING_LEN(str2) == 0)
return TRUE;
4161 if (idx1 == idx2)
return TRUE;
4166 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4170 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4180 const char *ptr1, *ptr2;
4183 if (str1 == str2)
return 0;
4186 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4195 if (len1 > len2)
return 1;
4198 if (retval > 0)
return 1;
4232 if (str1 == str2)
return Qtrue;
4239 return rb_str_eql_internal(str1, str2);
4253 if (str1 == str2)
return Qtrue;
4255 return rb_str_eql_internal(str1, str2);
4287 return rb_invcmp(str1, str2);
4329 return str_casecmp(str1, s);
4337 const char *p1, *p1end, *p2, *p2end;
4339 enc = rb_enc_compatible(str1, str2);
4344 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4345 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4346 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4347 while (p1 < p1end && p2 < p2end) {
4349 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4350 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4352 return INT2FIX(c1 < c2 ? -1 : 1);
4359 while (p1 < p1end && p2 < p2end) {
4360 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4361 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4363 if (0 <= c1 && 0 <= c2) {
4367 return INT2FIX(c1 < c2 ? -1 : 1);
4371 l1 = rb_enc_mbclen(p1, p1end, enc);
4372 l2 = rb_enc_mbclen(p2, p2end, enc);
4373 len = l1 < l2 ? l1 : l2;
4374 r = memcmp(p1, p2,
len);
4376 return INT2FIX(r < 0 ? -1 : 1);
4378 return INT2FIX(l1 < l2 ? -1 : 1);
4384 if (RSTRING_LEN(str1) == RSTRING_LEN(str2))
return INT2FIX(0);
4385 if (RSTRING_LEN(str1) > RSTRING_LEN(str2))
return INT2FIX(1);
4418 return str_casecmp_p(str1, s);
4425 VALUE folded_str1, folded_str2;
4426 VALUE fold_opt = sym_fold;
4428 enc = rb_enc_compatible(str1, str2);
4433 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4434 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4436 return rb_str_eql(folded_str1, folded_str2);
4440strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4441 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4443 const char *search_start = str_ptr;
4444 long pos, search_len = str_len - offset;
4448 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4449 if (pos < 0)
return pos;
4451 if (t == search_start + pos)
break;
4452 search_len -= t - search_start;
4453 if (search_len <= 0)
return -1;
4454 offset += t - search_start;
4457 return pos + offset;
4461#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4462#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4465rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4467 const char *str_ptr, *str_ptr_end, *sub_ptr;
4468 long str_len, sub_len;
4471 enc = rb_enc_check(str, sub);
4472 if (is_broken_string(sub))
return -1;
4474 str_ptr = RSTRING_PTR(str);
4476 str_len = RSTRING_LEN(str);
4477 sub_ptr = RSTRING_PTR(sub);
4478 sub_len = RSTRING_LEN(sub);
4480 if (str_len < sub_len)
return -1;
4483 long str_len_char, sub_len_char;
4484 int single_byte = single_byte_optimizable(str);
4485 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4486 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4488 offset += str_len_char;
4489 if (offset < 0)
return -1;
4491 if (str_len_char - offset < sub_len_char)
return -1;
4492 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4495 if (sub_len == 0)
return offset;
4498 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4512rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4519 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4520 long slen = str_strlen(str, enc);
4522 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4534 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4535 enc, single_byte_optimizable(str));
4546 pos = rb_str_index(str, sub, pos);
4560str_ensure_byte_pos(
VALUE str,
long pos)
4562 if (!single_byte_optimizable(str)) {
4563 const char *s = RSTRING_PTR(str);
4565 const char *p = s + pos;
4566 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4568 "offset %ld does not land on character boundary", pos);
4641rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4647 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4648 long slen = RSTRING_LEN(str);
4650 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4661 str_ensure_byte_pos(str, pos);
4673 pos = rb_str_byteindex(str, sub, pos);
4674 if (pos >= 0)
return LONG2NUM(pos);
4681memrchr(
const char *search_str,
int chr,
long search_len)
4683 const char *ptr = search_str + search_len;
4684 while (ptr > search_str) {
4685 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4695 char *hit, *adjusted;
4697 long slen, searchlen;
4700 sbeg = RSTRING_PTR(str);
4701 slen = RSTRING_LEN(sub);
4702 if (slen == 0)
return s - sbeg;
4704 t = RSTRING_PTR(sub);
4706 searchlen = s - sbeg + 1;
4708 if (memcmp(s, t, slen) == 0) {
4713 hit = memrchr(sbeg, c, searchlen);
4716 if (hit != adjusted) {
4717 searchlen = adjusted - sbeg;
4720 if (memcmp(hit, t, slen) == 0)
4722 searchlen = adjusted - sbeg;
4723 }
while (searchlen > 0);
4737 enc = rb_enc_check(str, sub);
4738 if (is_broken_string(sub))
return -1;
4739 singlebyte = single_byte_optimizable(str);
4740 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4741 slen = str_strlen(sub, enc);
4744 if (
len < slen)
return -1;
4745 if (
len - pos < slen) pos =
len - slen;
4746 if (
len == 0)
return pos;
4748 sbeg = RSTRING_PTR(str);
4751 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4757 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4758 return str_rindex(str, sub, s, enc);
4819rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4824 long pos,
len = str_strlen(str, enc);
4826 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4828 if (pos < 0 && (pos +=
len) < 0) {
4834 if (pos >
len) pos =
len;
4842 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4843 enc, single_byte_optimizable(str));
4854 pos = rb_str_rindex(str, sub, pos);
4864rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4870 enc = rb_enc_check(str, sub);
4871 if (is_broken_string(sub))
return -1;
4872 len = RSTRING_LEN(str);
4873 slen = RSTRING_LEN(sub);
4876 if (
len < slen)
return -1;
4877 if (
len - pos < slen) pos =
len - slen;
4878 if (
len == 0)
return pos;
4880 sbeg = RSTRING_PTR(str);
4883 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4890 return str_rindex(str, sub, s, enc);
4980rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4984 long pos,
len = RSTRING_LEN(str);
4986 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4988 if (pos < 0 && (pos +=
len) < 0) {
4994 if (pos >
len) pos =
len;
5000 str_ensure_byte_pos(str, pos);
5012 pos = rb_str_byterindex(str, sub, pos);
5013 if (pos >= 0)
return LONG2NUM(pos);
5052 switch (OBJ_BUILTIN_TYPE(y)) {
5104rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5111 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5143rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5147 re = get_pat(argv[0]);
5148 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5157static enum neighbor_char
5163 if (rb_enc_mbminlen(enc) > 1) {
5165 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5167 return NEIGHBOR_NOT_CHAR;
5169 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5171 if (!l)
return NEIGHBOR_NOT_CHAR;
5172 if (l !=
len)
return NEIGHBOR_WRAPPED;
5173 rb_enc_mbcput(c, p, enc);
5174 r = rb_enc_precise_mbclen(p, p +
len, enc);
5176 return NEIGHBOR_NOT_CHAR;
5178 return NEIGHBOR_FOUND;
5181 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5184 return NEIGHBOR_WRAPPED;
5185 ++((
unsigned char*)p)[i];
5186 l = rb_enc_precise_mbclen(p, p+
len, enc);
5190 return NEIGHBOR_FOUND;
5193 memset(p+l, 0xff,
len-l);
5199 for (len2 =
len-1; 0 < len2; len2--) {
5200 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5204 memset(p+len2+1, 0xff,
len-(len2+1));
5209static enum neighbor_char
5214 if (rb_enc_mbminlen(enc) > 1) {
5216 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5218 return NEIGHBOR_NOT_CHAR;
5220 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5221 if (!c)
return NEIGHBOR_NOT_CHAR;
5224 if (!l)
return NEIGHBOR_NOT_CHAR;
5225 if (l !=
len)
return NEIGHBOR_WRAPPED;
5226 rb_enc_mbcput(c, p, enc);
5227 r = rb_enc_precise_mbclen(p, p +
len, enc);
5229 return NEIGHBOR_NOT_CHAR;
5231 return NEIGHBOR_FOUND;
5234 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5237 return NEIGHBOR_WRAPPED;
5238 --((
unsigned char*)p)[i];
5239 l = rb_enc_precise_mbclen(p, p+
len, enc);
5243 return NEIGHBOR_FOUND;
5246 memset(p+l, 0,
len-l);
5252 for (len2 =
len-1; 0 < len2; len2--) {
5253 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5257 memset(p+len2+1, 0,
len-(len2+1));
5271static enum neighbor_char
5272enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5274 enum neighbor_char ret;
5278 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5282 const int max_gaps = 1;
5284 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5286 ctype = ONIGENC_CTYPE_DIGIT;
5288 ctype = ONIGENC_CTYPE_ALPHA;
5290 return NEIGHBOR_NOT_CHAR;
5293 for (
try = 0;
try <= max_gaps; ++
try) {
5294 ret = enc_succ_char(p,
len, enc);
5295 if (ret == NEIGHBOR_FOUND) {
5296 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5298 return NEIGHBOR_FOUND;
5305 ret = enc_pred_char(p,
len, enc);
5306 if (ret == NEIGHBOR_FOUND) {
5307 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5320 return NEIGHBOR_NOT_CHAR;
5323 if (ctype != ONIGENC_CTYPE_DIGIT) {
5325 return NEIGHBOR_WRAPPED;
5329 enc_succ_char(carry,
len, enc);
5330 return NEIGHBOR_WRAPPED;
5398 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5399 rb_enc_cr_str_copy_for_substr(str, orig);
5400 return str_succ(str);
5407 char *sbeg, *s, *e, *last_alnum = 0;
5408 int found_alnum = 0;
5410 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5411 long carry_pos = 0, carry_len = 1;
5412 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5414 slen = RSTRING_LEN(str);
5415 if (slen == 0)
return str;
5417 enc = STR_ENC_GET(str);
5418 sbeg = RSTRING_PTR(str);
5419 s = e = sbeg + slen;
5421 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5422 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5428 l = rb_enc_precise_mbclen(s, e, enc);
5429 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5430 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5431 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5433 case NEIGHBOR_NOT_CHAR:
5435 case NEIGHBOR_FOUND:
5437 case NEIGHBOR_WRAPPED:
5442 carry_pos = s - sbeg;
5447 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5448 enum neighbor_char neighbor;
5449 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5450 l = rb_enc_precise_mbclen(s, e, enc);
5451 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5452 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5454 neighbor = enc_succ_char(tmp, l, enc);
5456 case NEIGHBOR_FOUND:
5460 case NEIGHBOR_WRAPPED:
5463 case NEIGHBOR_NOT_CHAR:
5466 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5468 enc_succ_char(s, l, enc);
5470 if (!rb_enc_asciicompat(enc)) {
5471 MEMCPY(carry, s,
char, l);
5474 carry_pos = s - sbeg;
5478 RESIZE_CAPA(str, slen + carry_len);
5479 sbeg = RSTRING_PTR(str);
5480 s = sbeg + carry_pos;
5481 memmove(s + carry_len, s, slen - carry_pos);
5482 memmove(s, carry, carry_len);
5484 STR_SET_LEN(str, slen);
5485 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5499rb_str_succ_bang(
VALUE str)
5507all_digits_p(
const char *s,
long len)
5561 VALUE end, exclusive;
5565 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5571 VALUE current, after_end;
5578 enc = rb_enc_check(beg, end);
5579 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5581 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5582 char c = RSTRING_PTR(beg)[0];
5583 char e = RSTRING_PTR(end)[0];
5585 if (c > e || (excl && c == e))
return beg;
5587 VALUE str = rb_enc_str_new(&c, 1, enc);
5589 if ((*each)(str, arg))
break;
5590 if (!excl && c == e)
break;
5592 if (excl && c == e)
break;
5597 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5598 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5599 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5604 b = rb_str_to_inum(beg, 10, FALSE);
5605 e = rb_str_to_inum(end, 10, FALSE);
5612 if (excl && bi == ei)
break;
5613 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5618 ID op = excl ?
'<' : idLE;
5619 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5624 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5625 b = rb_funcallv(b, succ, 0, 0);
5632 if (n > 0 || (excl && n == 0))
return beg;
5634 after_end = rb_funcallv(end, succ, 0, 0);
5639 next = rb_funcallv(current, succ, 0, 0);
5640 if ((*each)(current, arg))
break;
5641 if (
NIL_P(next))
break;
5645 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5660 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5661 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5662 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5664 b = rb_str_to_inum(beg, 10, FALSE);
5670 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5678 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5679 b = rb_funcallv(b, succ, 0, 0);
5685 VALUE next = rb_funcallv(current, succ, 0, 0);
5686 if ((*each)(current, arg))
break;
5689 if (RSTRING_LEN(current) == 0)
5700 if (!
rb_equal(str, *argp))
return 0;
5714 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5715 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5716 rb_enc_asciicompat(STR_ENC_GET(val))) {
5717 const char *bp = RSTRING_PTR(beg);
5718 const char *ep = RSTRING_PTR(end);
5719 const char *vp = RSTRING_PTR(val);
5720 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5721 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5729 if (b <= v && v < e)
return Qtrue;
5730 return RBOOL(!
RTEST(exclusive) && v == e);
5737 all_digits_p(bp, RSTRING_LEN(beg)) &&
5738 all_digits_p(ep, RSTRING_LEN(end))) {
5743 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5745 return RBOOL(
NIL_P(val));
5768 return rb_str_subpat(str, indx,
INT2FIX(0));
5771 if (rb_str_index(str, indx, 0) != -1)
5777 long beg,
len = str_strlen(str, NULL);
5789 return str_substr(str, idx, 1, FALSE);
5808rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5812 return rb_str_subpat(str, argv[0], argv[1]);
5815 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5819 return rb_str_aref(str, argv[0]);
5825 char *ptr = RSTRING_PTR(str);
5826 long olen = RSTRING_LEN(str), nlen;
5828 str_modifiable(str);
5829 if (
len > olen)
len = olen;
5831 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5833 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5835 ptr =
RSTRING(str)->as.embed.ary;
5836 memmove(ptr, oldptr +
len, nlen);
5837 if (fl == STR_NOEMBED)
xfree(oldptr);
5840 if (!STR_SHARED_P(str)) {
5842 rb_enc_cr_str_exact_copy(shared, str);
5847 STR_SET_LEN(str, nlen);
5849 if (!SHARABLE_MIDDLE_SUBSTRING) {
5850 TERM_FILL(ptr + nlen, TERM_LEN(str));
5857rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5863 if (beg == 0 && vlen == 0) {
5868 str_modify_keep_cr(str);
5872 RESIZE_CAPA(str, slen + vlen -
len);
5873 sptr = RSTRING_PTR(str);
5882 memmove(sptr + beg + vlen,
5884 slen - (beg +
len));
5886 if (vlen < beg &&
len < 0) {
5890 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5893 STR_SET_LEN(str, slen);
5894 TERM_FILL(&sptr[slen], TERM_LEN(str));
5901 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5910 int singlebyte = single_byte_optimizable(str);
5916 enc = rb_enc_check(str, val);
5917 slen = str_strlen(str, enc);
5919 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5928 if (
len > slen - beg) {
5931 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5936 beg = p - RSTRING_PTR(str);
5938 rb_str_update_0(str, beg,
len, val);
5939 rb_enc_associate(str, enc);
5950 long start, end,
len;
5960 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5964 nth += regs->num_regs;
5974 enc = rb_enc_check_str(str, val);
5975 rb_str_update_0(str, start,
len, val);
5976 rb_enc_associate(str, enc);
5984 switch (
TYPE(indx)) {
5986 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5990 beg = rb_str_index(str, indx, 0);
6045rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
6049 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6057 return rb_str_aset(str, argv[0], argv[1]);
6117rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6125 str_modify_keep_cr(str);
6133 if ((nth += regs->num_regs) <= 0)
return Qnil;
6135 else if (nth >= regs->num_regs)
return Qnil;
6137 len = END(nth) - beg;
6140 else if (argc == 2) {
6149 beg = p - RSTRING_PTR(str);
6153 beg = rb_str_index(str, indx, 0);
6154 if (beg == -1)
return Qnil;
6155 len = RSTRING_LEN(indx);
6167 beg = p - RSTRING_PTR(str);
6176 beg = p - RSTRING_PTR(str);
6180 rb_enc_cr_str_copy_for_substr(result, str);
6188 char *sptr = RSTRING_PTR(str);
6189 long slen = RSTRING_LEN(str);
6190 if (beg +
len > slen)
6194 slen - (beg +
len));
6196 STR_SET_LEN(str, slen);
6197 TERM_FILL(&sptr[slen], TERM_LEN(str));
6208 switch (OBJ_BUILTIN_TYPE(pat)) {
6227get_pat_quoted(
VALUE pat,
int check)
6231 switch (OBJ_BUILTIN_TYPE(pat)) {
6245 if (check && is_broken_string(pat)) {
6252rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6255 pos = rb_str_byteindex(str, pat, pos);
6256 if (set_backref_str) {
6258 str = rb_str_new_frozen_String(str);
6259 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6261 *match = match_data;
6271 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6276rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6278 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6297rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6311 hash = rb_check_hash_type(argv[1]);
6317 pat = get_pat_quoted(argv[0], 1);
6319 str_modifiable(str);
6320 beg = rb_pat_search(pat, str, 0, 1);
6334 end0 = beg0 + RSTRING_LEN(pat);
6343 if (iter || !
NIL_P(hash)) {
6344 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6350 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6353 str_mod_check(str, p,
len);
6354 rb_check_frozen(str);
6360 enc = rb_enc_compatible(str, repl);
6363 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6367 rb_enc_inspect_name(str_enc),
6368 rb_enc_inspect_name(STR_ENC_GET(repl)));
6370 enc = STR_ENC_GET(repl);
6373 rb_enc_associate(str, enc);
6383 rlen = RSTRING_LEN(repl);
6384 len = RSTRING_LEN(str);
6386 RESIZE_CAPA(str,
len + rlen - plen);
6388 p = RSTRING_PTR(str);
6390 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6392 rp = RSTRING_PTR(repl);
6393 memmove(p + beg0, rp, rlen);
6395 STR_SET_LEN(str,
len);
6396 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6425 rb_str_sub_bang(argc, argv, str);
6430str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6433 long beg, beg0, end0;
6434 long offset, blen, slen,
len, last;
6435 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6437 int need_backref_str = -1;
6447 hash = rb_check_hash_type(argv[1]);
6451 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6459 rb_error_arity(argc, 1, 2);
6462 pat = get_pat_quoted(argv[0], 1);
6463 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6466 if (bang)
return Qnil;
6471 blen = RSTRING_LEN(str) + 30;
6473 sp = RSTRING_PTR(str);
6474 slen = RSTRING_LEN(str);
6476 str_enc = STR_ENC_GET(str);
6477 rb_enc_associate(dest, str_enc);
6484 end0 = beg0 + RSTRING_LEN(pat);
6500 if (mode == FAST_MAP) {
6509 val = rb_hash_aref(hash, key);
6512 str_mod_check(str, sp, slen);
6517 else if (need_backref_str) {
6519 if (need_backref_str < 0) {
6520 need_backref_str = val != repl;
6527 len = beg0 - offset;
6541 if (RSTRING_LEN(str) <= end0)
break;
6542 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6544 offset = end0 +
len;
6546 cp = RSTRING_PTR(str) + offset;
6547 if (offset > RSTRING_LEN(str))
break;
6550 if (mode != FAST_MAP && mode != STR) {
6553 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6558 if (RSTRING_LEN(str) > offset) {
6561 rb_pat_search0(pat, str, last, 1, &match);
6563 str_shared_replace(str, dest);
6588rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6590 str_modify_keep_cr(str);
6591 return str_gsub(argc, argv, str, 1);
6641 return str_gsub(argc, argv, str, 0);
6659 str_modifiable(str);
6660 if (str == str2)
return str;
6664 return str_replace(str, str2);
6681rb_str_clear(
VALUE str)
6685 STR_SET_LEN(str, 0);
6686 RSTRING_PTR(str)[0] = 0;
6687 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6703rb_str_chr(
VALUE str)
6721 pos += RSTRING_LEN(str);
6722 if (pos < 0 || RSTRING_LEN(str) <= pos)
6725 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6744 long len = RSTRING_LEN(str);
6745 char *
ptr, *head, *left = 0;
6749 if (pos < -
len ||
len <= pos)
6756 char byte = (char)(
NUM2INT(w) & 0xFF);
6758 if (!str_independent(str))
6759 str_make_independent(str);
6760 enc = STR_ENC_GET(str);
6761 head = RSTRING_PTR(str);
6763 if (!STR_EMBED_P(str)) {
6770 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6778 width = rb_enc_precise_mbclen(left, head+
len, enc);
6780 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6796str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6798 long n = RSTRING_LEN(str);
6800 if (beg > n ||
len < 0)
return Qnil;
6803 if (beg < 0)
return Qnil;
6808 if (!empty)
return Qnil;
6812 VALUE str2 = str_subseq(str, beg,
len);
6814 str_enc_copy_direct(str2, str);
6816 if (RSTRING_LEN(str2) == 0) {
6817 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6851 long beg,
len = RSTRING_LEN(str);
6859 return str_byte_substr(str, beg,
len, TRUE);
6864 return str_byte_substr(str, idx, 1, FALSE);
6876rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6881 return str_byte_substr(str, beg,
len, TRUE);
6884 return str_byte_aref(str, argv[0]);
6888str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6890 long end, slen = RSTRING_LEN(str);
6893 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6902 if (*
len > slen - *beg) {
6906 str_ensure_byte_pos(str, *beg);
6907 str_ensure_byte_pos(str, end);
6921rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6923 long beg,
len, vbeg, vlen;
6928 if (!(argc == 2 || argc == 3 || argc == 5)) {
6929 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6933 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6934 rb_builtin_class_name(argv[0]));
6941 vlen = RSTRING_LEN(val);
6946 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6947 rb_builtin_class_name(argv[2]));
6959 vlen = RSTRING_LEN(val);
6967 str_check_beg_len(str, &beg, &
len);
6968 str_check_beg_len(val, &vbeg, &vlen);
6969 str_modify_keep_cr(str);
6972 rb_enc_associate(str, rb_enc_check(str, val));
6975 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6993rb_str_reverse(
VALUE str)
7000 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
7001 enc = STR_ENC_GET(str);
7007 if (RSTRING_LEN(str) > 1) {
7008 if (single_byte_optimizable(str)) {
7015 int clen = rb_enc_fast_mbclen(s, e, enc);
7023 cr = rb_enc_asciicompat(enc) ?
7026 int clen = rb_enc_mbclen(s, e, enc);
7035 STR_SET_LEN(rev, RSTRING_LEN(str));
7036 str_enc_copy_direct(rev, str);
7056rb_str_reverse_bang(
VALUE str)
7058 if (RSTRING_LEN(str) > 1) {
7059 if (single_byte_optimizable(str)) {
7062 str_modify_keep_cr(str);
7063 s = RSTRING_PTR(str);
7072 str_shared_replace(str, rb_str_reverse(str));
7076 str_modify_keep_cr(str);
7101 i = rb_str_index(str, arg, 0);
7103 return RBOOL(i != -1);
7145 rb_raise(rb_eArgError,
"invalid radix %d", base);
7147 return rb_str_to_inum(str, base, FALSE);
7171rb_str_to_f(
VALUE str)
7186rb_str_to_s(
VALUE str)
7198 char s[RUBY_MAX_CHAR_LEN];
7199 int n = rb_enc_codelen(c, enc);
7201 rb_enc_mbcput(c, s, enc);
7206#define CHAR_ESC_LEN 13
7209rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7211 char buf[CHAR_ESC_LEN + 1];
7219 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7221 else if (c < 0x10000) {
7222 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7225 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7230 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7233 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7236 l = (int)strlen(buf);
7242ruby_escaped_char(
int c)
7245 case '\0':
return "\\0";
7246 case '\n':
return "\\n";
7247 case '\r':
return "\\r";
7248 case '\t':
return "\\t";
7249 case '\f':
return "\\f";
7250 case '\013':
return "\\v";
7251 case '\010':
return "\\b";
7252 case '\007':
return "\\a";
7253 case '\033':
return "\\e";
7254 case '\x7f':
return "\\c?";
7260rb_str_escape(
VALUE str)
7264 const char *p = RSTRING_PTR(str);
7266 const char *prev = p;
7267 char buf[CHAR_ESC_LEN + 1];
7269 int unicode_p = rb_enc_unicode_p(enc);
7270 int asciicompat = rb_enc_asciicompat(enc);
7275 int n = rb_enc_precise_mbclen(p, pend, enc);
7277 if (p > prev) str_buf_cat(result, prev, p - prev);
7278 n = rb_enc_mbminlen(enc);
7280 n = (int)(pend - p);
7282 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7283 str_buf_cat(result, buf, strlen(buf));
7289 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7291 cc = ruby_escaped_char(c);
7293 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7294 str_buf_cat(result, cc, strlen(cc));
7297 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7300 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7301 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7305 if (p > prev) str_buf_cat(result, prev, p - prev);
7329 const char *p, *pend, *prev;
7330 char buf[CHAR_ESC_LEN + 1];
7332 rb_encoding *resenc = rb_default_internal_encoding();
7333 int unicode_p = rb_enc_unicode_p(enc);
7334 int asciicompat = rb_enc_asciicompat(enc);
7336 if (resenc == NULL) resenc = rb_default_external_encoding();
7337 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7338 rb_enc_associate(result, resenc);
7339 str_buf_cat2(result,
"\"");
7347 n = rb_enc_precise_mbclen(p, pend, enc);
7349 if (p > prev) str_buf_cat(result, prev, p - prev);
7350 n = rb_enc_mbminlen(enc);
7352 n = (int)(pend - p);
7354 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7355 str_buf_cat(result, buf, strlen(buf));
7361 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7363 if ((asciicompat || unicode_p) &&
7364 (c ==
'"'|| c ==
'\\' ||
7369 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7370 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7371 str_buf_cat2(result,
"\\");
7372 if (asciicompat || enc == resenc) {
7378 case '\n': cc =
'n';
break;
7379 case '\r': cc =
'r';
break;
7380 case '\t': cc =
't';
break;
7381 case '\f': cc =
'f';
break;
7382 case '\013': cc =
'v';
break;
7383 case '\010': cc =
'b';
break;
7384 case '\007': cc =
'a';
break;
7385 case 033: cc =
'e';
break;
7386 default: cc = 0;
break;
7389 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7392 str_buf_cat(result, buf, 2);
7405 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7409 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7410 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7415 if (p > prev) str_buf_cat(result, prev, p - prev);
7416 str_buf_cat2(result,
"\"");
7421#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7434 int encidx = rb_enc_get_index(str);
7437 const char *p, *pend;
7440 int u8 = (encidx == rb_utf8_encindex());
7441 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7444 if (!rb_enc_asciicompat(enc)) {
7446 len += strlen(enc->name);
7449 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7452 unsigned char c = *p++;
7455 case '"':
case '\\':
7456 case '\n':
case '\r':
7457 case '\t':
case '\f':
7458 case '\013':
case '\010':
case '\007':
case '\033':
7463 clen = IS_EVSTR(p, pend) ? 2 : 1;
7471 if (u8 && c > 0x7F) {
7472 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7474 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7477 else if (cc <= 0xFFFFF)
7490 if (clen > LONG_MAX -
len) {
7497 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7498 q = RSTRING_PTR(result); qend = q +
len + 1;
7502 unsigned char c = *p++;
7504 if (c ==
'"' || c ==
'\\') {
7508 else if (c ==
'#') {
7509 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7512 else if (c ==
'\n') {
7516 else if (c ==
'\r') {
7520 else if (c ==
'\t') {
7524 else if (c ==
'\f') {
7528 else if (c ==
'\013') {
7532 else if (c ==
'\010') {
7536 else if (c ==
'\007') {
7540 else if (c ==
'\033') {
7550 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7552 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7555 snprintf(q, qend-q,
"u%04X", cc);
7557 snprintf(q, qend-q,
"u{%X}", cc);
7562 snprintf(q, qend-q,
"x%02X", c);
7568 if (!rb_enc_asciicompat(enc)) {
7569 snprintf(q, qend-q, nonascii_suffix, enc->name);
7570 encidx = rb_ascii8bit_encindex();
7573 rb_enc_associate_index(result, encidx);
7579unescape_ascii(
unsigned int c)
7603undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7605 const char *s = *ss;
7609 unsigned char buf[6];
7627 *buf = unescape_ascii(*s);
7639 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7640 if (*penc != enc_utf8) {
7642 rb_enc_associate(undumped, enc_utf8);
7659 if (hexlen == 0 || hexlen > 6) {
7665 if (0xd800 <= c && c <= 0xdfff) {
7668 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7678 if (0xd800 <= c && c <= 0xdfff) {
7681 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7709static VALUE rb_str_is_ascii_only_p(
VALUE str);
7727str_undump(
VALUE str)
7729 const char *s = RSTRING_PTR(str);
7732 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7734 bool binary =
false;
7738 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7741 if (!str_null_check(str, &w)) {
7744 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7745 if (*s !=
'"')
goto invalid_format;
7763 static const char force_encoding_suffix[] =
".force_encoding(\"";
7764 static const char dup_suffix[] =
".dup";
7765 const char *encname;
7770 size =
sizeof(dup_suffix) - 1;
7771 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7773 size =
sizeof(force_encoding_suffix) - 1;
7774 if (s_end - s <= size)
goto invalid_format;
7775 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7779 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7783 s = memchr(s,
'"', s_end-s);
7785 if (!s)
goto invalid_format;
7786 if (s_end - s != 2)
goto invalid_format;
7787 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7789 encidx = rb_enc_find_index2(encname, (
long)size);
7793 rb_enc_associate_index(undumped, encidx);
7803 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7814 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7820 if (rb_enc_dummy_p(enc)) {
7827str_true_enc(
VALUE str)
7830 rb_str_check_dummy_enc(enc);
7834static OnigCaseFoldType
7835check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7840 rb_raise(rb_eArgError,
"too many options");
7841 if (argv[0]==sym_turkic) {
7842 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7844 if (argv[1]==sym_lithuanian)
7845 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7847 rb_raise(rb_eArgError,
"invalid second option");
7850 else if (argv[0]==sym_lithuanian) {
7851 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7853 if (argv[1]==sym_turkic)
7854 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7856 rb_raise(rb_eArgError,
"invalid second option");
7860 rb_raise(rb_eArgError,
"too many options");
7861 else if (argv[0]==sym_ascii)
7862 flags |= ONIGENC_CASE_ASCII_ONLY;
7863 else if (argv[0]==sym_fold) {
7864 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7865 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7867 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7870 rb_raise(rb_eArgError,
"invalid option");
7877 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7883#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7884#ifndef CASEMAP_DEBUG
7885# define CASEMAP_DEBUG 0
7893 OnigUChar space[FLEX_ARY_LEN];
7897mapping_buffer_free(
void *p)
7901 while (current_buffer) {
7902 previous_buffer = current_buffer;
7903 current_buffer = current_buffer->next;
7904 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7910 {0, mapping_buffer_free,},
7911 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7919 const OnigUChar *source_current, *source_end;
7920 int target_length = 0;
7921 VALUE buffer_anchor;
7924 size_t buffer_count = 0;
7925 int buffer_length_or_invalid;
7927 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7929 source_current = (OnigUChar*)RSTRING_PTR(source);
7934 while (source_current < source_end) {
7936 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7937 if (CASEMAP_DEBUG) {
7938 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7941 *pre_buffer = current_buffer;
7942 pre_buffer = ¤t_buffer->next;
7943 current_buffer->next = NULL;
7944 current_buffer->capa =
capa;
7945 buffer_length_or_invalid = enc->case_map(flags,
7946 &source_current, source_end,
7947 current_buffer->space,
7948 current_buffer->space+current_buffer->capa,
7950 if (buffer_length_or_invalid < 0) {
7951 current_buffer =
DATA_PTR(buffer_anchor);
7953 mapping_buffer_free(current_buffer);
7954 rb_raise(rb_eArgError,
"input string invalid");
7956 target_length += current_buffer->used = buffer_length_or_invalid;
7958 if (CASEMAP_DEBUG) {
7959 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7962 if (buffer_count==1) {
7963 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7966 char *target_current;
7969 target_current = RSTRING_PTR(target);
7970 current_buffer =
DATA_PTR(buffer_anchor);
7971 while (current_buffer) {
7972 memcpy(target_current, current_buffer->space, current_buffer->used);
7973 target_current += current_buffer->used;
7974 current_buffer = current_buffer->next;
7977 current_buffer =
DATA_PTR(buffer_anchor);
7979 mapping_buffer_free(current_buffer);
7984 str_enc_copy_direct(target, source);
7993 const OnigUChar *source_current, *source_end;
7994 OnigUChar *target_current, *target_end;
7995 long old_length = RSTRING_LEN(source);
7996 int length_or_invalid;
7998 if (old_length == 0)
return Qnil;
8000 source_current = (OnigUChar*)RSTRING_PTR(source);
8002 if (source == target) {
8003 target_current = (OnigUChar*)source_current;
8004 target_end = (OnigUChar*)source_end;
8007 target_current = (OnigUChar*)RSTRING_PTR(target);
8011 length_or_invalid = onigenc_ascii_only_case_map(flags,
8012 &source_current, source_end,
8013 target_current, target_end, enc);
8014 if (length_or_invalid < 0)
8015 rb_raise(rb_eArgError,
"input string invalid");
8016 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8017 fprintf(stderr,
"problem with rb_str_ascii_casemap"
8018 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8019 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
8020 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8023 str_enc_copy(target, source);
8029upcase_single(
VALUE str)
8031 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8032 bool modified =
false;
8035 unsigned int c = *(
unsigned char*)s;
8037 if (
'a' <= c && c <=
'z') {
8038 *s =
'A' + (c -
'a');
8066rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
8069 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8071 flags = check_case_options(argc, argv, flags);
8072 str_modify_keep_cr(str);
8073 enc = str_true_enc(str);
8074 if (case_option_single_p(flags, enc, str)) {
8075 if (upcase_single(str))
8076 flags |= ONIGENC_CASE_MODIFIED;
8078 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8079 rb_str_ascii_casemap(str, str, &flags, enc);
8081 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8083 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8105rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8108 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8111 flags = check_case_options(argc, argv, flags);
8112 enc = str_true_enc(str);
8113 if (case_option_single_p(flags, enc, str)) {
8114 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8115 str_enc_copy_direct(ret, str);
8118 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8120 rb_str_ascii_casemap(str, ret, &flags, enc);
8123 ret = rb_str_casemap(str, &flags, enc);
8130downcase_single(
VALUE str)
8132 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8133 bool modified =
false;
8136 unsigned int c = *(
unsigned char*)s;
8138 if (
'A' <= c && c <=
'Z') {
8139 *s =
'a' + (c -
'A');
8161rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8164 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8166 flags = check_case_options(argc, argv, flags);
8167 str_modify_keep_cr(str);
8168 enc = str_true_enc(str);
8169 if (case_option_single_p(flags, enc, str)) {
8170 if (downcase_single(str))
8171 flags |= ONIGENC_CASE_MODIFIED;
8173 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8174 rb_str_ascii_casemap(str, str, &flags, enc);
8176 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8178 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8192rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8195 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8198 flags = check_case_options(argc, argv, flags);
8199 enc = str_true_enc(str);
8200 if (case_option_single_p(flags, enc, str)) {
8201 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8202 str_enc_copy_direct(ret, str);
8203 downcase_single(ret);
8205 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8207 rb_str_ascii_casemap(str, ret, &flags, enc);
8210 ret = rb_str_casemap(str, &flags, enc);
8230rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8233 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8235 flags = check_case_options(argc, argv, flags);
8236 str_modify_keep_cr(str);
8237 enc = str_true_enc(str);
8238 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8239 if (flags&ONIGENC_CASE_ASCII_ONLY)
8240 rb_str_ascii_casemap(str, str, &flags, enc);
8242 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8244 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8277rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8280 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8283 flags = check_case_options(argc, argv, flags);
8284 enc = str_true_enc(str);
8285 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8286 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8288 rb_str_ascii_casemap(str, ret, &flags, enc);
8291 ret = rb_str_casemap(str, &flags, enc);
8318rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8321 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8323 flags = check_case_options(argc, argv, flags);
8324 str_modify_keep_cr(str);
8325 enc = str_true_enc(str);
8326 if (flags&ONIGENC_CASE_ASCII_ONLY)
8327 rb_str_ascii_casemap(str, str, &flags, enc);
8329 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8331 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8355rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8358 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8361 flags = check_case_options(argc, argv, flags);
8362 enc = str_true_enc(str);
8363 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8364 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8366 rb_str_ascii_casemap(str, ret, &flags, enc);
8369 ret = rb_str_casemap(str, &flags, enc);
8374typedef unsigned char *USTR;
8378 unsigned int now, max;
8390 if (t->p == t->pend)
return -1;
8391 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8394 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8396 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8398 if (t->p < t->pend) {
8399 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8402 if (t->now < 0x80 && c < 0x80) {
8403 rb_raise(rb_eArgError,
8404 "invalid range \"%c-%c\" in string transliteration",
8408 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8412 else if (t->now < c) {
8421 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8422 if (t->now == t->max) {
8427 if (t->now < t->max) {
8443 const unsigned int errc = -1;
8444 unsigned int trans[256];
8446 struct tr trsrc, trrepl;
8448 unsigned int c, c0, last = 0;
8449 int modify = 0, i, l;
8450 unsigned char *s, *send;
8452 int singlebyte = single_byte_optimizable(str);
8456#define CHECK_IF_ASCII(c) \
8457 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8458 (cr = ENC_CODERANGE_VALID) : 0)
8462 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8463 if (RSTRING_LEN(repl) == 0) {
8464 return rb_str_delete_bang(1, &src, str);
8468 e1 = rb_enc_check(str, src);
8469 e2 = rb_enc_check(str, repl);
8474 enc = rb_enc_check(src, repl);
8476 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8477 if (RSTRING_LEN(src) > 1 &&
8478 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8479 trsrc.p + l < trsrc.pend) {
8483 trrepl.p = RSTRING_PTR(repl);
8484 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8485 trsrc.gen = trrepl.gen = 0;
8486 trsrc.now = trrepl.now = 0;
8487 trsrc.max = trrepl.max = 0;
8490 for (i=0; i<256; i++) {
8493 while ((c = trnext(&trsrc, enc)) != errc) {
8498 if (!hash) hash = rb_hash_new();
8502 while ((c = trnext(&trrepl, enc)) != errc)
8505 for (i=0; i<256; i++) {
8506 if (trans[i] != errc) {
8514 for (i=0; i<256; i++) {
8517 while ((c = trnext(&trsrc, enc)) != errc) {
8518 r = trnext(&trrepl, enc);
8519 if (r == errc) r = trrepl.now;
8522 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8525 if (!hash) hash = rb_hash_new();
8533 str_modify_keep_cr(str);
8534 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8535 termlen = rb_enc_mbminlen(enc);
8538 long offset, max = RSTRING_LEN(str);
8539 unsigned int save = -1;
8540 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8545 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8548 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8551 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8553 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8562 if (cflag) c = last;
8565 else if (cflag) c = errc;
8571 if (c != (
unsigned int)-1) {
8577 tlen = rb_enc_codelen(c, enc);
8583 if (enc != e1) may_modify = 1;
8585 if ((offset = t - buf) + tlen > max) {
8586 size_t MAYBE_UNUSED(old) = max + termlen;
8587 max = offset + tlen + (send - s);
8588 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8591 rb_enc_mbcput(c, t, enc);
8592 if (may_modify && memcmp(s, t, tlen) != 0) {
8598 if (!STR_EMBED_P(str)) {
8599 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8601 TERM_FILL((
char *)t, termlen);
8602 RSTRING(str)->as.heap.ptr = (
char *)buf;
8603 STR_SET_LEN(str, t - buf);
8604 STR_SET_NOEMBED(str);
8605 RSTRING(str)->as.heap.aux.capa = max;
8609 c = (
unsigned char)*s;
8610 if (trans[c] != errc) {
8627 long offset, max = (long)((send - s) * 1.2);
8628 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8633 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8636 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8639 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8641 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8649 if (cflag) c = last;
8652 else if (cflag) c = errc;
8656 c = cflag ? last : errc;
8659 tlen = rb_enc_codelen(c, enc);
8664 if (enc != e1) may_modify = 1;
8666 if ((offset = t - buf) + tlen > max) {
8667 size_t MAYBE_UNUSED(old) = max + termlen;
8668 max = offset + tlen + (long)((send - s) * 1.2);
8669 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8673 rb_enc_mbcput(c, t, enc);
8674 if (may_modify && memcmp(s, t, tlen) != 0) {
8682 if (!STR_EMBED_P(str)) {
8683 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8685 TERM_FILL((
char *)t, termlen);
8686 RSTRING(str)->as.heap.ptr = (
char *)buf;
8687 STR_SET_LEN(str, t - buf);
8688 STR_SET_NOEMBED(str);
8689 RSTRING(str)->as.heap.aux.capa = max;
8695 rb_enc_associate(str, enc);
8714 return tr_trans(str, src, repl, 0);
8761 tr_trans(str, src, repl, 0);
8765#define TR_TABLE_MAX (UCHAR_MAX+1)
8766#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8768tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8771 const unsigned int errc = -1;
8772 char buf[TR_TABLE_MAX];
8775 VALUE table = 0, ptable = 0;
8776 int i, l, cflag = 0;
8778 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8779 tr.gen =
tr.now =
tr.max = 0;
8781 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8786 for (i=0; i<TR_TABLE_MAX; i++) {
8789 stable[TR_TABLE_MAX] = cflag;
8791 else if (stable[TR_TABLE_MAX] && !cflag) {
8792 stable[TR_TABLE_MAX] = 0;
8794 for (i=0; i<TR_TABLE_MAX; i++) {
8798 while ((c = trnext(&
tr, enc)) != errc) {
8799 if (c < TR_TABLE_MAX) {
8800 buf[(
unsigned char)c] = !cflag;
8805 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8808 table = ptable ? ptable : rb_hash_new();
8812 table = rb_hash_new();
8817 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8818 rb_hash_aset(table, key,
Qtrue);
8822 for (i=0; i<TR_TABLE_MAX; i++) {
8823 stable[i] = stable[i] && buf[i];
8825 if (!table && !cflag) {
8832tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8834 if (c < TR_TABLE_MAX) {
8835 return table[c] != 0;
8841 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8842 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8846 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8849 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8864rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8866 char squeez[TR_TABLE_SIZE];
8869 VALUE del = 0, nodel = 0;
8871 int i, ascompat, cr;
8873 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8875 for (i=0; i<argc; i++) {
8879 enc = rb_enc_check(str, s);
8880 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8883 str_modify_keep_cr(str);
8884 ascompat = rb_enc_asciicompat(enc);
8885 s = t = RSTRING_PTR(str);
8892 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8903 c = rb_enc_codepoint_len(s, send, &clen, enc);
8905 if (tr_find(c, squeez, del, nodel)) {
8909 if (t != s) rb_enc_mbcput(c, t, enc);
8916 TERM_FILL(t, TERM_LEN(str));
8917 STR_SET_LEN(str, t - RSTRING_PTR(str));
8920 if (modify)
return str;
8934rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8937 rb_str_delete_bang(argc, argv, str);
8951rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8953 char squeez[TR_TABLE_SIZE];
8955 VALUE del = 0, nodel = 0;
8956 unsigned char *s, *send, *t;
8958 int ascompat, singlebyte = single_byte_optimizable(str);
8962 enc = STR_ENC_GET(str);
8965 for (i=0; i<argc; i++) {
8969 enc = rb_enc_check(str, s);
8970 if (singlebyte && !single_byte_optimizable(s))
8972 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8976 str_modify_keep_cr(str);
8977 s = t = (
unsigned char *)RSTRING_PTR(str);
8978 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8981 ascompat = rb_enc_asciicompat(enc);
8985 unsigned int c = *s++;
8986 if (c != save || (argc > 0 && !squeez[c])) {
8996 if (ascompat && (c = *s) < 0x80) {
8997 if (c != save || (argc > 0 && !squeez[c])) {
9003 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
9005 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9006 if (t != s) rb_enc_mbcput(c, t, enc);
9015 TERM_FILL((
char *)t, TERM_LEN(str));
9016 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9017 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
9021 if (modify)
return str;
9044rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
9047 rb_str_squeeze_bang(argc, argv, str);
9065 return tr_trans(str, src, repl, 1);
9088 tr_trans(str, src, repl, 1);
9101rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9103 char table[TR_TABLE_SIZE];
9105 VALUE del = 0, nodel = 0, tstr;
9115 enc = rb_enc_check(str, tstr);
9118 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9119 (ptstr = RSTRING_PTR(tstr),
9120 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9121 !is_broken_string(str)) {
9123 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9125 s = RSTRING_PTR(str);
9126 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9129 if (*(
unsigned char*)s++ == c) n++;
9135 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9136 for (i=1; i<argc; i++) {
9139 enc = rb_enc_check(str, tstr);
9140 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9143 s = RSTRING_PTR(str);
9144 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9146 ascompat = rb_enc_asciicompat(enc);
9150 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9158 c = rb_enc_codepoint_len(s, send, &clen, enc);
9159 if (tr_find(c, table, del, nodel)) {
9170rb_fs_check(
VALUE val)
9174 if (
NIL_P(val))
return 0;
9179static const char isspacetable[256] = {
9180 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9182 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9198#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9201split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9203 if (empty_count >= 0 &&
len == 0) {
9204 return empty_count + 1;
9206 if (empty_count > 0) {
9211 }
while (--empty_count > 0);
9215 rb_yield(str_new_empty_String(str));
9216 }
while (--empty_count > 0);
9230 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9234literal_split_pattern(
VALUE spat, split_type_t default_type)
9242 return SPLIT_TYPE_CHARS;
9244 else if (rb_enc_asciicompat(enc)) {
9245 if (
len == 1 && ptr[0] ==
' ') {
9246 return SPLIT_TYPE_AWK;
9251 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9252 return SPLIT_TYPE_AWK;
9255 return default_type;
9268rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9273 split_type_t split_type;
9274 long beg, end, i = 0, empty_count = -1;
9279 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9281 if (lim <= 0) limit =
Qnil;
9282 else if (lim == 1) {
9283 if (RSTRING_LEN(str) == 0)
9294 if (
NIL_P(limit) && !lim) empty_count = 0;
9296 enc = STR_ENC_GET(str);
9297 split_type = SPLIT_TYPE_REGEXP;
9299 spat = get_pat_quoted(spat, 0);
9301 else if (
NIL_P(spat = rb_fs)) {
9302 split_type = SPLIT_TYPE_AWK;
9304 else if (!(spat = rb_fs_check(spat))) {
9305 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9310 if (split_type != SPLIT_TYPE_AWK) {
9315 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9316 if (split_type == SPLIT_TYPE_AWK) {
9318 split_type = SPLIT_TYPE_STRING;
9323 mustnot_broken(spat);
9324 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9332#define SPLIT_STR(beg, len) ( \
9333 empty_count = split_string(result, str, beg, len, empty_count), \
9334 str_mod_check(str, str_start, str_len))
9337 char *ptr = RSTRING_PTR(str);
9338 char *
const str_start = ptr;
9339 const long str_len = RSTRING_LEN(str);
9340 char *
const eptr = str_start + str_len;
9341 if (split_type == SPLIT_TYPE_AWK) {
9348 if (is_ascii_string(str)) {
9349 while (ptr < eptr) {
9350 c = (
unsigned char)*ptr++;
9352 if (ascii_isspace(c)) {
9358 if (!
NIL_P(limit) && lim <= i)
break;
9361 else if (ascii_isspace(c)) {
9362 SPLIT_STR(beg, end-beg);
9365 if (!
NIL_P(limit)) ++i;
9373 while (ptr < eptr) {
9376 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9385 if (!
NIL_P(limit) && lim <= i)
break;
9389 SPLIT_STR(beg, end-beg);
9392 if (!
NIL_P(limit)) ++i;
9400 else if (split_type == SPLIT_TYPE_STRING) {
9401 char *substr_start = ptr;
9402 char *sptr = RSTRING_PTR(spat);
9403 long slen = RSTRING_LEN(spat);
9406 mustnot_broken(str);
9407 enc = rb_enc_check(str, spat);
9408 while (ptr < eptr &&
9409 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9412 if (t != ptr + end) {
9416 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9417 str_mod_check(spat, sptr, slen);
9420 if (!
NIL_P(limit) && lim <= ++i)
break;
9422 beg = ptr - str_start;
9424 else if (split_type == SPLIT_TYPE_CHARS) {
9428 mustnot_broken(str);
9429 enc = rb_enc_get(str);
9430 while (ptr < eptr &&
9431 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9432 SPLIT_STR(ptr - str_start, n);
9434 if (!
NIL_P(limit) && lim <= ++i)
break;
9436 beg = ptr - str_start;
9440 long len = RSTRING_LEN(str);
9448 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9453 if (start == end && BEG(0) == END(0)) {
9458 else if (last_null == 1) {
9459 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9466 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9472 SPLIT_STR(beg, end-beg);
9473 beg = start = END(0);
9477 for (idx=1; idx < regs->num_regs; idx++) {
9478 if (BEG(idx) == -1)
continue;
9479 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9481 if (!
NIL_P(limit) && lim <= ++i)
break;
9483 if (match) rb_match_unbusy(match);
9485 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9486 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9489 return result ? result : str;
9499 return rb_str_split_m(1, &sep, str);
9502#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9517#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9520chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9522 const char *prev = rb_enc_prev_char(p, e, e, enc);
9525 prev = rb_enc_prev_char(p, e, e, enc);
9526 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9538 RSTRING_LEN(rs) != 1 ||
9539 RSTRING_PTR(rs)[0] !=
'\n')) {
9545#define rb_rs get_rs()
9552 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9553 long pos,
len, rslen;
9559 static ID keywords[1];
9564 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9568 if (!ENUM_ELEM(ary, str)) {
9576 if (!RSTRING_LEN(str))
goto end;
9578 ptr = subptr = RSTRING_PTR(str);
9580 len = RSTRING_LEN(str);
9582 rslen = RSTRING_LEN(rs);
9585 enc = rb_enc_get(str);
9587 enc = rb_enc_check(str, rs);
9592 const char *eol = NULL;
9594 while (subend < pend) {
9595 long chomp_rslen = 0;
9597 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9599 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9601 if (eol == subend)
break;
9605 chomp_rslen = -rslen;
9609 if (!subptr) subptr = subend;
9613 }
while (subend < pend);
9615 if (rslen == 0) chomp_rslen = 0;
9617 subend - subptr + (chomp ? chomp_rslen : rslen));
9618 if (ENUM_ELEM(ary, line)) {
9619 str_mod_check(str, ptr,
len);
9621 subptr = eol = NULL;
9626 rsptr = RSTRING_PTR(rs);
9627 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9636 rsptr = RSTRING_PTR(rs);
9637 rslen = RSTRING_LEN(rs);
9640 while (subptr < pend) {
9641 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9645 if (hit != adjusted) {
9649 subend = hit += rslen;
9652 subend = chomp_newline(subptr, subend, enc);
9659 if (ENUM_ELEM(ary, line)) {
9660 str_mod_check(str, ptr,
len);
9665 if (subptr != pend) {
9668 pend = chomp_newline(subptr, pend, enc);
9670 else if (pend - subptr >= rslen &&
9671 memcmp(pend - rslen, rsptr, rslen) == 0) {
9676 ENUM_ELEM(ary, line);
9697rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9700 return rb_str_enumerate_lines(argc, argv, str, 0);
9713rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9715 VALUE ary = WANTARRAY(
"lines", 0);
9716 return rb_str_enumerate_lines(argc, argv, str, ary);
9730 for (i=0; i<RSTRING_LEN(str); i++) {
9731 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9749rb_str_each_byte(
VALUE str)
9752 return rb_str_enumerate_bytes(str, 0);
9764rb_str_bytes(
VALUE str)
9766 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9767 return rb_str_enumerate_bytes(str, ary);
9785 ptr = RSTRING_PTR(str);
9786 len = RSTRING_LEN(str);
9787 enc = rb_enc_get(str);
9790 for (i = 0; i <
len; i += n) {
9791 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9796 for (i = 0; i <
len; i += n) {
9797 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9818rb_str_each_char(
VALUE str)
9821 return rb_str_enumerate_chars(str, 0);
9833rb_str_chars(
VALUE str)
9836 return rb_str_enumerate_chars(str, ary);
9840rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9845 const char *ptr, *end;
9848 if (single_byte_optimizable(str))
9849 return rb_str_enumerate_bytes(str, ary);
9852 ptr = RSTRING_PTR(str);
9854 enc = STR_ENC_GET(str);
9857 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9878rb_str_each_codepoint(
VALUE str)
9881 return rb_str_enumerate_codepoints(str, 0);
9893rb_str_codepoints(
VALUE str)
9896 return rb_str_enumerate_codepoints(str, ary);
9902 int encidx = rb_enc_to_index(enc);
9904 const OnigUChar source_ascii[] =
"\\X";
9905 const OnigUChar *source = source_ascii;
9906 size_t source_len =
sizeof(source_ascii) - 1;
9909#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9910#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9911#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9912#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9913#define CASE_UTF(e) \
9914 case ENCINDEX_UTF_##e: { \
9915 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9916 source = source_UTF_##e; \
9917 source_len = sizeof(source_UTF_##e); \
9920 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9928 regex_t *reg_grapheme_cluster;
9930 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9931 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9933 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9934 onig_error_code_to_str(message, r, &einfo);
9935 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9938 return reg_grapheme_cluster;
9944 int encidx = rb_enc_to_index(enc);
9945 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9947 if (encidx == rb_utf8_encindex()) {
9948 if (!reg_grapheme_cluster_utf8) {
9949 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9952 return reg_grapheme_cluster_utf8;
9961 size_t grapheme_cluster_count = 0;
9963 const char *ptr, *end;
9965 if (!rb_enc_unicode_p(enc)) {
9969 bool cached_reg_grapheme_cluster =
true;
9970 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9971 if (!reg_grapheme_cluster) {
9972 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9973 cached_reg_grapheme_cluster =
false;
9976 ptr = RSTRING_PTR(str);
9980 OnigPosition
len = onig_match(reg_grapheme_cluster,
9981 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9982 (
const OnigUChar *)ptr, NULL, 0);
9983 if (
len <= 0)
break;
9984 grapheme_cluster_count++;
9988 if (!cached_reg_grapheme_cluster) {
9989 onig_free(reg_grapheme_cluster);
9992 return SIZET2NUM(grapheme_cluster_count);
9996rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
10000 const char *ptr0, *ptr, *end;
10002 if (!rb_enc_unicode_p(enc)) {
10003 return rb_str_enumerate_chars(str, ary);
10008 bool cached_reg_grapheme_cluster =
true;
10009 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10010 if (!reg_grapheme_cluster) {
10011 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10012 cached_reg_grapheme_cluster =
false;
10015 ptr0 = ptr = RSTRING_PTR(str);
10018 while (ptr < end) {
10019 OnigPosition
len = onig_match(reg_grapheme_cluster,
10020 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
10021 (
const OnigUChar *)ptr, NULL, 0);
10022 if (
len <= 0)
break;
10027 if (!cached_reg_grapheme_cluster) {
10028 onig_free(reg_grapheme_cluster);
10048rb_str_each_grapheme_cluster(
VALUE str)
10051 return rb_str_enumerate_grapheme_clusters(str, 0);
10063rb_str_grapheme_clusters(
VALUE str)
10066 return rb_str_enumerate_grapheme_clusters(str, ary);
10070chopped_length(
VALUE str)
10073 const char *p, *p2, *beg, *end;
10075 beg = RSTRING_PTR(str);
10076 end = beg + RSTRING_LEN(str);
10077 if (beg >= end)
return 0;
10078 p = rb_enc_prev_char(beg, end, end, enc);
10080 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10081 p2 = rb_enc_prev_char(beg, p, end, enc);
10082 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10100rb_str_chop_bang(
VALUE str)
10102 str_modify_keep_cr(str);
10103 if (RSTRING_LEN(str) > 0) {
10105 len = chopped_length(str);
10106 STR_SET_LEN(str,
len);
10107 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10126rb_str_chop(
VALUE str)
10132smart_chomp(
VALUE str,
const char *e,
const char *p)
10135 if (rb_enc_mbminlen(enc) > 1) {
10140 pp = e - rb_enc_mbminlen(enc);
10143 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10151 if (--e > p && *(e-1) ==
'\r') {
10168 char *pp, *e, *rsptr;
10170 char *
const p = RSTRING_PTR(str);
10171 long len = RSTRING_LEN(str);
10173 if (
len == 0)
return 0;
10176 return smart_chomp(str, e, p);
10179 enc = rb_enc_get(str);
10182 if (rb_enc_mbminlen(enc) > 1) {
10187 pp -= rb_enc_mbminlen(enc);
10190 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10197 while (e > p && *(e-1) ==
'\n') {
10199 if (e > p && *(e-1) ==
'\r')
10205 if (rslen >
len)
return len;
10207 enc = rb_enc_get(rs);
10208 newline = rsptr[rslen-1];
10209 if (rslen == rb_enc_mbminlen(enc)) {
10211 if (newline ==
'\n')
10212 return smart_chomp(str, e, p);
10216 return smart_chomp(str, e, p);
10220 enc = rb_enc_check(str, rs);
10221 if (is_broken_string(rs)) {
10225 if (p[
len-1] == newline &&
10227 memcmp(rsptr, pp, rslen) == 0)) {
10228 if (at_char_boundary(p, pp, e, enc))
10229 return len - rslen;
10241chomp_rs(
int argc,
const VALUE *argv)
10245 VALUE rs = argv[0];
10257 long olen = RSTRING_LEN(str);
10258 long len = chompped_length(str, rs);
10259 if (
len >= olen)
return Qnil;
10260 str_modify_keep_cr(str);
10261 STR_SET_LEN(str,
len);
10262 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10282rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10285 str_modifiable(str);
10286 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10287 rs = chomp_rs(argc, argv);
10289 return rb_str_chomp_string(str, rs);
10302rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10304 VALUE rs = chomp_rs(argc, argv);
10312 const char *
const start = s;
10314 if (!s || s >= e)
return 0;
10317 if (single_byte_optimizable(str)) {
10318 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10323 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10343rb_str_lstrip_bang(
VALUE str)
10347 long olen, loffset;
10349 str_modify_keep_cr(str);
10350 enc = STR_ENC_GET(str);
10352 loffset = lstrip_offset(str, start, start+olen, enc);
10354 long len = olen-loffset;
10355 s = start + loffset;
10356 memmove(start, s,
len);
10357 STR_SET_LEN(str,
len);
10358 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10381rb_str_lstrip(
VALUE str)
10386 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10387 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10396 rb_str_check_dummy_enc(enc);
10400 if (!s || s >= e)
return 0;
10404 if (single_byte_optimizable(str)) {
10406 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10411 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10431rb_str_rstrip_bang(
VALUE str)
10435 long olen, roffset;
10437 str_modify_keep_cr(str);
10438 enc = STR_ENC_GET(str);
10440 roffset = rstrip_offset(str, start, start+olen, enc);
10442 long len = olen - roffset;
10444 STR_SET_LEN(str,
len);
10445 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10468rb_str_rstrip(
VALUE str)
10472 long olen, roffset;
10474 enc = STR_ENC_GET(str);
10476 roffset = rstrip_offset(str, start, start+olen, enc);
10478 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10494rb_str_strip_bang(
VALUE str)
10497 long olen, loffset, roffset;
10500 str_modify_keep_cr(str);
10501 enc = STR_ENC_GET(str);
10503 loffset = lstrip_offset(str, start, start+olen, enc);
10504 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10506 if (loffset > 0 || roffset > 0) {
10507 long len = olen-roffset;
10510 memmove(start, start + loffset,
len);
10512 STR_SET_LEN(str,
len);
10513 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10536rb_str_strip(
VALUE str)
10539 long olen, loffset, roffset;
10543 loffset = lstrip_offset(str, start, start+olen, enc);
10544 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10546 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10551scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10554 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10560 end = pos + RSTRING_LEN(pat);
10574 if (RSTRING_LEN(str) > end)
10575 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10584 if (!regs || regs->num_regs == 1) {
10590 for (
int i = 1; i < regs->num_regs; i++) {
10651 long last = -1, prev = 0;
10652 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10654 pat = get_pat_quoted(pat, 1);
10655 mustnot_broken(str);
10659 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10664 if (last >= 0) rb_pat_search(pat, str, last, 1);
10669 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10673 str_mod_check(str, p,
len);
10675 if (last >= 0) rb_pat_search(pat, str, last, 1);
10699rb_str_hex(
VALUE str)
10701 return rb_str_to_inum(str, 16, FALSE);
10726rb_str_oct(
VALUE str)
10728 return rb_str_to_inum(str, -8, FALSE);
10731#ifndef HAVE_CRYPT_R
10736 rb_nativethread_lock_t lock;
10737} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10806# define CRYPT_END() ALLOCV_END(databuf)
10809 extern char *crypt(
const char *,
const char *);
10810# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10813 const char *s, *saltp;
10816 char salt_8bit_clean[3];
10820 mustnot_wchar(str);
10821 mustnot_wchar(salt);
10823 saltp = RSTRING_PTR(salt);
10824 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10825 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10829 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10830 salt_8bit_clean[0] = saltp[0] & 0x7f;
10831 salt_8bit_clean[1] = saltp[1] & 0x7f;
10832 salt_8bit_clean[2] =
'\0';
10833 saltp = salt_8bit_clean;
10838# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10839 data->initialized = 0;
10841 res = crypt_r(s, saltp, data);
10844 res = crypt(s, saltp);
10859 size_t res_size = strlen(res)+1;
10860 tmp_buf =
ALLOCA_N(
char, res_size);
10861 memcpy(tmp_buf, res, res_size);
10898 char *ptr, *p, *pend;
10901 unsigned long sum0 = 0;
10906 ptr = p = RSTRING_PTR(str);
10907 len = RSTRING_LEN(str);
10913 str_mod_check(str, ptr,
len);
10916 sum0 += (
unsigned char)*p;
10927 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10928 sum0 &= (((
unsigned long)1)<<bits)-1;
10948rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10952 long width,
len, flen = 1, fclen = 1;
10955 const char *f =
" ";
10956 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10958 int singlebyte = 1, cr;
10962 enc = STR_ENC_GET(str);
10963 termlen = rb_enc_mbminlen(enc);
10967 enc = rb_enc_check(str, pad);
10968 f = RSTRING_PTR(pad);
10969 flen = RSTRING_LEN(pad);
10970 fclen = str_strlen(pad, enc);
10971 singlebyte = single_byte_optimizable(pad);
10972 if (flen == 0 || fclen == 0) {
10973 rb_raise(rb_eArgError,
"zero width padding");
10976 len = str_strlen(str, enc);
10977 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
10979 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
10983 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10984 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10986 size = RSTRING_LEN(str);
10987 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10988 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10989 (
len += llen2 + rlen2) >= LONG_MAX - size) {
10990 rb_raise(rb_eArgError,
"argument too big");
10994 p = RSTRING_PTR(res);
10996 memset(p, *f, llen);
11000 while (llen >= fclen) {
11006 memcpy(p, f, llen2);
11010 memcpy(p, RSTRING_PTR(str), size);
11013 memset(p, *f, rlen);
11017 while (rlen >= fclen) {
11023 memcpy(p, f, rlen2);
11027 TERM_FILL(p, termlen);
11028 STR_SET_LEN(res, p-RSTRING_PTR(res));
11051rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
11053 return rb_str_justify(argc, argv, str,
'l');
11067rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11069 return rb_str_justify(argc, argv, str,
'r');
11082rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11084 return rb_str_justify(argc, argv, str,
'c');
11100 sep = get_pat_quoted(sep, 0);
11112 pos = rb_str_index(str, sep, 0);
11113 if (pos < 0)
goto failed;
11118 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11121 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11135 long pos = RSTRING_LEN(str);
11137 sep = get_pat_quoted(sep, 0);
11150 pos = rb_str_rindex(str, sep, pos);
11159 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11161 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11173rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11177 for (i=0; i<argc; i++) {
11178 VALUE tmp = argv[i];
11180 if (rb_reg_start_with_p(tmp, str))
11184 const char *p, *s, *e;
11189 enc = rb_enc_check(str, tmp);
11190 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11191 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11192 p = RSTRING_PTR(str);
11195 if (!at_char_right_boundary(p, s, e, enc))
11197 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11213rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11217 for (i=0; i<argc; i++) {
11218 VALUE tmp = argv[i];
11219 const char *p, *s, *e;
11224 enc = rb_enc_check(str, tmp);
11225 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11226 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11227 p = RSTRING_PTR(str);
11230 if (!at_char_boundary(p, s, e, enc))
11232 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11248deleted_prefix_length(
VALUE str,
VALUE prefix)
11250 const char *strptr, *prefixptr;
11251 long olen, prefixlen;
11256 if (!is_broken_string(prefix) ||
11257 !rb_enc_asciicompat(enc) ||
11258 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11259 enc = rb_enc_check(str, prefix);
11263 prefixlen = RSTRING_LEN(prefix);
11264 if (prefixlen <= 0)
return 0;
11265 olen = RSTRING_LEN(str);
11266 if (olen < prefixlen)
return 0;
11267 strptr = RSTRING_PTR(str);
11268 prefixptr = RSTRING_PTR(prefix);
11269 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11270 if (is_broken_string(prefix)) {
11271 if (!is_broken_string(str)) {
11275 const char *strend = strptr + olen;
11276 const char *after_prefix = strptr + prefixlen;
11277 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11298rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11301 str_modify_keep_cr(str);
11303 prefixlen = deleted_prefix_length(str, prefix);
11304 if (prefixlen <= 0)
return Qnil;
11318rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11322 prefixlen = deleted_prefix_length(str, prefix);
11323 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11325 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11338deleted_suffix_length(
VALUE str,
VALUE suffix)
11340 const char *strptr, *suffixptr;
11341 long olen, suffixlen;
11345 if (is_broken_string(suffix))
return 0;
11346 enc = rb_enc_check(str, suffix);
11349 suffixlen = RSTRING_LEN(suffix);
11350 if (suffixlen <= 0)
return 0;
11351 olen = RSTRING_LEN(str);
11352 if (olen < suffixlen)
return 0;
11353 strptr = RSTRING_PTR(str);
11354 suffixptr = RSTRING_PTR(suffix);
11355 const char *strend = strptr + olen;
11356 const char *before_suffix = strend - suffixlen;
11357 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11358 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11374rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11376 long olen, suffixlen,
len;
11377 str_modifiable(str);
11379 suffixlen = deleted_suffix_length(str, suffix);
11380 if (suffixlen <= 0)
return Qnil;
11382 olen = RSTRING_LEN(str);
11383 str_modify_keep_cr(str);
11384 len = olen - suffixlen;
11385 STR_SET_LEN(str,
len);
11386 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11402rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11406 suffixlen = deleted_suffix_length(str, suffix);
11407 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11409 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11416 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11424 val = rb_fs_check(val);
11427 "value of %"PRIsVALUE
" must be String or Regexp",
11431 rb_warn_deprecated(
"'$;'", NULL);
11448 str_modifiable(str);
11451 int idx = rb_enc_to_index(encoding);
11458 rb_enc_associate_index(str, idx);
11482 if (STR_EMBED_P(str)) {
11483 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11488 str_replace_shared_without_enc(str2, str);
11490 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11523rb_str_valid_encoding_p(
VALUE str)
11543rb_str_is_ascii_only_p(
VALUE str)
11553 static const char ellipsis[] =
"...";
11554 const long ellipsislen =
sizeof(ellipsis) - 1;
11556 const long blen = RSTRING_LEN(str);
11557 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11558 VALUE estr, ret = 0;
11561 if (
len * rb_enc_mbminlen(enc) >= blen ||
11565 else if (
len <= ellipsislen ||
11567 if (rb_enc_asciicompat(enc)) {
11569 rb_enc_associate(ret, enc);
11576 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11581 rb_enc_from_encoding(enc), 0,
Qnil);
11594 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11600 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11619 if (enc == STR_ENC_GET(str)) {
11624 return enc_str_scrub(enc, str, repl, cr);
11632 const char *rep, *p, *e, *p1, *sp;
11638 rb_raise(rb_eArgError,
"both of block and replacement given");
11645 if (!
NIL_P(repl)) {
11646 repl = str_compat_and_valid(repl, enc);
11649 if (rb_enc_dummy_p(enc)) {
11652 encidx = rb_enc_to_index(enc);
11654#define DEFAULT_REPLACE_CHAR(str) do { \
11655 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11656 rep = replace; replen = (int)sizeof(replace); \
11659 slen = RSTRING_LEN(str);
11660 p = RSTRING_PTR(str);
11665 if (rb_enc_asciicompat(enc)) {
11671 else if (!
NIL_P(repl)) {
11672 rep = RSTRING_PTR(repl);
11673 replen = RSTRING_LEN(repl);
11676 else if (encidx == rb_utf8_encindex()) {
11677 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11681 DEFAULT_REPLACE_CHAR(
"?");
11686 p = search_nonascii(p, e);
11691 int ret = rb_enc_precise_mbclen(p, e, enc);
11710 if (e - p < clen) clen = e - p;
11717 for (; clen > 1; clen--) {
11718 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11729 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11730 str_mod_check(str, sp, slen);
11731 repl = str_compat_and_valid(repl, enc);
11738 p = search_nonascii(p, e);
11764 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11765 str_mod_check(str, sp, slen);
11766 repl = str_compat_and_valid(repl, enc);
11775 long mbminlen = rb_enc_mbminlen(enc);
11779 else if (!
NIL_P(repl)) {
11780 rep = RSTRING_PTR(repl);
11781 replen = RSTRING_LEN(repl);
11783 else if (encidx == ENCINDEX_UTF_16BE) {
11784 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11786 else if (encidx == ENCINDEX_UTF_16LE) {
11787 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11789 else if (encidx == ENCINDEX_UTF_32BE) {
11790 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11792 else if (encidx == ENCINDEX_UTF_32LE) {
11793 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11796 DEFAULT_REPLACE_CHAR(
"?");
11800 int ret = rb_enc_precise_mbclen(p, e, enc);
11813 if (e - p < clen) clen = e - p;
11814 if (clen <= mbminlen * 2) {
11819 for (; clen > mbminlen; clen-=mbminlen) {
11820 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11830 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11831 str_mod_check(str, sp, slen);
11832 repl = str_compat_and_valid(repl, enc);
11857 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11858 str_mod_check(str, sp, slen);
11859 repl = str_compat_and_valid(repl, enc);
11895str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11903static ID id_normalize;
11904static ID id_normalized_p;
11905static VALUE mUnicodeNormalize;
11908unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11910 static int UnicodeNormalizeRequired = 0;
11913 if (!UnicodeNormalizeRequired) {
11914 rb_require(
"unicode_normalize/normalize.rb");
11915 UnicodeNormalizeRequired = 1;
11919 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11956rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11958 return unicode_normalize_common(argc, argv, str, id_normalize);
11972rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
11974 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12001rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
12003 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12135#define sym_equal rb_obj_equal
12138sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12142 int c = rb_enc_precise_mbclen(s, send, enc);
12146 c = rb_enc_mbc_to_codepoint(s, send, enc);
12154rb_str_symname_p(
VALUE sym)
12159 rb_encoding *resenc = rb_default_internal_encoding();
12161 if (resenc == NULL) resenc = rb_default_external_encoding();
12162 enc = STR_ENC_GET(sym);
12163 ptr = RSTRING_PTR(sym);
12164 len = RSTRING_LEN(sym);
12165 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12173rb_str_quote_unprintable(
VALUE str)
12181 resenc = rb_default_internal_encoding();
12182 if (resenc == NULL) resenc = rb_default_external_encoding();
12183 enc = STR_ENC_GET(str);
12184 ptr = RSTRING_PTR(str);
12185 len = RSTRING_LEN(str);
12186 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12187 !sym_printable(ptr, ptr +
len, enc)) {
12188 return rb_str_escape(str);
12194rb_id_quote_unprintable(
ID id)
12196 VALUE str = rb_id2str(
id);
12197 if (!rb_str_symname_p(str)) {
12198 return rb_str_escape(str);
12216sym_inspect(
VALUE sym)
12223 if (!rb_str_symname_p(str)) {
12225 len = RSTRING_LEN(str);
12226 rb_str_resize(str,
len + 1);
12227 dest = RSTRING_PTR(str);
12228 memmove(dest + 1, dest,
len);
12232 VALUE orig_str = str;
12234 len = RSTRING_LEN(orig_str);
12235 str = rb_enc_str_new(0,
len + 1, enc);
12238 ptr = RSTRING_PTR(orig_str);
12239 dest = RSTRING_PTR(str);
12240 memcpy(dest + 1, ptr,
len);
12260rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12265 rb_raise(rb_eArgError,
"no receiver given");
12362 return rb_str_match(
rb_sym2str(sym), other);
12377sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12379 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12392sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12394 return rb_str_match_m_p(argc, argv, sym);
12412 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12423sym_length(
VALUE sym)
12437sym_empty(
VALUE sym)
12471sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12487sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12503sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12517sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12519 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12532sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12534 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12546sym_encoding(
VALUE sym)
12552string_for_symbol(
VALUE name)
12557 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12571 name = string_for_symbol(name);
12572 return rb_intern_str(name);
12581 name = string_for_symbol(name);
12605 return rb_fstring(str);
12612 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
true,
false);
12624 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12625 rb_enc_autoload(enc);
12629 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12635 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12636 rb_enc_autoload(enc);
12640 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12651rb_yjit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12656 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12657 rb_str_buf_cat_byte(str, (
char) code);
12667fstring_set_class_i(
VALUE *str,
void *data)
12671 return ST_CONTINUE;
12679 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12846 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.