14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
48#include "ruby_assert.h"
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
66#undef rb_usascii_str_new
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
149str_encindex_fastpath(
int encindex)
153 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_US_ASCII:
163str_enc_fastpath(
VALUE str)
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
217#define STR_ENC_GET(str) get_encoding(str)
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
230str_embed_capa(
VALUE str)
232 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
236rb_str_reembeddable_p(
VALUE str)
238 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242rb_str_embed_size(
long capa)
248rb_str_size_as_embedded(
VALUE str)
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
260 real_size =
sizeof(
struct RString);
264 real_size +=
sizeof(st_index_t);
271STR_EMBEDDABLE_P(
long len,
long termlen)
273 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
278static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
279static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
281static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
282static inline void str_modifiable(
VALUE str);
287str_make_independent(
VALUE str)
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str),
len, 0L, termlen);
294static inline int str_dependent_p(
VALUE str);
297rb_str_make_independent(
VALUE str)
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
305rb_str_make_embedded(
VALUE str)
310 char *buf =
RSTRING(str)->as.heap.ptr;
314 STR_SET_LEN(str,
len);
317 memcpy(RSTRING_PTR(str), buf,
len);
321 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
325rb_debug_rstring_null_ptr(
const char *func)
327 fprintf(stderr,
"%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
338get_encoding(
VALUE str)
344mustnot_broken(
VALUE str)
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
352mustnot_wchar(
VALUE str)
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
360static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
368BARE_STRING_P(
VALUE str)
373static inline st_index_t
374str_do_hash(
VALUE str)
376 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
378 if (e && !is_ascii_string(str)) {
385str_store_precomputed_hash(
VALUE str, st_index_t hash)
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
396 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
398 FL_SET(str, STR_PRECOMPUTED_HASH);
411 if (
FL_TEST(str, RSTRING_FSTR))
414 bare = BARE_STRING_P(str);
416 if (STR_EMBED_P(str)) {
421 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 rb_str_resize(str, RSTRING_LEN(str));
430 fstr = register_fstring(str,
false,
false);
433 str_replace_shared_without_enc(str, fstr);
440static VALUE fstring_table_obj;
443fstring_concurrent_set_hash(
VALUE str)
445#ifdef PRECOMPUTED_FAKESTR_HASH
449 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
466 const char *aptr, *bptr;
473 return (alen == blen &&
475 memcmp(aptr, bptr, alen) == 0);
480 bool force_precompute_hash;
484fstring_concurrent_set_create(
VALUE str,
void *data)
494 long len = RSTRING_LEN(str);
495 long capa =
len +
sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
525 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
528 if (STR_SHARED_P(str)) {
530 str_make_independent(str);
533 if (!BARE_STRING_P(str)) {
539 RBASIC(str)->flags |= RSTRING_FSTR;
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
559Init_fstring_table(
void)
561 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
562 rb_gc_register_address(&fstring_table_obj);
566register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
570 .force_precompute_hash = force_precompute_hash
573#if SIZEOF_VOIDP == SIZEOF_LONG
577 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
581 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
583 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593rb_obj_is_fstring_table(
VALUE obj)
597 return obj == fstring_table_obj;
601rb_gc_free_fstring(
VALUE obj)
606 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
608 RB_DEBUG_COUNTER_INC(obj_str_fstr);
614rb_fstring_foreach_with_replace(
int (*callback)(
VALUE *str,
void *data),
void *data)
616 if (fstring_table_obj) {
617 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
622setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
625 RBASIC_SET_SHAPE_ID((
VALUE)fake_str, ROOT_SHAPE_ID);
638 return (
VALUE)fake_str;
647 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
656rb_fstring_new(
const char *ptr,
long len)
659 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
666 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
670rb_fstring_cstr(
const char *
ptr)
672 return rb_fstring_new(
ptr, strlen(
ptr));
676single_byte_optimizable(
VALUE str)
680 case ENCINDEX_ASCII_8BIT:
681 case ENCINDEX_US_ASCII:
703static inline const char *
704search_nonascii(
const char *p,
const char *e)
706 const uintptr_t *s, *t;
708#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
709# if SIZEOF_UINTPTR_T == 8
710# define NONASCII_MASK UINT64_C(0x8080808080808080)
711# elif SIZEOF_UINTPTR_T == 4
712# define NONASCII_MASK UINT32_C(0x80808080)
714# error "don't know what to do."
717# if SIZEOF_UINTPTR_T == 8
718# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
719# elif SIZEOF_UINTPTR_T == 4
720# define NONASCII_MASK 0x80808080UL
722# error "don't know what to do."
726 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
727#if !UNALIGNED_WORD_ACCESS
728 if ((uintptr_t)p % SIZEOF_VOIDP) {
729 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
734 case 7:
if (p[-7]&0x80)
return p-7;
735 case 6:
if (p[-6]&0x80)
return p-6;
736 case 5:
if (p[-5]&0x80)
return p-5;
737 case 4:
if (p[-4]&0x80)
return p-4;
739 case 3:
if (p[-3]&0x80)
return p-3;
740 case 2:
if (p[-2]&0x80)
return p-2;
741 case 1:
if (p[-1]&0x80)
return p-1;
746#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
747#define aligned_ptr(value) \
748 __builtin_assume_aligned((value), sizeof(uintptr_t))
750#define aligned_ptr(value) (uintptr_t *)(value)
753 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
756 if (*s & NONASCII_MASK) {
757#ifdef WORDS_BIGENDIAN
758 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
760 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
770 case 7:
if (e[-7]&0x80)
return e-7;
771 case 6:
if (e[-6]&0x80)
return e-6;
772 case 5:
if (e[-5]&0x80)
return e-5;
773 case 4:
if (e[-4]&0x80)
return e-4;
775 case 3:
if (e[-3]&0x80)
return e-3;
776 case 2:
if (e[-2]&0x80)
return e-2;
777 case 1:
if (e[-1]&0x80)
return e-1;
785 const char *e = p +
len;
787 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
789 p = search_nonascii(p, e);
793 if (rb_enc_asciicompat(enc)) {
794 p = search_nonascii(p, e);
797 int ret = rb_enc_precise_mbclen(p, e, enc);
801 p = search_nonascii(p, e);
807 int ret = rb_enc_precise_mbclen(p, e, enc);
823 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
826 p = search_nonascii(p, e);
830 else if (rb_enc_asciicompat(enc)) {
831 p = search_nonascii(p, e);
837 int ret = rb_enc_precise_mbclen(p, e, enc);
844 p = search_nonascii(p, e);
850 int ret = rb_enc_precise_mbclen(p, e, enc);
875 rb_enc_set_index(str1, rb_enc_get_index(str2));
883rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
888 str_enc_copy(dest, src);
889 if (RSTRING_LEN(dest) == 0) {
890 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
901 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
902 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
913rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
915 str_enc_copy(dest, src);
922 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
928 return enc_coderange_scan(str, enc);
937 cr = enc_coderange_scan(str, get_encoding(str));
944rb_enc_str_asciicompat(
VALUE str)
947 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
955 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
964str_mod_check(
VALUE s,
const char *p,
long len)
966 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
972str_capacity(
VALUE str,
const int termlen)
974 if (STR_EMBED_P(str)) {
975 return str_embed_capa(str) - termlen;
977 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
981 return RSTRING(str)->as.heap.aux.capa;
988 return str_capacity(str, TERM_LEN(str));
992must_not_null(
const char *
ptr)
995 rb_raise(rb_eArgError,
"NULL pointer given");
1000str_alloc_embed(
VALUE klass,
size_t capa)
1002 size_t size = rb_str_embed_size(
capa);
1006 NEWOBJ_OF(str,
struct RString, klass,
1010 str->as.embed.ary[0] = 0;
1016str_alloc_heap(
VALUE klass)
1018 NEWOBJ_OF(str,
struct RString, klass,
1022 str->as.heap.aux.capa = 0;
1023 str->as.heap.ptr = NULL;
1029empty_str_alloc(
VALUE klass)
1031 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1032 VALUE str = str_alloc_embed(klass, 0);
1033 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1044 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1048 enc = rb_ascii8bit_encoding();
1051 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1053 int termlen = rb_enc_mbminlen(enc);
1055 if (STR_EMBEDDABLE_P(
len, termlen)) {
1056 str = str_alloc_embed(klass,
len + termlen);
1062 str = str_alloc_heap(klass);
1068 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1071 rb_enc_raw_set(str, enc);
1074 memcpy(RSTRING_PTR(str),
ptr,
len);
1077 memset(RSTRING_PTR(str), 0,
len);
1080 STR_SET_LEN(str,
len);
1081 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1088 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1123 __msan_unpoison_string(
ptr);
1143 if (rb_enc_mbminlen(enc) != 1) {
1144 rb_raise(rb_eArgError,
"wchar encoding given");
1146 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1150str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1155 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1159 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1162 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1163 str = str_alloc_heap(klass);
1167 RBASIC(str)->flags |= STR_NOFREE;
1168 rb_enc_associate_index(str, encindex);
1197static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1199 int ecflags,
VALUE ecopts);
1204 int encidx = rb_enc_to_index(enc);
1205 if (rb_enc_get_index(str) == encidx)
1206 return is_ascii_string(str);
1217 if (!to)
return str;
1218 if (!from) from = rb_enc_get(str);
1219 if (from == to)
return str;
1220 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1221 rb_is_ascii8bit_enc(to)) {
1222 if (STR_ENC_GET(str) != to) {
1224 rb_enc_associate(str, to);
1231 from, to, ecflags, ecopts);
1232 if (
NIL_P(newstr)) {
1240rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1245 olen = RSTRING_LEN(newstr);
1246 if (ofs < -olen || olen < ofs)
1248 if (ofs < 0) ofs += olen;
1250 STR_SET_LEN(newstr, ofs);
1254 rb_str_modify(newstr);
1255 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1263 STR_SET_LEN(str, 0);
1264 rb_enc_associate(str, enc);
1270str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1272 int ecflags,
VALUE ecopts)
1277 VALUE econv_wrapper;
1278 const unsigned char *start, *sp;
1279 unsigned char *dest, *dp;
1280 size_t converted_output = (size_t)ofs;
1285 RBASIC_CLEAR_CLASS(econv_wrapper);
1287 if (!ec)
return Qnil;
1290 sp = (
unsigned char*)
ptr;
1292 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1293 (dp = dest + converted_output),
1297 size_t converted_input = sp - start;
1298 size_t rest =
len - converted_input;
1299 converted_output = dp - dest;
1301 if (converted_input && converted_output &&
1302 rest < (LONG_MAX / converted_output)) {
1303 rest = (rest * converted_output) / converted_input;
1308 olen += rest < 2 ? 2 : rest;
1309 rb_str_resize(newstr, olen);
1316 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1318 rb_enc_associate(newstr, to);
1337 const int eidx = rb_enc_to_index(eenc);
1340 return rb_enc_str_new(
ptr,
len, eenc);
1344 if ((eidx == rb_ascii8bit_encindex()) ||
1345 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1349 ienc = rb_default_internal_encoding();
1350 if (!ienc || eenc == ienc) {
1351 return rb_enc_str_new(
ptr,
len, eenc);
1355 if ((eidx == rb_ascii8bit_encindex()) ||
1356 (eidx == rb_usascii_encindex()) ||
1357 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1358 return rb_enc_str_new(
ptr,
len, ienc);
1361 str = rb_enc_str_new(NULL, 0, ienc);
1364 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1365 rb_str_initialize(str,
ptr,
len, eenc);
1373 int eidx = rb_enc_to_index(eenc);
1374 if (eidx == rb_usascii_encindex() &&
1375 !is_ascii_string(str)) {
1376 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1379 rb_enc_associate_index(str, eidx);
1438str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1440 const int termlen = TERM_LEN(str);
1445 if (str_embed_capa(str2) >=
len + termlen) {
1446 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1447 STR_SET_EMBED(str2);
1448 memcpy(ptr2, RSTRING_PTR(str),
len);
1449 TERM_FILL(ptr2+
len, termlen);
1453 if (STR_SHARED_P(str)) {
1454 root =
RSTRING(str)->as.heap.aux.shared;
1463 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1465 rb_fatal(
"about to free a possible shared root");
1467 char *ptr2 = STR_HEAP_PTR(str2);
1469 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1472 FL_SET(str2, STR_NOEMBED);
1474 STR_SET_SHARED(str2, root);
1477 STR_SET_LEN(str2,
len);
1485 str_replace_shared_without_enc(str2, str);
1486 rb_enc_cr_str_exact_copy(str2, str);
1493 return str_replace_shared(str_alloc_heap(klass), str);
1510rb_str_new_frozen_String(
VALUE orig)
1518rb_str_frozen_bare_string(
VALUE orig)
1520 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1525rb_str_tmp_frozen_acquire(
VALUE orig)
1528 return str_new_frozen_buffer(0, orig, FALSE);
1532rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1534 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1535 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1537 VALUE str = str_alloc_heap(0);
1540 FL_SET(str, STR_SHARED_ROOT);
1542 size_t capa = str_capacity(orig, TERM_LEN(orig));
1548 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1549 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1556 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1557 RBASIC(orig)->flags &= ~STR_NOFREE;
1558 STR_SET_SHARED(orig, str);
1568rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1573 if (STR_EMBED_P(tmp)) {
1576 else if (
FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1582 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1586 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1587 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1592 STR_SET_LEN(tmp, 0);
1600 return str_new_frozen_buffer(klass, orig, TRUE);
1609 VALUE str = str_alloc_heap(klass);
1610 STR_SET_LEN(str, RSTRING_LEN(orig));
1611 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1612 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1613 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1614 RBASIC(orig)->flags &= ~STR_NOFREE;
1615 STR_SET_SHARED(orig, str);
1622str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1626 long len = RSTRING_LEN(orig);
1627 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1628 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1630 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1631 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1637 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1638 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1644 if ((ofs > 0) || (rest > 0) ||
1647 str = str_new_shared(klass,
shared);
1649 RSTRING(str)->as.heap.ptr += ofs;
1650 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1658 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1659 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1661 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1662 STR_SET_LEN(str, RSTRING_LEN(orig));
1667 str = heap_str_make_shared(klass, orig);
1671 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1683str_new_empty_String(
VALUE str)
1686 rb_enc_copy(v, str);
1690#define STR_BUF_MIN_SIZE 63
1695 if (STR_EMBEDDABLE_P(
capa, 1)) {
1703 RSTRING(str)->as.heap.ptr[0] =
'\0';
1723 return str_new(0, 0,
len);
1729 if (STR_EMBED_P(str)) {
1730 RB_DEBUG_COUNTER_INC(obj_str_embed);
1732 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1733 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1734 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1737 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1738 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1743rb_str_memsize(
VALUE str)
1745 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1746 return STR_HEAP_SIZE(str);
1756 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1759static inline void str_discard(
VALUE str);
1760static void str_shared_replace(
VALUE str,
VALUE str2);
1765 if (str != str2) str_shared_replace(str, str2);
1776 enc = STR_ENC_GET(str2);
1779 termlen = rb_enc_mbminlen(enc);
1781 STR_SET_LEN(str, RSTRING_LEN(str2));
1783 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1785 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1786 rb_enc_associate(str, enc);
1790 if (STR_EMBED_P(str2)) {
1792 long len = RSTRING_LEN(str2);
1795 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1796 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1797 RSTRING(str2)->as.heap.ptr = new_ptr;
1798 STR_SET_LEN(str2,
len);
1800 STR_SET_NOEMBED(str2);
1803 STR_SET_NOEMBED(str);
1805 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1807 if (
FL_TEST(str2, STR_SHARED)) {
1809 STR_SET_SHARED(str,
shared);
1812 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1816 STR_SET_EMBED(str2);
1817 RSTRING_PTR(str2)[0] = 0;
1818 STR_SET_LEN(str2, 0);
1819 rb_enc_associate(str, enc);
1833 return rb_obj_as_string_result(str, obj);
1849 len = RSTRING_LEN(str2);
1850 if (STR_SHARED_P(str2)) {
1853 STR_SET_NOEMBED(str);
1854 STR_SET_LEN(str,
len);
1855 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1856 STR_SET_SHARED(str,
shared);
1857 rb_enc_cr_str_exact_copy(str, str2);
1860 str_replace_shared(str, str2);
1869 size_t size = rb_str_embed_size(
capa);
1873 NEWOBJ_OF(str,
struct RString, klass,
1884 NEWOBJ_OF(str,
struct RString, klass,
1887 str->as.heap.aux.capa = 0;
1888 str->as.heap.ptr = NULL;
1898 encidx = rb_enc_get_index(str);
1899 flags &= ~ENCODING_MASK;
1902 if (encidx) rb_enc_associate_index(dup, encidx);
1912 long len = RSTRING_LEN(str);
1917 STR_SET_LEN(dup, RSTRING_LEN(str));
1918 return str_duplicate_setup_encoding(str, dup, flags);
1927 root =
RSTRING(str)->as.heap.aux.shared;
1930 root = str = str_new_frozen(klass, str);
1936 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1937 FL_SET(root, STR_SHARED_ROOT);
1939 flags |= RSTRING_NOEMBED | STR_SHARED;
1941 STR_SET_LEN(dup, RSTRING_LEN(str));
1942 return str_duplicate_setup_encoding(str, dup, flags);
1948 if (STR_EMBED_P(str)) {
1949 return str_duplicate_setup_embed(klass, str, dup);
1952 return str_duplicate_setup_heap(klass, str, dup);
1960 if (STR_EMBED_P(str)) {
1961 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1964 dup = str_alloc_heap(klass);
1967 return str_duplicate_setup(klass, str, dup);
1978rb_str_dup_m(
VALUE str)
1980 if (LIKELY(BARE_STRING_P(str))) {
1991 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1998 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2002 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2003 str_duplicate_setup_embed(klass, str, new_str);
2006 new_str = ec_str_alloc_heap(ec, klass);
2007 str_duplicate_setup_heap(klass, str, new_str);
2016rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
2018 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
2042 static ID keyword_ids[2];
2043 VALUE orig, opt, venc, vcapa;
2048 if (!keyword_ids[0]) {
2049 keyword_ids[0] = rb_id_encoding();
2050 CONST_ID(keyword_ids[1],
"capacity");
2058 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2059 enc = rb_to_encoding(venc);
2061 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2064 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2066 if (
capa < STR_BUF_MIN_SIZE) {
2067 capa = STR_BUF_MIN_SIZE;
2071 len = RSTRING_LEN(orig);
2075 if (orig == str) n = 0;
2077 str_modifiable(str);
2078 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2080 const size_t size = (size_t)
capa + termlen;
2081 const char *
const old_ptr = RSTRING_PTR(str);
2082 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2083 char *new_ptr =
ALLOC_N(
char, size);
2084 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2085 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2087 RSTRING(str)->as.heap.ptr = new_ptr;
2089 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2090 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2091 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2093 STR_SET_LEN(str,
len);
2096 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2097 rb_enc_cr_str_exact_copy(str, orig);
2099 FL_SET(str, STR_NOEMBED);
2106 rb_enc_associate(str, enc);
2118rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2124 static ID keyword_ids[2];
2134 keyword_ids[0] = rb_id_encoding();
2135 CONST_ID(keyword_ids[1],
"capacity");
2137 encoding = kwargs[0];
2138 capacity = kwargs[1];
2147 if (UNDEF_P(encoding)) {
2149 encoding = rb_obj_encoding(orig);
2153 if (!UNDEF_P(encoding)) {
2154 enc = rb_to_encoding(encoding);
2158 if (UNDEF_P(capacity)) {
2160 VALUE empty_str = str_new(klass,
"", 0);
2162 rb_enc_associate(empty_str, enc);
2166 VALUE copy = str_duplicate(klass, orig);
2167 rb_enc_associate(copy, enc);
2180 if (orig_capa >
capa) {
2185 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2186 STR_SET_LEN(str, 0);
2197#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2212static inline uintptr_t
2213count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2218 d = (d>>6) | (~d>>7);
2219 d &= NONASCII_MASK >> 7;
2222#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2224 return rb_popcount_intptr(d);
2228# if SIZEOF_VOIDP == 8
2237enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2243 long diff = (long)(e - p);
2244 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2249 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2250 const uintptr_t *s, *t;
2251 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2252 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2253 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2254 while (p < (
const char *)s) {
2255 if (is_utf8_lead_byte(*p))
len++;
2259 len += count_utf8_lead_bytes_with_word(s);
2262 p = (
const char *)s;
2265 if (is_utf8_lead_byte(*p))
len++;
2271 else if (rb_enc_asciicompat(enc)) {
2276 q = search_nonascii(p, e);
2282 p += rb_enc_fast_mbclen(p, e, enc);
2289 q = search_nonascii(p, e);
2295 p += rb_enc_mbclen(p, e, enc);
2302 for (c=0; p<e; c++) {
2303 p += rb_enc_mbclen(p, e, enc);
2318rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2326 long diff = (long)(e - p);
2327 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2329 else if (rb_enc_asciicompat(enc)) {
2333 q = search_nonascii(p, e);
2341 ret = rb_enc_precise_mbclen(p, e, enc);
2356 for (c=0; p<e; c++) {
2357 ret = rb_enc_precise_mbclen(p, e, enc);
2364 if (p + rb_enc_mbminlen(enc) <= e)
2365 p += rb_enc_mbminlen(enc);
2381 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2382 if (!enc) enc = STR_ENC_GET(str);
2383 p = RSTRING_PTR(str);
2388 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2393 return enc_strlen(p, e, enc, cr);
2400 return str_strlen(str, NULL);
2414 return LONG2NUM(str_strlen(str, NULL));
2426rb_str_bytesize(
VALUE str)
2445rb_str_empty(
VALUE str)
2447 return RBOOL(RSTRING_LEN(str) == 0);
2466 char *ptr1, *ptr2, *ptr3;
2471 enc = rb_enc_check_str(str1, str2);
2474 termlen = rb_enc_mbminlen(enc);
2475 if (len1 > LONG_MAX - len2) {
2476 rb_raise(rb_eArgError,
"string size too big");
2478 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2479 ptr3 = RSTRING_PTR(str3);
2480 memcpy(ptr3, ptr1, len1);
2481 memcpy(ptr3+len1, ptr2, len2);
2482 TERM_FILL(&ptr3[len1+len2], termlen);
2498 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2501 int enc1 = rb_enc_get_index(str1);
2502 int enc2 = rb_enc_get_index(str2);
2507 else if (enc2 < 0) {
2510 else if (enc1 != enc2) {
2513 else if (len1 > LONG_MAX - len2) {
2547 rb_enc_copy(str2, str);
2552 rb_raise(rb_eArgError,
"negative argument");
2554 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2555 if (STR_EMBEDDABLE_P(
len, 1)) {
2557 memset(RSTRING_PTR(str2), 0,
len + 1);
2564 STR_SET_LEN(str2,
len);
2565 rb_enc_copy(str2, str);
2568 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2569 rb_raise(rb_eArgError,
"argument too big");
2572 len *= RSTRING_LEN(str);
2573 termlen = TERM_LEN(str);
2575 ptr2 = RSTRING_PTR(str2);
2577 n = RSTRING_LEN(str);
2578 memcpy(ptr2, RSTRING_PTR(str), n);
2579 while (n <=
len/2) {
2580 memcpy(ptr2 + n, ptr2, n);
2583 memcpy(ptr2 + n, ptr2,
len-n);
2585 STR_SET_LEN(str2,
len);
2586 TERM_FILL(&ptr2[
len], termlen);
2587 rb_enc_cr_str_copy_for_substr(str2, str);
2624rb_check_lockedtmp(
VALUE str)
2626 if (
FL_TEST(str, STR_TMPLOCK)) {
2633#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2635str_modifiable(
VALUE str)
2639 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2640 if (CHILLED_STRING_P(str)) {
2641 CHILLED_STRING_MUTATED(str);
2643 rb_check_lockedtmp(str);
2644 rb_check_frozen(str);
2649str_dependent_p(
VALUE str)
2651 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2661#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2663str_independent(
VALUE str)
2667 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2668 str_modifiable(str);
2669 return !str_dependent_p(str);
2675str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2685 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2690 STR_SET_LEN(str,
len);
2695 oldptr = RSTRING_PTR(str);
2697 memcpy(
ptr, oldptr,
len);
2699 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2702 STR_SET_NOEMBED(str);
2703 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2704 TERM_FILL(
ptr +
len, termlen);
2706 STR_SET_LEN(str,
len);
2713 if (!str_independent(str))
2714 str_make_independent(str);
2723 int termlen = TERM_LEN(str);
2724 long len = RSTRING_LEN(str);
2727 rb_raise(rb_eArgError,
"negative expanding string size");
2729 if (expand >= LONG_MAX -
len) {
2730 rb_raise(rb_eArgError,
"string size too big");
2733 if (!str_independent(str)) {
2734 str_make_independent_expand(str,
len, expand, termlen);
2736 else if (expand > 0) {
2737 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2744str_modify_keep_cr(
VALUE str)
2746 if (!str_independent(str))
2747 str_make_independent(str);
2754str_discard(
VALUE str)
2756 str_modifiable(str);
2757 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2758 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2759 RSTRING(str)->as.heap.ptr = 0;
2760 STR_SET_LEN(str, 0);
2767 int encindex = rb_enc_get_index(str);
2769 if (RB_UNLIKELY(encindex == -1)) {
2773 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2778 if (!rb_enc_asciicompat(enc)) {
2800 return RSTRING_PTR(str);
2804zero_filled(
const char *s,
int n)
2806 for (; n > 0; --n) {
2813str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2815 const char *e = s +
len;
2817 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2818 if (zero_filled(s, minlen))
return s;
2824str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2829 if (str_dependent_p(str)) {
2830 if (!zero_filled(s +
len, termlen))
2831 str_make_independent_expand(str,
len, 0L, termlen);
2834 TERM_FILL(s +
len, termlen);
2837 return RSTRING_PTR(str);
2841rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2843 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2844 long len = RSTRING_LEN(str);
2848 rb_check_lockedtmp(str);
2849 str_make_independent_expand(str,
len, 0L, termlen);
2851 else if (str_dependent_p(str)) {
2852 if (termlen > oldtermlen)
2853 str_make_independent_expand(str,
len, 0L, termlen);
2856 if (!STR_EMBED_P(str)) {
2861 if (termlen > oldtermlen) {
2862 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2870str_null_check(
VALUE str,
int *w)
2872 char *s = RSTRING_PTR(str);
2873 long len = RSTRING_LEN(str);
2875 const int minlen = rb_enc_mbminlen(enc);
2879 if (str_null_char(s,
len, minlen, enc)) {
2882 return str_fill_term(str, s,
len, minlen);
2885 if (!s || memchr(s, 0,
len)) {
2889 s = str_fill_term(str, s,
len, minlen);
2895rb_str_to_cstr(
VALUE str)
2898 return str_null_check(str, &w);
2906 char *s = str_null_check(str, &w);
2909 rb_raise(rb_eArgError,
"string contains null char");
2911 rb_raise(rb_eArgError,
"string contains null byte");
2917rb_str_fill_terminator(
VALUE str,
const int newminlen)
2919 char *s = RSTRING_PTR(str);
2920 long len = RSTRING_LEN(str);
2921 return str_fill_term(str, s,
len, newminlen);
2927 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2953str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
2962 else if (rb_enc_asciicompat(enc)) {
2963 const char *p2, *e2;
2966 while (p < e && 0 < nth) {
2973 p2 = search_nonascii(p, e2);
2982 n = rb_enc_mbclen(p, e, enc);
2993 while (p < e && nth--) {
2994 p += rb_enc_mbclen(p, e, enc);
3005 return str_nth_len(p, e, &nth, enc);
3009str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3014 p = str_nth_len(p, e, &nth, enc);
3023str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3025 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3026 if (!pp)
return e - p;
3033 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
3034 STR_ENC_GET(str), single_byte_optimizable(str));
3039str_utf8_nth(
const char *p,
const char *e,
long *nthp)
3042 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
3043 const uintptr_t *s, *t;
3044 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3045 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3046 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
3047 while (p < (
const char *)s) {
3048 if (is_utf8_lead_byte(*p)) nth--;
3052 nth -= count_utf8_lead_bytes_with_word(s);
3054 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3058 if (is_utf8_lead_byte(*p)) {
3059 if (nth == 0)
break;
3069str_utf8_offset(
const char *p,
const char *e,
long nth)
3071 const char *pp = str_utf8_nth(p, e, &nth);
3080 if (single_byte_optimizable(str) || pos < 0)
3083 char *p = RSTRING_PTR(str);
3084 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3089str_subseq(
VALUE str,
long beg,
long len)
3097 const int termlen = TERM_LEN(str);
3098 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3105 if (str_embed_capa(str2) >=
len + termlen) {
3106 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3107 STR_SET_EMBED(str2);
3108 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3109 TERM_FILL(ptr2+
len, termlen);
3111 STR_SET_LEN(str2,
len);
3115 str_replace_shared(str2, str);
3118 RSTRING(str2)->as.heap.ptr += beg;
3119 if (RSTRING_LEN(str2) >
len) {
3120 STR_SET_LEN(str2,
len);
3130 VALUE str2 = str_subseq(str, beg,
len);
3131 rb_enc_cr_str_copy_for_substr(str2, str);
3140 const long blen = RSTRING_LEN(str);
3142 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3144 if (
len < 0)
return 0;
3145 if (beg < 0 && -beg < 0)
return 0;
3149 if (single_byte_optimizable(str)) {
3150 if (beg > blen)
return 0;
3153 if (beg < 0)
return 0;
3155 if (
len > blen - beg)
3157 if (
len < 0)
return 0;
3162 if (
len > -beg)
len = -beg;
3166 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3169 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3175 slen = str_strlen(str, enc);
3177 if (beg < 0)
return 0;
3179 if (
len == 0)
goto end;
3182 else if (beg > 0 && beg > blen) {
3186 if (beg > str_strlen(str, enc))
return 0;
3191 enc == rb_utf8_encoding()) {
3192 p = str_utf8_nth(s, e, &beg);
3193 if (beg > 0)
return 0;
3194 len = str_utf8_offset(p, e,
len);
3200 p = s + beg * char_sz;
3204 else if (
len * char_sz > e - p)
3209 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3210 if (beg > 0)
return 0;
3214 len = str_offset(p, e,
len, enc, 0);
3222static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3227 return str_substr(str, beg,
len, TRUE);
3237str_substr(
VALUE str,
long beg,
long len,
int empty)
3241 if (!p)
return Qnil;
3242 if (!
len && !empty)
return Qnil;
3244 beg = p - RSTRING_PTR(str);
3246 VALUE str2 = str_subseq(str, beg,
len);
3247 rb_enc_cr_str_copy_for_substr(str2, str);
3255 if (CHILLED_STRING_P(str)) {
3260 rb_str_resize(str, RSTRING_LEN(str));
3278 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3321str_uminus(
VALUE str)
3326 return rb_fstring(str);
3330#define rb_str_dup_frozen rb_str_new_frozen
3335 rb_check_frozen(str);
3336 if (
FL_TEST(str, STR_TMPLOCK)) {
3339 FL_SET(str, STR_TMPLOCK);
3346 rb_check_frozen(str);
3347 if (!
FL_TEST(str, STR_TMPLOCK)) {
3367 const int termlen = TERM_LEN(str);
3369 str_modifiable(str);
3370 if (STR_SHARED_P(str)) {
3373 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3374 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3385 else if (
len > RSTRING_LEN(str)) {
3389 const char *
const new_end = RSTRING_PTR(str) +
len;
3399 else if (
len < RSTRING_LEN(str)) {
3407 STR_SET_LEN(str,
len);
3408 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3415 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3418 int independent = str_independent(str);
3419 long slen = RSTRING_LEN(str);
3420 const int termlen = TERM_LEN(str);
3422 if (slen >
len || (termlen != 1 && slen <
len)) {
3428 if (STR_EMBED_P(str)) {
3429 if (
len == slen)
return str;
3430 if (str_embed_capa(str) >=
len + termlen) {
3431 STR_SET_LEN(str,
len);
3435 str_make_independent_expand(str, slen,
len - slen, termlen);
3437 else if (str_embed_capa(str) >=
len + termlen) {
3438 char *
ptr = STR_HEAP_PTR(str);
3440 if (slen >
len) slen =
len;
3443 STR_SET_LEN(str,
len);
3444 if (independent) ruby_xfree(
ptr);
3447 else if (!independent) {
3448 if (
len == slen)
return str;
3449 str_make_independent_expand(str, slen,
len - slen, termlen);
3453 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3454 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3457 else if (
len == slen)
return str;
3458 STR_SET_LEN(str,
len);
3465str_ensure_available_capa(
VALUE str,
long len)
3467 str_modify_keep_cr(str);
3469 const int termlen = TERM_LEN(str);
3470 long olen = RSTRING_LEN(str);
3472 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3473 rb_raise(rb_eArgError,
"string sizes too big");
3476 long total = olen +
len;
3477 long capa = str_capacity(str, termlen);
3480 if (total >= LONG_MAX / 2) {
3483 while (total >
capa) {
3486 RESIZE_CAPA_TERM(str,
capa, termlen);
3491str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3494 str_modify_keep_cr(str);
3499 if (
len == 0)
return 0;
3501 long total, olen,
off = -1;
3503 const int termlen = TERM_LEN(str);
3506 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3510 long capa = str_capacity(str, termlen);
3512 if (olen > LONG_MAX -
len) {
3513 rb_raise(rb_eArgError,
"string sizes too big");
3517 if (total >= LONG_MAX / 2) {
3520 while (total >
capa) {
3523 RESIZE_CAPA_TERM(str,
capa, termlen);
3524 sptr = RSTRING_PTR(str);
3529 memcpy(sptr + olen,
ptr,
len);
3530 STR_SET_LEN(str, total);
3531 TERM_FILL(sptr + total, termlen);
3536#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3537#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3542 if (
len == 0)
return str;
3544 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3546 return str_buf_cat(str,
ptr,
len);
3557rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3562 if (UNLIKELY(!str_independent(str))) {
3563 str_make_independent(str);
3566 long string_length = -1;
3567 const int null_terminator_length = 1;
3572 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3573 rb_raise(rb_eArgError,
"string sizes too big");
3576 long string_capacity = str_capacity(str, null_terminator_length);
3582 if (LIKELY(string_capacity >= string_length + 1)) {
3584 sptr[string_length] = byte;
3585 STR_SET_LEN(str, string_length + 1);
3586 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3590 str_buf_cat(str, (
char *)&
byte, 1);
3606 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3617rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3618 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3627 if (str_encindex == ptr_encindex) {
3629 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3633 str_enc = rb_enc_from_index(str_encindex);
3634 ptr_enc = rb_enc_from_index(ptr_encindex);
3635 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3638 if (RSTRING_LEN(str) == 0) {
3641 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3647 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3656 *ptr_cr_ret = ptr_cr;
3658 if (str_encindex != ptr_encindex &&
3661 str_enc = rb_enc_from_index(str_encindex);
3662 ptr_enc = rb_enc_from_index(ptr_encindex);
3667 res_encindex = str_encindex;
3672 res_encindex = str_encindex;
3676 res_encindex = ptr_encindex;
3681 res_encindex = str_encindex;
3688 res_encindex = str_encindex;
3694 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3696 str_buf_cat(str,
ptr,
len);
3702 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3709 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3719 if (rb_enc_asciicompat(enc)) {
3720 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3726 unsigned int c = (
unsigned char)*
ptr;
3727 int len = rb_enc_codelen(c, enc);
3728 rb_enc_mbcput(c, buf, enc);
3729 rb_enc_cr_str_buf_cat(str, buf,
len,
3742 if (str_enc_fastpath(str)) {
3746 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3752 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3763 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3779rb_str_concat_literals(
size_t num,
const VALUE *strary)
3783 unsigned long len = 1;
3788 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3790 str_enc_copy_direct(str, strary[0]);
3792 for (i = s; i < num; ++i) {
3793 const VALUE v = strary[i];
3797 if (encidx != ENCINDEX_US_ASCII) {
3799 rb_enc_set_index(str, encidx);
3812rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3814 str_modifiable(str);
3819 else if (argc > 1) {
3822 rb_enc_copy(arg_str, str);
3823 for (i = 0; i < argc; i++) {
3858rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3860 long needed_capacity = 0;
3864 for (
int index = 0; index < argc; index++) {
3865 VALUE obj = argv[index];
3873 needed_capacity += RSTRING_LEN(obj);
3878 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3885 str_ensure_available_capa(str, needed_capacity);
3888 for (
int index = 0; index < argc; index++) {
3889 VALUE obj = argv[index];
3894 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3895 char byte = (char)(
NUM2INT(obj) & 0xFF);
3909 rb_bug(
"append_as_bytes arguments should have been validated");
3913 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3914 TERM_FILL(sptr, TERM_LEN(str));
3919 for (
int index = 0; index < argc; index++) {
3920 VALUE obj = argv[index];
3937 rb_bug(
"append_as_bytes arguments should have been validated");
4016 if (rb_num_to_uint(str2, &code) == 0) {
4029 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4032 rb_str_buf_cat_byte(str1, (
unsigned char)code);
4035 long pos = RSTRING_LEN(str1);
4040 switch (
len = rb_enc_codelen(code, enc)) {
4041 case ONIGERR_INVALID_CODE_POINT_VALUE:
4042 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4044 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4050 rb_enc_mbcput(code, buf, enc);
4051 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
4052 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4054 rb_str_resize(str1, pos+
len);
4055 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
4068rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
4070 int encidx = rb_enc_to_index(enc);
4072 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4077 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4078 return ENCINDEX_ASCII_8BIT;
4101rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4103 str_modifiable(str);
4108 else if (argc > 1) {
4111 rb_enc_copy(arg_str, str);
4112 for (i = 0; i < argc; i++) {
4125 st_index_t precomputed_hash;
4126 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4128 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4129 return precomputed_hash;
4132 return str_do_hash(str);
4139 const char *ptr1, *ptr2;
4142 return (len1 != len2 ||
4144 memcmp(ptr1, ptr2, len1) != 0);
4156rb_str_hash_m(
VALUE str)
4162#define lesser(a,b) (((a)>(b))?(b):(a))
4170 if (RSTRING_LEN(str1) == 0)
return TRUE;
4171 if (RSTRING_LEN(str2) == 0)
return TRUE;
4174 if (idx1 == idx2)
return TRUE;
4179 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4183 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4193 const char *ptr1, *ptr2;
4196 if (str1 == str2)
return 0;
4199 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4208 if (len1 > len2)
return 1;
4211 if (retval > 0)
return 1;
4245 if (str1 == str2)
return Qtrue;
4252 return rb_str_eql_internal(str1, str2);
4266 if (str1 == str2)
return Qtrue;
4268 return rb_str_eql_internal(str1, str2);
4300 return rb_invcmp(str1, str2);
4342 return str_casecmp(str1, s);
4350 const char *p1, *p1end, *p2, *p2end;
4352 enc = rb_enc_compatible(str1, str2);
4357 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4358 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4359 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4360 while (p1 < p1end && p2 < p2end) {
4362 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4363 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4365 return INT2FIX(c1 < c2 ? -1 : 1);
4372 while (p1 < p1end && p2 < p2end) {
4373 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4374 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4376 if (0 <= c1 && 0 <= c2) {
4380 return INT2FIX(c1 < c2 ? -1 : 1);
4384 l1 = rb_enc_mbclen(p1, p1end, enc);
4385 l2 = rb_enc_mbclen(p2, p2end, enc);
4386 len = l1 < l2 ? l1 : l2;
4387 r = memcmp(p1, p2,
len);
4389 return INT2FIX(r < 0 ? -1 : 1);
4391 return INT2FIX(l1 < l2 ? -1 : 1);
4397 if (p1 == p1end && p2 == p2end)
return INT2FIX(0);
4398 if (p1 == p1end)
return INT2FIX(-1);
4431 return str_casecmp_p(str1, s);
4438 VALUE folded_str1, folded_str2;
4439 VALUE fold_opt = sym_fold;
4441 enc = rb_enc_compatible(str1, str2);
4446 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4447 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4449 return rb_str_eql(folded_str1, folded_str2);
4453strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4454 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4456 const char *search_start = str_ptr;
4457 long pos, search_len = str_len - offset;
4461 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4462 if (pos < 0)
return pos;
4464 if (t == search_start + pos)
break;
4465 search_len -= t - search_start;
4466 if (search_len <= 0)
return -1;
4467 offset += t - search_start;
4470 return pos + offset;
4474#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4475#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4478rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4480 const char *str_ptr, *str_ptr_end, *sub_ptr;
4481 long str_len, sub_len;
4484 enc = rb_enc_check(str, sub);
4485 if (is_broken_string(sub))
return -1;
4487 str_ptr = RSTRING_PTR(str);
4489 str_len = RSTRING_LEN(str);
4490 sub_ptr = RSTRING_PTR(sub);
4491 sub_len = RSTRING_LEN(sub);
4493 if (str_len < sub_len)
return -1;
4496 long str_len_char, sub_len_char;
4497 int single_byte = single_byte_optimizable(str);
4498 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4499 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4501 offset += str_len_char;
4502 if (offset < 0)
return -1;
4504 if (str_len_char - offset < sub_len_char)
return -1;
4505 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4508 if (sub_len == 0)
return offset;
4511 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4524rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4531 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4532 long slen = str_strlen(str, enc);
4534 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4546 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4547 enc, single_byte_optimizable(str));
4558 pos = rb_str_index(str, sub, pos);
4572str_ensure_byte_pos(
VALUE str,
long pos)
4574 if (!single_byte_optimizable(str)) {
4575 const char *s = RSTRING_PTR(str);
4577 const char *p = s + pos;
4578 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4580 "offset %ld does not land on character boundary", pos);
4653rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4659 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4660 long slen = RSTRING_LEN(str);
4662 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4673 str_ensure_byte_pos(str, pos);
4685 pos = rb_str_byteindex(str, sub, pos);
4686 if (pos >= 0)
return LONG2NUM(pos);
4693memrchr(
const char *search_str,
int chr,
long search_len)
4695 const char *ptr = search_str + search_len;
4696 while (ptr > search_str) {
4697 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4707 char *hit, *adjusted;
4709 long slen, searchlen;
4712 sbeg = RSTRING_PTR(str);
4713 slen = RSTRING_LEN(sub);
4714 if (slen == 0)
return s - sbeg;
4716 t = RSTRING_PTR(sub);
4718 searchlen = s - sbeg + 1;
4720 if (memcmp(s, t, slen) == 0) {
4725 hit = memrchr(sbeg, c, searchlen);
4728 if (hit != adjusted) {
4729 searchlen = adjusted - sbeg;
4732 if (memcmp(hit, t, slen) == 0)
4734 searchlen = adjusted - sbeg;
4735 }
while (searchlen > 0);
4749 enc = rb_enc_check(str, sub);
4750 if (is_broken_string(sub))
return -1;
4751 singlebyte = single_byte_optimizable(str);
4752 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4753 slen = str_strlen(sub, enc);
4756 if (
len < slen)
return -1;
4757 if (
len - pos < slen) pos =
len - slen;
4758 if (
len == 0)
return pos;
4760 sbeg = RSTRING_PTR(str);
4763 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4769 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4770 return str_rindex(str, sub, s, enc);
4831rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4836 long pos,
len = str_strlen(str, enc);
4838 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4840 if (pos < 0 && (pos +=
len) < 0) {
4846 if (pos >
len) pos =
len;
4854 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4855 enc, single_byte_optimizable(str));
4866 pos = rb_str_rindex(str, sub, pos);
4876rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4882 enc = rb_enc_check(str, sub);
4883 if (is_broken_string(sub))
return -1;
4884 len = RSTRING_LEN(str);
4885 slen = RSTRING_LEN(sub);
4888 if (
len < slen)
return -1;
4889 if (
len - pos < slen) pos =
len - slen;
4890 if (
len == 0)
return pos;
4892 sbeg = RSTRING_PTR(str);
4895 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4902 return str_rindex(str, sub, s, enc);
4992rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4996 long pos,
len = RSTRING_LEN(str);
4998 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
5000 if (pos < 0 && (pos +=
len) < 0) {
5006 if (pos >
len) pos =
len;
5012 str_ensure_byte_pos(str, pos);
5024 pos = rb_str_byterindex(str, sub, pos);
5025 if (pos >= 0)
return LONG2NUM(pos);
5064 switch (OBJ_BUILTIN_TYPE(y)) {
5118rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5125 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5156rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5160 re = get_pat(argv[0]);
5161 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5170static enum neighbor_char
5176 if (rb_enc_mbminlen(enc) > 1) {
5178 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5180 return NEIGHBOR_NOT_CHAR;
5182 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5184 if (!l)
return NEIGHBOR_NOT_CHAR;
5185 if (l !=
len)
return NEIGHBOR_WRAPPED;
5186 rb_enc_mbcput(c, p, enc);
5187 r = rb_enc_precise_mbclen(p, p +
len, enc);
5189 return NEIGHBOR_NOT_CHAR;
5191 return NEIGHBOR_FOUND;
5194 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5197 return NEIGHBOR_WRAPPED;
5198 ++((
unsigned char*)p)[i];
5199 l = rb_enc_precise_mbclen(p, p+
len, enc);
5203 return NEIGHBOR_FOUND;
5206 memset(p+l, 0xff,
len-l);
5212 for (len2 =
len-1; 0 < len2; len2--) {
5213 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5217 memset(p+len2+1, 0xff,
len-(len2+1));
5222static enum neighbor_char
5227 if (rb_enc_mbminlen(enc) > 1) {
5229 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5231 return NEIGHBOR_NOT_CHAR;
5233 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5234 if (!c)
return NEIGHBOR_NOT_CHAR;
5237 if (!l)
return NEIGHBOR_NOT_CHAR;
5238 if (l !=
len)
return NEIGHBOR_WRAPPED;
5239 rb_enc_mbcput(c, p, enc);
5240 r = rb_enc_precise_mbclen(p, p +
len, enc);
5242 return NEIGHBOR_NOT_CHAR;
5244 return NEIGHBOR_FOUND;
5247 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5250 return NEIGHBOR_WRAPPED;
5251 --((
unsigned char*)p)[i];
5252 l = rb_enc_precise_mbclen(p, p+
len, enc);
5256 return NEIGHBOR_FOUND;
5259 memset(p+l, 0,
len-l);
5265 for (len2 =
len-1; 0 < len2; len2--) {
5266 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5270 memset(p+len2+1, 0,
len-(len2+1));
5284static enum neighbor_char
5285enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5287 enum neighbor_char ret;
5291 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5295 const int max_gaps = 1;
5297 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5299 ctype = ONIGENC_CTYPE_DIGIT;
5301 ctype = ONIGENC_CTYPE_ALPHA;
5303 return NEIGHBOR_NOT_CHAR;
5306 for (
try = 0;
try <= max_gaps; ++
try) {
5307 ret = enc_succ_char(p,
len, enc);
5308 if (ret == NEIGHBOR_FOUND) {
5309 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5311 return NEIGHBOR_FOUND;
5318 ret = enc_pred_char(p,
len, enc);
5319 if (ret == NEIGHBOR_FOUND) {
5320 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5333 return NEIGHBOR_NOT_CHAR;
5336 if (ctype != ONIGENC_CTYPE_DIGIT) {
5338 return NEIGHBOR_WRAPPED;
5342 enc_succ_char(carry,
len, enc);
5343 return NEIGHBOR_WRAPPED;
5411 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5412 rb_enc_cr_str_copy_for_substr(str, orig);
5413 return str_succ(str);
5420 char *sbeg, *s, *e, *last_alnum = 0;
5421 int found_alnum = 0;
5423 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5424 long carry_pos = 0, carry_len = 1;
5425 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5427 slen = RSTRING_LEN(str);
5428 if (slen == 0)
return str;
5430 enc = STR_ENC_GET(str);
5431 sbeg = RSTRING_PTR(str);
5432 s = e = sbeg + slen;
5434 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5435 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5441 l = rb_enc_precise_mbclen(s, e, enc);
5442 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5443 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5444 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5446 case NEIGHBOR_NOT_CHAR:
5448 case NEIGHBOR_FOUND:
5450 case NEIGHBOR_WRAPPED:
5455 carry_pos = s - sbeg;
5460 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5461 enum neighbor_char neighbor;
5462 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5463 l = rb_enc_precise_mbclen(s, e, enc);
5464 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5465 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5467 neighbor = enc_succ_char(tmp, l, enc);
5469 case NEIGHBOR_FOUND:
5473 case NEIGHBOR_WRAPPED:
5476 case NEIGHBOR_NOT_CHAR:
5479 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5481 enc_succ_char(s, l, enc);
5483 if (!rb_enc_asciicompat(enc)) {
5484 MEMCPY(carry, s,
char, l);
5487 carry_pos = s - sbeg;
5491 RESIZE_CAPA(str, slen + carry_len);
5492 sbeg = RSTRING_PTR(str);
5493 s = sbeg + carry_pos;
5494 memmove(s + carry_len, s, slen - carry_pos);
5495 memmove(s, carry, carry_len);
5497 STR_SET_LEN(str, slen);
5498 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5512rb_str_succ_bang(
VALUE str)
5520all_digits_p(
const char *s,
long len)
5574 VALUE end, exclusive;
5578 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5584 VALUE current, after_end;
5591 enc = rb_enc_check(beg, end);
5592 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5594 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5595 char c = RSTRING_PTR(beg)[0];
5596 char e = RSTRING_PTR(end)[0];
5598 if (c > e || (excl && c == e))
return beg;
5600 VALUE str = rb_enc_str_new(&c, 1, enc);
5602 if ((*each)(str, arg))
break;
5603 if (!excl && c == e)
break;
5605 if (excl && c == e)
break;
5610 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5611 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5612 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5617 b = rb_str_to_inum(beg, 10, FALSE);
5618 e = rb_str_to_inum(end, 10, FALSE);
5625 if (excl && bi == ei)
break;
5626 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5631 ID op = excl ?
'<' : idLE;
5632 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5637 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5638 b = rb_funcallv(b, succ, 0, 0);
5645 if (n > 0 || (excl && n == 0))
return beg;
5647 after_end = rb_funcallv(end, succ, 0, 0);
5652 next = rb_funcallv(current, succ, 0, 0);
5653 if ((*each)(current, arg))
break;
5654 if (
NIL_P(next))
break;
5658 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5673 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5674 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5675 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5677 b = rb_str_to_inum(beg, 10, FALSE);
5683 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5691 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5692 b = rb_funcallv(b, succ, 0, 0);
5698 VALUE next = rb_funcallv(current, succ, 0, 0);
5699 if ((*each)(current, arg))
break;
5702 if (RSTRING_LEN(current) == 0)
5713 if (!
rb_equal(str, *argp))
return 0;
5727 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5728 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5729 rb_enc_asciicompat(STR_ENC_GET(val))) {
5730 const char *bp = RSTRING_PTR(beg);
5731 const char *ep = RSTRING_PTR(end);
5732 const char *vp = RSTRING_PTR(val);
5733 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5734 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5742 if (b <= v && v < e)
return Qtrue;
5743 return RBOOL(!
RTEST(exclusive) && v == e);
5750 all_digits_p(bp, RSTRING_LEN(beg)) &&
5751 all_digits_p(ep, RSTRING_LEN(end))) {
5756 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5758 return RBOOL(
NIL_P(val));
5781 return rb_str_subpat(str, indx,
INT2FIX(0));
5784 if (rb_str_index(str, indx, 0) != -1)
5790 long beg,
len = str_strlen(str, NULL);
5802 return str_substr(str, idx, 1, FALSE);
5821rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5825 return rb_str_subpat(str, argv[0], argv[1]);
5828 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5832 return rb_str_aref(str, argv[0]);
5838 char *ptr = RSTRING_PTR(str);
5839 long olen = RSTRING_LEN(str), nlen;
5841 str_modifiable(str);
5842 if (
len > olen)
len = olen;
5844 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5846 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5848 ptr =
RSTRING(str)->as.embed.ary;
5849 memmove(ptr, oldptr +
len, nlen);
5850 if (fl == STR_NOEMBED)
xfree(oldptr);
5853 if (!STR_SHARED_P(str)) {
5855 rb_enc_cr_str_exact_copy(shared, str);
5860 STR_SET_LEN(str, nlen);
5862 if (!SHARABLE_MIDDLE_SUBSTRING) {
5863 TERM_FILL(ptr + nlen, TERM_LEN(str));
5870rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5876 if (beg == 0 && vlen == 0) {
5881 str_modify_keep_cr(str);
5885 RESIZE_CAPA(str, slen + vlen -
len);
5886 sptr = RSTRING_PTR(str);
5895 memmove(sptr + beg + vlen,
5897 slen - (beg +
len));
5899 if (vlen < beg &&
len < 0) {
5903 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5906 STR_SET_LEN(str, slen);
5907 TERM_FILL(&sptr[slen], TERM_LEN(str));
5914 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5923 int singlebyte = single_byte_optimizable(str);
5929 enc = rb_enc_check(str, val);
5930 slen = str_strlen(str, enc);
5932 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5941 if (
len > slen - beg) {
5944 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5949 beg = p - RSTRING_PTR(str);
5951 rb_str_update_0(str, beg,
len, val);
5952 rb_enc_associate(str, enc);
5963 long start, end,
len;
5973 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5977 nth += regs->num_regs;
5987 enc = rb_enc_check_str(str, val);
5988 rb_str_update_0(str, start,
len, val);
5989 rb_enc_associate(str, enc);
5997 switch (
TYPE(indx)) {
5999 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
6003 beg = rb_str_index(str, indx, 0);
6058rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
6062 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6070 return rb_str_aset(str, argv[0], argv[1]);
6120rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6128 str_modify_keep_cr(str);
6136 if ((nth += regs->num_regs) <= 0)
return Qnil;
6138 else if (nth >= regs->num_regs)
return Qnil;
6140 len = END(nth) - beg;
6143 else if (argc == 2) {
6152 beg = p - RSTRING_PTR(str);
6156 beg = rb_str_index(str, indx, 0);
6157 if (beg == -1)
return Qnil;
6158 len = RSTRING_LEN(indx);
6170 beg = p - RSTRING_PTR(str);
6179 beg = p - RSTRING_PTR(str);
6183 rb_enc_cr_str_copy_for_substr(result, str);
6191 char *sptr = RSTRING_PTR(str);
6192 long slen = RSTRING_LEN(str);
6193 if (beg +
len > slen)
6197 slen - (beg +
len));
6199 STR_SET_LEN(str, slen);
6200 TERM_FILL(&sptr[slen], TERM_LEN(str));
6211 switch (OBJ_BUILTIN_TYPE(pat)) {
6230get_pat_quoted(
VALUE pat,
int check)
6234 switch (OBJ_BUILTIN_TYPE(pat)) {
6248 if (check && is_broken_string(pat)) {
6255rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6258 pos = rb_str_byteindex(str, pat, pos);
6259 if (set_backref_str) {
6261 str = rb_str_new_frozen_String(str);
6262 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6264 *match = match_data;
6274 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6279rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6281 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6300rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6314 hash = rb_check_hash_type(argv[1]);
6320 pat = get_pat_quoted(argv[0], 1);
6322 str_modifiable(str);
6323 beg = rb_pat_search(pat, str, 0, 1);
6337 end0 = beg0 + RSTRING_LEN(pat);
6346 if (iter || !
NIL_P(hash)) {
6347 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6353 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6356 str_mod_check(str, p,
len);
6357 rb_check_frozen(str);
6363 enc = rb_enc_compatible(str, repl);
6366 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6370 rb_enc_inspect_name(str_enc),
6371 rb_enc_inspect_name(STR_ENC_GET(repl)));
6373 enc = STR_ENC_GET(repl);
6376 rb_enc_associate(str, enc);
6386 rlen = RSTRING_LEN(repl);
6387 len = RSTRING_LEN(str);
6389 RESIZE_CAPA(str,
len + rlen - plen);
6391 p = RSTRING_PTR(str);
6393 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6395 rp = RSTRING_PTR(repl);
6396 memmove(p + beg0, rp, rlen);
6398 STR_SET_LEN(str,
len);
6399 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6428 rb_str_sub_bang(argc, argv, str);
6433str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6436 long beg, beg0, end0;
6437 long offset, blen, slen,
len, last;
6438 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6440 int need_backref_str = -1;
6450 hash = rb_check_hash_type(argv[1]);
6454 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6462 rb_error_arity(argc, 1, 2);
6465 pat = get_pat_quoted(argv[0], 1);
6466 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6469 if (bang)
return Qnil;
6474 blen = RSTRING_LEN(str) + 30;
6476 sp = RSTRING_PTR(str);
6477 slen = RSTRING_LEN(str);
6479 str_enc = STR_ENC_GET(str);
6480 rb_enc_associate(dest, str_enc);
6487 end0 = beg0 + RSTRING_LEN(pat);
6503 if (mode == FAST_MAP) {
6512 val = rb_hash_aref(hash, key);
6515 str_mod_check(str, sp, slen);
6520 else if (need_backref_str) {
6522 if (need_backref_str < 0) {
6523 need_backref_str = val != repl;
6530 len = beg0 - offset;
6544 if (RSTRING_LEN(str) <= end0)
break;
6545 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6547 offset = end0 +
len;
6549 cp = RSTRING_PTR(str) + offset;
6550 if (offset > RSTRING_LEN(str))
break;
6553 if (mode != FAST_MAP && mode != STR) {
6556 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6561 if (RSTRING_LEN(str) > offset) {
6564 rb_pat_search0(pat, str, last, 1, &match);
6566 str_shared_replace(str, dest);
6591rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6593 str_modify_keep_cr(str);
6594 return str_gsub(argc, argv, str, 1);
6644 return str_gsub(argc, argv, str, 0);
6662 str_modifiable(str);
6663 if (str == str2)
return str;
6667 return str_replace(str, str2);
6684rb_str_clear(
VALUE str)
6688 STR_SET_LEN(str, 0);
6689 RSTRING_PTR(str)[0] = 0;
6690 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6706rb_str_chr(
VALUE str)
6724 pos += RSTRING_LEN(str);
6725 if (pos < 0 || RSTRING_LEN(str) <= pos)
6728 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6747 long len = RSTRING_LEN(str);
6748 char *
ptr, *head, *left = 0;
6752 if (pos < -
len ||
len <= pos)
6759 char byte = (char)(
NUM2INT(w) & 0xFF);
6761 if (!str_independent(str))
6762 str_make_independent(str);
6763 enc = STR_ENC_GET(str);
6764 head = RSTRING_PTR(str);
6766 if (!STR_EMBED_P(str)) {
6773 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6781 width = rb_enc_precise_mbclen(left, head+
len, enc);
6783 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6799str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6801 long n = RSTRING_LEN(str);
6803 if (beg > n ||
len < 0)
return Qnil;
6806 if (beg < 0)
return Qnil;
6811 if (!empty)
return Qnil;
6815 VALUE str2 = str_subseq(str, beg,
len);
6817 str_enc_copy_direct(str2, str);
6819 if (RSTRING_LEN(str2) == 0) {
6820 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6854 long beg,
len = RSTRING_LEN(str);
6862 return str_byte_substr(str, beg,
len, TRUE);
6867 return str_byte_substr(str, idx, 1, FALSE);
6879rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6884 return str_byte_substr(str, beg,
len, TRUE);
6887 return str_byte_aref(str, argv[0]);
6891str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6893 long end, slen = RSTRING_LEN(str);
6896 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6905 if (*
len > slen - *beg) {
6909 str_ensure_byte_pos(str, *beg);
6910 str_ensure_byte_pos(str, end);
6924rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6926 long beg,
len, vbeg, vlen;
6931 if (!(argc == 2 || argc == 3 || argc == 5)) {
6932 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6936 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6937 rb_builtin_class_name(argv[0]));
6944 vlen = RSTRING_LEN(val);
6949 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6950 rb_builtin_class_name(argv[2]));
6962 vlen = RSTRING_LEN(val);
6970 str_check_beg_len(str, &beg, &
len);
6971 str_check_beg_len(val, &vbeg, &vlen);
6972 str_modify_keep_cr(str);
6975 rb_enc_associate(str, rb_enc_check(str, val));
6978 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6996rb_str_reverse(
VALUE str)
7003 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
7004 enc = STR_ENC_GET(str);
7010 if (RSTRING_LEN(str) > 1) {
7011 if (single_byte_optimizable(str)) {
7018 int clen = rb_enc_fast_mbclen(s, e, enc);
7026 cr = rb_enc_asciicompat(enc) ?
7029 int clen = rb_enc_mbclen(s, e, enc);
7038 STR_SET_LEN(rev, RSTRING_LEN(str));
7039 str_enc_copy_direct(rev, str);
7059rb_str_reverse_bang(
VALUE str)
7061 if (RSTRING_LEN(str) > 1) {
7062 if (single_byte_optimizable(str)) {
7065 str_modify_keep_cr(str);
7066 s = RSTRING_PTR(str);
7075 str_shared_replace(str, rb_str_reverse(str));
7079 str_modify_keep_cr(str);
7108 i = rb_str_index(str, arg, 0);
7110 return RBOOL(i != -1);
7152 rb_raise(rb_eArgError,
"invalid radix %d", base);
7154 return rb_str_to_inum(str, base, FALSE);
7178rb_str_to_f(
VALUE str)
7193rb_str_to_s(
VALUE str)
7205 char s[RUBY_MAX_CHAR_LEN];
7206 int n = rb_enc_codelen(c, enc);
7208 rb_enc_mbcput(c, s, enc);
7213#define CHAR_ESC_LEN 13
7216rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7218 char buf[CHAR_ESC_LEN + 1];
7226 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7228 else if (c < 0x10000) {
7229 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7232 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7237 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7240 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7243 l = (int)strlen(buf);
7249ruby_escaped_char(
int c)
7252 case '\0':
return "\\0";
7253 case '\n':
return "\\n";
7254 case '\r':
return "\\r";
7255 case '\t':
return "\\t";
7256 case '\f':
return "\\f";
7257 case '\013':
return "\\v";
7258 case '\010':
return "\\b";
7259 case '\007':
return "\\a";
7260 case '\033':
return "\\e";
7261 case '\x7f':
return "\\c?";
7267rb_str_escape(
VALUE str)
7271 const char *p = RSTRING_PTR(str);
7273 const char *prev = p;
7274 char buf[CHAR_ESC_LEN + 1];
7276 int unicode_p = rb_enc_unicode_p(enc);
7277 int asciicompat = rb_enc_asciicompat(enc);
7282 int n = rb_enc_precise_mbclen(p, pend, enc);
7284 if (p > prev) str_buf_cat(result, prev, p - prev);
7285 n = rb_enc_mbminlen(enc);
7287 n = (int)(pend - p);
7289 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7290 str_buf_cat(result, buf, strlen(buf));
7296 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7298 cc = ruby_escaped_char(c);
7300 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7301 str_buf_cat(result, cc, strlen(cc));
7304 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7307 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7308 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7312 if (p > prev) str_buf_cat(result, prev, p - prev);
7331 const char *p, *pend, *prev;
7332 char buf[CHAR_ESC_LEN + 1];
7334 rb_encoding *resenc = rb_default_internal_encoding();
7335 int unicode_p = rb_enc_unicode_p(enc);
7336 int asciicompat = rb_enc_asciicompat(enc);
7338 if (resenc == NULL) resenc = rb_default_external_encoding();
7339 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7340 rb_enc_associate(result, resenc);
7341 str_buf_cat2(result,
"\"");
7349 n = rb_enc_precise_mbclen(p, pend, enc);
7351 if (p > prev) str_buf_cat(result, prev, p - prev);
7352 n = rb_enc_mbminlen(enc);
7354 n = (int)(pend - p);
7356 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7357 str_buf_cat(result, buf, strlen(buf));
7363 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7365 if ((asciicompat || unicode_p) &&
7366 (c ==
'"'|| c ==
'\\' ||
7371 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7372 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7373 str_buf_cat2(result,
"\\");
7374 if (asciicompat || enc == resenc) {
7380 case '\n': cc =
'n';
break;
7381 case '\r': cc =
'r';
break;
7382 case '\t': cc =
't';
break;
7383 case '\f': cc =
'f';
break;
7384 case '\013': cc =
'v';
break;
7385 case '\010': cc =
'b';
break;
7386 case '\007': cc =
'a';
break;
7387 case 033: cc =
'e';
break;
7388 default: cc = 0;
break;
7391 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7394 str_buf_cat(result, buf, 2);
7407 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7411 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7412 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7417 if (p > prev) str_buf_cat(result, prev, p - prev);
7418 str_buf_cat2(result,
"\"");
7423#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7436 int encidx = rb_enc_get_index(str);
7439 const char *p, *pend;
7442 int u8 = (encidx == rb_utf8_encindex());
7443 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7446 if (!rb_enc_asciicompat(enc)) {
7448 len += strlen(enc->name);
7451 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7454 unsigned char c = *p++;
7457 case '"':
case '\\':
7458 case '\n':
case '\r':
7459 case '\t':
case '\f':
7460 case '\013':
case '\010':
case '\007':
case '\033':
7465 clen = IS_EVSTR(p, pend) ? 2 : 1;
7473 if (u8 && c > 0x7F) {
7474 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7476 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7479 else if (cc <= 0xFFFFF)
7492 if (clen > LONG_MAX -
len) {
7499 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7500 q = RSTRING_PTR(result); qend = q +
len + 1;
7504 unsigned char c = *p++;
7506 if (c ==
'"' || c ==
'\\') {
7510 else if (c ==
'#') {
7511 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7514 else if (c ==
'\n') {
7518 else if (c ==
'\r') {
7522 else if (c ==
'\t') {
7526 else if (c ==
'\f') {
7530 else if (c ==
'\013') {
7534 else if (c ==
'\010') {
7538 else if (c ==
'\007') {
7542 else if (c ==
'\033') {
7552 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7554 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7557 snprintf(q, qend-q,
"u%04X", cc);
7559 snprintf(q, qend-q,
"u{%X}", cc);
7564 snprintf(q, qend-q,
"x%02X", c);
7570 if (!rb_enc_asciicompat(enc)) {
7571 snprintf(q, qend-q, nonascii_suffix, enc->name);
7572 encidx = rb_ascii8bit_encindex();
7575 rb_enc_associate_index(result, encidx);
7581unescape_ascii(
unsigned int c)
7605undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7607 const char *s = *ss;
7611 unsigned char buf[6];
7629 *buf = unescape_ascii(*s);
7641 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7642 if (*penc != enc_utf8) {
7644 rb_enc_associate(undumped, enc_utf8);
7661 if (hexlen == 0 || hexlen > 6) {
7667 if (0xd800 <= c && c <= 0xdfff) {
7670 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7680 if (0xd800 <= c && c <= 0xdfff) {
7683 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7711static VALUE rb_str_is_ascii_only_p(
VALUE str);
7729str_undump(
VALUE str)
7731 const char *s = RSTRING_PTR(str);
7734 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7736 bool binary =
false;
7740 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7743 if (!str_null_check(str, &w)) {
7746 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7747 if (*s !=
'"')
goto invalid_format;
7765 static const char force_encoding_suffix[] =
".force_encoding(\"";
7766 static const char dup_suffix[] =
".dup";
7767 const char *encname;
7772 size =
sizeof(dup_suffix) - 1;
7773 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7775 size =
sizeof(force_encoding_suffix) - 1;
7776 if (s_end - s <= size)
goto invalid_format;
7777 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7781 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7785 s = memchr(s,
'"', s_end-s);
7787 if (!s)
goto invalid_format;
7788 if (s_end - s != 2)
goto invalid_format;
7789 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7791 encidx = rb_enc_find_index2(encname, (
long)size);
7795 rb_enc_associate_index(undumped, encidx);
7805 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7816 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7822 if (rb_enc_dummy_p(enc)) {
7829str_true_enc(
VALUE str)
7832 rb_str_check_dummy_enc(enc);
7836static OnigCaseFoldType
7837check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7842 rb_raise(rb_eArgError,
"too many options");
7843 if (argv[0]==sym_turkic) {
7844 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7846 if (argv[1]==sym_lithuanian)
7847 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7849 rb_raise(rb_eArgError,
"invalid second option");
7852 else if (argv[0]==sym_lithuanian) {
7853 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7855 if (argv[1]==sym_turkic)
7856 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7858 rb_raise(rb_eArgError,
"invalid second option");
7862 rb_raise(rb_eArgError,
"too many options");
7863 else if (argv[0]==sym_ascii)
7864 flags |= ONIGENC_CASE_ASCII_ONLY;
7865 else if (argv[0]==sym_fold) {
7866 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7867 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7869 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7872 rb_raise(rb_eArgError,
"invalid option");
7879 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7885#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7886#ifndef CASEMAP_DEBUG
7887# define CASEMAP_DEBUG 0
7895 OnigUChar space[FLEX_ARY_LEN];
7899mapping_buffer_free(
void *p)
7903 while (current_buffer) {
7904 previous_buffer = current_buffer;
7905 current_buffer = current_buffer->next;
7906 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7912 {0, mapping_buffer_free,},
7913 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7921 const OnigUChar *source_current, *source_end;
7922 int target_length = 0;
7923 VALUE buffer_anchor;
7926 size_t buffer_count = 0;
7927 int buffer_length_or_invalid;
7929 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7931 source_current = (OnigUChar*)RSTRING_PTR(source);
7936 while (source_current < source_end) {
7938 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7939 if (CASEMAP_DEBUG) {
7940 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7943 *pre_buffer = current_buffer;
7944 pre_buffer = ¤t_buffer->next;
7945 current_buffer->next = NULL;
7946 current_buffer->capa =
capa;
7947 buffer_length_or_invalid = enc->case_map(flags,
7948 &source_current, source_end,
7949 current_buffer->space,
7950 current_buffer->space+current_buffer->capa,
7952 if (buffer_length_or_invalid < 0) {
7953 current_buffer =
DATA_PTR(buffer_anchor);
7955 mapping_buffer_free(current_buffer);
7956 rb_raise(rb_eArgError,
"input string invalid");
7958 target_length += current_buffer->used = buffer_length_or_invalid;
7960 if (CASEMAP_DEBUG) {
7961 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7964 if (buffer_count==1) {
7965 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7968 char *target_current;
7971 target_current = RSTRING_PTR(target);
7972 current_buffer =
DATA_PTR(buffer_anchor);
7973 while (current_buffer) {
7974 memcpy(target_current, current_buffer->space, current_buffer->used);
7975 target_current += current_buffer->used;
7976 current_buffer = current_buffer->next;
7979 current_buffer =
DATA_PTR(buffer_anchor);
7981 mapping_buffer_free(current_buffer);
7986 str_enc_copy_direct(target, source);
7995 const OnigUChar *source_current, *source_end;
7996 OnigUChar *target_current, *target_end;
7997 long old_length = RSTRING_LEN(source);
7998 int length_or_invalid;
8000 if (old_length == 0)
return Qnil;
8002 source_current = (OnigUChar*)RSTRING_PTR(source);
8004 if (source == target) {
8005 target_current = (OnigUChar*)source_current;
8006 target_end = (OnigUChar*)source_end;
8009 target_current = (OnigUChar*)RSTRING_PTR(target);
8013 length_or_invalid = onigenc_ascii_only_case_map(flags,
8014 &source_current, source_end,
8015 target_current, target_end, enc);
8016 if (length_or_invalid < 0)
8017 rb_raise(rb_eArgError,
"input string invalid");
8018 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8019 fprintf(stderr,
"problem with rb_str_ascii_casemap"
8020 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8021 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
8022 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8025 str_enc_copy(target, source);
8031upcase_single(
VALUE str)
8033 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8034 bool modified =
false;
8037 unsigned int c = *(
unsigned char*)s;
8039 if (
'a' <= c && c <=
'z') {
8040 *s =
'A' + (c -
'a');
8068rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
8071 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8073 flags = check_case_options(argc, argv, flags);
8074 str_modify_keep_cr(str);
8075 enc = str_true_enc(str);
8076 if (case_option_single_p(flags, enc, str)) {
8077 if (upcase_single(str))
8078 flags |= ONIGENC_CASE_MODIFIED;
8080 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8081 rb_str_ascii_casemap(str, str, &flags, enc);
8083 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8085 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8107rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8110 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8113 flags = check_case_options(argc, argv, flags);
8114 enc = str_true_enc(str);
8115 if (case_option_single_p(flags, enc, str)) {
8116 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8117 str_enc_copy_direct(ret, str);
8120 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8122 rb_str_ascii_casemap(str, ret, &flags, enc);
8125 ret = rb_str_casemap(str, &flags, enc);
8132downcase_single(
VALUE str)
8134 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8135 bool modified =
false;
8138 unsigned int c = *(
unsigned char*)s;
8140 if (
'A' <= c && c <=
'Z') {
8141 *s =
'a' + (c -
'A');
8163rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8166 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8168 flags = check_case_options(argc, argv, flags);
8169 str_modify_keep_cr(str);
8170 enc = str_true_enc(str);
8171 if (case_option_single_p(flags, enc, str)) {
8172 if (downcase_single(str))
8173 flags |= ONIGENC_CASE_MODIFIED;
8175 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8176 rb_str_ascii_casemap(str, str, &flags, enc);
8178 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8180 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8194rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8197 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8200 flags = check_case_options(argc, argv, flags);
8201 enc = str_true_enc(str);
8202 if (case_option_single_p(flags, enc, str)) {
8203 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8204 str_enc_copy_direct(ret, str);
8205 downcase_single(ret);
8207 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8209 rb_str_ascii_casemap(str, ret, &flags, enc);
8212 ret = rb_str_casemap(str, &flags, enc);
8232rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8235 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8237 flags = check_case_options(argc, argv, flags);
8238 str_modify_keep_cr(str);
8239 enc = str_true_enc(str);
8240 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8241 if (flags&ONIGENC_CASE_ASCII_ONLY)
8242 rb_str_ascii_casemap(str, str, &flags, enc);
8244 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8246 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8279rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8282 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8285 flags = check_case_options(argc, argv, flags);
8286 enc = str_true_enc(str);
8287 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8288 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8290 rb_str_ascii_casemap(str, ret, &flags, enc);
8293 ret = rb_str_casemap(str, &flags, enc);
8320rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8323 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8325 flags = check_case_options(argc, argv, flags);
8326 str_modify_keep_cr(str);
8327 enc = str_true_enc(str);
8328 if (flags&ONIGENC_CASE_ASCII_ONLY)
8329 rb_str_ascii_casemap(str, str, &flags, enc);
8331 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8333 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8357rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8360 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8363 flags = check_case_options(argc, argv, flags);
8364 enc = str_true_enc(str);
8365 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8366 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8368 rb_str_ascii_casemap(str, ret, &flags, enc);
8371 ret = rb_str_casemap(str, &flags, enc);
8376typedef unsigned char *USTR;
8380 unsigned int now, max;
8392 if (t->p == t->pend)
return -1;
8393 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8396 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8398 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8400 if (t->p < t->pend) {
8401 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8404 if (t->now < 0x80 && c < 0x80) {
8405 rb_raise(rb_eArgError,
8406 "invalid range \"%c-%c\" in string transliteration",
8410 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8414 else if (t->now < c) {
8423 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8424 if (t->now == t->max) {
8429 if (t->now < t->max) {
8445 const unsigned int errc = -1;
8446 unsigned int trans[256];
8448 struct tr trsrc, trrepl;
8450 unsigned int c, c0, last = 0;
8451 int modify = 0, i, l;
8452 unsigned char *s, *send;
8454 int singlebyte = single_byte_optimizable(str);
8458#define CHECK_IF_ASCII(c) \
8459 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8460 (cr = ENC_CODERANGE_VALID) : 0)
8464 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8465 if (RSTRING_LEN(repl) == 0) {
8466 return rb_str_delete_bang(1, &src, str);
8470 e1 = rb_enc_check(str, src);
8471 e2 = rb_enc_check(str, repl);
8476 enc = rb_enc_check(src, repl);
8478 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8479 if (RSTRING_LEN(src) > 1 &&
8480 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8481 trsrc.p + l < trsrc.pend) {
8485 trrepl.p = RSTRING_PTR(repl);
8486 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8487 trsrc.gen = trrepl.gen = 0;
8488 trsrc.now = trrepl.now = 0;
8489 trsrc.max = trrepl.max = 0;
8492 for (i=0; i<256; i++) {
8495 while ((c = trnext(&trsrc, enc)) != errc) {
8500 if (!hash) hash = rb_hash_new();
8504 while ((c = trnext(&trrepl, enc)) != errc)
8507 for (i=0; i<256; i++) {
8508 if (trans[i] != errc) {
8516 for (i=0; i<256; i++) {
8519 while ((c = trnext(&trsrc, enc)) != errc) {
8520 r = trnext(&trrepl, enc);
8521 if (r == errc) r = trrepl.now;
8524 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8527 if (!hash) hash = rb_hash_new();
8535 str_modify_keep_cr(str);
8536 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8537 termlen = rb_enc_mbminlen(enc);
8540 long offset, max = RSTRING_LEN(str);
8541 unsigned int save = -1;
8542 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8547 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8550 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8553 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8555 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8564 if (cflag) c = last;
8567 else if (cflag) c = errc;
8573 if (c != (
unsigned int)-1) {
8579 tlen = rb_enc_codelen(c, enc);
8585 if (enc != e1) may_modify = 1;
8587 if ((offset = t - buf) + tlen > max) {
8588 size_t MAYBE_UNUSED(old) = max + termlen;
8589 max = offset + tlen + (send - s);
8590 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8593 rb_enc_mbcput(c, t, enc);
8594 if (may_modify && memcmp(s, t, tlen) != 0) {
8600 if (!STR_EMBED_P(str)) {
8601 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8603 TERM_FILL((
char *)t, termlen);
8604 RSTRING(str)->as.heap.ptr = (
char *)buf;
8605 STR_SET_LEN(str, t - buf);
8606 STR_SET_NOEMBED(str);
8607 RSTRING(str)->as.heap.aux.capa = max;
8611 c = (
unsigned char)*s;
8612 if (trans[c] != errc) {
8629 long offset, max = (long)((send - s) * 1.2);
8630 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8635 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8638 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8641 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8643 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8651 if (cflag) c = last;
8654 else if (cflag) c = errc;
8658 c = cflag ? last : errc;
8661 tlen = rb_enc_codelen(c, enc);
8666 if (enc != e1) may_modify = 1;
8668 if ((offset = t - buf) + tlen > max) {
8669 size_t MAYBE_UNUSED(old) = max + termlen;
8670 max = offset + tlen + (long)((send - s) * 1.2);
8671 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8675 rb_enc_mbcput(c, t, enc);
8676 if (may_modify && memcmp(s, t, tlen) != 0) {
8684 if (!STR_EMBED_P(str)) {
8685 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8687 TERM_FILL((
char *)t, termlen);
8688 RSTRING(str)->as.heap.ptr = (
char *)buf;
8689 STR_SET_LEN(str, t - buf);
8690 STR_SET_NOEMBED(str);
8691 RSTRING(str)->as.heap.aux.capa = max;
8697 rb_enc_associate(str, enc);
8716 return tr_trans(str, src, repl, 0);
8763 tr_trans(str, src, repl, 0);
8767#define TR_TABLE_MAX (UCHAR_MAX+1)
8768#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8770tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8773 const unsigned int errc = -1;
8774 char buf[TR_TABLE_MAX];
8777 VALUE table = 0, ptable = 0;
8778 int i, l, cflag = 0;
8780 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8781 tr.gen =
tr.now =
tr.max = 0;
8783 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8788 for (i=0; i<TR_TABLE_MAX; i++) {
8791 stable[TR_TABLE_MAX] = cflag;
8793 else if (stable[TR_TABLE_MAX] && !cflag) {
8794 stable[TR_TABLE_MAX] = 0;
8796 for (i=0; i<TR_TABLE_MAX; i++) {
8800 while ((c = trnext(&
tr, enc)) != errc) {
8801 if (c < TR_TABLE_MAX) {
8802 buf[(
unsigned char)c] = !cflag;
8807 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8810 table = ptable ? ptable : rb_hash_new();
8814 table = rb_hash_new();
8819 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8820 rb_hash_aset(table, key,
Qtrue);
8824 for (i=0; i<TR_TABLE_MAX; i++) {
8825 stable[i] = stable[i] && buf[i];
8827 if (!table && !cflag) {
8834tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8836 if (c < TR_TABLE_MAX) {
8837 return table[c] != 0;
8843 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8844 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8848 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8851 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8866rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8868 char squeez[TR_TABLE_SIZE];
8871 VALUE del = 0, nodel = 0;
8873 int i, ascompat, cr;
8875 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8877 for (i=0; i<argc; i++) {
8881 enc = rb_enc_check(str, s);
8882 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8885 str_modify_keep_cr(str);
8886 ascompat = rb_enc_asciicompat(enc);
8887 s = t = RSTRING_PTR(str);
8894 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8905 c = rb_enc_codepoint_len(s, send, &clen, enc);
8907 if (tr_find(c, squeez, del, nodel)) {
8911 if (t != s) rb_enc_mbcput(c, t, enc);
8918 TERM_FILL(t, TERM_LEN(str));
8919 STR_SET_LEN(str, t - RSTRING_PTR(str));
8922 if (modify)
return str;
8936rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8939 rb_str_delete_bang(argc, argv, str);
8953rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8955 char squeez[TR_TABLE_SIZE];
8957 VALUE del = 0, nodel = 0;
8958 unsigned char *s, *send, *t;
8960 int ascompat, singlebyte = single_byte_optimizable(str);
8964 enc = STR_ENC_GET(str);
8967 for (i=0; i<argc; i++) {
8971 enc = rb_enc_check(str, s);
8972 if (singlebyte && !single_byte_optimizable(s))
8974 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8978 str_modify_keep_cr(str);
8979 s = t = (
unsigned char *)RSTRING_PTR(str);
8980 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8983 ascompat = rb_enc_asciicompat(enc);
8987 unsigned int c = *s++;
8988 if (c != save || (argc > 0 && !squeez[c])) {
8998 if (ascompat && (c = *s) < 0x80) {
8999 if (c != save || (argc > 0 && !squeez[c])) {
9005 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
9007 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9008 if (t != s) rb_enc_mbcput(c, t, enc);
9017 TERM_FILL((
char *)t, TERM_LEN(str));
9018 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9019 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
9023 if (modify)
return str;
9046rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
9049 rb_str_squeeze_bang(argc, argv, str);
9067 return tr_trans(str, src, repl, 1);
9090 tr_trans(str, src, repl, 1);
9103rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9105 char table[TR_TABLE_SIZE];
9107 VALUE del = 0, nodel = 0, tstr;
9117 enc = rb_enc_check(str, tstr);
9120 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9121 (ptstr = RSTRING_PTR(tstr),
9122 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9123 !is_broken_string(str)) {
9125 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9127 s = RSTRING_PTR(str);
9128 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9131 if (*(
unsigned char*)s++ == c) n++;
9137 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9138 for (i=1; i<argc; i++) {
9141 enc = rb_enc_check(str, tstr);
9142 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9145 s = RSTRING_PTR(str);
9146 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9148 ascompat = rb_enc_asciicompat(enc);
9152 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9160 c = rb_enc_codepoint_len(s, send, &clen, enc);
9161 if (tr_find(c, table, del, nodel)) {
9172rb_fs_check(
VALUE val)
9176 if (
NIL_P(val))
return 0;
9181static const char isspacetable[256] = {
9182 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9184 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9200#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9203split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9205 if (empty_count >= 0 &&
len == 0) {
9206 return empty_count + 1;
9208 if (empty_count > 0) {
9213 }
while (--empty_count > 0);
9217 rb_yield(str_new_empty_String(str));
9218 }
while (--empty_count > 0);
9232 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9236literal_split_pattern(
VALUE spat, split_type_t default_type)
9244 return SPLIT_TYPE_CHARS;
9246 else if (rb_enc_asciicompat(enc)) {
9247 if (
len == 1 && ptr[0] ==
' ') {
9248 return SPLIT_TYPE_AWK;
9253 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9254 return SPLIT_TYPE_AWK;
9257 return default_type;
9270rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9275 split_type_t split_type;
9276 long beg, end, i = 0, empty_count = -1;
9281 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9283 if (lim <= 0) limit =
Qnil;
9284 else if (lim == 1) {
9285 if (RSTRING_LEN(str) == 0)
9296 if (
NIL_P(limit) && !lim) empty_count = 0;
9298 enc = STR_ENC_GET(str);
9299 split_type = SPLIT_TYPE_REGEXP;
9301 spat = get_pat_quoted(spat, 0);
9303 else if (
NIL_P(spat = rb_fs)) {
9304 split_type = SPLIT_TYPE_AWK;
9306 else if (!(spat = rb_fs_check(spat))) {
9307 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9312 if (split_type != SPLIT_TYPE_AWK) {
9317 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9318 if (split_type == SPLIT_TYPE_AWK) {
9320 split_type = SPLIT_TYPE_STRING;
9325 mustnot_broken(spat);
9326 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9334#define SPLIT_STR(beg, len) ( \
9335 empty_count = split_string(result, str, beg, len, empty_count), \
9336 str_mod_check(str, str_start, str_len))
9339 char *ptr = RSTRING_PTR(str);
9340 char *
const str_start = ptr;
9341 const long str_len = RSTRING_LEN(str);
9342 char *
const eptr = str_start + str_len;
9343 if (split_type == SPLIT_TYPE_AWK) {
9350 if (is_ascii_string(str)) {
9351 while (ptr < eptr) {
9352 c = (
unsigned char)*ptr++;
9354 if (ascii_isspace(c)) {
9360 if (!
NIL_P(limit) && lim <= i)
break;
9363 else if (ascii_isspace(c)) {
9364 SPLIT_STR(beg, end-beg);
9367 if (!
NIL_P(limit)) ++i;
9375 while (ptr < eptr) {
9378 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9387 if (!
NIL_P(limit) && lim <= i)
break;
9391 SPLIT_STR(beg, end-beg);
9394 if (!
NIL_P(limit)) ++i;
9402 else if (split_type == SPLIT_TYPE_STRING) {
9403 char *substr_start = ptr;
9404 char *sptr = RSTRING_PTR(spat);
9405 long slen = RSTRING_LEN(spat);
9408 mustnot_broken(str);
9409 enc = rb_enc_check(str, spat);
9410 while (ptr < eptr &&
9411 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9414 if (t != ptr + end) {
9418 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9419 str_mod_check(spat, sptr, slen);
9422 if (!
NIL_P(limit) && lim <= ++i)
break;
9424 beg = ptr - str_start;
9426 else if (split_type == SPLIT_TYPE_CHARS) {
9430 mustnot_broken(str);
9431 enc = rb_enc_get(str);
9432 while (ptr < eptr &&
9433 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9434 SPLIT_STR(ptr - str_start, n);
9436 if (!
NIL_P(limit) && lim <= ++i)
break;
9438 beg = ptr - str_start;
9442 long len = RSTRING_LEN(str);
9450 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9455 if (start == end && BEG(0) == END(0)) {
9460 else if (last_null == 1) {
9461 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9468 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9474 SPLIT_STR(beg, end-beg);
9475 beg = start = END(0);
9479 for (idx=1; idx < regs->num_regs; idx++) {
9480 if (BEG(idx) == -1)
continue;
9481 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9483 if (!
NIL_P(limit) && lim <= ++i)
break;
9485 if (match) rb_match_unbusy(match);
9487 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9488 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9491 return result ? result : str;
9501 return rb_str_split_m(1, &sep, str);
9504#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9519#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9522chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9524 const char *prev = rb_enc_prev_char(p, e, e, enc);
9527 prev = rb_enc_prev_char(p, e, e, enc);
9528 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9540 RSTRING_LEN(rs) != 1 ||
9541 RSTRING_PTR(rs)[0] !=
'\n')) {
9547#define rb_rs get_rs()
9554 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9555 long pos,
len, rslen;
9561 static ID keywords[1];
9566 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9570 if (!ENUM_ELEM(ary, str)) {
9578 if (!RSTRING_LEN(str))
goto end;
9580 ptr = subptr = RSTRING_PTR(str);
9582 len = RSTRING_LEN(str);
9584 rslen = RSTRING_LEN(rs);
9587 enc = rb_enc_get(str);
9589 enc = rb_enc_check(str, rs);
9594 const char *eol = NULL;
9596 while (subend < pend) {
9597 long chomp_rslen = 0;
9599 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9601 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9603 if (eol == subend)
break;
9607 chomp_rslen = -rslen;
9611 if (!subptr) subptr = subend;
9615 }
while (subend < pend);
9617 if (rslen == 0) chomp_rslen = 0;
9619 subend - subptr + (chomp ? chomp_rslen : rslen));
9620 if (ENUM_ELEM(ary, line)) {
9621 str_mod_check(str, ptr,
len);
9623 subptr = eol = NULL;
9628 rsptr = RSTRING_PTR(rs);
9629 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9638 rsptr = RSTRING_PTR(rs);
9639 rslen = RSTRING_LEN(rs);
9642 while (subptr < pend) {
9643 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9647 if (hit != adjusted) {
9651 subend = hit += rslen;
9654 subend = chomp_newline(subptr, subend, enc);
9661 if (ENUM_ELEM(ary, line)) {
9662 str_mod_check(str, ptr,
len);
9667 if (subptr != pend) {
9670 pend = chomp_newline(subptr, pend, enc);
9672 else if (pend - subptr >= rslen &&
9673 memcmp(pend - rslen, rsptr, rslen) == 0) {
9678 ENUM_ELEM(ary, line);
9699rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9702 return rb_str_enumerate_lines(argc, argv, str, 0);
9757rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9759 VALUE ary = WANTARRAY(
"lines", 0);
9760 return rb_str_enumerate_lines(argc, argv, str, ary);
9774 for (i=0; i<RSTRING_LEN(str); i++) {
9775 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9793rb_str_each_byte(
VALUE str)
9796 return rb_str_enumerate_bytes(str, 0);
9808rb_str_bytes(
VALUE str)
9810 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9811 return rb_str_enumerate_bytes(str, ary);
9829 ptr = RSTRING_PTR(str);
9830 len = RSTRING_LEN(str);
9831 enc = rb_enc_get(str);
9834 for (i = 0; i <
len; i += n) {
9835 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9840 for (i = 0; i <
len; i += n) {
9841 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9862rb_str_each_char(
VALUE str)
9865 return rb_str_enumerate_chars(str, 0);
9877rb_str_chars(
VALUE str)
9880 return rb_str_enumerate_chars(str, ary);
9884rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9889 const char *ptr, *end;
9892 if (single_byte_optimizable(str))
9893 return rb_str_enumerate_bytes(str, ary);
9896 ptr = RSTRING_PTR(str);
9898 enc = STR_ENC_GET(str);
9901 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9922rb_str_each_codepoint(
VALUE str)
9925 return rb_str_enumerate_codepoints(str, 0);
9937rb_str_codepoints(
VALUE str)
9940 return rb_str_enumerate_codepoints(str, ary);
9946 int encidx = rb_enc_to_index(enc);
9948 const OnigUChar source_ascii[] =
"\\X";
9949 const OnigUChar *source = source_ascii;
9950 size_t source_len =
sizeof(source_ascii) - 1;
9953#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9954#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9955#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9956#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9957#define CASE_UTF(e) \
9958 case ENCINDEX_UTF_##e: { \
9959 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9960 source = source_UTF_##e; \
9961 source_len = sizeof(source_UTF_##e); \
9964 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9972 regex_t *reg_grapheme_cluster;
9974 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9975 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9977 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9978 onig_error_code_to_str(message, r, &einfo);
9979 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9982 return reg_grapheme_cluster;
9988 int encidx = rb_enc_to_index(enc);
9989 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9991 if (encidx == rb_utf8_encindex()) {
9992 if (!reg_grapheme_cluster_utf8) {
9993 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9996 return reg_grapheme_cluster_utf8;
10005 size_t grapheme_cluster_count = 0;
10007 const char *ptr, *end;
10009 if (!rb_enc_unicode_p(enc)) {
10013 bool cached_reg_grapheme_cluster =
true;
10014 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10015 if (!reg_grapheme_cluster) {
10016 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10017 cached_reg_grapheme_cluster =
false;
10020 ptr = RSTRING_PTR(str);
10023 while (ptr < end) {
10024 OnigPosition
len = onig_match(reg_grapheme_cluster,
10025 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
10026 (
const OnigUChar *)ptr, NULL, 0);
10027 if (
len <= 0)
break;
10028 grapheme_cluster_count++;
10032 if (!cached_reg_grapheme_cluster) {
10033 onig_free(reg_grapheme_cluster);
10036 return SIZET2NUM(grapheme_cluster_count);
10040rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
10044 const char *ptr0, *ptr, *end;
10046 if (!rb_enc_unicode_p(enc)) {
10047 return rb_str_enumerate_chars(str, ary);
10052 bool cached_reg_grapheme_cluster =
true;
10053 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10054 if (!reg_grapheme_cluster) {
10055 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10056 cached_reg_grapheme_cluster =
false;
10059 ptr0 = ptr = RSTRING_PTR(str);
10062 while (ptr < end) {
10063 OnigPosition
len = onig_match(reg_grapheme_cluster,
10064 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
10065 (
const OnigUChar *)ptr, NULL, 0);
10066 if (
len <= 0)
break;
10071 if (!cached_reg_grapheme_cluster) {
10072 onig_free(reg_grapheme_cluster);
10092rb_str_each_grapheme_cluster(
VALUE str)
10095 return rb_str_enumerate_grapheme_clusters(str, 0);
10107rb_str_grapheme_clusters(
VALUE str)
10110 return rb_str_enumerate_grapheme_clusters(str, ary);
10114chopped_length(
VALUE str)
10117 const char *p, *p2, *beg, *end;
10119 beg = RSTRING_PTR(str);
10120 end = beg + RSTRING_LEN(str);
10121 if (beg >= end)
return 0;
10122 p = rb_enc_prev_char(beg, end, end, enc);
10124 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10125 p2 = rb_enc_prev_char(beg, p, end, enc);
10126 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10144rb_str_chop_bang(
VALUE str)
10146 str_modify_keep_cr(str);
10147 if (RSTRING_LEN(str) > 0) {
10149 len = chopped_length(str);
10150 STR_SET_LEN(str,
len);
10151 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10170rb_str_chop(
VALUE str)
10176smart_chomp(
VALUE str,
const char *e,
const char *p)
10179 if (rb_enc_mbminlen(enc) > 1) {
10184 pp = e - rb_enc_mbminlen(enc);
10187 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10195 if (--e > p && *(e-1) ==
'\r') {
10212 char *pp, *e, *rsptr;
10214 char *
const p = RSTRING_PTR(str);
10215 long len = RSTRING_LEN(str);
10217 if (
len == 0)
return 0;
10220 return smart_chomp(str, e, p);
10223 enc = rb_enc_get(str);
10226 if (rb_enc_mbminlen(enc) > 1) {
10231 pp -= rb_enc_mbminlen(enc);
10234 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10241 while (e > p && *(e-1) ==
'\n') {
10243 if (e > p && *(e-1) ==
'\r')
10249 if (rslen >
len)
return len;
10251 enc = rb_enc_get(rs);
10252 newline = rsptr[rslen-1];
10253 if (rslen == rb_enc_mbminlen(enc)) {
10255 if (newline ==
'\n')
10256 return smart_chomp(str, e, p);
10260 return smart_chomp(str, e, p);
10264 enc = rb_enc_check(str, rs);
10265 if (is_broken_string(rs)) {
10269 if (p[
len-1] == newline &&
10271 memcmp(rsptr, pp, rslen) == 0)) {
10272 if (at_char_boundary(p, pp, e, enc))
10273 return len - rslen;
10285chomp_rs(
int argc,
const VALUE *argv)
10289 VALUE rs = argv[0];
10301 long olen = RSTRING_LEN(str);
10302 long len = chompped_length(str, rs);
10303 if (
len >= olen)
return Qnil;
10304 str_modify_keep_cr(str);
10305 STR_SET_LEN(str,
len);
10306 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10326rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10329 str_modifiable(str);
10330 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10331 rs = chomp_rs(argc, argv);
10333 return rb_str_chomp_string(str, rs);
10346rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10348 VALUE rs = chomp_rs(argc, argv);
10356 const char *
const start = s;
10358 if (!s || s >= e)
return 0;
10361 if (single_byte_optimizable(str)) {
10362 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10367 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10389rb_str_lstrip_bang(
VALUE str)
10393 long olen, loffset;
10395 str_modify_keep_cr(str);
10396 enc = STR_ENC_GET(str);
10398 loffset = lstrip_offset(str, start, start+olen, enc);
10400 long len = olen-loffset;
10401 s = start + loffset;
10402 memmove(start, s,
len);
10403 STR_SET_LEN(str,
len);
10404 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10428rb_str_lstrip(
VALUE str)
10433 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10434 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10443 rb_str_check_dummy_enc(enc);
10447 if (!s || s >= e)
return 0;
10451 if (single_byte_optimizable(str)) {
10453 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10458 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10478rb_str_rstrip_bang(
VALUE str)
10482 long olen, roffset;
10484 str_modify_keep_cr(str);
10485 enc = STR_ENC_GET(str);
10487 roffset = rstrip_offset(str, start, start+olen, enc);
10489 long len = olen - roffset;
10491 STR_SET_LEN(str,
len);
10492 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10515rb_str_rstrip(
VALUE str)
10519 long olen, roffset;
10521 enc = STR_ENC_GET(str);
10523 roffset = rstrip_offset(str, start, start+olen, enc);
10525 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10541rb_str_strip_bang(
VALUE str)
10544 long olen, loffset, roffset;
10547 str_modify_keep_cr(str);
10548 enc = STR_ENC_GET(str);
10550 loffset = lstrip_offset(str, start, start+olen, enc);
10551 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10553 if (loffset > 0 || roffset > 0) {
10554 long len = olen-roffset;
10557 memmove(start, start + loffset,
len);
10559 STR_SET_LEN(str,
len);
10560 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10583rb_str_strip(
VALUE str)
10586 long olen, loffset, roffset;
10590 loffset = lstrip_offset(str, start, start+olen, enc);
10591 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10593 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10598scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10601 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10607 end = pos + RSTRING_LEN(pat);
10621 if (RSTRING_LEN(str) > end)
10622 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10631 if (!regs || regs->num_regs == 1) {
10637 for (
int i = 1; i < regs->num_regs; i++) {
10698 long last = -1, prev = 0;
10699 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10701 pat = get_pat_quoted(pat, 1);
10702 mustnot_broken(str);
10706 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10711 if (last >= 0) rb_pat_search(pat, str, last, 1);
10716 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10720 str_mod_check(str, p,
len);
10722 if (last >= 0) rb_pat_search(pat, str, last, 1);
10749rb_str_hex(
VALUE str)
10751 return rb_str_to_inum(str, 16, FALSE);
10835rb_str_oct(
VALUE str)
10837 return rb_str_to_inum(str, -8, FALSE);
10840#ifndef HAVE_CRYPT_R
10845 rb_nativethread_lock_t lock;
10846} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10915# define CRYPT_END() ALLOCV_END(databuf)
10918 extern char *crypt(
const char *,
const char *);
10919# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10922 const char *s, *saltp;
10925 char salt_8bit_clean[3];
10929 mustnot_wchar(str);
10930 mustnot_wchar(salt);
10932 saltp = RSTRING_PTR(salt);
10933 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10934 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10938 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10939 salt_8bit_clean[0] = saltp[0] & 0x7f;
10940 salt_8bit_clean[1] = saltp[1] & 0x7f;
10941 salt_8bit_clean[2] =
'\0';
10942 saltp = salt_8bit_clean;
10947# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10948 data->initialized = 0;
10950 res = crypt_r(s, saltp, data);
10953 res = crypt(s, saltp);
10968 size_t res_size = strlen(res)+1;
10969 tmp_buf =
ALLOCA_N(
char, res_size);
10970 memcpy(tmp_buf, res, res_size);
11007 char *ptr, *p, *pend;
11010 unsigned long sum0 = 0;
11015 ptr = p = RSTRING_PTR(str);
11016 len = RSTRING_LEN(str);
11022 str_mod_check(str, ptr,
len);
11025 sum0 += (
unsigned char)*p;
11036 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
11037 sum0 &= (((
unsigned long)1)<<bits)-1;
11057rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
11061 long width,
len, flen = 1, fclen = 1;
11064 const char *f =
" ";
11065 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11067 int singlebyte = 1, cr;
11071 enc = STR_ENC_GET(str);
11072 termlen = rb_enc_mbminlen(enc);
11076 enc = rb_enc_check(str, pad);
11077 f = RSTRING_PTR(pad);
11078 flen = RSTRING_LEN(pad);
11079 fclen = str_strlen(pad, enc);
11080 singlebyte = single_byte_optimizable(pad);
11081 if (flen == 0 || fclen == 0) {
11082 rb_raise(rb_eArgError,
"zero width padding");
11085 len = str_strlen(str, enc);
11086 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
11088 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
11092 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11093 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11095 size = RSTRING_LEN(str);
11096 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11097 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11098 (
len += llen2 + rlen2) >= LONG_MAX - size) {
11099 rb_raise(rb_eArgError,
"argument too big");
11103 p = RSTRING_PTR(res);
11105 memset(p, *f, llen);
11109 while (llen >= fclen) {
11115 memcpy(p, f, llen2);
11119 memcpy(p, RSTRING_PTR(str), size);
11122 memset(p, *f, rlen);
11126 while (rlen >= fclen) {
11132 memcpy(p, f, rlen2);
11136 TERM_FILL(p, termlen);
11137 STR_SET_LEN(res, p-RSTRING_PTR(res));
11158rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
11160 return rb_str_justify(argc, argv, str,
'l');
11174rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11176 return rb_str_justify(argc, argv, str,
'r');
11189rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11191 return rb_str_justify(argc, argv, str,
'c');
11207 sep = get_pat_quoted(sep, 0);
11219 pos = rb_str_index(str, sep, 0);
11220 if (pos < 0)
goto failed;
11225 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11228 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11242 long pos = RSTRING_LEN(str);
11244 sep = get_pat_quoted(sep, 0);
11257 pos = rb_str_rindex(str, sep, pos);
11266 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11268 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11280rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11284 for (i=0; i<argc; i++) {
11285 VALUE tmp = argv[i];
11287 if (rb_reg_start_with_p(tmp, str))
11291 const char *p, *s, *e;
11296 enc = rb_enc_check(str, tmp);
11297 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11298 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11299 p = RSTRING_PTR(str);
11302 if (!at_char_right_boundary(p, s, e, enc))
11304 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11320rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11324 for (i=0; i<argc; i++) {
11325 VALUE tmp = argv[i];
11326 const char *p, *s, *e;
11331 enc = rb_enc_check(str, tmp);
11332 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11333 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11334 p = RSTRING_PTR(str);
11337 if (!at_char_boundary(p, s, e, enc))
11339 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11355deleted_prefix_length(
VALUE str,
VALUE prefix)
11357 const char *strptr, *prefixptr;
11358 long olen, prefixlen;
11363 if (!is_broken_string(prefix) ||
11364 !rb_enc_asciicompat(enc) ||
11365 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11366 enc = rb_enc_check(str, prefix);
11370 prefixlen = RSTRING_LEN(prefix);
11371 if (prefixlen <= 0)
return 0;
11372 olen = RSTRING_LEN(str);
11373 if (olen < prefixlen)
return 0;
11374 strptr = RSTRING_PTR(str);
11375 prefixptr = RSTRING_PTR(prefix);
11376 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11377 if (is_broken_string(prefix)) {
11378 if (!is_broken_string(str)) {
11382 const char *strend = strptr + olen;
11383 const char *after_prefix = strptr + prefixlen;
11384 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11405rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11408 str_modify_keep_cr(str);
11410 prefixlen = deleted_prefix_length(str, prefix);
11411 if (prefixlen <= 0)
return Qnil;
11425rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11429 prefixlen = deleted_prefix_length(str, prefix);
11430 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11432 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11445deleted_suffix_length(
VALUE str,
VALUE suffix)
11447 const char *strptr, *suffixptr;
11448 long olen, suffixlen;
11452 if (is_broken_string(suffix))
return 0;
11453 enc = rb_enc_check(str, suffix);
11456 suffixlen = RSTRING_LEN(suffix);
11457 if (suffixlen <= 0)
return 0;
11458 olen = RSTRING_LEN(str);
11459 if (olen < suffixlen)
return 0;
11460 strptr = RSTRING_PTR(str);
11461 suffixptr = RSTRING_PTR(suffix);
11462 const char *strend = strptr + olen;
11463 const char *before_suffix = strend - suffixlen;
11464 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11465 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11481rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11483 long olen, suffixlen,
len;
11484 str_modifiable(str);
11486 suffixlen = deleted_suffix_length(str, suffix);
11487 if (suffixlen <= 0)
return Qnil;
11489 olen = RSTRING_LEN(str);
11490 str_modify_keep_cr(str);
11491 len = olen - suffixlen;
11492 STR_SET_LEN(str,
len);
11493 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11509rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11513 suffixlen = deleted_suffix_length(str, suffix);
11514 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11516 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11523 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11531 val = rb_fs_check(val);
11534 "value of %"PRIsVALUE
" must be String or Regexp",
11538 rb_warn_deprecated(
"'$;'", NULL);
11555 str_modifiable(str);
11558 int idx = rb_enc_to_index(encoding);
11565 rb_enc_associate_index(str, idx);
11589 if (STR_EMBED_P(str)) {
11590 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11595 str_replace_shared_without_enc(str2, str);
11597 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11630rb_str_valid_encoding_p(
VALUE str)
11650rb_str_is_ascii_only_p(
VALUE str)
11660 static const char ellipsis[] =
"...";
11661 const long ellipsislen =
sizeof(ellipsis) - 1;
11663 const long blen = RSTRING_LEN(str);
11664 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11665 VALUE estr, ret = 0;
11668 if (
len * rb_enc_mbminlen(enc) >= blen ||
11672 else if (
len <= ellipsislen ||
11674 if (rb_enc_asciicompat(enc)) {
11676 rb_enc_associate(ret, enc);
11683 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11688 rb_enc_from_encoding(enc), 0,
Qnil);
11701 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11707 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11726 if (enc == STR_ENC_GET(str)) {
11731 return enc_str_scrub(enc, str, repl, cr);
11739 const char *rep, *p, *e, *p1, *sp;
11745 rb_raise(rb_eArgError,
"both of block and replacement given");
11752 if (!
NIL_P(repl)) {
11753 repl = str_compat_and_valid(repl, enc);
11756 if (rb_enc_dummy_p(enc)) {
11759 encidx = rb_enc_to_index(enc);
11761#define DEFAULT_REPLACE_CHAR(str) do { \
11762 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11763 rep = replace; replen = (int)sizeof(replace); \
11766 slen = RSTRING_LEN(str);
11767 p = RSTRING_PTR(str);
11772 if (rb_enc_asciicompat(enc)) {
11778 else if (!
NIL_P(repl)) {
11779 rep = RSTRING_PTR(repl);
11780 replen = RSTRING_LEN(repl);
11783 else if (encidx == rb_utf8_encindex()) {
11784 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11788 DEFAULT_REPLACE_CHAR(
"?");
11793 p = search_nonascii(p, e);
11798 int ret = rb_enc_precise_mbclen(p, e, enc);
11817 if (e - p < clen) clen = e - p;
11824 for (; clen > 1; clen--) {
11825 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11836 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11837 str_mod_check(str, sp, slen);
11838 repl = str_compat_and_valid(repl, enc);
11845 p = search_nonascii(p, e);
11871 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11872 str_mod_check(str, sp, slen);
11873 repl = str_compat_and_valid(repl, enc);
11882 long mbminlen = rb_enc_mbminlen(enc);
11886 else if (!
NIL_P(repl)) {
11887 rep = RSTRING_PTR(repl);
11888 replen = RSTRING_LEN(repl);
11890 else if (encidx == ENCINDEX_UTF_16BE) {
11891 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11893 else if (encidx == ENCINDEX_UTF_16LE) {
11894 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11896 else if (encidx == ENCINDEX_UTF_32BE) {
11897 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11899 else if (encidx == ENCINDEX_UTF_32LE) {
11900 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11903 DEFAULT_REPLACE_CHAR(
"?");
11907 int ret = rb_enc_precise_mbclen(p, e, enc);
11920 if (e - p < clen) clen = e - p;
11921 if (clen <= mbminlen * 2) {
11926 for (; clen > mbminlen; clen-=mbminlen) {
11927 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11937 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11938 str_mod_check(str, sp, slen);
11939 repl = str_compat_and_valid(repl, enc);
11964 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11965 str_mod_check(str, sp, slen);
11966 repl = str_compat_and_valid(repl, enc);
12002str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
12010static ID id_normalize;
12011static ID id_normalized_p;
12012static VALUE mUnicodeNormalize;
12015unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
12017 static int UnicodeNormalizeRequired = 0;
12020 if (!UnicodeNormalizeRequired) {
12021 rb_require(
"unicode_normalize/normalize.rb");
12022 UnicodeNormalizeRequired = 1;
12026 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
12063rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
12065 return unicode_normalize_common(argc, argv, str, id_normalize);
12079rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
12081 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12108rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
12110 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12242#define sym_equal rb_obj_equal
12245sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12249 int c = rb_enc_precise_mbclen(s, send, enc);
12253 c = rb_enc_mbc_to_codepoint(s, send, enc);
12261rb_str_symname_p(
VALUE sym)
12266 rb_encoding *resenc = rb_default_internal_encoding();
12268 if (resenc == NULL) resenc = rb_default_external_encoding();
12269 enc = STR_ENC_GET(sym);
12270 ptr = RSTRING_PTR(sym);
12271 len = RSTRING_LEN(sym);
12272 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12280rb_str_quote_unprintable(
VALUE str)
12288 resenc = rb_default_internal_encoding();
12289 if (resenc == NULL) resenc = rb_default_external_encoding();
12290 enc = STR_ENC_GET(str);
12291 ptr = RSTRING_PTR(str);
12292 len = RSTRING_LEN(str);
12293 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12294 !sym_printable(ptr, ptr +
len, enc)) {
12295 return rb_str_escape(str);
12301rb_id_quote_unprintable(
ID id)
12303 VALUE str = rb_id2str(
id);
12304 if (!rb_str_symname_p(str)) {
12305 return rb_str_escape(str);
12323sym_inspect(
VALUE sym)
12330 if (!rb_str_symname_p(str)) {
12332 len = RSTRING_LEN(str);
12333 rb_str_resize(str,
len + 1);
12334 dest = RSTRING_PTR(str);
12335 memmove(dest + 1, dest,
len);
12339 VALUE orig_str = str;
12341 len = RSTRING_LEN(orig_str);
12342 str = rb_enc_str_new(0,
len + 1, enc);
12345 ptr = RSTRING_PTR(orig_str);
12346 dest = RSTRING_PTR(str);
12347 memcpy(dest + 1, ptr,
len);
12367rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12372 rb_raise(rb_eArgError,
"no receiver given");
12469 return rb_str_match(
rb_sym2str(sym), other);
12484sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12486 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12499sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12501 return rb_str_match_m_p(argc, argv, sym);
12519 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12530sym_length(
VALUE sym)
12544sym_empty(
VALUE sym)
12578sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12594sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12610sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12624sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12626 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12639sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12641 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12653sym_encoding(
VALUE sym)
12659string_for_symbol(
VALUE name)
12664 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12678 name = string_for_symbol(name);
12679 return rb_intern_str(name);
12688 name = string_for_symbol(name);
12712 return rb_fstring(str);
12719 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
true,
false);
12731 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12732 rb_enc_autoload(enc);
12736 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12742 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12743 rb_enc_autoload(enc);
12747 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12758rb_yjit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12763 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12764 rb_str_buf_cat_byte(str, (
char) code);
12774fstring_set_class_i(
VALUE *str,
void *data)
12778 return ST_CONTINUE;
12786 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12953 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.