14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
49#include "ruby_assert.h"
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
67#undef rb_usascii_str_new
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
149#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
150#define TERM_FILL(ptr, termlen) do {\
151 char *const term_fill_ptr = (ptr);\
152 const int term_fill_len = (termlen);\
153 *term_fill_ptr = '\0';\
154 if (UNLIKELY(term_fill_len > 1))\
155 memset(term_fill_ptr, 0, term_fill_len);\
158#define RESIZE_CAPA(str,capacity) do {\
159 const int termlen = TERM_LEN(str);\
160 RESIZE_CAPA_TERM(str,capacity,termlen);\
162#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
163 if (STR_EMBED_P(str)) {\
164 if (str_embed_capa(str) < capacity + termlen) {\
165 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
166 const long tlen = RSTRING_LEN(str);\
167 memcpy(tmp, RSTRING_PTR(str), str_embed_capa(str));\
168 RSTRING(str)->as.heap.ptr = tmp;\
169 RSTRING(str)->len = tlen;\
170 STR_SET_NOEMBED(str);\
171 RSTRING(str)->as.heap.aux.capa = (capacity);\
175 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
176 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
177 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
178 RSTRING(str)->as.heap.aux.capa = (capacity);\
182#define STR_SET_SHARED(str, shared_str) do { \
183 if (!FL_TEST(str, STR_FAKESTR)) { \
184 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
185 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
186 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
187 FL_SET((str), STR_SHARED); \
188 rb_gc_register_pinning_obj(str); \
189 FL_SET((shared_str), STR_SHARED_ROOT); \
190 if (RBASIC_CLASS((shared_str)) == 0) \
191 FL_SET_RAW((shared_str), STR_BORROWED); \
195#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
196#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
199#define STR_ENC_GET(str) get_encoding(str)
201#if !defined SHARABLE_MIDDLE_SUBSTRING
202# define SHARABLE_MIDDLE_SUBSTRING 0
204#if !SHARABLE_MIDDLE_SUBSTRING
205#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
207#define SHARABLE_SUBSTRING_P(beg, len, end) 1
212str_embed_capa(
VALUE str)
214 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
218rb_str_reembeddable_p(
VALUE str)
220 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
224rb_str_embed_size(
long capa,
long termlen)
232rb_str_size_as_embedded(
VALUE str)
235 if (STR_EMBED_P(str)) {
237 if (
FL_TEST_RAW(str, STR_PRECOMPUTED_HASH))
capa +=
sizeof(st_index_t);
239 real_size = rb_str_embed_size(
capa, TERM_LEN(str));
243 else if (rb_str_reembeddable_p(str)) {
245 if (
FL_TEST_RAW(str, STR_PRECOMPUTED_HASH))
capa +=
sizeof(st_index_t);
247 real_size = rb_str_embed_size(
capa, TERM_LEN(str));
250 real_size =
sizeof(
struct RString);
257STR_EMBEDDABLE_P(
long len,
long termlen)
259 return rb_gc_size_allocatable_p(rb_str_embed_size(
len, termlen));
264static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
265static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
267static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
268static inline void str_modifiable(
VALUE str);
273str_make_independent(
VALUE str)
275 long len = RSTRING_LEN(str);
276 int termlen = TERM_LEN(str);
277 str_make_independent_expand((str),
len, 0L, termlen);
280static inline int str_dependent_p(
VALUE str);
283rb_str_make_independent(
VALUE str)
285 if (str_dependent_p(str)) {
286 str_make_independent(str);
291rb_str_make_embedded(
VALUE str)
296 int termlen = TERM_LEN(str);
297 char *buf =
RSTRING(str)->as.heap.ptr;
298 long old_capa =
RSTRING(str)->as.heap.aux.capa + termlen;
302 STR_SET_LEN(str,
len);
305 memcpy(RSTRING_PTR(str), buf,
len);
306 SIZED_FREE_N(buf, old_capa);
313rb_debug_rstring_null_ptr(
const char *func)
315 fprintf(stderr,
"%s is returning NULL!! "
316 "SIGSEGV is highly expected to follow immediately.\n"
317 "If you could reproduce, attach your debugger here, "
318 "and look at the passed string.\n",
323static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
326get_encoding(
VALUE str)
332mustnot_broken(
VALUE str)
334 if (is_broken_string(str)) {
335 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
340mustnot_wchar(
VALUE str)
343 if (rb_enc_mbminlen(enc) > 1) {
344 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
348static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
350#if SIZEOF_LONG == SIZEOF_VOIDP
351#define PRECOMPUTED_FAKESTR_HASH 1
356BARE_STRING_P(
VALUE str)
361static inline st_index_t
362str_do_hash(
VALUE str)
364 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
366 if (e && !is_ascii_string(str)) {
373str_store_precomputed_hash(
VALUE str, st_index_t hash)
379 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
380 size_t free_bytes = str_embed_capa(str) - used_bytes;
384 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
386 FL_SET(str, STR_PRECOMPUTED_HASH);
399 if (
FL_TEST(str, RSTRING_FSTR))
402 bare = BARE_STRING_P(str);
404 if (STR_EMBED_P(str)) {
409 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
416 rb_str_resize(str, RSTRING_LEN(str));
418 fstr = register_fstring(str,
false,
false);
421 str_replace_shared_without_enc(str, fstr);
428static VALUE fstring_table_obj;
431fstring_concurrent_set_hash(
VALUE str)
433#ifdef PRECOMPUTED_FAKESTR_HASH
437 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
454 const char *aptr, *bptr;
461 return (alen == blen &&
463 memcmp(aptr, bptr, alen) == 0);
468 bool force_precompute_hash;
472fstring_concurrent_set_create(
VALUE str,
void *data)
482 long len = RSTRING_LEN(str);
483 long capa =
len +
sizeof(st_index_t);
484 int term_len = TERM_LEN(str);
486 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
488 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
489 STR_SET_LEN(new_str, RSTRING_LEN(str));
491 rb_enc_copy(new_str, str);
492 str_store_precomputed_hash(new_str, str_do_hash(str));
496 rb_enc_copy(new_str, str);
497#ifdef PRECOMPUTED_FAKESTR_HASH
498 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
499 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
513 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
516 if (STR_SHARED_P(str)) {
518 str_make_independent(str);
521 if (!BARE_STRING_P(str)) {
527 RBASIC(str)->flags |= RSTRING_FSTR;
529 RB_OBJ_SET_SHAREABLE(str);
543 .hash = fstring_concurrent_set_hash,
544 .cmp = fstring_concurrent_set_cmp,
545 .create = fstring_concurrent_set_create,
550Init_fstring_table(
void)
552 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
553 rb_gc_register_address(&fstring_table_obj);
557register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
561 .force_precompute_hash = force_precompute_hash
564#if SIZEOF_VOIDP == SIZEOF_LONG
568 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
572 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
574 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
586rb_obj_is_fstring_table(
VALUE obj)
590 return obj == fstring_table_obj;
594rb_gc_free_fstring(
VALUE obj)
596 ASSERT_vm_locking_with_barrier();
602 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
604 RB_DEBUG_COUNTER_INC(obj_str_fstr);
610rb_fstring_foreach_with_replace(
int (*callback)(
VALUE *str,
void *data),
void *data)
612 if (fstring_table_obj) {
613 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
618setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
621 RBASIC_SET_SHAPE_ID((
VALUE)fake_str, ROOT_SHAPE_ID);
634 return (
VALUE)fake_str;
643 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
652rb_fstring_new(
const char *ptr,
long len)
654 struct RString fake_str = {RBASIC_INIT};
655 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
661 struct RString fake_str = {RBASIC_INIT};
662 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
666rb_fstring_cstr(
const char *
ptr)
668 return rb_fstring_new(
ptr, strlen(
ptr));
672single_byte_optimizable(
VALUE str)
676 case ENCINDEX_ASCII_8BIT:
677 case ENCINDEX_US_ASCII:
699static inline const char *
700search_nonascii(
const char *p,
const char *e)
704#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
705# if SIZEOF_UINTPTR_T == 8
706# define NONASCII_MASK UINT64_C(0x8080808080808080)
707# elif SIZEOF_UINTPTR_T == 4
708# define NONASCII_MASK UINT32_C(0x80808080)
710# error "don't know what to do."
713# if SIZEOF_UINTPTR_T == 8
714# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
715# elif SIZEOF_UINTPTR_T == 4
716# define NONASCII_MASK 0x80808080UL
718# error "don't know what to do."
722 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
723#if !UNALIGNED_WORD_ACCESS
724 if ((uintptr_t)p % SIZEOF_VOIDP) {
725 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
730 case 7:
if (p[-7]&0x80)
return p-7;
731 case 6:
if (p[-6]&0x80)
return p-6;
732 case 5:
if (p[-5]&0x80)
return p-5;
733 case 4:
if (p[-4]&0x80)
return p-4;
735 case 3:
if (p[-3]&0x80)
return p-3;
736 case 2:
if (p[-2]&0x80)
return p-2;
737 case 1:
if (p[-1]&0x80)
return p-1;
742#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
743#define aligned_ptr(value) \
744 __builtin_assume_aligned((value), sizeof(uintptr_t))
746#define aligned_ptr(value) (value)
749 t = (e - (SIZEOF_VOIDP-1));
751 for (;s < t; s +=
sizeof(uintptr_t)) {
753 memcpy(&word, s,
sizeof(word));
754 if (word & NONASCII_MASK) {
755#ifdef WORDS_BIGENDIAN
756 return (
const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
758 return (
const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
768 case 7:
if (e[-7]&0x80)
return e-7;
769 case 6:
if (e[-6]&0x80)
return e-6;
770 case 5:
if (e[-5]&0x80)
return e-5;
771 case 4:
if (e[-4]&0x80)
return e-4;
773 case 3:
if (e[-3]&0x80)
return e-3;
774 case 2:
if (e[-2]&0x80)
return e-2;
775 case 1:
if (e[-1]&0x80)
return e-1;
783 const char *e = p +
len;
785 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
787 p = search_nonascii(p, e);
791 if (rb_enc_asciicompat(enc)) {
792 p = search_nonascii(p, e);
795 int ret = rb_enc_precise_mbclen(p, e, enc);
799 p = search_nonascii(p, e);
805 int ret = rb_enc_precise_mbclen(p, e, enc);
821 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
824 p = search_nonascii(p, e);
828 else if (rb_enc_asciicompat(enc)) {
829 p = search_nonascii(p, e);
835 int ret = rb_enc_precise_mbclen(p, e, enc);
842 p = search_nonascii(p, e);
848 int ret = rb_enc_precise_mbclen(p, e, enc);
873 rb_enc_set_index(str1, rb_enc_get_index(str2));
881rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
886 str_enc_copy(dest, src);
887 if (RSTRING_LEN(dest) == 0) {
888 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
899 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
900 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
911rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
913 str_enc_copy(dest, src);
920 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
926 return enc_coderange_scan(str, enc);
935 cr = enc_coderange_scan(str, get_encoding(str));
942rb_enc_str_asciicompat(
VALUE str)
945 return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
953 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
962str_mod_check(
VALUE s,
const char *p,
long len)
964 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
970str_capacity(
VALUE str,
const int termlen)
972 if (STR_EMBED_P(str)) {
973 return str_embed_capa(str) - termlen;
975 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
979 return RSTRING(str)->as.heap.aux.capa;
986 return str_capacity(str, TERM_LEN(str));
990must_not_null(
const char *
ptr)
993 rb_raise(rb_eArgError,
"NULL pointer given");
1000 size_t size = rb_str_embed_size(
capa, 0);
1004 NEWOBJ_OF(str,
struct RString, klass,
1008 str->as.embed.ary[0] = 0;
1014str_alloc_heap(
VALUE klass)
1016 NEWOBJ_OF(str,
struct RString, klass,
1020 str->as.heap.aux.capa = 0;
1021 str->as.heap.ptr = NULL;
1027empty_str_alloc(
VALUE klass)
1029 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1030 VALUE str = str_alloc_embed(klass, 0);
1031 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1042 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1046 enc = rb_ascii8bit_encoding();
1049 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1051 int termlen = rb_enc_mbminlen(enc);
1053 if (STR_EMBEDDABLE_P(
len, termlen)) {
1054 str = str_alloc_embed(klass,
len + termlen);
1060 str = str_alloc_heap(klass);
1066 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1069 rb_enc_raw_set(str, enc);
1072 memcpy(RSTRING_PTR(str),
ptr,
len);
1075 memset(RSTRING_PTR(str), 0,
len);
1078 STR_SET_LEN(str,
len);
1079 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1086 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1121 __msan_unpoison_string(
ptr);
1141 if (rb_enc_mbminlen(enc) != 1) {
1142 rb_raise(rb_eArgError,
"wchar encoding given");
1144 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1148str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1153 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1157 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1160 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1161 str = str_alloc_heap(klass);
1165 RBASIC(str)->flags |= STR_NOFREE;
1166 rb_enc_associate_index(str, encindex);
1195static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1197 int ecflags,
VALUE ecopts);
1202 int encidx = rb_enc_to_index(enc);
1203 if (rb_enc_get_index(str) == encidx)
1204 return is_ascii_string(str);
1215 if (!to)
return str;
1216 if (!from) from = rb_enc_get(str);
1217 if (from == to)
return str;
1218 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1219 rb_is_ascii8bit_enc(to)) {
1220 if (STR_ENC_GET(str) != to) {
1222 rb_enc_associate(str, to);
1229 from, to, ecflags, ecopts);
1230 if (
NIL_P(newstr)) {
1238rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1243 olen = RSTRING_LEN(newstr);
1244 if (ofs < -olen || olen < ofs)
1246 if (ofs < 0) ofs += olen;
1248 STR_SET_LEN(newstr, ofs);
1252 rb_str_modify(newstr);
1253 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1261 STR_SET_LEN(str, 0);
1262 rb_enc_associate(str, enc);
1268str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1270 int ecflags,
VALUE ecopts)
1275 VALUE econv_wrapper;
1276 const unsigned char *start, *sp;
1277 unsigned char *dest, *dp;
1278 size_t converted_output = (size_t)ofs;
1283 RBASIC_CLEAR_CLASS(econv_wrapper);
1285 if (!ec)
return Qnil;
1288 sp = (
unsigned char*)
ptr;
1290 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1291 (dp = dest + converted_output),
1295 size_t converted_input = sp - start;
1296 size_t rest =
len - converted_input;
1297 converted_output = dp - dest;
1299 if (converted_input && converted_output &&
1300 rest < (LONG_MAX / converted_output)) {
1301 rest = (rest * converted_output) / converted_input;
1306 olen += rest < 2 ? 2 : rest;
1307 rb_str_resize(newstr, olen);
1314 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1316 rb_enc_associate(newstr, to);
1335 const int eidx = rb_enc_to_index(eenc);
1338 return rb_enc_str_new(
ptr,
len, eenc);
1342 if ((eidx == rb_ascii8bit_encindex()) ||
1343 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1347 ienc = rb_default_internal_encoding();
1348 if (!ienc || eenc == ienc) {
1349 return rb_enc_str_new(
ptr,
len, eenc);
1353 if ((eidx == rb_ascii8bit_encindex()) ||
1354 (eidx == rb_usascii_encindex()) ||
1355 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1356 return rb_enc_str_new(
ptr,
len, ienc);
1359 str = rb_enc_str_new(NULL, 0, ienc);
1362 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1363 rb_str_initialize(str,
ptr,
len, eenc);
1371 int eidx = rb_enc_to_index(eenc);
1372 if (eidx == rb_usascii_encindex() &&
1373 !is_ascii_string(str)) {
1374 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1377 rb_enc_associate_index(str, eidx);
1436str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1438 const int termlen = TERM_LEN(str);
1443 if (str_embed_capa(str2) >=
len + termlen) {
1444 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1445 STR_SET_EMBED(str2);
1446 memcpy(ptr2, RSTRING_PTR(str),
len);
1447 TERM_FILL(ptr2+
len, termlen);
1451 if (STR_SHARED_P(str)) {
1452 root =
RSTRING(str)->as.heap.aux.shared;
1461 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1463 rb_fatal(
"about to free a possible shared root");
1465 char *ptr2 = STR_HEAP_PTR(str2);
1467 SIZED_FREE_N(ptr2, STR_HEAP_SIZE(str2));
1470 FL_SET(str2, STR_NOEMBED);
1472 STR_SET_SHARED(str2, root);
1475 STR_SET_LEN(str2,
len);
1483 str_replace_shared_without_enc(str2, str);
1484 rb_enc_cr_str_exact_copy(str2, str);
1491 return str_replace_shared(str_alloc_heap(klass), str);
1508rb_str_new_frozen_String(
VALUE orig)
1516rb_str_frozen_bare_string(
VALUE orig)
1518 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1523rb_str_tmp_frozen_acquire(
VALUE orig)
1526 return str_new_frozen_buffer(0, orig, FALSE);
1530rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1532 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1533 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1535 VALUE str = str_alloc_heap(0);
1538 FL_SET(str, STR_SHARED_ROOT);
1540 size_t capa = str_capacity(orig, TERM_LEN(orig));
1546 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1547 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1554 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1555 RBASIC(orig)->flags &= ~STR_NOFREE;
1556 STR_SET_SHARED(orig, str);
1558 RB_OBJ_SET_SHAREABLE(str);
1564 RSTRING(str)->as.heap.aux.capa =
capa + (TERM_LEN(orig) - TERM_LEN(str));
1570rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1575 if (STR_EMBED_P(tmp)) {
1578 else if (
FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1584 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1588 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1589 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1594 STR_SET_LEN(tmp, 0);
1602 return str_new_frozen_buffer(klass, orig, TRUE);
1612 VALUE str = str_alloc_heap(klass);
1613 STR_SET_LEN(str, RSTRING_LEN(orig));
1614 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1615 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1616 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1617 RBASIC(orig)->flags &= ~STR_NOFREE;
1618 STR_SET_SHARED(orig, str);
1625str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1629 long len = RSTRING_LEN(orig);
1630 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1631 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1633 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1634 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1640 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1641 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1647 if ((ofs > 0) || (rest > 0) ||
1650 str = str_new_shared(klass,
shared);
1652 RSTRING(str)->as.heap.ptr += ofs;
1653 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1661 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1662 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1664 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1665 STR_SET_LEN(str, RSTRING_LEN(orig));
1671 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1674 str = heap_str_make_shared(klass, orig);
1679 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1691str_new_empty_String(
VALUE str)
1694 rb_enc_copy(v, str);
1698#define STR_BUF_MIN_SIZE 63
1703 if (STR_EMBEDDABLE_P(
capa, 1)) {
1711 RSTRING(str)->as.heap.ptr[0] =
'\0';
1731 return str_new(0, 0,
len);
1737 if (STR_EMBED_P(str)) {
1738 RB_DEBUG_COUNTER_INC(obj_str_embed);
1740 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1741 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1742 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1745 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1746 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1751rb_str_memsize(
VALUE str)
1753 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1754 return STR_HEAP_SIZE(str);
1764 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1767static inline void str_discard(
VALUE str);
1768static void str_shared_replace(
VALUE str,
VALUE str2);
1773 if (str != str2) str_shared_replace(str, str2);
1784 enc = STR_ENC_GET(str2);
1787 termlen = rb_enc_mbminlen(enc);
1789 STR_SET_LEN(str, RSTRING_LEN(str2));
1791 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1793 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1794 rb_enc_associate(str, enc);
1798 if (STR_EMBED_P(str2)) {
1800 long len = RSTRING_LEN(str2);
1803 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1804 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1805 RSTRING(str2)->as.heap.ptr = new_ptr;
1806 STR_SET_LEN(str2,
len);
1808 STR_SET_NOEMBED(str2);
1811 STR_SET_NOEMBED(str);
1813 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1815 if (
FL_TEST(str2, STR_SHARED)) {
1817 STR_SET_SHARED(str,
shared);
1820 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1824 STR_SET_EMBED(str2);
1825 RSTRING_PTR(str2)[0] = 0;
1826 STR_SET_LEN(str2, 0);
1827 rb_enc_associate(str, enc);
1841 return rb_obj_as_string_result(str, obj);
1857 len = RSTRING_LEN(str2);
1858 if (STR_SHARED_P(str2)) {
1861 STR_SET_NOEMBED(str);
1862 STR_SET_LEN(str,
len);
1863 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1864 STR_SET_SHARED(str,
shared);
1865 rb_enc_cr_str_exact_copy(str, str2);
1868 str_replace_shared(str, str2);
1877 size_t size = rb_str_embed_size(
capa, 0);
1881 NEWOBJ_OF(str,
struct RString, klass,
1892 NEWOBJ_OF(str,
struct RString, klass,
1895 str->as.heap.aux.capa = 0;
1896 str->as.heap.ptr = NULL;
1906 encidx = rb_enc_get_index(str);
1907 flags &= ~ENCODING_MASK;
1910 if (encidx) rb_enc_associate_index(dup, encidx);
1920 long len = RSTRING_LEN(str);
1925 STR_SET_LEN(dup, RSTRING_LEN(str));
1926 return str_duplicate_setup_encoding(str, dup, flags);
1935 root =
RSTRING(str)->as.heap.aux.shared;
1938 root = str = str_new_frozen(klass, str);
1944 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1946 STR_SET_SHARED(dup, root);
1947 flags |= RSTRING_NOEMBED | STR_SHARED;
1949 STR_SET_LEN(dup, RSTRING_LEN(str));
1950 return str_duplicate_setup_encoding(str, dup, flags);
1956 if (STR_EMBED_P(str)) {
1957 return str_duplicate_setup_embed(klass, str, dup);
1960 return str_duplicate_setup_heap(klass, str, dup);
1968 if (STR_EMBED_P(str)) {
1969 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1972 dup = str_alloc_heap(klass);
1975 return str_duplicate_setup(klass, str, dup);
1986rb_str_dup_m(
VALUE str)
1988 if (LIKELY(BARE_STRING_P(str))) {
1999 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2006 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2010 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2011 str_duplicate_setup_embed(klass, str, new_str);
2014 new_str = ec_str_alloc_heap(ec, klass);
2015 str_duplicate_setup_heap(klass, str, new_str);
2024rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
2026 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
2050 static ID keyword_ids[2];
2051 VALUE orig, opt, venc, vcapa;
2056 if (!keyword_ids[0]) {
2057 keyword_ids[0] = rb_id_encoding();
2058 CONST_ID(keyword_ids[1],
"capacity");
2066 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2067 enc = rb_to_encoding(venc);
2069 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2072 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2074 if (
capa < STR_BUF_MIN_SIZE) {
2075 capa = STR_BUF_MIN_SIZE;
2079 len = RSTRING_LEN(orig);
2083 if (orig == str) n = 0;
2085 str_modifiable(str);
2086 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2088 const size_t size = (size_t)
capa + termlen;
2089 const char *
const old_ptr = RSTRING_PTR(str);
2090 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2091 char *new_ptr =
ALLOC_N(
char, size);
2092 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2093 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2095 RSTRING(str)->as.heap.ptr = new_ptr;
2097 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2098 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2099 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2101 STR_SET_LEN(str,
len);
2104 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2105 rb_enc_cr_str_exact_copy(str, orig);
2107 FL_SET(str, STR_NOEMBED);
2114 rb_enc_associate(str, enc);
2126rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2132 static ID keyword_ids[2];
2142 keyword_ids[0] = rb_id_encoding();
2143 CONST_ID(keyword_ids[1],
"capacity");
2145 encoding = kwargs[0];
2146 capacity = kwargs[1];
2155 if (UNDEF_P(encoding)) {
2157 encoding = rb_obj_encoding(orig);
2161 if (!UNDEF_P(encoding)) {
2162 enc = rb_to_encoding(encoding);
2166 if (UNDEF_P(capacity)) {
2168 VALUE empty_str = str_new(klass,
"", 0);
2170 rb_enc_associate(empty_str, enc);
2174 VALUE copy = str_duplicate(klass, orig);
2175 rb_enc_associate(copy, enc);
2188 if (orig_capa >
capa) {
2193 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2194 STR_SET_LEN(str, 0);
2205#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2220static inline uintptr_t
2221count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2226 d = (d>>6) | (~d>>7);
2227 d &= NONASCII_MASK >> 7;
2230#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2232 return rb_popcount_intptr(d);
2236# if SIZEOF_VOIDP == 8
2245enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2251 long diff = (long)(e - p);
2252 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2257 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2258 const uintptr_t *s, *t;
2259 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2260 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2261 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2262 while (p < (
const char *)s) {
2263 if (is_utf8_lead_byte(*p))
len++;
2267 len += count_utf8_lead_bytes_with_word(s);
2270 p = (
const char *)s;
2273 if (is_utf8_lead_byte(*p))
len++;
2279 else if (rb_enc_asciicompat(enc)) {
2284 q = search_nonascii(p, e);
2290 p += rb_enc_fast_mbclen(p, e, enc);
2297 q = search_nonascii(p, e);
2303 p += rb_enc_mbclen(p, e, enc);
2310 for (c=0; p<e; c++) {
2311 p += rb_enc_mbclen(p, e, enc);
2326rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2334 long diff = (long)(e - p);
2335 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2337 else if (rb_enc_asciicompat(enc)) {
2341 q = search_nonascii(p, e);
2349 ret = rb_enc_precise_mbclen(p, e, enc);
2364 for (c=0; p<e; c++) {
2365 ret = rb_enc_precise_mbclen(p, e, enc);
2372 if (p + rb_enc_mbminlen(enc) <= e)
2373 p += rb_enc_mbminlen(enc);
2389 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2390 if (!enc) enc = STR_ENC_GET(str);
2391 p = RSTRING_PTR(str);
2396 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2401 return enc_strlen(p, e, enc, cr);
2408 return str_strlen(str, NULL);
2422 return LONG2NUM(str_strlen(str, NULL));
2434rb_str_bytesize(
VALUE str)
2453rb_str_empty(
VALUE str)
2455 return RBOOL(RSTRING_LEN(str) == 0);
2474 char *ptr1, *ptr2, *ptr3;
2479 enc = rb_enc_check_str(str1, str2);
2482 termlen = rb_enc_mbminlen(enc);
2483 if (len1 > LONG_MAX - len2) {
2484 rb_raise(rb_eArgError,
"string size too big");
2486 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2487 ptr3 = RSTRING_PTR(str3);
2488 memcpy(ptr3, ptr1, len1);
2489 memcpy(ptr3+len1, ptr2, len2);
2490 TERM_FILL(&ptr3[len1+len2], termlen);
2506 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2509 int enc1 = rb_enc_get_index(str1);
2510 int enc2 = rb_enc_get_index(str2);
2515 else if (enc2 < 0) {
2518 else if (enc1 != enc2) {
2521 else if (len1 > LONG_MAX - len2) {
2555 rb_enc_copy(str2, str);
2560 rb_raise(rb_eArgError,
"negative argument");
2562 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2563 if (STR_EMBEDDABLE_P(
len, 1)) {
2565 memset(RSTRING_PTR(str2), 0,
len + 1);
2572 STR_SET_LEN(str2,
len);
2573 rb_enc_copy(str2, str);
2576 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2577 rb_raise(rb_eArgError,
"argument too big");
2580 len *= RSTRING_LEN(str);
2581 termlen = TERM_LEN(str);
2583 ptr2 = RSTRING_PTR(str2);
2585 n = RSTRING_LEN(str);
2586 memcpy(ptr2, RSTRING_PTR(str), n);
2587 while (n <=
len/2) {
2588 memcpy(ptr2 + n, ptr2, n);
2591 memcpy(ptr2 + n, ptr2,
len-n);
2593 STR_SET_LEN(str2,
len);
2594 TERM_FILL(&ptr2[
len], termlen);
2595 rb_enc_cr_str_copy_for_substr(str2, str);
2634rb_check_lockedtmp(
VALUE str)
2636 if (
FL_TEST(str, STR_TMPLOCK)) {
2643#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2645str_modifiable(
VALUE str)
2649 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2650 if (CHILLED_STRING_P(str)) {
2651 CHILLED_STRING_MUTATED(str);
2653 rb_check_lockedtmp(str);
2654 rb_check_frozen(str);
2659str_dependent_p(
VALUE str)
2661 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2671#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2673str_independent(
VALUE str)
2677 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2678 str_modifiable(str);
2679 return !str_dependent_p(str);
2685str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2695 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2700 STR_SET_LEN(str,
len);
2705 oldptr = RSTRING_PTR(str);
2707 memcpy(
ptr, oldptr,
len);
2709 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2710 SIZED_FREE_N(oldptr, STR_HEAP_SIZE(str));
2712 STR_SET_NOEMBED(str);
2713 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2714 TERM_FILL(
ptr +
len, termlen);
2716 STR_SET_LEN(str,
len);
2723 if (!str_independent(str))
2724 str_make_independent(str);
2733 int termlen = TERM_LEN(str);
2734 long len = RSTRING_LEN(str);
2737 rb_raise(rb_eArgError,
"negative expanding string size");
2739 if (expand >= LONG_MAX -
len) {
2740 rb_raise(rb_eArgError,
"string size too big");
2743 if (!str_independent(str)) {
2744 str_make_independent_expand(str,
len, expand, termlen);
2746 else if (expand > 0) {
2747 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2754str_modify_keep_cr(
VALUE str)
2756 if (!str_independent(str))
2757 str_make_independent(str);
2764str_discard(
VALUE str)
2766 str_modifiable(str);
2767 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2768 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2769 RSTRING(str)->as.heap.ptr = 0;
2770 STR_SET_LEN(str, 0);
2777 int encindex = rb_enc_get_index(str);
2779 if (RB_UNLIKELY(encindex == -1)) {
2783 if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) {
2788 if (!rb_enc_asciicompat(enc)) {
2810 return RSTRING_PTR(str);
2814zero_filled(
const char *s,
int n)
2816 for (; n > 0; --n) {
2823str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2825 const char *e = s +
len;
2827 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2828 if (zero_filled(s, minlen))
return s;
2834str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2839 if (str_dependent_p(str)) {
2840 if (!zero_filled(s +
len, termlen))
2841 str_make_independent_expand(str,
len, 0L, termlen);
2844 TERM_FILL(s +
len, termlen);
2847 return RSTRING_PTR(str);
2851rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2853 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2854 long len = RSTRING_LEN(str);
2858 rb_check_lockedtmp(str);
2859 str_make_independent_expand(str,
len, 0L, termlen);
2861 else if (str_dependent_p(str)) {
2862 if (termlen > oldtermlen)
2863 str_make_independent_expand(str,
len, 0L, termlen);
2866 if (!STR_EMBED_P(str)) {
2871 if (termlen > oldtermlen) {
2872 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2880str_null_check(
VALUE str,
int *w)
2882 char *s = RSTRING_PTR(str);
2883 long len = RSTRING_LEN(str);
2886 if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
2888 minlen = rb_enc_mbminlen(enc);
2892 if (str_null_char(s,
len, minlen, enc)) {
2895 return str_fill_term(str, s,
len, minlen);
2900 if (!s || memchr(s, 0,
len)) {
2904 s = str_fill_term(str, s,
len, minlen);
2910rb_str_null_check(
VALUE str)
2918 if (RB_LIKELY(rb_str_enc_fastpath(str))) {
2919 if (!s || memchr(s, 0,
len)) {
2920 rb_raise(rb_eArgError,
"string contains null byte");
2925 const char *s = str_null_check(str, &w);
2928 rb_raise(rb_eArgError,
"string contains null char");
2930 rb_raise(rb_eArgError,
"string contains null byte");
2938rb_str_to_cstr(
VALUE str)
2941 return str_null_check(str, &w);
2949 char *s = str_null_check(str, &w);
2952 rb_raise(rb_eArgError,
"string contains null char");
2954 rb_raise(rb_eArgError,
"string contains null byte");
2960rb_str_fill_terminator(
VALUE str,
const int newminlen)
2962 char *s = RSTRING_PTR(str);
2963 long len = RSTRING_LEN(str);
2964 return str_fill_term(str, s,
len, newminlen);
2970 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2996str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
3005 else if (rb_enc_asciicompat(enc)) {
3006 const char *p2, *e2;
3009 while (p < e && 0 < nth) {
3016 p2 = search_nonascii(p, e2);
3025 n = rb_enc_mbclen(p, e, enc);
3036 while (p < e && nth--) {
3037 p += rb_enc_mbclen(p, e, enc);
3048 return str_nth_len(p, e, &nth, enc);
3052str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3057 p = str_nth_len(p, e, &nth, enc);
3066str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3068 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3069 if (!pp)
return e - p;
3076 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
3077 STR_ENC_GET(str), single_byte_optimizable(str));
3082str_utf8_nth(
const char *p,
const char *e,
long *nthp)
3085 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
3086 const uintptr_t *s, *t;
3087 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3088 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3089 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
3090 while (p < (
const char *)s) {
3091 if (is_utf8_lead_byte(*p)) nth--;
3095 nth -= count_utf8_lead_bytes_with_word(s);
3097 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3101 if (is_utf8_lead_byte(*p)) {
3102 if (nth == 0)
break;
3112str_utf8_offset(
const char *p,
const char *e,
long nth)
3114 const char *pp = str_utf8_nth(p, e, &nth);
3123 if (single_byte_optimizable(str) || pos < 0)
3126 char *p = RSTRING_PTR(str);
3127 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3132str_subseq(
VALUE str,
long beg,
long len)
3140 const int termlen = TERM_LEN(str);
3141 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3142 str2 = rb_enc_str_new(RSTRING_PTR(str) + beg,
len, rb_str_enc_get(str));
3148 if (str_embed_capa(str2) >=
len + termlen) {
3149 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3150 STR_SET_EMBED(str2);
3151 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3152 TERM_FILL(ptr2+
len, termlen);
3154 STR_SET_LEN(str2,
len);
3158 str_replace_shared(str2, str);
3161 RSTRING(str2)->as.heap.ptr += beg;
3162 if (RSTRING_LEN(str2) >
len) {
3163 STR_SET_LEN(str2,
len);
3173 VALUE str2 = str_subseq(str, beg,
len);
3174 rb_enc_cr_str_copy_for_substr(str2, str);
3183 const long blen = RSTRING_LEN(str);
3185 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3187 if (
len < 0)
return 0;
3188 if (beg < 0 && -beg < 0)
return 0;
3192 if (single_byte_optimizable(str)) {
3193 if (beg > blen)
return 0;
3196 if (beg < 0)
return 0;
3198 if (
len > blen - beg)
3200 if (
len < 0)
return 0;
3205 if (
len > -beg)
len = -beg;
3209 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3212 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3218 slen = str_strlen(str, enc);
3220 if (beg < 0)
return 0;
3222 if (
len == 0)
goto end;
3225 else if (beg > 0 && beg > blen) {
3229 if (beg > str_strlen(str, enc))
return 0;
3234 enc == rb_utf8_encoding()) {
3235 p = str_utf8_nth(s, e, &beg);
3236 if (beg > 0)
return 0;
3237 len = str_utf8_offset(p, e,
len);
3243 p = s + beg * char_sz;
3247 else if (
len * char_sz > e - p)
3252 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3253 if (beg > 0)
return 0;
3257 len = str_offset(p, e,
len, enc, 0);
3265static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3270 return str_substr(str, beg,
len, TRUE);
3280str_substr(
VALUE str,
long beg,
long len,
int empty)
3284 if (!p)
return Qnil;
3285 if (!
len && !empty)
return Qnil;
3287 beg = p - RSTRING_PTR(str);
3289 VALUE str2 = str_subseq(str, beg,
len);
3290 rb_enc_cr_str_copy_for_substr(str2, str);
3298 if (CHILLED_STRING_P(str)) {
3303 rb_str_resize(str, RSTRING_LEN(str));
3321 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3364str_uminus(
VALUE str)
3369 return rb_fstring(str);
3373#define rb_str_dup_frozen rb_str_new_frozen
3378 rb_check_frozen(str);
3379 if (
FL_TEST(str, STR_TMPLOCK)) {
3382 FL_SET(str, STR_TMPLOCK);
3389 rb_check_frozen(str);
3390 if (!
FL_TEST(str, STR_TMPLOCK)) {
3410 const int termlen = TERM_LEN(str);
3412 str_modifiable(str);
3413 if (STR_SHARED_P(str)) {
3416 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3417 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3428 else if (
len > RSTRING_LEN(str)) {
3432 const char *
const new_end = RSTRING_PTR(str) +
len;
3442 else if (
len < RSTRING_LEN(str)) {
3450 STR_SET_LEN(str,
len);
3451 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3458 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3461 int independent = str_independent(str);
3462 long slen = RSTRING_LEN(str);
3463 const int termlen = TERM_LEN(str);
3465 if (slen >
len || (termlen != 1 && slen <
len)) {
3471 if (STR_EMBED_P(str)) {
3472 if (
len == slen)
return str;
3473 if (str_embed_capa(str) >=
len + termlen) {
3474 STR_SET_LEN(str,
len);
3478 str_make_independent_expand(str, slen,
len - slen, termlen);
3480 else if (str_embed_capa(str) >=
len + termlen) {
3482 char *
ptr = STR_HEAP_PTR(str);
3484 if (slen >
len) slen =
len;
3487 STR_SET_LEN(str,
len);
3489 SIZED_FREE_N(
ptr,
capa + termlen);
3493 else if (!independent) {
3494 if (
len == slen)
return str;
3495 str_make_independent_expand(str, slen,
len - slen, termlen);
3499 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3500 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3503 else if (
len == slen)
return str;
3504 STR_SET_LEN(str,
len);
3511str_ensure_available_capa(
VALUE str,
long len)
3513 str_modify_keep_cr(str);
3515 const int termlen = TERM_LEN(str);
3516 long olen = RSTRING_LEN(str);
3518 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3519 rb_raise(rb_eArgError,
"string sizes too big");
3522 long total = olen +
len;
3523 long capa = str_capacity(str, termlen);
3526 if (total >= LONG_MAX / 2) {
3529 while (total >
capa) {
3532 RESIZE_CAPA_TERM(str,
capa, termlen);
3537str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3540 str_modify_keep_cr(str);
3545 if (
len == 0)
return 0;
3547 long total, olen,
off = -1;
3549 const int termlen = TERM_LEN(str);
3552 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3556 long capa = str_capacity(str, termlen);
3558 if (olen > LONG_MAX -
len) {
3559 rb_raise(rb_eArgError,
"string sizes too big");
3563 if (total >= LONG_MAX / 2) {
3566 while (total >
capa) {
3569 RESIZE_CAPA_TERM(str,
capa, termlen);
3570 sptr = RSTRING_PTR(str);
3575 memcpy(sptr + olen,
ptr,
len);
3576 STR_SET_LEN(str, total);
3577 TERM_FILL(sptr + total, termlen);
3582#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3583#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3588 if (
len == 0)
return str;
3590 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3592 return str_buf_cat(str,
ptr,
len);
3603rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3608 if (UNLIKELY(!str_independent(str))) {
3609 str_make_independent(str);
3612 long string_length = -1;
3613 const int null_terminator_length = 1;
3618 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3619 rb_raise(rb_eArgError,
"string sizes too big");
3622 long string_capacity = str_capacity(str, null_terminator_length);
3628 if (LIKELY(string_capacity >= string_length + 1)) {
3630 sptr[string_length] = byte;
3631 STR_SET_LEN(str, string_length + 1);
3632 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3636 str_buf_cat(str, (
char *)&
byte, 1);
3652 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3663rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3664 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3673 if (str_encindex == ptr_encindex) {
3675 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3679 str_enc = rb_enc_from_index(str_encindex);
3680 ptr_enc = rb_enc_from_index(ptr_encindex);
3681 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3684 if (RSTRING_LEN(str) == 0) {
3687 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3693 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3702 *ptr_cr_ret = ptr_cr;
3704 if (str_encindex != ptr_encindex &&
3707 str_enc = rb_enc_from_index(str_encindex);
3708 ptr_enc = rb_enc_from_index(ptr_encindex);
3713 res_encindex = str_encindex;
3718 res_encindex = str_encindex;
3722 res_encindex = ptr_encindex;
3727 res_encindex = str_encindex;
3734 res_encindex = str_encindex;
3740 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3742 str_buf_cat(str,
ptr,
len);
3748 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3755 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3765 if (rb_enc_asciicompat(enc)) {
3766 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3772 unsigned int c = (
unsigned char)*
ptr;
3773 int len = rb_enc_codelen(c, enc);
3774 rb_enc_mbcput(c, buf, enc);
3775 rb_enc_cr_str_buf_cat(str, buf,
len,
3788 if (rb_str_enc_fastpath(str)) {
3792 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3798 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3809 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3825rb_str_concat_literals(
size_t num,
const VALUE *strary)
3829 unsigned long len = 1;
3834 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3836 str_enc_copy_direct(str, strary[0]);
3838 for (i = s; i < num; ++i) {
3839 const VALUE v = strary[i];
3843 if (encidx != ENCINDEX_US_ASCII) {
3845 rb_enc_set_index(str, encidx);
3858rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3860 str_modifiable(str);
3865 else if (argc > 1) {
3868 rb_enc_copy(arg_str, str);
3869 for (i = 0; i < argc; i++) {
3904rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3906 long needed_capacity = 0;
3910 for (
int index = 0; index < argc; index++) {
3911 VALUE obj = argv[index];
3919 needed_capacity += RSTRING_LEN(obj);
3924 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3931 str_ensure_available_capa(str, needed_capacity);
3934 for (
int index = 0; index < argc; index++) {
3935 VALUE obj = argv[index];
3940 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3941 char byte = (char)(
NUM2INT(obj) & 0xFF);
3955 rb_bug(
"append_as_bytes arguments should have been validated");
3959 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3960 TERM_FILL(sptr, TERM_LEN(str));
3965 for (
int index = 0; index < argc; index++) {
3966 VALUE obj = argv[index];
3983 rb_bug(
"append_as_bytes arguments should have been validated");
4062 if (rb_num_to_uint(str2, &code) == 0) {
4075 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4078 rb_str_buf_cat_byte(str1, (
unsigned char)code);
4081 long pos = RSTRING_LEN(str1);
4086 switch (
len = rb_enc_codelen(code, enc)) {
4087 case ONIGERR_INVALID_CODE_POINT_VALUE:
4088 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4090 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4096 rb_enc_mbcput(code, buf, enc);
4097 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
4098 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4100 rb_str_resize(str1, pos+
len);
4101 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
4114rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
4116 int encidx = rb_enc_to_index(enc);
4118 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4123 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4124 return ENCINDEX_ASCII_8BIT;
4146rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4148 str_modifiable(str);
4153 else if (argc > 1) {
4156 rb_enc_copy(arg_str, str);
4157 for (i = 0; i < argc; i++) {
4170 st_index_t precomputed_hash;
4171 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4173 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4174 return precomputed_hash;
4177 return str_do_hash(str);
4184 const char *ptr1, *ptr2;
4187 return (len1 != len2 ||
4189 memcmp(ptr1, ptr2, len1) != 0);
4201rb_str_hash_m(
VALUE str)
4207#define lesser(a,b) (((a)>(b))?(b):(a))
4215 if (RSTRING_LEN(str1) == 0)
return TRUE;
4216 if (RSTRING_LEN(str2) == 0)
return TRUE;
4219 if (idx1 == idx2)
return TRUE;
4224 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4228 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4238 const char *ptr1, *ptr2;
4241 if (str1 == str2)
return 0;
4244 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4253 if (len1 > len2)
return 1;
4256 if (retval > 0)
return 1;
4290 if (str1 == str2)
return Qtrue;
4297 return rb_str_eql_internal(str1, str2);
4311 if (str1 == str2)
return Qtrue;
4313 return rb_str_eql_internal(str1, str2);
4351 return rb_invcmp(str1, str2);
4393 return str_casecmp(str1, s);
4401 const char *p1, *p1end, *p2, *p2end;
4403 enc = rb_enc_compatible(str1, str2);
4408 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4409 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4410 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4411 while (p1 < p1end && p2 < p2end) {
4413 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4414 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4416 return INT2FIX(c1 < c2 ? -1 : 1);
4423 while (p1 < p1end && p2 < p2end) {
4424 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4425 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4427 if (0 <= c1 && 0 <= c2) {
4431 return INT2FIX(c1 < c2 ? -1 : 1);
4435 l1 = rb_enc_mbclen(p1, p1end, enc);
4436 l2 = rb_enc_mbclen(p2, p2end, enc);
4437 len = l1 < l2 ? l1 : l2;
4438 r = memcmp(p1, p2,
len);
4440 return INT2FIX(r < 0 ? -1 : 1);
4442 return INT2FIX(l1 < l2 ? -1 : 1);
4448 if (p1 == p1end && p2 == p2end)
return INT2FIX(0);
4449 if (p1 == p1end)
return INT2FIX(-1);
4482 return str_casecmp_p(str1, s);
4489 VALUE folded_str1, folded_str2;
4490 VALUE fold_opt = sym_fold;
4492 enc = rb_enc_compatible(str1, str2);
4497 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4498 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4500 return rb_str_eql(folded_str1, folded_str2);
4504strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4505 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4507 const char *search_start = str_ptr;
4508 long pos, search_len = str_len - offset;
4512 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4513 if (pos < 0)
return pos;
4515 if (t == search_start + pos)
break;
4516 search_len -= t - search_start;
4517 if (search_len <= 0)
return -1;
4518 offset += t - search_start;
4521 return pos + offset;
4525#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4526#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4529rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4531 const char *str_ptr, *str_ptr_end, *sub_ptr;
4532 long str_len, sub_len;
4535 enc = rb_enc_check(str, sub);
4536 if (is_broken_string(sub))
return -1;
4538 str_ptr = RSTRING_PTR(str);
4540 str_len = RSTRING_LEN(str);
4541 sub_ptr = RSTRING_PTR(sub);
4542 sub_len = RSTRING_LEN(sub);
4544 if (str_len < sub_len)
return -1;
4547 long str_len_char, sub_len_char;
4548 int single_byte = single_byte_optimizable(str);
4549 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4550 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4552 offset += str_len_char;
4553 if (offset < 0)
return -1;
4555 if (str_len_char - offset < sub_len_char)
return -1;
4556 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4559 if (sub_len == 0)
return offset;
4562 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4575rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4582 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4583 long slen = str_strlen(str, enc);
4585 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4597 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4598 enc, single_byte_optimizable(str));
4609 pos = rb_str_index(str, sub, pos);
4623str_ensure_byte_pos(
VALUE str,
long pos)
4625 if (!single_byte_optimizable(str)) {
4626 const char *s = RSTRING_PTR(str);
4628 const char *p = s + pos;
4629 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4631 "offset %ld does not land on character boundary", pos);
4704rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4710 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4711 long slen = RSTRING_LEN(str);
4713 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4724 str_ensure_byte_pos(str, pos);
4736 pos = rb_str_byteindex(str, sub, pos);
4737 if (pos >= 0)
return LONG2NUM(pos);
4744memrchr(
const char *search_str,
int chr,
long search_len)
4746 const char *ptr = search_str + search_len;
4747 while (ptr > search_str) {
4748 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4758 char *hit, *adjusted;
4760 long slen, searchlen;
4763 sbeg = RSTRING_PTR(str);
4764 slen = RSTRING_LEN(sub);
4765 if (slen == 0)
return s - sbeg;
4767 t = RSTRING_PTR(sub);
4769 searchlen = s - sbeg + 1;
4771 if (memcmp(s, t, slen) == 0) {
4776 hit = memrchr(sbeg, c, searchlen);
4779 if (hit != adjusted) {
4780 searchlen = adjusted - sbeg;
4783 if (memcmp(hit, t, slen) == 0)
4785 searchlen = adjusted - sbeg;
4786 }
while (searchlen > 0);
4800 enc = rb_enc_check(str, sub);
4801 if (is_broken_string(sub))
return -1;
4802 singlebyte = single_byte_optimizable(str);
4803 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4804 slen = str_strlen(sub, enc);
4807 if (
len < slen)
return -1;
4808 if (
len - pos < slen) pos =
len - slen;
4809 if (
len == 0)
return pos;
4811 sbeg = RSTRING_PTR(str);
4814 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4820 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4821 return str_rindex(str, sub, s, enc);
4833rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4838 long pos,
len = str_strlen(str, enc);
4840 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4842 if (pos < 0 && (pos +=
len) < 0) {
4848 if (pos >
len) pos =
len;
4856 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4857 enc, single_byte_optimizable(str));
4868 pos = rb_str_rindex(str, sub, pos);
4878rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4884 enc = rb_enc_check(str, sub);
4885 if (is_broken_string(sub))
return -1;
4886 len = RSTRING_LEN(str);
4887 slen = RSTRING_LEN(sub);
4890 if (
len < slen)
return -1;
4891 if (
len - pos < slen) pos =
len - slen;
4892 if (
len == 0)
return pos;
4894 sbeg = RSTRING_PTR(str);
4897 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4904 return str_rindex(str, sub, s, enc);
4994rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4998 long pos,
len = RSTRING_LEN(str);
5000 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
5002 if (pos < 0 && (pos +=
len) < 0) {
5008 if (pos >
len) pos =
len;
5014 str_ensure_byte_pos(str, pos);
5026 pos = rb_str_byterindex(str, sub, pos);
5027 if (pos >= 0)
return LONG2NUM(pos);
5069 switch (OBJ_BUILTIN_TYPE(y)) {
5123rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5130 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5161rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5165 re = get_pat(argv[0]);
5166 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5175static enum neighbor_char
5181 if (rb_enc_mbminlen(enc) > 1) {
5183 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5185 return NEIGHBOR_NOT_CHAR;
5187 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5189 if (!l)
return NEIGHBOR_NOT_CHAR;
5190 if (l !=
len)
return NEIGHBOR_WRAPPED;
5191 rb_enc_mbcput(c, p, enc);
5192 r = rb_enc_precise_mbclen(p, p +
len, enc);
5194 return NEIGHBOR_NOT_CHAR;
5196 return NEIGHBOR_FOUND;
5199 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5202 return NEIGHBOR_WRAPPED;
5203 ++((
unsigned char*)p)[i];
5204 l = rb_enc_precise_mbclen(p, p+
len, enc);
5208 return NEIGHBOR_FOUND;
5211 memset(p+l, 0xff,
len-l);
5217 for (len2 =
len-1; 0 < len2; len2--) {
5218 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5222 memset(p+len2+1, 0xff,
len-(len2+1));
5227static enum neighbor_char
5232 if (rb_enc_mbminlen(enc) > 1) {
5234 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5236 return NEIGHBOR_NOT_CHAR;
5238 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5239 if (!c)
return NEIGHBOR_NOT_CHAR;
5242 if (!l)
return NEIGHBOR_NOT_CHAR;
5243 if (l !=
len)
return NEIGHBOR_WRAPPED;
5244 rb_enc_mbcput(c, p, enc);
5245 r = rb_enc_precise_mbclen(p, p +
len, enc);
5247 return NEIGHBOR_NOT_CHAR;
5249 return NEIGHBOR_FOUND;
5252 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5255 return NEIGHBOR_WRAPPED;
5256 --((
unsigned char*)p)[i];
5257 l = rb_enc_precise_mbclen(p, p+
len, enc);
5261 return NEIGHBOR_FOUND;
5264 memset(p+l, 0,
len-l);
5270 for (len2 =
len-1; 0 < len2; len2--) {
5271 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5275 memset(p+len2+1, 0,
len-(len2+1));
5289static enum neighbor_char
5290enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5292 enum neighbor_char ret;
5296 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5300 const int max_gaps = 1;
5302 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5304 ctype = ONIGENC_CTYPE_DIGIT;
5306 ctype = ONIGENC_CTYPE_ALPHA;
5308 return NEIGHBOR_NOT_CHAR;
5311 for (
try = 0;
try <= max_gaps; ++
try) {
5312 ret = enc_succ_char(p,
len, enc);
5313 if (ret == NEIGHBOR_FOUND) {
5314 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5316 return NEIGHBOR_FOUND;
5323 ret = enc_pred_char(p,
len, enc);
5324 if (ret == NEIGHBOR_FOUND) {
5325 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5338 return NEIGHBOR_NOT_CHAR;
5341 if (ctype != ONIGENC_CTYPE_DIGIT) {
5343 return NEIGHBOR_WRAPPED;
5347 enc_succ_char(carry,
len, enc);
5348 return NEIGHBOR_WRAPPED;
5366 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5367 rb_enc_cr_str_copy_for_substr(str, orig);
5368 return str_succ(str);
5375 char *sbeg, *s, *e, *last_alnum = 0;
5376 int found_alnum = 0;
5378 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5379 long carry_pos = 0, carry_len = 1;
5380 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5382 slen = RSTRING_LEN(str);
5383 if (slen == 0)
return str;
5385 enc = STR_ENC_GET(str);
5386 sbeg = RSTRING_PTR(str);
5387 s = e = sbeg + slen;
5389 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5390 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5396 l = rb_enc_precise_mbclen(s, e, enc);
5397 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5398 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5399 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5401 case NEIGHBOR_NOT_CHAR:
5403 case NEIGHBOR_FOUND:
5405 case NEIGHBOR_WRAPPED:
5410 carry_pos = s - sbeg;
5415 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5416 enum neighbor_char neighbor;
5417 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5418 l = rb_enc_precise_mbclen(s, e, enc);
5419 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5420 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5422 neighbor = enc_succ_char(tmp, l, enc);
5424 case NEIGHBOR_FOUND:
5428 case NEIGHBOR_WRAPPED:
5431 case NEIGHBOR_NOT_CHAR:
5434 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5436 enc_succ_char(s, l, enc);
5438 if (!rb_enc_asciicompat(enc)) {
5439 MEMCPY(carry, s,
char, l);
5442 carry_pos = s - sbeg;
5446 RESIZE_CAPA(str, slen + carry_len);
5447 sbeg = RSTRING_PTR(str);
5448 s = sbeg + carry_pos;
5449 memmove(s + carry_len, s, slen - carry_pos);
5450 memmove(s, carry, carry_len);
5452 STR_SET_LEN(str, slen);
5453 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5469rb_str_succ_bang(
VALUE str)
5477all_digits_p(
const char *s,
long len)
5505 VALUE end, exclusive;
5509 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5515 VALUE current, after_end;
5522 enc = rb_enc_check(beg, end);
5523 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5525 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5526 char c = RSTRING_PTR(beg)[0];
5527 char e = RSTRING_PTR(end)[0];
5529 if (c > e || (excl && c == e))
return beg;
5531 VALUE str = rb_enc_str_new(&c, 1, enc);
5533 if ((*each)(str, arg))
break;
5534 if (!excl && c == e)
break;
5536 if (excl && c == e)
break;
5541 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5542 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5543 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5548 b = rb_str_to_inum(beg, 10, FALSE);
5549 e = rb_str_to_inum(end, 10, FALSE);
5556 if (excl && bi == ei)
break;
5557 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5562 ID op = excl ?
'<' : idLE;
5563 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5568 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5569 b = rb_funcallv(b, succ, 0, 0);
5576 if (n > 0 || (excl && n == 0))
return beg;
5578 after_end = rb_funcallv(end, succ, 0, 0);
5583 next = rb_funcallv(current, succ, 0, 0);
5584 if ((*each)(current, arg))
break;
5585 if (
NIL_P(next))
break;
5589 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5604 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5605 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5606 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5608 b = rb_str_to_inum(beg, 10, FALSE);
5614 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5622 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5623 b = rb_funcallv(b, succ, 0, 0);
5629 VALUE next = rb_funcallv(current, succ, 0, 0);
5630 if ((*each)(current, arg))
break;
5633 if (RSTRING_LEN(current) == 0)
5644 if (!
rb_equal(str, *argp))
return 0;
5658 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5659 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5660 rb_enc_asciicompat(STR_ENC_GET(val))) {
5661 const char *bp = RSTRING_PTR(beg);
5662 const char *ep = RSTRING_PTR(end);
5663 const char *vp = RSTRING_PTR(val);
5664 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5665 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5673 if (b <= v && v < e)
return Qtrue;
5674 return RBOOL(!
RTEST(exclusive) && v == e);
5681 all_digits_p(bp, RSTRING_LEN(beg)) &&
5682 all_digits_p(ep, RSTRING_LEN(end))) {
5687 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5689 return RBOOL(
NIL_P(val));
5712 return rb_str_subpat(str, indx,
INT2FIX(0));
5715 if (rb_str_index(str, indx, 0) != -1)
5721 long beg,
len = str_strlen(str, NULL);
5733 return str_substr(str, idx, 1, FALSE);
5750rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5754 return rb_str_subpat(str, argv[0], argv[1]);
5757 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5761 return rb_str_aref(str, argv[0]);
5767 char *ptr = RSTRING_PTR(str);
5768 long olen = RSTRING_LEN(str), nlen;
5770 str_modifiable(str);
5771 if (
len > olen)
len = olen;
5773 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5775 size_t old_capa =
RSTRING(str)->as.heap.aux.capa + TERM_LEN(str);
5776 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5778 ptr =
RSTRING(str)->as.embed.ary;
5779 memmove(ptr, oldptr +
len, nlen);
5780 if (fl == STR_NOEMBED) {
5781 SIZED_FREE_N(oldptr, old_capa);
5785 if (!STR_SHARED_P(str)) {
5787 rb_enc_cr_str_exact_copy(shared, str);
5792 STR_SET_LEN(str, nlen);
5794 if (!SHARABLE_MIDDLE_SUBSTRING) {
5795 TERM_FILL(ptr + nlen, TERM_LEN(str));
5802rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5808 if (beg == 0 && vlen == 0) {
5813 str_modify_keep_cr(str);
5817 RESIZE_CAPA(str, slen + vlen -
len);
5818 sptr = RSTRING_PTR(str);
5827 memmove(sptr + beg + vlen,
5829 slen - (beg +
len));
5831 if (vlen < beg &&
len < 0) {
5835 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5838 STR_SET_LEN(str, slen);
5839 TERM_FILL(&sptr[slen], TERM_LEN(str));
5846 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5855 int singlebyte = single_byte_optimizable(str);
5861 enc = rb_enc_check(str, val);
5862 slen = str_strlen(str, enc);
5864 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5873 if (
len > slen - beg) {
5876 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5881 beg = p - RSTRING_PTR(str);
5883 rb_str_update_0(str, beg,
len, val);
5884 rb_enc_associate(str, enc);
5895 long start, end,
len;
5905 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5909 nth += regs->num_regs;
5919 enc = rb_enc_check_str(str, val);
5920 rb_str_update_0(str, start,
len, val);
5921 rb_enc_associate(str, enc);
5929 switch (
TYPE(indx)) {
5931 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5935 beg = rb_str_index(str, indx, 0);
5974rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5978 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5986 return rb_str_aset(str, argv[0], argv[1]);
6038rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6046 str_modify_keep_cr(str);
6054 if ((nth += regs->num_regs) <= 0)
return Qnil;
6056 else if (nth >= regs->num_regs)
return Qnil;
6058 len = END(nth) - beg;
6061 else if (argc == 2) {
6070 beg = p - RSTRING_PTR(str);
6074 beg = rb_str_index(str, indx, 0);
6075 if (beg == -1)
return Qnil;
6076 len = RSTRING_LEN(indx);
6088 beg = p - RSTRING_PTR(str);
6097 beg = p - RSTRING_PTR(str);
6101 rb_enc_cr_str_copy_for_substr(result, str);
6109 char *sptr = RSTRING_PTR(str);
6110 long slen = RSTRING_LEN(str);
6111 if (beg +
len > slen)
6115 slen - (beg +
len));
6117 STR_SET_LEN(str, slen);
6118 TERM_FILL(&sptr[slen], TERM_LEN(str));
6129 switch (OBJ_BUILTIN_TYPE(pat)) {
6148get_pat_quoted(
VALUE pat,
int check)
6152 switch (OBJ_BUILTIN_TYPE(pat)) {
6166 if (check && is_broken_string(pat)) {
6173rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6176 pos = rb_str_byteindex(str, pat, pos);
6177 if (set_backref_str) {
6179 str = rb_str_new_frozen_String(str);
6180 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6182 *match = match_data;
6192 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6197rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6199 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6217rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6232 hash = rb_check_hash_type(repl);
6239 pat = get_pat_quoted(argv[0], 1);
6241 str_modifiable(str);
6242 beg = rb_pat_search(pat, str, 0, 1);
6256 end0 = beg0 + RSTRING_LEN(pat);
6265 if (iter || !
NIL_P(hash)) {
6266 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6272 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6275 str_mod_check(str, p,
len);
6276 rb_check_frozen(str);
6282 enc = rb_enc_compatible(str, repl);
6285 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6289 rb_enc_inspect_name(str_enc),
6290 rb_enc_inspect_name(STR_ENC_GET(repl)));
6292 enc = STR_ENC_GET(repl);
6295 rb_enc_associate(str, enc);
6305 rlen = RSTRING_LEN(repl);
6306 len = RSTRING_LEN(str);
6308 RESIZE_CAPA(str,
len + rlen - plen);
6310 p = RSTRING_PTR(str);
6312 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6314 rp = RSTRING_PTR(repl);
6315 memmove(p + beg0, rp, rlen);
6317 STR_SET_LEN(str,
len);
6318 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6341 rb_str_sub_bang(argc, argv, str);
6346str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6349 long beg, beg0, end0;
6350 long offset, blen, slen,
len, last;
6351 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6353 int need_backref_str = -1;
6364 hash = rb_check_hash_type(repl);
6368 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6377 rb_error_arity(argc, 1, 2);
6380 pat = get_pat_quoted(argv[0], 1);
6381 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6384 if (bang)
return Qnil;
6389 blen = RSTRING_LEN(str) + 30;
6391 sp = RSTRING_PTR(str);
6392 slen = RSTRING_LEN(str);
6394 str_enc = STR_ENC_GET(str);
6395 rb_enc_associate(dest, str_enc);
6402 end0 = beg0 + RSTRING_LEN(pat);
6416 struct RString fake_str = {RBASIC_INIT};
6418 if (mode == FAST_MAP) {
6427 val = rb_hash_aref(hash, key);
6430 str_mod_check(str, sp, slen);
6435 else if (need_backref_str) {
6437 if (need_backref_str < 0) {
6438 need_backref_str = val != repl;
6445 len = beg0 - offset;
6459 if (RSTRING_LEN(str) <= end0)
break;
6460 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6462 offset = end0 +
len;
6464 cp = RSTRING_PTR(str) + offset;
6465 if (offset > RSTRING_LEN(str))
break;
6468 if (mode != FAST_MAP && mode != STR) {
6471 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6476 if (RSTRING_LEN(str) > offset) {
6479 rb_pat_search0(pat, str, last, 1, &match);
6481 str_shared_replace(str, dest);
6506rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6508 str_modify_keep_cr(str);
6509 return str_gsub(argc, argv, str, 1);
6559 return str_gsub(argc, argv, str, 0);
6579 str_modifiable(str);
6580 if (str == str2)
return str;
6584 return str_replace(str, str2);
6601rb_str_clear(
VALUE str)
6605 STR_SET_LEN(str, 0);
6606 RSTRING_PTR(str)[0] = 0;
6607 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6623rb_str_chr(
VALUE str)
6641 pos += RSTRING_LEN(str);
6642 if (pos < 0 || RSTRING_LEN(str) <= pos)
6645 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6665 long len = RSTRING_LEN(str);
6666 char *
ptr, *head, *left = 0;
6670 if (pos < -
len ||
len <= pos)
6677 char byte = (char)(
NUM2INT(w) & 0xFF);
6679 if (!str_independent(str))
6680 str_make_independent(str);
6681 enc = STR_ENC_GET(str);
6682 head = RSTRING_PTR(str);
6684 if (!STR_EMBED_P(str)) {
6691 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6699 width = rb_enc_precise_mbclen(left, head+
len, enc);
6701 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6717str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6719 long n = RSTRING_LEN(str);
6721 if (beg > n ||
len < 0)
return Qnil;
6724 if (beg < 0)
return Qnil;
6729 if (!empty)
return Qnil;
6733 VALUE str2 = str_subseq(str, beg,
len);
6735 str_enc_copy_direct(str2, str);
6737 if (RSTRING_LEN(str2) == 0) {
6738 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6772 long beg,
len = RSTRING_LEN(str);
6780 return str_byte_substr(str, beg,
len, TRUE);
6785 return str_byte_substr(str, idx, 1, FALSE);
6797rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6802 return str_byte_substr(str, beg,
len, TRUE);
6805 return str_byte_aref(str, argv[0]);
6809str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6811 long end, slen = RSTRING_LEN(str);
6814 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6823 if (*
len > slen - *beg) {
6827 str_ensure_byte_pos(str, *beg);
6828 str_ensure_byte_pos(str, end);
6842rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6844 long beg,
len, vbeg, vlen;
6849 if (!(argc == 2 || argc == 3 || argc == 5)) {
6850 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6854 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6855 rb_builtin_class_name(argv[0]));
6862 vlen = RSTRING_LEN(val);
6867 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6868 rb_builtin_class_name(argv[2]));
6880 vlen = RSTRING_LEN(val);
6888 str_check_beg_len(str, &beg, &
len);
6889 str_check_beg_len(val, &vbeg, &vlen);
6890 str_modify_keep_cr(str);
6893 rb_enc_associate(str, rb_enc_check(str, val));
6896 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6918rb_str_reverse(
VALUE str)
6925 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6926 enc = STR_ENC_GET(str);
6932 if (RSTRING_LEN(str) > 1) {
6933 if (single_byte_optimizable(str)) {
6940 int clen = rb_enc_fast_mbclen(s, e, enc);
6948 cr = rb_enc_asciicompat(enc) ?
6951 int clen = rb_enc_mbclen(s, e, enc);
6960 STR_SET_LEN(rev, RSTRING_LEN(str));
6961 str_enc_copy_direct(rev, str);
6983rb_str_reverse_bang(
VALUE str)
6985 if (RSTRING_LEN(str) > 1) {
6986 if (single_byte_optimizable(str)) {
6989 str_modify_keep_cr(str);
6990 s = RSTRING_PTR(str);
6999 str_shared_replace(str, rb_str_reverse(str));
7003 str_modify_keep_cr(str);
7032 i = rb_str_index(str, arg, 0);
7034 return RBOOL(i != -1);
7078 rb_raise(rb_eArgError,
"invalid radix %d", base);
7080 return rb_str_to_inum(str, base, FALSE);
7105rb_str_to_f(
VALUE str)
7122rb_str_to_s(
VALUE str)
7134 char s[RUBY_MAX_CHAR_LEN];
7135 int n = rb_enc_codelen(c, enc);
7137 rb_enc_mbcput(c, s, enc);
7142#define CHAR_ESC_LEN 13
7145rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7147 char buf[CHAR_ESC_LEN + 1];
7155 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7157 else if (c < 0x10000) {
7158 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7161 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7166 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7169 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7172 l = (int)strlen(buf);
7178ruby_escaped_char(
int c)
7181 case '\0':
return "\\0";
7182 case '\n':
return "\\n";
7183 case '\r':
return "\\r";
7184 case '\t':
return "\\t";
7185 case '\f':
return "\\f";
7186 case '\013':
return "\\v";
7187 case '\010':
return "\\b";
7188 case '\007':
return "\\a";
7189 case '\033':
return "\\e";
7190 case '\x7f':
return "\\c?";
7196rb_str_escape(
VALUE str)
7200 const char *p = RSTRING_PTR(str);
7202 const char *prev = p;
7203 char buf[CHAR_ESC_LEN + 1];
7205 int unicode_p = rb_enc_unicode_p(enc);
7206 int asciicompat = rb_enc_asciicompat(enc);
7211 int n = rb_enc_precise_mbclen(p, pend, enc);
7213 if (p > prev) str_buf_cat(result, prev, p - prev);
7214 n = rb_enc_mbminlen(enc);
7216 n = (int)(pend - p);
7218 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7219 str_buf_cat(result, buf, strlen(buf));
7225 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7227 cc = ruby_escaped_char(c);
7229 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7230 str_buf_cat(result, cc, strlen(cc));
7233 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7236 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7237 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7241 if (p > prev) str_buf_cat(result, prev, p - prev);
7260 const char *p, *pend, *prev;
7261 char buf[CHAR_ESC_LEN + 1];
7263 rb_encoding *resenc = rb_default_internal_encoding();
7264 int unicode_p = rb_enc_unicode_p(enc);
7265 int asciicompat = rb_enc_asciicompat(enc);
7267 if (resenc == NULL) resenc = rb_default_external_encoding();
7268 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7269 rb_enc_associate(result, resenc);
7270 str_buf_cat2(result,
"\"");
7278 n = rb_enc_precise_mbclen(p, pend, enc);
7280 if (p > prev) str_buf_cat(result, prev, p - prev);
7281 n = rb_enc_mbminlen(enc);
7283 n = (int)(pend - p);
7285 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7286 str_buf_cat(result, buf, strlen(buf));
7292 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7294 if ((asciicompat || unicode_p) &&
7295 (c ==
'"'|| c ==
'\\' ||
7300 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7301 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7302 str_buf_cat2(result,
"\\");
7303 if (asciicompat || enc == resenc) {
7309 case '\n': cc =
'n';
break;
7310 case '\r': cc =
'r';
break;
7311 case '\t': cc =
't';
break;
7312 case '\f': cc =
'f';
break;
7313 case '\013': cc =
'v';
break;
7314 case '\010': cc =
'b';
break;
7315 case '\007': cc =
'a';
break;
7316 case 033: cc =
'e';
break;
7317 default: cc = 0;
break;
7320 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7323 str_buf_cat(result, buf, 2);
7336 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7340 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7341 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7346 if (p > prev) str_buf_cat(result, prev, p - prev);
7347 str_buf_cat2(result,
"\"");
7352#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7365 int encidx = rb_enc_get_index(str);
7368 const char *p, *pend;
7371 int u8 = (encidx == rb_utf8_encindex());
7372 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7375 if (!rb_enc_asciicompat(enc)) {
7377 len += strlen(enc->name);
7380 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7383 unsigned char c = *p++;
7386 case '"':
case '\\':
7387 case '\n':
case '\r':
7388 case '\t':
case '\f':
7389 case '\013':
case '\010':
case '\007':
case '\033':
7394 clen = IS_EVSTR(p, pend) ? 2 : 1;
7402 if (u8 && c > 0x7F) {
7403 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7405 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7408 else if (cc <= 0xFFFFF)
7421 if (clen > LONG_MAX -
len) {
7428 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7429 q = RSTRING_PTR(result); qend = q +
len + 1;
7433 unsigned char c = *p++;
7435 if (c ==
'"' || c ==
'\\') {
7439 else if (c ==
'#') {
7440 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7443 else if (c ==
'\n') {
7447 else if (c ==
'\r') {
7451 else if (c ==
'\t') {
7455 else if (c ==
'\f') {
7459 else if (c ==
'\013') {
7463 else if (c ==
'\010') {
7467 else if (c ==
'\007') {
7471 else if (c ==
'\033') {
7481 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7483 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7486 snprintf(q, qend-q,
"u%04X", cc);
7488 snprintf(q, qend-q,
"u{%X}", cc);
7493 snprintf(q, qend-q,
"x%02X", c);
7499 if (!rb_enc_asciicompat(enc)) {
7500 snprintf(q, qend-q, nonascii_suffix, enc->name);
7501 encidx = rb_ascii8bit_encindex();
7504 rb_enc_associate_index(result, encidx);
7510unescape_ascii(
unsigned int c)
7534undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7536 const char *s = *ss;
7540 unsigned char buf[6];
7558 *buf = unescape_ascii(*s);
7570 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7571 if (*penc != enc_utf8) {
7573 rb_enc_associate(undumped, enc_utf8);
7590 if (hexlen == 0 || hexlen > 6) {
7596 if (0xd800 <= c && c <= 0xdfff) {
7599 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7609 if (0xd800 <= c && c <= 0xdfff) {
7612 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7642static VALUE rb_str_is_ascii_only_p(
VALUE str);
7654str_undump(
VALUE str)
7656 const char *s = RSTRING_PTR(str);
7659 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7661 bool binary =
false;
7665 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7668 if (!str_null_check(str, &w)) {
7671 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7672 if (*s !=
'"')
goto invalid_format;
7690 static const char force_encoding_suffix[] =
".force_encoding(\"";
7691 static const char dup_suffix[] =
".dup";
7692 const char *encname;
7697 size =
sizeof(dup_suffix) - 1;
7698 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7700 size =
sizeof(force_encoding_suffix) - 1;
7701 if (s_end - s <= size)
goto invalid_format;
7702 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7706 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7710 s = memchr(s,
'"', s_end-s);
7712 if (!s)
goto invalid_format;
7713 if (s_end - s != 2)
goto invalid_format;
7714 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7716 encidx = rb_enc_find_index2(encname, (
long)size);
7720 rb_enc_associate_index(undumped, encidx);
7730 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7741 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7747 if (rb_enc_dummy_p(enc)) {
7754str_true_enc(
VALUE str)
7757 rb_str_check_dummy_enc(enc);
7761static OnigCaseFoldType
7762check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7767 rb_raise(rb_eArgError,
"too many options");
7768 if (argv[0]==sym_turkic) {
7769 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7771 if (argv[1]==sym_lithuanian)
7772 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7774 rb_raise(rb_eArgError,
"invalid second option");
7777 else if (argv[0]==sym_lithuanian) {
7778 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7780 if (argv[1]==sym_turkic)
7781 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7783 rb_raise(rb_eArgError,
"invalid second option");
7787 rb_raise(rb_eArgError,
"too many options");
7788 else if (argv[0]==sym_ascii)
7789 flags |= ONIGENC_CASE_ASCII_ONLY;
7790 else if (argv[0]==sym_fold) {
7791 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7792 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7794 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7797 rb_raise(rb_eArgError,
"invalid option");
7804 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7810#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7811#ifndef CASEMAP_DEBUG
7812# define CASEMAP_DEBUG 0
7820 OnigUChar space[FLEX_ARY_LEN];
7824mapping_buffer_free(
void *p)
7828 while (current_buffer) {
7829 previous_buffer = current_buffer;
7830 current_buffer = current_buffer->next;
7831 ruby_sized_xfree(previous_buffer, offsetof(
mapping_buffer, space) + previous_buffer->capa);
7837 {0, mapping_buffer_free,},
7846 const OnigUChar *source_current, *source_end;
7847 int target_length = 0;
7848 VALUE buffer_anchor;
7851 size_t buffer_count = 0;
7852 int buffer_length_or_invalid;
7854 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7856 source_current = (OnigUChar*)RSTRING_PTR(source);
7861 while (source_current < source_end) {
7863 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7864 if (CASEMAP_DEBUG) {
7865 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7868 *pre_buffer = current_buffer;
7869 pre_buffer = ¤t_buffer->next;
7870 current_buffer->next = NULL;
7871 current_buffer->capa =
capa;
7872 buffer_length_or_invalid = enc->case_map(flags,
7873 &source_current, source_end,
7874 current_buffer->space,
7875 current_buffer->space+current_buffer->capa,
7877 if (buffer_length_or_invalid < 0) {
7878 current_buffer =
DATA_PTR(buffer_anchor);
7880 mapping_buffer_free(current_buffer);
7881 rb_raise(rb_eArgError,
"input string invalid");
7883 target_length += current_buffer->used = buffer_length_or_invalid;
7885 if (CASEMAP_DEBUG) {
7886 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7889 if (buffer_count==1) {
7890 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7893 char *target_current;
7896 target_current = RSTRING_PTR(target);
7897 current_buffer =
DATA_PTR(buffer_anchor);
7898 while (current_buffer) {
7899 memcpy(target_current, current_buffer->space, current_buffer->used);
7900 target_current += current_buffer->used;
7901 current_buffer = current_buffer->next;
7904 current_buffer =
DATA_PTR(buffer_anchor);
7906 mapping_buffer_free(current_buffer);
7911 str_enc_copy_direct(target, source);
7920 const OnigUChar *source_current, *source_end;
7921 OnigUChar *target_current, *target_end;
7922 long old_length = RSTRING_LEN(source);
7923 int length_or_invalid;
7925 if (old_length == 0)
return Qnil;
7927 source_current = (OnigUChar*)RSTRING_PTR(source);
7929 if (source == target) {
7930 target_current = (OnigUChar*)source_current;
7931 target_end = (OnigUChar*)source_end;
7934 target_current = (OnigUChar*)RSTRING_PTR(target);
7938 length_or_invalid = onigenc_ascii_only_case_map(flags,
7939 &source_current, source_end,
7940 target_current, target_end, enc);
7941 if (length_or_invalid < 0)
7942 rb_raise(rb_eArgError,
"input string invalid");
7943 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7944 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7945 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7946 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
7947 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7950 str_enc_copy(target, source);
7956upcase_single(
VALUE str)
7958 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
7959 bool modified =
false;
7962 unsigned int c = *(
unsigned char*)s;
7964 if (
'a' <= c && c <=
'z') {
7965 *s =
'A' + (c -
'a');
7986rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
7989 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7991 flags = check_case_options(argc, argv, flags);
7992 str_modify_keep_cr(str);
7993 enc = str_true_enc(str);
7994 if (case_option_single_p(flags, enc, str)) {
7995 if (upcase_single(str))
7996 flags |= ONIGENC_CASE_MODIFIED;
7998 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7999 rb_str_ascii_casemap(str, str, &flags, enc);
8001 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8003 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8016rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8019 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8022 flags = check_case_options(argc, argv, flags);
8023 enc = str_true_enc(str);
8024 if (case_option_single_p(flags, enc, str)) {
8025 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8026 str_enc_copy_direct(ret, str);
8029 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8031 rb_str_ascii_casemap(str, ret, &flags, enc);
8034 ret = rb_str_casemap(str, &flags, enc);
8041downcase_single(
VALUE str)
8043 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8044 bool modified =
false;
8047 unsigned int c = *(
unsigned char*)s;
8049 if (
'A' <= c && c <=
'Z') {
8050 *s =
'a' + (c -
'A');
8072rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8075 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8077 flags = check_case_options(argc, argv, flags);
8078 str_modify_keep_cr(str);
8079 enc = str_true_enc(str);
8080 if (case_option_single_p(flags, enc, str)) {
8081 if (downcase_single(str))
8082 flags |= ONIGENC_CASE_MODIFIED;
8084 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8085 rb_str_ascii_casemap(str, str, &flags, enc);
8087 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8089 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8103rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8106 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8109 flags = check_case_options(argc, argv, flags);
8110 enc = str_true_enc(str);
8111 if (case_option_single_p(flags, enc, str)) {
8112 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8113 str_enc_copy_direct(ret, str);
8114 downcase_single(ret);
8116 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8118 rb_str_ascii_casemap(str, ret, &flags, enc);
8121 ret = rb_str_casemap(str, &flags, enc);
8141rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8144 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8146 flags = check_case_options(argc, argv, flags);
8147 str_modify_keep_cr(str);
8148 enc = str_true_enc(str);
8149 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8150 if (flags&ONIGENC_CASE_ASCII_ONLY)
8151 rb_str_ascii_casemap(str, str, &flags, enc);
8153 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8155 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8169rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8172 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8175 flags = check_case_options(argc, argv, flags);
8176 enc = str_true_enc(str);
8177 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8178 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8180 rb_str_ascii_casemap(str, ret, &flags, enc);
8183 ret = rb_str_casemap(str, &flags, enc);
8202rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8205 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8207 flags = check_case_options(argc, argv, flags);
8208 str_modify_keep_cr(str);
8209 enc = str_true_enc(str);
8210 if (flags&ONIGENC_CASE_ASCII_ONLY)
8211 rb_str_ascii_casemap(str, str, &flags, enc);
8213 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8215 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8229rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8232 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8235 flags = check_case_options(argc, argv, flags);
8236 enc = str_true_enc(str);
8237 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8238 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8240 rb_str_ascii_casemap(str, ret, &flags, enc);
8243 ret = rb_str_casemap(str, &flags, enc);
8248typedef unsigned char *USTR;
8252 unsigned int now, max;
8264 if (t->p == t->pend)
return -1;
8265 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8268 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8270 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8272 if (t->p < t->pend) {
8273 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8276 if (t->now < 0x80 && c < 0x80) {
8277 rb_raise(rb_eArgError,
8278 "invalid range \"%c-%c\" in string transliteration",
8282 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8286 else if (t->now < c) {
8295 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8296 if (t->now == t->max) {
8301 if (t->now < t->max) {
8317 const unsigned int errc = -1;
8318 unsigned int trans[256];
8320 struct tr trsrc, trrepl;
8322 unsigned int c, c0, last = 0;
8323 int modify = 0, i, l;
8324 unsigned char *s, *send;
8326 int singlebyte = single_byte_optimizable(str);
8330#define CHECK_IF_ASCII(c) \
8331 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8332 (cr = ENC_CODERANGE_VALID) : 0)
8336 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8337 if (RSTRING_LEN(repl) == 0) {
8338 return rb_str_delete_bang(1, &src, str);
8342 e1 = rb_enc_check(str, src);
8343 e2 = rb_enc_check(str, repl);
8348 enc = rb_enc_check(src, repl);
8350 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8351 if (RSTRING_LEN(src) > 1 &&
8352 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8353 trsrc.p + l < trsrc.pend) {
8357 trrepl.p = RSTRING_PTR(repl);
8358 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8359 trsrc.gen = trrepl.gen = 0;
8360 trsrc.now = trrepl.now = 0;
8361 trsrc.max = trrepl.max = 0;
8364 for (i=0; i<256; i++) {
8367 while ((c = trnext(&trsrc, enc)) != errc) {
8372 if (!hash) hash = rb_hash_new();
8376 while ((c = trnext(&trrepl, enc)) != errc)
8379 for (i=0; i<256; i++) {
8380 if (trans[i] != errc) {
8388 for (i=0; i<256; i++) {
8391 while ((c = trnext(&trsrc, enc)) != errc) {
8392 r = trnext(&trrepl, enc);
8393 if (r == errc) r = trrepl.now;
8396 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8399 if (!hash) hash = rb_hash_new();
8407 str_modify_keep_cr(str);
8408 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8409 termlen = rb_enc_mbminlen(enc);
8412 long offset, max = RSTRING_LEN(str);
8413 unsigned int save = -1;
8414 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8419 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8421 SIZED_FREE_N(buf, max + termlen);
8422 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8425 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8427 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8436 if (cflag) c = last;
8439 else if (cflag) c = errc;
8445 if (c != (
unsigned int)-1) {
8451 tlen = rb_enc_codelen(c, enc);
8457 if (enc != e1) may_modify = 1;
8459 if ((offset = t - buf) + tlen > max) {
8460 size_t MAYBE_UNUSED(old) = max + termlen;
8461 max = offset + tlen + (send - s);
8462 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8465 rb_enc_mbcput(c, t, enc);
8466 if (may_modify && memcmp(s, t, tlen) != 0) {
8472 if (!STR_EMBED_P(str)) {
8473 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8475 TERM_FILL((
char *)t, termlen);
8476 RSTRING(str)->as.heap.ptr = (
char *)buf;
8477 STR_SET_LEN(str, t - buf);
8478 STR_SET_NOEMBED(str);
8479 RSTRING(str)->as.heap.aux.capa = max;
8483 c = (
unsigned char)*s;
8484 if (trans[c] != errc) {
8501 long offset, max = (long)((send - s) * 1.2);
8502 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8507 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8509 SIZED_FREE_N(buf, max + termlen);
8510 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8513 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8515 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8523 if (cflag) c = last;
8526 else if (cflag) c = errc;
8530 c = cflag ? last : errc;
8533 tlen = rb_enc_codelen(c, enc);
8538 if (enc != e1) may_modify = 1;
8540 if ((offset = t - buf) + tlen > max) {
8541 size_t MAYBE_UNUSED(old) = max + termlen;
8542 max = offset + tlen + (long)((send - s) * 1.2);
8543 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8547 rb_enc_mbcput(c, t, enc);
8548 if (may_modify && memcmp(s, t, tlen) != 0) {
8556 if (!STR_EMBED_P(str)) {
8557 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8559 TERM_FILL((
char *)t, termlen);
8560 RSTRING(str)->as.heap.ptr = (
char *)buf;
8561 STR_SET_LEN(str, t - buf);
8562 STR_SET_NOEMBED(str);
8563 RSTRING(str)->as.heap.aux.capa = max;
8569 rb_enc_associate(str, enc);
8591 return tr_trans(str, src, repl, 0);
8636 tr_trans(str, src, repl, 0);
8640#define TR_TABLE_MAX (UCHAR_MAX+1)
8641#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8643tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8646 const unsigned int errc = -1;
8647 char buf[TR_TABLE_MAX];
8650 VALUE table = 0, ptable = 0;
8651 int i, l, cflag = 0;
8653 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8654 tr.gen =
tr.now =
tr.max = 0;
8656 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8661 for (i=0; i<TR_TABLE_MAX; i++) {
8664 stable[TR_TABLE_MAX] = cflag;
8666 else if (stable[TR_TABLE_MAX] && !cflag) {
8667 stable[TR_TABLE_MAX] = 0;
8669 for (i=0; i<TR_TABLE_MAX; i++) {
8673 while ((c = trnext(&
tr, enc)) != errc) {
8674 if (c < TR_TABLE_MAX) {
8675 buf[(
unsigned char)c] = !cflag;
8680 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8683 table = ptable ? ptable : rb_hash_new();
8687 table = rb_hash_new();
8692 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8693 rb_hash_aset(table, key,
Qtrue);
8697 for (i=0; i<TR_TABLE_MAX; i++) {
8698 stable[i] = stable[i] && buf[i];
8700 if (!table && !cflag) {
8707tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8709 if (c < TR_TABLE_MAX) {
8710 return table[c] != 0;
8716 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8717 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8721 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8724 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8739rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8741 char squeez[TR_TABLE_SIZE];
8744 VALUE del = 0, nodel = 0;
8746 int i, ascompat, cr;
8748 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8750 for (i=0; i<argc; i++) {
8754 enc = rb_enc_check(str, s);
8755 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8758 str_modify_keep_cr(str);
8759 ascompat = rb_enc_asciicompat(enc);
8760 s = t = RSTRING_PTR(str);
8767 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8778 c = rb_enc_codepoint_len(s, send, &clen, enc);
8780 if (tr_find(c, squeez, del, nodel)) {
8784 if (t != s) rb_enc_mbcput(c, t, enc);
8791 TERM_FILL(t, TERM_LEN(str));
8792 STR_SET_LEN(str, t - RSTRING_PTR(str));
8795 if (modify)
return str;
8809rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8812 rb_str_delete_bang(argc, argv, str);
8830rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8832 char squeez[TR_TABLE_SIZE];
8834 VALUE del = 0, nodel = 0;
8835 unsigned char *s, *send, *t;
8837 int ascompat, singlebyte = single_byte_optimizable(str);
8841 enc = STR_ENC_GET(str);
8844 for (i=0; i<argc; i++) {
8848 enc = rb_enc_check(str, s);
8849 if (singlebyte && !single_byte_optimizable(s))
8851 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8855 str_modify_keep_cr(str);
8856 s = t = (
unsigned char *)RSTRING_PTR(str);
8857 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8860 ascompat = rb_enc_asciicompat(enc);
8864 unsigned int c = *s++;
8865 if (c != save || (argc > 0 && !squeez[c])) {
8875 if (ascompat && (c = *s) < 0x80) {
8876 if (c != save || (argc > 0 && !squeez[c])) {
8882 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8884 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8885 if (t != s) rb_enc_mbcput(c, t, enc);
8894 TERM_FILL((
char *)t, TERM_LEN(str));
8895 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8896 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
8900 if (modify)
return str;
8914rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8917 rb_str_squeeze_bang(argc, argv, str);
8937 return tr_trans(str, src, repl, 1);
8965 tr_trans(str, src, repl, 1);
8978rb_str_count(
int argc,
VALUE *argv,
VALUE str)
8980 char table[TR_TABLE_SIZE];
8982 VALUE del = 0, nodel = 0, tstr;
8992 enc = rb_enc_check(str, tstr);
8995 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8996 (ptstr = RSTRING_PTR(tstr),
8997 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
8998 !is_broken_string(str)) {
9000 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9002 s = RSTRING_PTR(str);
9003 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9006 if (*(
unsigned char*)s++ == c) n++;
9012 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9013 for (i=1; i<argc; i++) {
9016 enc = rb_enc_check(str, tstr);
9017 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9020 s = RSTRING_PTR(str);
9021 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9023 ascompat = rb_enc_asciicompat(enc);
9027 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9035 c = rb_enc_codepoint_len(s, send, &clen, enc);
9036 if (tr_find(c, table, del, nodel)) {
9047rb_fs_check(
VALUE val)
9051 if (
NIL_P(val))
return 0;
9056static const char isspacetable[256] = {
9057 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9058 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9059 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9060 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9061 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9062 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9063 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9064 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9065 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9066 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9067 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9068 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9069 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9070 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9071 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9075#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9078split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9080 if (empty_count >= 0 &&
len == 0) {
9081 return empty_count + 1;
9083 if (empty_count > 0) {
9088 }
while (--empty_count > 0);
9092 rb_yield(str_new_empty_String(str));
9093 }
while (--empty_count > 0);
9107 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9111literal_split_pattern(
VALUE spat, split_type_t default_type)
9119 return SPLIT_TYPE_CHARS;
9121 else if (rb_enc_asciicompat(enc)) {
9122 if (
len == 1 && ptr[0] ==
' ') {
9123 return SPLIT_TYPE_AWK;
9128 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9129 return SPLIT_TYPE_AWK;
9132 return default_type;
9145rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9150 split_type_t split_type;
9151 long beg, end, i = 0, empty_count = -1;
9156 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9158 if (lim <= 0) limit =
Qnil;
9159 else if (lim == 1) {
9160 if (RSTRING_LEN(str) == 0)
9171 if (
NIL_P(limit) && !lim) empty_count = 0;
9173 enc = STR_ENC_GET(str);
9174 split_type = SPLIT_TYPE_REGEXP;
9176 spat = get_pat_quoted(spat, 0);
9178 else if (
NIL_P(spat = rb_fs)) {
9179 split_type = SPLIT_TYPE_AWK;
9181 else if (!(spat = rb_fs_check(spat))) {
9182 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9187 if (split_type != SPLIT_TYPE_AWK) {
9192 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9193 if (split_type == SPLIT_TYPE_AWK) {
9195 split_type = SPLIT_TYPE_STRING;
9200 mustnot_broken(spat);
9201 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9209#define SPLIT_STR(beg, len) ( \
9210 empty_count = split_string(result, str, beg, len, empty_count), \
9211 str_mod_check(str, str_start, str_len))
9214 char *ptr = RSTRING_PTR(str);
9215 char *
const str_start = ptr;
9216 const long str_len = RSTRING_LEN(str);
9217 char *
const eptr = str_start + str_len;
9218 if (split_type == SPLIT_TYPE_AWK) {
9225 if (is_ascii_string(str)) {
9226 while (ptr < eptr) {
9227 c = (
unsigned char)*ptr++;
9229 if (ascii_isspace(c)) {
9235 if (!
NIL_P(limit) && lim <= i)
break;
9238 else if (ascii_isspace(c)) {
9239 SPLIT_STR(beg, end-beg);
9242 if (!
NIL_P(limit)) ++i;
9250 while (ptr < eptr) {
9253 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9262 if (!
NIL_P(limit) && lim <= i)
break;
9266 SPLIT_STR(beg, end-beg);
9269 if (!
NIL_P(limit)) ++i;
9277 else if (split_type == SPLIT_TYPE_STRING) {
9278 char *substr_start = ptr;
9279 char *sptr = RSTRING_PTR(spat);
9280 long slen = RSTRING_LEN(spat);
9283 mustnot_broken(str);
9284 enc = rb_enc_check(str, spat);
9285 while (ptr < eptr &&
9286 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9289 if (t != ptr + end) {
9293 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9294 str_mod_check(spat, sptr, slen);
9297 if (!
NIL_P(limit) && lim <= ++i)
break;
9299 beg = ptr - str_start;
9301 else if (split_type == SPLIT_TYPE_CHARS) {
9305 mustnot_broken(str);
9306 enc = rb_enc_get(str);
9307 while (ptr < eptr &&
9308 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9309 SPLIT_STR(ptr - str_start, n);
9311 if (!
NIL_P(limit) && lim <= ++i)
break;
9313 beg = ptr - str_start;
9317 long len = RSTRING_LEN(str);
9325 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9330 if (start == end && BEG(0) == END(0)) {
9335 else if (last_null == 1) {
9336 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9343 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9349 SPLIT_STR(beg, end-beg);
9350 beg = start = END(0);
9354 for (idx=1; idx < regs->num_regs; idx++) {
9355 if (BEG(idx) == -1)
continue;
9356 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9358 if (!
NIL_P(limit) && lim <= ++i)
break;
9360 if (match) rb_match_unbusy(match);
9362 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9363 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9366 return result ? result : str;
9376 return rb_str_split_m(1, &sep, str);
9379#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9394#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9397chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9399 const char *prev = rb_enc_prev_char(p, e, e, enc);
9402 prev = rb_enc_prev_char(p, e, e, enc);
9403 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9415 RSTRING_LEN(rs) != 1 ||
9416 RSTRING_PTR(rs)[0] !=
'\n')) {
9422#define rb_rs get_rs()
9429 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9430 long pos,
len, rslen;
9436 static ID keywords[1];
9441 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9445 if (!ENUM_ELEM(ary, str)) {
9453 if (!RSTRING_LEN(str))
goto end;
9455 ptr = subptr = RSTRING_PTR(str);
9457 len = RSTRING_LEN(str);
9459 rslen = RSTRING_LEN(rs);
9462 enc = rb_enc_get(str);
9464 enc = rb_enc_check(str, rs);
9469 const char *eol = NULL;
9471 while (subend < pend) {
9472 long chomp_rslen = 0;
9474 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9476 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9478 if (eol == subend)
break;
9482 chomp_rslen = -rslen;
9486 if (!subptr) subptr = subend;
9490 }
while (subend < pend);
9492 if (rslen == 0) chomp_rslen = 0;
9494 subend - subptr + (chomp ? chomp_rslen : rslen));
9495 if (ENUM_ELEM(ary, line)) {
9496 str_mod_check(str, ptr,
len);
9498 subptr = eol = NULL;
9503 rsptr = RSTRING_PTR(rs);
9504 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9513 rsptr = RSTRING_PTR(rs);
9514 rslen = RSTRING_LEN(rs);
9517 while (subptr < pend) {
9518 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9522 if (hit != adjusted) {
9526 subend = hit += rslen;
9529 subend = chomp_newline(subptr, subend, enc);
9536 if (ENUM_ELEM(ary, line)) {
9537 str_mod_check(str, ptr,
len);
9542 if (subptr != pend) {
9545 pend = chomp_newline(subptr, pend, enc);
9547 else if (pend - subptr >= rslen &&
9548 memcmp(pend - rslen, rsptr, rslen) == 0) {
9553 ENUM_ELEM(ary, line);
9574rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9577 return rb_str_enumerate_lines(argc, argv, str, 0);
9632rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9634 VALUE ary = WANTARRAY(
"lines", 0);
9635 return rb_str_enumerate_lines(argc, argv, str, ary);
9649 for (i=0; i<RSTRING_LEN(str); i++) {
9650 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9668rb_str_each_byte(
VALUE str)
9671 return rb_str_enumerate_bytes(str, 0);
9683rb_str_bytes(
VALUE str)
9685 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9686 return rb_str_enumerate_bytes(str, ary);
9704 ptr = RSTRING_PTR(str);
9705 len = RSTRING_LEN(str);
9706 enc = rb_enc_get(str);
9709 for (i = 0; i <
len; i += n) {
9710 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9715 for (i = 0; i <
len; i += n) {
9716 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9737rb_str_each_char(
VALUE str)
9740 return rb_str_enumerate_chars(str, 0);
9752rb_str_chars(
VALUE str)
9755 return rb_str_enumerate_chars(str, ary);
9759rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9764 const char *ptr, *end;
9767 if (single_byte_optimizable(str))
9768 return rb_str_enumerate_bytes(str, ary);
9771 ptr = RSTRING_PTR(str);
9773 enc = STR_ENC_GET(str);
9776 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9797rb_str_each_codepoint(
VALUE str)
9800 return rb_str_enumerate_codepoints(str, 0);
9812rb_str_codepoints(
VALUE str)
9815 return rb_str_enumerate_codepoints(str, ary);
9821 int encidx = rb_enc_to_index(enc);
9823 const OnigUChar source_ascii[] =
"\\X";
9824 const OnigUChar *source = source_ascii;
9825 size_t source_len =
sizeof(source_ascii) - 1;
9828#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9829#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9830#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9831#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9832#define CASE_UTF(e) \
9833 case ENCINDEX_UTF_##e: { \
9834 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9835 source = source_UTF_##e; \
9836 source_len = sizeof(source_UTF_##e); \
9839 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9847 regex_t *reg_grapheme_cluster;
9849 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9850 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9852 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9853 onig_error_code_to_str(message, r, &einfo);
9854 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9857 return reg_grapheme_cluster;
9863 int encidx = rb_enc_to_index(enc);
9864 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9866 if (encidx == rb_utf8_encindex()) {
9867 if (!reg_grapheme_cluster_utf8) {
9868 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9871 return reg_grapheme_cluster_utf8;
9880 size_t grapheme_cluster_count = 0;
9882 const char *ptr, *end;
9884 if (!rb_enc_unicode_p(enc)) {
9888 bool cached_reg_grapheme_cluster =
true;
9889 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9890 if (!reg_grapheme_cluster) {
9891 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9892 cached_reg_grapheme_cluster =
false;
9895 ptr = RSTRING_PTR(str);
9899 OnigPosition
len = onig_match(reg_grapheme_cluster,
9900 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9901 (
const OnigUChar *)ptr, NULL, 0);
9902 if (
len <= 0)
break;
9903 grapheme_cluster_count++;
9907 if (!cached_reg_grapheme_cluster) {
9908 onig_free(reg_grapheme_cluster);
9911 return SIZET2NUM(grapheme_cluster_count);
9915rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9919 const char *ptr0, *ptr, *end;
9921 if (!rb_enc_unicode_p(enc)) {
9922 return rb_str_enumerate_chars(str, ary);
9927 bool cached_reg_grapheme_cluster =
true;
9928 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9929 if (!reg_grapheme_cluster) {
9930 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9931 cached_reg_grapheme_cluster =
false;
9934 ptr0 = ptr = RSTRING_PTR(str);
9938 OnigPosition
len = onig_match(reg_grapheme_cluster,
9939 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9940 (
const OnigUChar *)ptr, NULL, 0);
9941 if (
len <= 0)
break;
9946 if (!cached_reg_grapheme_cluster) {
9947 onig_free(reg_grapheme_cluster);
9967rb_str_each_grapheme_cluster(
VALUE str)
9970 return rb_str_enumerate_grapheme_clusters(str, 0);
9982rb_str_grapheme_clusters(
VALUE str)
9985 return rb_str_enumerate_grapheme_clusters(str, ary);
9989chopped_length(
VALUE str)
9992 const char *p, *p2, *beg, *end;
9994 beg = RSTRING_PTR(str);
9995 end = beg + RSTRING_LEN(str);
9996 if (beg >= end)
return 0;
9997 p = rb_enc_prev_char(beg, end, end, enc);
9999 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10000 p2 = rb_enc_prev_char(beg, p, end, enc);
10001 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10019rb_str_chop_bang(
VALUE str)
10021 str_modify_keep_cr(str);
10022 if (RSTRING_LEN(str) > 0) {
10024 len = chopped_length(str);
10025 STR_SET_LEN(str,
len);
10026 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10045rb_str_chop(
VALUE str)
10051smart_chomp(
VALUE str,
const char *e,
const char *p)
10054 if (rb_enc_mbminlen(enc) > 1) {
10059 pp = e - rb_enc_mbminlen(enc);
10062 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10070 if (--e > p && *(e-1) ==
'\r') {
10087 char *pp, *e, *rsptr;
10089 char *
const p = RSTRING_PTR(str);
10090 long len = RSTRING_LEN(str);
10092 if (
len == 0)
return 0;
10095 return smart_chomp(str, e, p);
10098 enc = rb_enc_get(str);
10101 if (rb_enc_mbminlen(enc) > 1) {
10106 pp -= rb_enc_mbminlen(enc);
10109 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10116 while (e > p && *(e-1) ==
'\n') {
10118 if (e > p && *(e-1) ==
'\r')
10124 if (rslen >
len)
return len;
10126 enc = rb_enc_get(rs);
10127 newline = rsptr[rslen-1];
10128 if (rslen == rb_enc_mbminlen(enc)) {
10130 if (newline ==
'\n')
10131 return smart_chomp(str, e, p);
10135 return smart_chomp(str, e, p);
10139 enc = rb_enc_check(str, rs);
10140 if (is_broken_string(rs)) {
10144 if (p[
len-1] == newline &&
10146 memcmp(rsptr, pp, rslen) == 0)) {
10147 if (at_char_boundary(p, pp, e, enc))
10148 return len - rslen;
10160chomp_rs(
int argc,
const VALUE *argv)
10164 VALUE rs = argv[0];
10176 long olen = RSTRING_LEN(str);
10177 long len = chompped_length(str, rs);
10178 if (
len >= olen)
return Qnil;
10179 str_modify_keep_cr(str);
10180 STR_SET_LEN(str,
len);
10181 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10201rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10204 str_modifiable(str);
10205 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10206 rs = chomp_rs(argc, argv);
10208 return rb_str_chomp_string(str, rs);
10221rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10223 VALUE rs = chomp_rs(argc, argv);
10229tr_setup_table_multi(
char table[TR_TABLE_SIZE],
VALUE *tablep,
VALUE *ctablep,
10230 VALUE str,
int num_selectors,
VALUE *selectors)
10234 for (i=0; i<num_selectors; i++) {
10235 VALUE selector = selectors[i];
10239 enc = rb_enc_check(str, selector);
10240 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10247 const char *
const start = s;
10249 if (!s || s >= e)
return 0;
10252 if (single_byte_optimizable(str)) {
10253 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10258 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10268lstrip_offset_table(
VALUE str,
const char *s,
const char *e,
rb_encoding *enc,
10269 char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
10271 const char *
const start = s;
10273 if (!s || s >= e)
return 0;
10278 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10280 if (!tr_find(cc, table, del, nodel))
break;
10299rb_str_lstrip_bang(
int argc,
VALUE *argv,
VALUE str)
10303 long olen, loffset;
10305 str_modify_keep_cr(str);
10306 enc = STR_ENC_GET(str);
10309 char table[TR_TABLE_SIZE];
10310 VALUE del = 0, nodel = 0;
10312 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10313 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10316 loffset = lstrip_offset(str, start, start+olen, enc);
10320 long len = olen-loffset;
10321 s = start + loffset;
10322 memmove(start, s,
len);
10323 STR_SET_LEN(str,
len);
10324 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10359rb_str_lstrip(
int argc,
VALUE *argv,
VALUE str)
10366 char table[TR_TABLE_SIZE];
10367 VALUE del = 0, nodel = 0;
10369 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10370 loffset = lstrip_offset_table(str, start, start+
len, STR_ENC_GET(str), table, del, nodel);
10373 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10375 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10384 rb_str_check_dummy_enc(enc);
10388 if (!s || s >= e)
return 0;
10392 if (single_byte_optimizable(str)) {
10394 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10399 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10409rstrip_offset_table(
VALUE str,
const char *s,
const char *e,
rb_encoding *enc,
10410 char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
10415 rb_str_check_dummy_enc(enc);
10419 if (!s || s >= e)
return 0;
10423 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10425 if (!tr_find(c, table, del, nodel))
break;
10445rb_str_rstrip_bang(
int argc,
VALUE *argv,
VALUE str)
10449 long olen, roffset;
10451 str_modify_keep_cr(str);
10452 enc = STR_ENC_GET(str);
10455 char table[TR_TABLE_SIZE];
10456 VALUE del = 0, nodel = 0;
10458 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10459 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10462 roffset = rstrip_offset(str, start, start+olen, enc);
10465 long len = olen - roffset;
10467 STR_SET_LEN(str,
len);
10468 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10502rb_str_rstrip(
int argc,
VALUE *argv,
VALUE str)
10506 long olen, roffset;
10508 enc = STR_ENC_GET(str);
10511 char table[TR_TABLE_SIZE];
10512 VALUE del = 0, nodel = 0;
10514 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10515 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10518 roffset = rstrip_offset(str, start, start+olen, enc);
10520 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10538rb_str_strip_bang(
int argc,
VALUE *argv,
VALUE str)
10541 long olen, loffset, roffset;
10544 str_modify_keep_cr(str);
10545 enc = STR_ENC_GET(str);
10549 char table[TR_TABLE_SIZE];
10550 VALUE del = 0, nodel = 0;
10552 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10553 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10554 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10557 loffset = lstrip_offset(str, start, start+olen, enc);
10558 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10561 if (loffset > 0 || roffset > 0) {
10562 long len = olen-roffset;
10565 memmove(start, start + loffset,
len);
10567 STR_SET_LEN(str,
len);
10568 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10603rb_str_strip(
int argc,
VALUE *argv,
VALUE str)
10606 long olen, loffset, roffset;
10612 char table[TR_TABLE_SIZE];
10613 VALUE del = 0, nodel = 0;
10615 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10616 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10617 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10620 loffset = lstrip_offset(str, start, start+olen, enc);
10621 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10624 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10629scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10632 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10638 end = pos + RSTRING_LEN(pat);
10652 if (RSTRING_LEN(str) > end)
10653 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10662 if (!regs || regs->num_regs == 1) {
10668 for (
int i = 1; i < regs->num_regs; i++) {
10699 long last = -1, prev = 0;
10700 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10702 pat = get_pat_quoted(pat, 1);
10703 mustnot_broken(str);
10707 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10712 if (last >= 0) rb_pat_search(pat, str, last, 1);
10717 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10721 str_mod_check(str, p,
len);
10723 if (last >= 0) rb_pat_search(pat, str, last, 1);
10775rb_str_hex(
VALUE str)
10777 return rb_str_to_inum(str, 16, FALSE);
10861rb_str_oct(
VALUE str)
10863 return rb_str_to_inum(str, -8, FALSE);
10866#ifndef HAVE_CRYPT_R
10871 rb_nativethread_lock_t lock;
10872} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10941# define CRYPT_END() ALLOCV_END(databuf)
10944 extern char *crypt(
const char *,
const char *);
10945# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10948 const char *s, *saltp;
10951 char salt_8bit_clean[3];
10955 mustnot_wchar(str);
10956 mustnot_wchar(salt);
10958 saltp = RSTRING_PTR(salt);
10959 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10960 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10964 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10965 salt_8bit_clean[0] = saltp[0] & 0x7f;
10966 salt_8bit_clean[1] = saltp[1] & 0x7f;
10967 salt_8bit_clean[2] =
'\0';
10968 saltp = salt_8bit_clean;
10973# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10974 data->initialized = 0;
10976 res = crypt_r(s, saltp, data);
10979 res = crypt(s, saltp);
10994 size_t res_size = strlen(res)+1;
10995 tmp_buf =
ALLOCA_N(
char, res_size);
10996 memcpy(tmp_buf, res, res_size);
11033 char *ptr, *p, *pend;
11036 unsigned long sum0 = 0;
11041 ptr = p = RSTRING_PTR(str);
11042 len = RSTRING_LEN(str);
11048 str_mod_check(str, ptr,
len);
11051 sum0 += (
unsigned char)*p;
11062 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
11063 sum0 &= (((
unsigned long)1)<<bits)-1;
11083rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
11087 long width,
len, flen = 1, fclen = 1;
11090 const char *f =
" ";
11091 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11093 int singlebyte = 1, cr;
11097 enc = STR_ENC_GET(str);
11098 termlen = rb_enc_mbminlen(enc);
11102 enc = rb_enc_check(str, pad);
11103 f = RSTRING_PTR(pad);
11104 flen = RSTRING_LEN(pad);
11105 fclen = str_strlen(pad, enc);
11106 singlebyte = single_byte_optimizable(pad);
11107 if (flen == 0 || fclen == 0) {
11108 rb_raise(rb_eArgError,
"zero width padding");
11111 len = str_strlen(str, enc);
11112 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
11114 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
11118 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11119 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11121 size = RSTRING_LEN(str);
11122 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11123 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11124 (
len += llen2 + rlen2) >= LONG_MAX - size) {
11125 rb_raise(rb_eArgError,
"argument too big");
11129 p = RSTRING_PTR(res);
11131 memset(p, *f, llen);
11135 while (llen >= fclen) {
11141 memcpy(p, f, llen2);
11145 memcpy(p, RSTRING_PTR(str), size);
11148 memset(p, *f, rlen);
11152 while (rlen >= fclen) {
11158 memcpy(p, f, rlen2);
11162 TERM_FILL(p, termlen);
11163 STR_SET_LEN(res, p-RSTRING_PTR(res));
11184rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
11186 return rb_str_justify(argc, argv, str,
'l');
11198rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11200 return rb_str_justify(argc, argv, str,
'r');
11213rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11215 return rb_str_justify(argc, argv, str,
'c');
11231 sep = get_pat_quoted(sep, 0);
11243 pos = rb_str_index(str, sep, 0);
11244 if (pos < 0)
goto failed;
11249 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11252 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11266 long pos = RSTRING_LEN(str);
11268 sep = get_pat_quoted(sep, 0);
11281 pos = rb_str_rindex(str, sep, pos);
11290 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11292 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11304rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11308 for (i=0; i<argc; i++) {
11309 VALUE tmp = argv[i];
11311 if (rb_reg_start_with_p(tmp, str))
11315 const char *p, *s, *e;
11320 enc = rb_enc_check(str, tmp);
11321 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11322 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11323 p = RSTRING_PTR(str);
11326 if (!at_char_right_boundary(p, s, e, enc))
11328 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11344rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11348 for (i=0; i<argc; i++) {
11349 VALUE tmp = argv[i];
11350 const char *p, *s, *e;
11355 enc = rb_enc_check(str, tmp);
11356 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11357 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11358 p = RSTRING_PTR(str);
11361 if (!at_char_boundary(p, s, e, enc))
11363 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11379deleted_prefix_length(
VALUE str,
VALUE prefix)
11381 const char *strptr, *prefixptr;
11382 long olen, prefixlen;
11387 if (!is_broken_string(prefix) ||
11388 !rb_enc_asciicompat(enc) ||
11389 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11390 enc = rb_enc_check(str, prefix);
11394 prefixlen = RSTRING_LEN(prefix);
11395 if (prefixlen <= 0)
return 0;
11396 olen = RSTRING_LEN(str);
11397 if (olen < prefixlen)
return 0;
11398 strptr = RSTRING_PTR(str);
11399 prefixptr = RSTRING_PTR(prefix);
11400 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11401 if (is_broken_string(prefix)) {
11402 if (!is_broken_string(str)) {
11406 const char *strend = strptr + olen;
11407 const char *after_prefix = strptr + prefixlen;
11408 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11429rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11432 str_modify_keep_cr(str);
11434 prefixlen = deleted_prefix_length(str, prefix);
11435 if (prefixlen <= 0)
return Qnil;
11449rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11453 prefixlen = deleted_prefix_length(str, prefix);
11454 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11456 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11469deleted_suffix_length(
VALUE str,
VALUE suffix)
11471 const char *strptr, *suffixptr;
11472 long olen, suffixlen;
11476 if (is_broken_string(suffix))
return 0;
11477 enc = rb_enc_check(str, suffix);
11480 suffixlen = RSTRING_LEN(suffix);
11481 if (suffixlen <= 0)
return 0;
11482 olen = RSTRING_LEN(str);
11483 if (olen < suffixlen)
return 0;
11484 strptr = RSTRING_PTR(str);
11485 suffixptr = RSTRING_PTR(suffix);
11486 const char *strend = strptr + olen;
11487 const char *before_suffix = strend - suffixlen;
11488 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11489 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11505rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11507 long olen, suffixlen,
len;
11508 str_modifiable(str);
11510 suffixlen = deleted_suffix_length(str, suffix);
11511 if (suffixlen <= 0)
return Qnil;
11513 olen = RSTRING_LEN(str);
11514 str_modify_keep_cr(str);
11515 len = olen - suffixlen;
11516 STR_SET_LEN(str,
len);
11517 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11533rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11537 suffixlen = deleted_suffix_length(str, suffix);
11538 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11540 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11547 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11553nil_setter_warning(
ID id)
11555 rb_warn_deprecated(
"non-nil '%"PRIsVALUE
"'", NULL, rb_id2str(
id));
11562 if (!
NIL_P(*var)) {
11563 nil_setter_warning(
id);
11570 val = rb_fs_check(val);
11573 "value of %"PRIsVALUE
" must be String or Regexp",
11577 nil_setter_warning(
id);
11594 str_modifiable(str);
11597 int idx = rb_enc_to_index(encoding);
11604 rb_enc_associate_index(str, idx);
11628 if (STR_EMBED_P(str)) {
11629 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11634 str_replace_shared_without_enc(str2, str);
11636 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11666rb_str_valid_encoding_p(
VALUE str)
11686rb_str_is_ascii_only_p(
VALUE str)
11696 static const char ellipsis[] =
"...";
11697 const long ellipsislen =
sizeof(ellipsis) - 1;
11699 const long blen = RSTRING_LEN(str);
11700 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11701 VALUE estr, ret = 0;
11704 if (
len * rb_enc_mbminlen(enc) >= blen ||
11708 else if (
len <= ellipsislen ||
11710 if (rb_enc_asciicompat(enc)) {
11712 rb_enc_associate(ret, enc);
11719 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11724 rb_enc_from_encoding(enc), 0,
Qnil);
11737 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11743 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11762 if (enc == STR_ENC_GET(str)) {
11767 return enc_str_scrub(enc, str, repl, cr);
11775 const char *rep, *p, *e, *p1, *sp;
11781 rb_raise(rb_eArgError,
"both of block and replacement given");
11788 if (!
NIL_P(repl)) {
11789 repl = str_compat_and_valid(repl, enc);
11792 if (rb_enc_dummy_p(enc)) {
11795 encidx = rb_enc_to_index(enc);
11797#define DEFAULT_REPLACE_CHAR(str) do { \
11798 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11799 rep = replace; replen = (int)sizeof(replace); \
11802 slen = RSTRING_LEN(str);
11803 p = RSTRING_PTR(str);
11808 if (rb_enc_asciicompat(enc)) {
11814 else if (!
NIL_P(repl)) {
11815 rep = RSTRING_PTR(repl);
11816 replen = RSTRING_LEN(repl);
11819 else if (encidx == rb_utf8_encindex()) {
11820 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11824 DEFAULT_REPLACE_CHAR(
"?");
11829 p = search_nonascii(p, e);
11834 int ret = rb_enc_precise_mbclen(p, e, enc);
11853 if (e - p < clen) clen = e - p;
11860 for (; clen > 1; clen--) {
11861 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11872 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11873 str_mod_check(str, sp, slen);
11874 repl = str_compat_and_valid(repl, enc);
11881 p = search_nonascii(p, e);
11907 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11908 str_mod_check(str, sp, slen);
11909 repl = str_compat_and_valid(repl, enc);
11918 long mbminlen = rb_enc_mbminlen(enc);
11922 else if (!
NIL_P(repl)) {
11923 rep = RSTRING_PTR(repl);
11924 replen = RSTRING_LEN(repl);
11926 else if (encidx == ENCINDEX_UTF_16BE) {
11927 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11929 else if (encidx == ENCINDEX_UTF_16LE) {
11930 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11932 else if (encidx == ENCINDEX_UTF_32BE) {
11933 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11935 else if (encidx == ENCINDEX_UTF_32LE) {
11936 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11939 DEFAULT_REPLACE_CHAR(
"?");
11943 int ret = rb_enc_precise_mbclen(p, e, enc);
11956 if (e - p < clen) clen = e - p;
11957 if (clen <= mbminlen * 2) {
11962 for (; clen > mbminlen; clen-=mbminlen) {
11963 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11973 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11974 str_mod_check(str, sp, slen);
11975 repl = str_compat_and_valid(repl, enc);
12000 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
12001 str_mod_check(str, sp, slen);
12002 repl = str_compat_and_valid(repl, enc);
12042str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
12050static ID id_normalize;
12051static ID id_normalized_p;
12052static VALUE mUnicodeNormalize;
12055unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
12057 static int UnicodeNormalizeRequired = 0;
12060 if (!UnicodeNormalizeRequired) {
12061 rb_require(
"unicode_normalize/normalize.rb");
12062 UnicodeNormalizeRequired = 1;
12066 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
12077rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
12079 return unicode_normalize_common(argc, argv, str, id_normalize);
12093rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
12095 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12122rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
12124 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12256#define sym_equal rb_obj_equal
12259sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12263 int c = rb_enc_precise_mbclen(s, send, enc);
12267 c = rb_enc_mbc_to_codepoint(s, send, enc);
12275rb_str_symname_p(
VALUE sym)
12280 rb_encoding *resenc = rb_default_internal_encoding();
12282 if (resenc == NULL) resenc = rb_default_external_encoding();
12283 enc = STR_ENC_GET(sym);
12284 ptr = RSTRING_PTR(sym);
12285 len = RSTRING_LEN(sym);
12286 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12294rb_str_quote_unprintable(
VALUE str)
12302 resenc = rb_default_internal_encoding();
12303 if (resenc == NULL) resenc = rb_default_external_encoding();
12304 enc = STR_ENC_GET(str);
12305 ptr = RSTRING_PTR(str);
12306 len = RSTRING_LEN(str);
12307 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12308 !sym_printable(ptr, ptr +
len, enc)) {
12309 return rb_str_escape(str);
12315rb_id_quote_unprintable(
ID id)
12317 VALUE str = rb_id2str(
id);
12318 if (!rb_str_symname_p(str)) {
12319 return rb_str_escape(str);
12337sym_inspect(
VALUE sym)
12344 if (!rb_str_symname_p(str)) {
12346 len = RSTRING_LEN(str);
12347 rb_str_resize(str,
len + 1);
12348 dest = RSTRING_PTR(str);
12349 memmove(dest + 1, dest,
len);
12353 VALUE orig_str = str;
12355 len = RSTRING_LEN(orig_str);
12356 str = rb_enc_str_new(0,
len + 1, enc);
12359 ptr = RSTRING_PTR(orig_str);
12360 dest = RSTRING_PTR(str);
12361 memcpy(dest + 1, ptr,
len);
12381rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12386 rb_raise(rb_eArgError,
"no receiver given");
12489 return rb_str_match(
rb_sym2str(sym), other);
12504sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12506 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12519sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12521 return rb_str_match_m_p(argc, argv, sym);
12539 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12550sym_length(
VALUE sym)
12564sym_empty(
VALUE sym)
12598sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12614sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12630sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12644sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12646 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12659sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12661 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12673sym_encoding(
VALUE sym)
12679string_for_symbol(
VALUE name)
12684 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12698 name = string_for_symbol(name);
12699 return rb_intern_str(name);
12708 name = string_for_symbol(name);
12732 return rb_fstring(str);
12738 struct RString fake_str = {RBASIC_INIT};
12739 int encidx = ENCINDEX_US_ASCII;
12742 encidx = ENCINDEX_ASCII_8BIT;
12745 VALUE str = setup_fake_str(&fake_str,
ptr,
len, encidx);
12747 return register_fstring(str,
true,
false);
12759 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12760 rb_enc_autoload(enc);
12763 struct RString fake_str = {RBASIC_INIT};
12764 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12770 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12771 rb_enc_autoload(enc);
12774 struct RString fake_str = {RBASIC_INIT};
12775 VALUE str = register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12786#if USE_YJIT || USE_ZJIT
12788rb_jit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12793 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12794 rb_str_buf_cat_byte(str, (
char) code);
12804fstring_set_class_i(
VALUE *str,
void *data)
12808 return ST_CONTINUE;
12816 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12983 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_cObject
Object class.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
long len
Length of the string, not including terminating NUL character.
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
union RString::@52::@53::@55 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.