14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
49#include "ruby_assert.h"
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
67#undef rb_usascii_str_new
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
149#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
150#define TERM_FILL(ptr, termlen) do {\
151 char *const term_fill_ptr = (ptr);\
152 const int term_fill_len = (termlen);\
153 *term_fill_ptr = '\0';\
154 if (UNLIKELY(term_fill_len > 1))\
155 memset(term_fill_ptr, 0, term_fill_len);\
158#define RESIZE_CAPA(str,capacity) do {\
159 const int termlen = TERM_LEN(str);\
160 RESIZE_CAPA_TERM(str,capacity,termlen);\
162#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
163 if (STR_EMBED_P(str)) {\
164 if (str_embed_capa(str) < capacity + termlen) {\
165 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
166 const long tlen = RSTRING_LEN(str);\
167 memcpy(tmp, RSTRING_PTR(str), str_embed_capa(str));\
168 RSTRING(str)->as.heap.ptr = tmp;\
169 RSTRING(str)->len = tlen;\
170 STR_SET_NOEMBED(str);\
171 RSTRING(str)->as.heap.aux.capa = (capacity);\
175 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
176 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
177 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
178 RSTRING(str)->as.heap.aux.capa = (capacity);\
182#define STR_SET_SHARED(str, shared_str) do { \
183 if (!FL_TEST(str, STR_FAKESTR)) { \
184 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
185 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
186 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
187 FL_SET((str), STR_SHARED); \
188 rb_gc_register_pinning_obj(str); \
189 FL_SET((shared_str), STR_SHARED_ROOT); \
190 if (RBASIC_CLASS((shared_str)) == 0) \
191 FL_SET_RAW((shared_str), STR_BORROWED); \
195#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
196#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
199#define STR_ENC_GET(str) get_encoding(str)
202zero_filled(
const char *s,
int n)
205 if (*s++)
return false;
210#if !defined SHARABLE_MIDDLE_SUBSTRING
211# define SHARABLE_MIDDLE_SUBSTRING 0
215SHARABLE_SUBSTRING_P(
VALUE str,
long beg,
long len)
217#if SHARABLE_MIDDLE_SUBSTRING
220 long end = beg +
len;
221 long source_len = RSTRING_LEN(str);
222 return end == source_len || zero_filled(RSTRING_PTR(str) + end, TERM_LEN(str));
227str_embed_capa(
VALUE str)
229 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
233rb_str_reembeddable_p(
VALUE str)
235 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
239rb_str_embed_size(
long capa,
long termlen)
247rb_str_size_as_embedded(
VALUE str)
250 if (STR_EMBED_P(str)) {
252 if (
FL_TEST_RAW(str, STR_PRECOMPUTED_HASH))
capa +=
sizeof(st_index_t);
254 real_size = rb_str_embed_size(
capa, TERM_LEN(str));
258 else if (rb_str_reembeddable_p(str)) {
260 if (
FL_TEST_RAW(str, STR_PRECOMPUTED_HASH))
capa +=
sizeof(st_index_t);
262 real_size = rb_str_embed_size(
capa, TERM_LEN(str));
265 real_size =
sizeof(
struct RString);
272STR_EMBEDDABLE_P(
long len,
long termlen)
274 return rb_gc_size_allocatable_p(rb_str_embed_size(
len, termlen));
279static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
280static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
282static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
283static inline void str_modifiable(
VALUE str);
288str_make_independent(
VALUE str)
290 long len = RSTRING_LEN(str);
291 int termlen = TERM_LEN(str);
292 str_make_independent_expand((str),
len, 0L, termlen);
295static inline int str_dependent_p(
VALUE str);
298rb_str_make_independent(
VALUE str)
300 if (str_dependent_p(str)) {
301 str_make_independent(str);
306rb_str_make_embedded(
VALUE str)
311 int termlen = TERM_LEN(str);
312 char *buf =
RSTRING(str)->as.heap.ptr;
313 long old_capa =
RSTRING(str)->as.heap.aux.capa + termlen;
317 STR_SET_LEN(str,
len);
320 memcpy(RSTRING_PTR(str), buf,
len);
321 SIZED_FREE_N(buf, old_capa);
328rb_debug_rstring_null_ptr(
const char *func)
330 fprintf(stderr,
"%s is returning NULL!! "
331 "SIGSEGV is highly expected to follow immediately.\n"
332 "If you could reproduce, attach your debugger here, "
333 "and look at the passed string.\n",
338static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
341get_encoding(
VALUE str)
347mustnot_broken(
VALUE str)
349 if (is_broken_string(str)) {
350 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
355mustnot_wchar(
VALUE str)
358 if (rb_enc_mbminlen(enc) > 1) {
359 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
363static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
365#if SIZEOF_LONG == SIZEOF_VOIDP
366#define PRECOMPUTED_FAKESTR_HASH 1
371BARE_STRING_P(
VALUE str)
376static inline st_index_t
377str_do_hash(
VALUE str)
379 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
381 if (e && !is_ascii_string(str)) {
388str_store_precomputed_hash(
VALUE str, st_index_t hash)
394 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
395 size_t free_bytes = str_embed_capa(str) - used_bytes;
399 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
401 FL_SET(str, STR_PRECOMPUTED_HASH);
414 if (
FL_TEST(str, RSTRING_FSTR))
417 bare = BARE_STRING_P(str);
419 if (STR_EMBED_P(str)) {
424 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
431 rb_str_resize(str, RSTRING_LEN(str));
433 fstr = register_fstring(str,
false,
false);
436 str_replace_shared_without_enc(str, fstr);
443static VALUE fstring_table_obj;
446fstring_concurrent_set_hash(
VALUE str)
448#ifdef PRECOMPUTED_FAKESTR_HASH
452 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
469 const char *aptr, *bptr;
476 return (alen == blen &&
478 memcmp(aptr, bptr, alen) == 0);
483 bool force_precompute_hash;
487fstring_concurrent_set_create(
VALUE str,
void *data)
497 long len = RSTRING_LEN(str);
498 long capa =
len +
sizeof(st_index_t);
499 int term_len = TERM_LEN(str);
501 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
503 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
504 STR_SET_LEN(new_str, RSTRING_LEN(str));
506 rb_enc_copy(new_str, str);
507 str_store_precomputed_hash(new_str, str_do_hash(str));
511 rb_enc_copy(new_str, str);
512#ifdef PRECOMPUTED_FAKESTR_HASH
513 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
514 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
528 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
531 if (STR_SHARED_P(str)) {
533 str_make_independent(str);
536 if (!BARE_STRING_P(str)) {
542 RBASIC(str)->flags |= RSTRING_FSTR;
544 RB_OBJ_SET_SHAREABLE(str);
558 .hash = fstring_concurrent_set_hash,
559 .cmp = fstring_concurrent_set_cmp,
560 .create = fstring_concurrent_set_create,
565Init_fstring_table(
void)
567 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
568 rb_gc_register_address(&fstring_table_obj);
572register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
576 .force_precompute_hash = force_precompute_hash
579#if SIZEOF_VOIDP == SIZEOF_LONG
583 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
587 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
589 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
601rb_obj_is_fstring_table(
VALUE obj)
605 return obj == fstring_table_obj;
609rb_gc_free_fstring(
VALUE obj)
611 ASSERT_vm_locking_with_barrier();
617 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
619 RB_DEBUG_COUNTER_INC(obj_str_fstr);
625rb_fstring_foreach_with_replace(
int (*callback)(
VALUE *str,
void *data),
void *data)
627 if (fstring_table_obj) {
628 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
633setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
636 RBASIC_SET_SHAPE_ID((
VALUE)fake_str, ROOT_SHAPE_ID);
649 return (
VALUE)fake_str;
658 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
667rb_fstring_new(
const char *ptr,
long len)
669 struct RString fake_str = {RBASIC_INIT};
670 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
676 struct RString fake_str = {RBASIC_INIT};
677 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
681rb_fstring_cstr(
const char *
ptr)
683 return rb_fstring_new(
ptr, strlen(
ptr));
687single_byte_optimizable(
VALUE str)
691 case ENCINDEX_ASCII_8BIT:
692 case ENCINDEX_US_ASCII:
714static inline const char *
715search_nonascii(
const char *p,
const char *e)
719#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
720# if SIZEOF_UINTPTR_T == 8
721# define NONASCII_MASK UINT64_C(0x8080808080808080)
722# elif SIZEOF_UINTPTR_T == 4
723# define NONASCII_MASK UINT32_C(0x80808080)
725# error "don't know what to do."
728# if SIZEOF_UINTPTR_T == 8
729# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
730# elif SIZEOF_UINTPTR_T == 4
731# define NONASCII_MASK 0x80808080UL
733# error "don't know what to do."
737 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
738#if !UNALIGNED_WORD_ACCESS
739 if ((uintptr_t)p % SIZEOF_VOIDP) {
740 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
745 case 7:
if (p[-7]&0x80)
return p-7;
746 case 6:
if (p[-6]&0x80)
return p-6;
747 case 5:
if (p[-5]&0x80)
return p-5;
748 case 4:
if (p[-4]&0x80)
return p-4;
750 case 3:
if (p[-3]&0x80)
return p-3;
751 case 2:
if (p[-2]&0x80)
return p-2;
752 case 1:
if (p[-1]&0x80)
return p-1;
757#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
758#define aligned_ptr(value) \
759 __builtin_assume_aligned((value), sizeof(uintptr_t))
761#define aligned_ptr(value) (value)
764 t = (e - (SIZEOF_VOIDP-1));
766 for (;s < t; s +=
sizeof(uintptr_t)) {
768 memcpy(&word, s,
sizeof(word));
769 if (word & NONASCII_MASK) {
770#ifdef WORDS_BIGENDIAN
771 return (
const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
773 return (
const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
783 case 7:
if (e[-7]&0x80)
return e-7;
784 case 6:
if (e[-6]&0x80)
return e-6;
785 case 5:
if (e[-5]&0x80)
return e-5;
786 case 4:
if (e[-4]&0x80)
return e-4;
788 case 3:
if (e[-3]&0x80)
return e-3;
789 case 2:
if (e[-2]&0x80)
return e-2;
790 case 1:
if (e[-1]&0x80)
return e-1;
798 const char *e = p +
len;
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
802 p = search_nonascii(p, e);
806 if (rb_enc_asciicompat(enc)) {
807 p = search_nonascii(p, e);
810 int ret = rb_enc_precise_mbclen(p, e, enc);
814 p = search_nonascii(p, e);
820 int ret = rb_enc_precise_mbclen(p, e, enc);
836 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
839 p = search_nonascii(p, e);
843 else if (rb_enc_asciicompat(enc)) {
844 p = search_nonascii(p, e);
850 int ret = rb_enc_precise_mbclen(p, e, enc);
857 p = search_nonascii(p, e);
863 int ret = rb_enc_precise_mbclen(p, e, enc);
888 rb_enc_set_index(str1, rb_enc_get_index(str2));
896rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
901 str_enc_copy(dest, src);
902 if (RSTRING_LEN(dest) == 0) {
903 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
914 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
915 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
926rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
928 str_enc_copy(dest, src);
935 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
941 return enc_coderange_scan(str, enc);
945rbimpl_enc_str_coderange_scan(
VALUE str)
947 int cr = enc_coderange_scan(str, get_encoding(str));
952#undef rb_enc_str_coderange
959 cr = rbimpl_enc_str_coderange_scan(str);
963#define rb_enc_str_coderange rb_enc_str_coderange_inline
966rb_enc_str_asciicompat(
VALUE str)
969 return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
977 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
986str_mod_check(
VALUE s,
const char *p,
long len)
988 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
994str_capacity(
VALUE str,
const int termlen)
996 if (STR_EMBED_P(str)) {
997 return str_embed_capa(str) - termlen;
999 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
1003 return RSTRING(str)->as.heap.aux.capa;
1010 return str_capacity(str, TERM_LEN(str));
1014must_not_null(
const char *
ptr)
1017 rb_raise(rb_eArgError,
"NULL pointer given");
1022str_alloc_embed(
VALUE klass,
size_t capa)
1024 size_t size = rb_str_embed_size(
capa, 0);
1031 str->as.embed.ary[0] = 0;
1037str_alloc_heap(
VALUE klass)
1042 str->as.heap.aux.capa = 0;
1043 str->as.heap.ptr = NULL;
1049empty_str_alloc(
VALUE klass)
1051 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1052 VALUE str = str_alloc_embed(klass, 0);
1053 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1064 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1068 enc = rb_ascii8bit_encoding();
1071 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1073 int termlen = rb_enc_mbminlen(enc);
1075 if (STR_EMBEDDABLE_P(
len, termlen)) {
1076 str = str_alloc_embed(klass,
len + termlen);
1082 str = str_alloc_heap(klass);
1088 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1091 rb_enc_raw_set(str, enc);
1094 memcpy(RSTRING_PTR(str),
ptr,
len);
1097 memset(RSTRING_PTR(str), 0,
len);
1100 STR_SET_LEN(str,
len);
1101 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1108 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1143 __msan_unpoison_string(
ptr);
1163 if (rb_enc_mbminlen(enc) != 1) {
1164 rb_raise(rb_eArgError,
"wchar encoding given");
1166 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1170str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1175 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1179 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1182 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1183 str = str_alloc_heap(klass);
1187 RBASIC(str)->flags |= STR_NOFREE;
1188 rb_enc_associate_index(str, encindex);
1217static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1219 int ecflags,
VALUE ecopts);
1224 int encidx = rb_enc_to_index(enc);
1225 if (rb_enc_get_index(str) == encidx)
1226 return is_ascii_string(str);
1237 if (!to)
return str;
1238 if (!from) from = rb_enc_get(str);
1239 if (from == to)
return str;
1240 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1241 rb_is_ascii8bit_enc(to)) {
1242 if (STR_ENC_GET(str) != to) {
1244 rb_enc_associate(str, to);
1251 from, to, ecflags, ecopts);
1252 if (
NIL_P(newstr)) {
1260rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1265 olen = RSTRING_LEN(newstr);
1266 if (ofs < -olen || olen < ofs)
1268 if (ofs < 0) ofs += olen;
1270 STR_SET_LEN(newstr, ofs);
1274 rb_str_modify(newstr);
1275 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1283 STR_SET_LEN(str, 0);
1284 rb_enc_associate(str, enc);
1290str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1292 int ecflags,
VALUE ecopts)
1297 VALUE econv_wrapper;
1298 const unsigned char *start, *sp;
1299 unsigned char *dest, *dp;
1300 size_t converted_output = (size_t)ofs;
1305 RBASIC_CLEAR_CLASS(econv_wrapper);
1307 if (!ec)
return Qnil;
1310 sp = (
unsigned char*)
ptr;
1312 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1313 (dp = dest + converted_output),
1317 size_t converted_input = sp - start;
1318 size_t rest =
len - converted_input;
1319 converted_output = dp - dest;
1321 if (converted_input && converted_output &&
1322 rest < (LONG_MAX / converted_output)) {
1323 rest = (rest * converted_output) / converted_input;
1328 olen += rest < 2 ? 2 : rest;
1329 rb_str_resize(newstr, olen);
1336 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1338 rb_enc_associate(newstr, to);
1357 const int eidx = rb_enc_to_index(eenc);
1360 return rb_enc_str_new(
ptr,
len, eenc);
1364 if ((eidx == rb_ascii8bit_encindex()) ||
1365 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1369 ienc = rb_default_internal_encoding();
1370 if (!ienc || eenc == ienc) {
1371 return rb_enc_str_new(
ptr,
len, eenc);
1375 if ((eidx == rb_ascii8bit_encindex()) ||
1376 (eidx == rb_usascii_encindex()) ||
1377 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1378 return rb_enc_str_new(
ptr,
len, ienc);
1381 str = rb_enc_str_new(NULL, 0, ienc);
1384 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1385 rb_str_initialize(str,
ptr,
len, eenc);
1393 int eidx = rb_enc_to_index(eenc);
1394 if (eidx == rb_usascii_encindex() &&
1395 !is_ascii_string(str)) {
1396 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1399 rb_enc_associate_index(str, eidx);
1458str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1460 const int termlen = TERM_LEN(str);
1465 if (str_embed_capa(str2) >=
len + termlen) {
1466 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1467 STR_SET_EMBED(str2);
1468 memcpy(ptr2, RSTRING_PTR(str),
len);
1469 TERM_FILL(ptr2+
len, termlen);
1473 if (STR_SHARED_P(str)) {
1474 root =
RSTRING(str)->as.heap.aux.shared;
1483 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1485 rb_fatal(
"about to free a possible shared root");
1487 char *ptr2 = STR_HEAP_PTR(str2);
1489 SIZED_FREE_N(ptr2, STR_HEAP_SIZE(str2));
1492 FL_SET(str2, STR_NOEMBED);
1494 STR_SET_SHARED(str2, root);
1497 STR_SET_LEN(str2,
len);
1505 str_replace_shared_without_enc(str2, str);
1506 rb_enc_cr_str_exact_copy(str2, str);
1513 return str_replace_shared(str_alloc_heap(klass), str);
1530rb_str_new_frozen_String(
VALUE orig)
1538rb_str_frozen_bare_string(
VALUE orig)
1540 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1545rb_str_tmp_frozen_acquire(
VALUE orig)
1548 return str_new_frozen_buffer(0, orig, FALSE);
1552rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1554 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1555 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1557 VALUE str = str_alloc_heap(0);
1560 FL_SET(str, STR_SHARED_ROOT);
1562 size_t capa = str_capacity(orig, TERM_LEN(orig));
1568 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1569 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1576 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1577 RBASIC(orig)->flags &= ~STR_NOFREE;
1578 STR_SET_SHARED(orig, str);
1580 RB_OBJ_SET_SHAREABLE(str);
1586 RSTRING(str)->as.heap.aux.capa =
capa + (TERM_LEN(orig) - TERM_LEN(str));
1592rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1597 if (STR_EMBED_P(tmp)) {
1600 else if (
FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1606 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1610 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1611 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1616 STR_SET_LEN(tmp, 0);
1624 return str_new_frozen_buffer(klass, orig, TRUE);
1634 VALUE str = str_alloc_heap(klass);
1635 STR_SET_LEN(str, RSTRING_LEN(orig));
1636 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1637 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1638 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1639 RBASIC(orig)->flags &= ~STR_NOFREE;
1640 STR_SET_SHARED(orig, str);
1647str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1651 long len = RSTRING_LEN(orig);
1652 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1653 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1655 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1656 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1662 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1663 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1669 if ((ofs > 0) || (rest > 0) ||
1672 str = str_new_shared(klass,
shared);
1674 RSTRING(str)->as.heap.ptr += ofs;
1675 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1683 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1684 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1686 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1687 STR_SET_LEN(str, RSTRING_LEN(orig));
1693 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1696 str = heap_str_make_shared(klass, orig);
1701 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1713str_new_empty_String(
VALUE str)
1716 rb_enc_copy(v, str);
1720#define STR_BUF_MIN_SIZE 63
1725 if (STR_EMBEDDABLE_P(
capa, 1)) {
1733 RSTRING(str)->as.heap.ptr[0] =
'\0';
1753 return str_new(0, 0,
len);
1759 if (STR_EMBED_P(str)) {
1760 RB_DEBUG_COUNTER_INC(obj_str_embed);
1762 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1763 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1764 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1767 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1768 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1773rb_str_memsize(
VALUE str)
1775 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1776 return STR_HEAP_SIZE(str);
1786 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1789static inline void str_discard(
VALUE str);
1790static void str_shared_replace(
VALUE str,
VALUE str2);
1795 if (str != str2) str_shared_replace(str, str2);
1806 enc = STR_ENC_GET(str2);
1809 termlen = rb_enc_mbminlen(enc);
1811 STR_SET_LEN(str, RSTRING_LEN(str2));
1813 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1815 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1816 rb_enc_associate(str, enc);
1820 if (STR_EMBED_P(str2)) {
1822 long len = RSTRING_LEN(str2);
1825 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1826 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1827 RSTRING(str2)->as.heap.ptr = new_ptr;
1828 STR_SET_LEN(str2,
len);
1830 STR_SET_NOEMBED(str2);
1833 STR_SET_NOEMBED(str);
1835 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1837 if (
FL_TEST(str2, STR_SHARED)) {
1839 STR_SET_SHARED(str,
shared);
1842 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1846 STR_SET_EMBED(str2);
1847 RSTRING_PTR(str2)[0] = 0;
1848 STR_SET_LEN(str2, 0);
1849 rb_enc_associate(str, enc);
1863 return rb_obj_as_string_result(str, obj);
1879 len = RSTRING_LEN(str2);
1880 if (STR_SHARED_P(str2)) {
1883 STR_SET_NOEMBED(str);
1884 STR_SET_LEN(str,
len);
1885 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1886 STR_SET_SHARED(str,
shared);
1887 rb_enc_cr_str_exact_copy(str, str2);
1890 str_replace_shared(str, str2);
1899 size_t size = rb_str_embed_size(
capa, 0);
1915 str->as.heap.aux.capa = 0;
1916 str->as.heap.ptr = NULL;
1926 encidx = rb_enc_get_index(str);
1927 flags &= ~ENCODING_MASK;
1930 if (encidx) rb_enc_associate_index(dup, encidx);
1940 long len = RSTRING_LEN(str);
1945 STR_SET_LEN(dup, RSTRING_LEN(str));
1946 return str_duplicate_setup_encoding(str, dup, flags);
1955 root =
RSTRING(str)->as.heap.aux.shared;
1958 root = str = str_new_frozen(klass, str);
1964 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1966 STR_SET_SHARED(dup, root);
1967 flags |= RSTRING_NOEMBED | STR_SHARED;
1969 STR_SET_LEN(dup, RSTRING_LEN(str));
1970 return str_duplicate_setup_encoding(str, dup, flags);
1976 if (STR_EMBED_P(str)) {
1977 return str_duplicate_setup_embed(klass, str, dup);
1980 return str_duplicate_setup_heap(klass, str, dup);
1988 if (STR_EMBED_P(str)) {
1989 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1992 dup = str_alloc_heap(klass);
1995 return str_duplicate_setup(klass, str, dup);
2006rb_str_dup_m(
VALUE str)
2008 if (LIKELY(BARE_STRING_P(str))) {
2019 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2026 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2030 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2031 str_duplicate_setup_embed(klass, str, new_str);
2034 new_str = ec_str_alloc_heap(ec, klass);
2035 str_duplicate_setup_heap(klass, str, new_str);
2044rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
2046 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
2070 static ID keyword_ids[2];
2071 VALUE orig, opt, venc, vcapa;
2076 if (!keyword_ids[0]) {
2077 keyword_ids[0] = rb_id_encoding();
2078 CONST_ID(keyword_ids[1],
"capacity");
2086 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2087 enc = rb_to_encoding(venc);
2089 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2092 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2094 if (
capa < STR_BUF_MIN_SIZE) {
2095 capa = STR_BUF_MIN_SIZE;
2099 len = RSTRING_LEN(orig);
2103 if (orig == str) n = 0;
2105 str_modifiable(str);
2106 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2108 const size_t size = (size_t)
capa + termlen;
2109 const char *
const old_ptr = RSTRING_PTR(str);
2110 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2111 char *new_ptr =
ALLOC_N(
char, size);
2112 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2113 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2115 RSTRING(str)->as.heap.ptr = new_ptr;
2117 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2118 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2119 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2121 STR_SET_LEN(str,
len);
2124 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2125 rb_enc_cr_str_exact_copy(str, orig);
2127 FL_SET(str, STR_NOEMBED);
2134 rb_enc_associate(str, enc);
2146rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2152 static ID keyword_ids[2];
2162 keyword_ids[0] = rb_id_encoding();
2163 CONST_ID(keyword_ids[1],
"capacity");
2165 encoding = kwargs[0];
2166 capacity = kwargs[1];
2175 if (UNDEF_P(encoding)) {
2177 encoding = rb_obj_encoding(orig);
2181 if (!UNDEF_P(encoding)) {
2182 enc = rb_to_encoding(encoding);
2186 if (UNDEF_P(capacity)) {
2188 VALUE empty_str = str_new(klass,
"", 0);
2190 rb_enc_associate(empty_str, enc);
2194 VALUE copy = str_duplicate(klass, orig);
2195 rb_enc_associate(copy, enc);
2208 if (orig_capa >
capa) {
2213 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2214 STR_SET_LEN(str, 0);
2225#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2240static inline uintptr_t
2241count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2246 d = (d>>6) | (~d>>7);
2247 d &= NONASCII_MASK >> 7;
2250#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2252 return rb_popcount_intptr(d);
2256# if SIZEOF_VOIDP == 8
2265enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2271 long diff = (long)(e - p);
2272 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2277 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2278 const uintptr_t *s, *t;
2279 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2280 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2281 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2282 while (p < (
const char *)s) {
2283 if (is_utf8_lead_byte(*p))
len++;
2287 len += count_utf8_lead_bytes_with_word(s);
2290 p = (
const char *)s;
2293 if (is_utf8_lead_byte(*p))
len++;
2299 else if (rb_enc_asciicompat(enc)) {
2304 q = search_nonascii(p, e);
2310 p += rb_enc_fast_mbclen(p, e, enc);
2317 q = search_nonascii(p, e);
2323 p += rb_enc_mbclen(p, e, enc);
2330 for (c=0; p<e; c++) {
2331 p += rb_enc_mbclen(p, e, enc);
2346rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2354 long diff = (long)(e - p);
2355 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2357 else if (rb_enc_asciicompat(enc)) {
2361 q = search_nonascii(p, e);
2369 ret = rb_enc_precise_mbclen(p, e, enc);
2384 for (c=0; p<e; c++) {
2385 ret = rb_enc_precise_mbclen(p, e, enc);
2392 if (p + rb_enc_mbminlen(enc) <= e)
2393 p += rb_enc_mbminlen(enc);
2409 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2410 if (!enc) enc = STR_ENC_GET(str);
2411 p = RSTRING_PTR(str);
2416 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2421 return enc_strlen(p, e, enc, cr);
2428 return str_strlen(str, NULL);
2442 return LONG2NUM(str_strlen(str, NULL));
2454rb_str_bytesize(
VALUE str)
2473rb_str_empty(
VALUE str)
2475 return RBOOL(RSTRING_LEN(str) == 0);
2494 char *ptr1, *ptr2, *ptr3;
2499 enc = rb_enc_check_str(str1, str2);
2502 termlen = rb_enc_mbminlen(enc);
2503 if (len1 > LONG_MAX - len2) {
2504 rb_raise(rb_eArgError,
"string size too big");
2506 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2507 ptr3 = RSTRING_PTR(str3);
2508 memcpy(ptr3, ptr1, len1);
2509 memcpy(ptr3+len1, ptr2, len2);
2510 TERM_FILL(&ptr3[len1+len2], termlen);
2526 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2529 int enc1 = rb_enc_get_index(str1);
2530 int enc2 = rb_enc_get_index(str2);
2535 else if (enc2 < 0) {
2538 else if (enc1 != enc2) {
2541 else if (len1 > LONG_MAX - len2) {
2575 rb_enc_copy(str2, str);
2580 rb_raise(rb_eArgError,
"negative argument");
2582 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2583 if (STR_EMBEDDABLE_P(
len, 1)) {
2585 memset(RSTRING_PTR(str2), 0,
len + 1);
2592 STR_SET_LEN(str2,
len);
2593 rb_enc_copy(str2, str);
2596 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2597 rb_raise(rb_eArgError,
"argument too big");
2600 len *= RSTRING_LEN(str);
2601 termlen = TERM_LEN(str);
2603 ptr2 = RSTRING_PTR(str2);
2605 n = RSTRING_LEN(str);
2606 memcpy(ptr2, RSTRING_PTR(str), n);
2607 while (n <=
len/2) {
2608 memcpy(ptr2 + n, ptr2, n);
2611 memcpy(ptr2 + n, ptr2,
len-n);
2613 STR_SET_LEN(str2,
len);
2614 TERM_FILL(&ptr2[
len], termlen);
2615 rb_enc_cr_str_copy_for_substr(str2, str);
2654rb_check_lockedtmp(
VALUE str)
2656 if (
FL_TEST(str, STR_TMPLOCK)) {
2663#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2665str_modifiable(
VALUE str)
2669 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2670 if (CHILLED_STRING_P(str)) {
2671 CHILLED_STRING_MUTATED(str);
2673 rb_check_lockedtmp(str);
2674 rb_check_frozen(str);
2679str_dependent_p(
VALUE str)
2681 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2691#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2693str_independent(
VALUE str)
2697 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2698 str_modifiable(str);
2699 return !str_dependent_p(str);
2705str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2715 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2720 STR_SET_LEN(str,
len);
2725 oldptr = RSTRING_PTR(str);
2727 memcpy(
ptr, oldptr,
len);
2729 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2730 SIZED_FREE_N(oldptr, STR_HEAP_SIZE(str));
2732 STR_SET_NOEMBED(str);
2733 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2734 TERM_FILL(
ptr +
len, termlen);
2736 STR_SET_LEN(str,
len);
2743 if (!str_independent(str))
2744 str_make_independent(str);
2753 int termlen = TERM_LEN(str);
2754 long len = RSTRING_LEN(str);
2757 rb_raise(rb_eArgError,
"negative expanding string size");
2759 if (expand >= LONG_MAX -
len) {
2760 rb_raise(rb_eArgError,
"string size too big");
2763 if (!str_independent(str)) {
2764 str_make_independent_expand(str,
len, expand, termlen);
2766 else if (expand > 0) {
2767 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2774str_modify_keep_cr(
VALUE str)
2776 if (!str_independent(str))
2777 str_make_independent(str);
2784str_discard(
VALUE str)
2786 str_modifiable(str);
2787 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2788 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2789 RSTRING(str)->as.heap.ptr = 0;
2790 STR_SET_LEN(str, 0);
2797 int encindex = rb_enc_get_index(str);
2799 if (RB_UNLIKELY(encindex == -1)) {
2803 if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) {
2808 if (!rb_enc_asciicompat(enc)) {
2830 return RSTRING_PTR(str);
2834str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2836 const char *e = s +
len;
2838 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2839 if (zero_filled(s, minlen))
return s;
2845str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2850 if (str_dependent_p(str)) {
2851 if (!zero_filled(s +
len, termlen))
2852 str_make_independent_expand(str,
len, 0L, termlen);
2855 TERM_FILL(s +
len, termlen);
2858 return RSTRING_PTR(str);
2862rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2864 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2865 long len = RSTRING_LEN(str);
2869 rb_check_lockedtmp(str);
2870 str_make_independent_expand(str,
len, 0L, termlen);
2872 else if (str_dependent_p(str)) {
2873 if (termlen > oldtermlen)
2874 str_make_independent_expand(str,
len, 0L, termlen);
2877 if (!STR_EMBED_P(str)) {
2882 if (termlen > oldtermlen) {
2883 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2891str_null_check(
VALUE str,
int *w)
2893 char *s = RSTRING_PTR(str);
2894 long len = RSTRING_LEN(str);
2897 if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
2899 minlen = rb_enc_mbminlen(enc);
2903 if (str_null_char(s,
len, minlen, enc)) {
2906 return str_fill_term(str, s,
len, minlen);
2911 if (!s || memchr(s, 0,
len)) {
2915 s = str_fill_term(str, s,
len, minlen);
2921rb_str_null_check(
VALUE str)
2929 if (RB_LIKELY(rb_str_enc_fastpath(str))) {
2930 if (!s || memchr(s, 0,
len)) {
2931 rb_raise(rb_eArgError,
"string contains null byte");
2936 const char *s = str_null_check(str, &w);
2939 rb_raise(rb_eArgError,
"string contains null char");
2941 rb_raise(rb_eArgError,
"string contains null byte");
2949rb_str_to_cstr(
VALUE str)
2952 return str_null_check(str, &w);
2960 char *s = str_null_check(str, &w);
2963 rb_raise(rb_eArgError,
"string contains null char");
2965 rb_raise(rb_eArgError,
"string contains null byte");
2971rb_str_fill_terminator(
VALUE str,
const int newminlen)
2973 char *s = RSTRING_PTR(str);
2974 long len = RSTRING_LEN(str);
2975 return str_fill_term(str, s,
len, newminlen);
2981 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
3007str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
3016 else if (rb_enc_asciicompat(enc)) {
3017 const char *p2, *e2;
3020 while (p < e && 0 < nth) {
3027 p2 = search_nonascii(p, e2);
3036 n = rb_enc_mbclen(p, e, enc);
3047 while (p < e && nth--) {
3048 p += rb_enc_mbclen(p, e, enc);
3059 return str_nth_len(p, e, &nth, enc);
3063str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3068 p = str_nth_len(p, e, &nth, enc);
3077str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
3079 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3080 if (!pp)
return e - p;
3087 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
3088 STR_ENC_GET(str), single_byte_optimizable(str));
3093str_utf8_nth(
const char *p,
const char *e,
long *nthp)
3096 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
3097 const uintptr_t *s, *t;
3098 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3099 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3100 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
3101 while (p < (
const char *)s) {
3102 if (is_utf8_lead_byte(*p)) nth--;
3106 nth -= count_utf8_lead_bytes_with_word(s);
3108 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3112 if (is_utf8_lead_byte(*p)) {
3113 if (nth == 0)
break;
3123str_utf8_offset(
const char *p,
const char *e,
long nth)
3125 const char *pp = str_utf8_nth(p, e, &nth);
3134 if (single_byte_optimizable(str) || pos < 0)
3137 char *p = RSTRING_PTR(str);
3138 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3143str_subseq(
VALUE str,
long beg,
long len)
3151 const int termlen = TERM_LEN(str);
3152 if (!SHARABLE_SUBSTRING_P(str, beg,
len)) {
3153 str2 = rb_enc_str_new(RSTRING_PTR(str) + beg,
len, rb_str_enc_get(str));
3162 if (str_embed_capa(str2) >=
len + termlen) {
3163 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3164 STR_SET_EMBED(str2);
3165 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3166 TERM_FILL(ptr2+
len, termlen);
3168 STR_SET_LEN(str2,
len);
3176 str_replace_shared(str2, str);
3182 RSTRING(str2)->as.heap.ptr += beg;
3183 if (RSTRING_LEN(str2) >
len) {
3184 STR_SET_LEN(str2,
len);
3194 VALUE str2 = str_subseq(str, beg,
len);
3195 rb_enc_cr_str_copy_for_substr(str2, str);
3204 const long blen = RSTRING_LEN(str);
3206 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3208 if (
len < 0)
return 0;
3209 if (beg < 0 && -beg < 0)
return 0;
3213 if (single_byte_optimizable(str)) {
3214 if (beg > blen)
return 0;
3217 if (beg < 0)
return 0;
3219 if (
len > blen - beg)
3221 if (
len < 0)
return 0;
3226 if (
len > -beg)
len = -beg;
3230 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3233 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3239 slen = str_strlen(str, enc);
3241 if (beg < 0)
return 0;
3243 if (
len == 0)
goto end;
3246 else if (beg > 0 && beg > blen) {
3250 if (beg > str_strlen(str, enc))
return 0;
3255 enc == rb_utf8_encoding()) {
3256 p = str_utf8_nth(s, e, &beg);
3257 if (beg > 0)
return 0;
3258 len = str_utf8_offset(p, e,
len);
3264 p = s + beg * char_sz;
3268 else if (
len * char_sz > e - p)
3273 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3274 if (beg > 0)
return 0;
3278 len = str_offset(p, e,
len, enc, 0);
3286static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3291 return str_substr(str, beg,
len, TRUE);
3301str_substr(
VALUE str,
long beg,
long len,
int empty)
3305 if (!p)
return Qnil;
3306 if (!
len && !empty)
return Qnil;
3308 beg = p - RSTRING_PTR(str);
3310 VALUE str2 = str_subseq(str, beg,
len);
3311 rb_enc_cr_str_copy_for_substr(str2, str);
3319 if (CHILLED_STRING_P(str)) {
3324 rb_str_resize(str, RSTRING_LEN(str));
3342 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3385str_uminus(
VALUE str)
3390 return rb_fstring(str);
3394#define rb_str_dup_frozen rb_str_new_frozen
3399 rb_check_frozen(str);
3400 if (
FL_TEST(str, STR_TMPLOCK)) {
3403 FL_SET(str, STR_TMPLOCK);
3410 rb_check_frozen(str);
3411 if (!
FL_TEST(str, STR_TMPLOCK)) {
3431 const int termlen = TERM_LEN(str);
3433 str_modifiable(str);
3434 if (STR_SHARED_P(str)) {
3437 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3438 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3449 else if (
len > RSTRING_LEN(str)) {
3453 const char *
const new_end = RSTRING_PTR(str) +
len;
3463 else if (
len < RSTRING_LEN(str)) {
3471 STR_SET_LEN(str,
len);
3472 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3479 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3482 int independent = str_independent(str);
3483 long slen = RSTRING_LEN(str);
3484 const int termlen = TERM_LEN(str);
3486 if (slen >
len || (termlen != 1 && slen <
len)) {
3492 if (STR_EMBED_P(str)) {
3493 if (
len == slen)
return str;
3494 if (str_embed_capa(str) >=
len + termlen) {
3495 STR_SET_LEN(str,
len);
3499 str_make_independent_expand(str, slen,
len - slen, termlen);
3501 else if (str_embed_capa(str) >=
len + termlen) {
3503 char *
ptr = STR_HEAP_PTR(str);
3505 if (slen >
len) slen =
len;
3508 STR_SET_LEN(str,
len);
3510 SIZED_FREE_N(
ptr,
capa + termlen);
3514 else if (!independent) {
3515 if (
len == slen)
return str;
3516 str_make_independent_expand(str, slen,
len - slen, termlen);
3520 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3521 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3524 else if (
len == slen)
return str;
3525 STR_SET_LEN(str,
len);
3532str_ensure_available_capa(
VALUE str,
long len)
3534 str_modify_keep_cr(str);
3536 const int termlen = TERM_LEN(str);
3537 long olen = RSTRING_LEN(str);
3539 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3540 rb_raise(rb_eArgError,
"string sizes too big");
3543 long total = olen +
len;
3544 long capa = str_capacity(str, termlen);
3547 if (total >= LONG_MAX / 2) {
3550 while (total >
capa) {
3553 RESIZE_CAPA_TERM(str,
capa, termlen);
3558str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3561 str_modify_keep_cr(str);
3566 if (
len == 0)
return 0;
3568 long total, olen,
off = -1;
3570 const int termlen = TERM_LEN(str);
3573 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3577 long capa = str_capacity(str, termlen);
3579 if (olen > LONG_MAX -
len) {
3580 rb_raise(rb_eArgError,
"string sizes too big");
3584 if (total >= LONG_MAX / 2) {
3587 while (total >
capa) {
3590 RESIZE_CAPA_TERM(str,
capa, termlen);
3591 sptr = RSTRING_PTR(str);
3596 memcpy(sptr + olen,
ptr,
len);
3597 STR_SET_LEN(str, total);
3598 TERM_FILL(sptr + total, termlen);
3603#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3604#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3609 if (
len == 0)
return str;
3611 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3613 return str_buf_cat(str,
ptr,
len);
3624rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3629 if (UNLIKELY(!str_independent(str))) {
3630 str_make_independent(str);
3633 long string_length = -1;
3634 const int null_terminator_length = 1;
3639 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3640 rb_raise(rb_eArgError,
"string sizes too big");
3643 long string_capacity = str_capacity(str, null_terminator_length);
3649 if (LIKELY(string_capacity >= string_length + 1)) {
3651 sptr[string_length] = byte;
3652 STR_SET_LEN(str, string_length + 1);
3653 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3657 str_buf_cat(str, (
char *)&
byte, 1);
3673 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3684rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3685 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3694 if (str_encindex == ptr_encindex) {
3696 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3700 str_enc = rb_enc_from_index(str_encindex);
3701 ptr_enc = rb_enc_from_index(ptr_encindex);
3702 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3705 if (RSTRING_LEN(str) == 0) {
3708 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3714 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3718 str_cr = rb_enc_str_coderange(str);
3723 *ptr_cr_ret = ptr_cr;
3725 if (str_encindex != ptr_encindex &&
3728 str_enc = rb_enc_from_index(str_encindex);
3729 ptr_enc = rb_enc_from_index(ptr_encindex);
3734 res_encindex = str_encindex;
3739 res_encindex = str_encindex;
3743 res_encindex = ptr_encindex;
3748 res_encindex = str_encindex;
3755 res_encindex = str_encindex;
3761 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3763 str_buf_cat(str,
ptr,
len);
3769 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3776 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3786 if (rb_enc_asciicompat(enc)) {
3787 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3793 unsigned int c = (
unsigned char)*
ptr;
3794 int len = rb_enc_codelen(c, enc);
3795 rb_enc_mbcput(c, buf, enc);
3796 rb_enc_cr_str_buf_cat(str, buf,
len,
3807 int str2_cr = rb_enc_str_coderange(str2);
3809 if (rb_str_enc_fastpath(str)) {
3813 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3819 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3830 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3846rb_str_concat_literals(
size_t num,
const VALUE *strary)
3850 unsigned long len = 1;
3855 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3857 str_enc_copy_direct(str, strary[0]);
3859 for (i = s; i < num; ++i) {
3860 const VALUE v = strary[i];
3864 if (encidx != ENCINDEX_US_ASCII) {
3866 rb_enc_set_index(str, encidx);
3879rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3881 str_modifiable(str);
3886 else if (argc > 1) {
3889 rb_enc_copy(arg_str, str);
3890 for (i = 0; i < argc; i++) {
3925rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3927 long needed_capacity = 0;
3931 for (
int index = 0; index < argc; index++) {
3932 VALUE obj = argv[index];
3940 needed_capacity += RSTRING_LEN(obj);
3945 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3952 str_ensure_available_capa(str, needed_capacity);
3955 for (
int index = 0; index < argc; index++) {
3956 VALUE obj = argv[index];
3961 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3962 char byte = (char)(
NUM2INT(obj) & 0xFF);
3976 rb_bug(
"append_as_bytes arguments should have been validated");
3980 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3981 TERM_FILL(sptr, TERM_LEN(str));
3986 for (
int index = 0; index < argc; index++) {
3987 VALUE obj = argv[index];
4004 rb_bug(
"append_as_bytes arguments should have been validated");
4083 if (rb_num_to_uint(str2, &code) == 0) {
4096 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4099 rb_str_buf_cat_byte(str1, (
unsigned char)code);
4102 long pos = RSTRING_LEN(str1);
4107 switch (
len = rb_enc_codelen(code, enc)) {
4108 case ONIGERR_INVALID_CODE_POINT_VALUE:
4109 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4111 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4117 rb_enc_mbcput(code, buf, enc);
4118 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
4119 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4121 rb_str_resize(str1, pos+
len);
4122 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
4135rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
4137 int encidx = rb_enc_to_index(enc);
4139 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4144 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4145 return ENCINDEX_ASCII_8BIT;
4167rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4169 str_modifiable(str);
4174 else if (argc > 1) {
4177 rb_enc_copy(arg_str, str);
4178 for (i = 0; i < argc; i++) {
4191 st_index_t precomputed_hash;
4192 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4194 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4195 return precomputed_hash;
4198 return str_do_hash(str);
4205 const char *ptr1, *ptr2;
4208 return (len1 != len2 ||
4210 memcmp(ptr1, ptr2, len1) != 0);
4222rb_str_hash_m(
VALUE str)
4228#define lesser(a,b) (((a)>(b))?(b):(a))
4236 if (RSTRING_LEN(str1) == 0)
return TRUE;
4237 if (RSTRING_LEN(str2) == 0)
return TRUE;
4240 if (idx1 == idx2)
return TRUE;
4241 rc1 = rb_enc_str_coderange(str1);
4242 rc2 = rb_enc_str_coderange(str2);
4245 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4249 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4259 const char *ptr1, *ptr2;
4262 if (str1 == str2)
return 0;
4265 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4274 if (len1 > len2)
return 1;
4277 if (retval > 0)
return 1;
4311 if (str1 == str2)
return Qtrue;
4318 return rb_str_eql_internal(str1, str2);
4332 if (str1 == str2)
return Qtrue;
4334 return rb_str_eql_internal(str1, str2);
4372 return rb_invcmp(str1, str2);
4414 return str_casecmp(str1, s);
4422 const char *p1, *p1end, *p2, *p2end;
4424 enc = rb_enc_compatible(str1, str2);
4429 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4430 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4431 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4432 while (p1 < p1end && p2 < p2end) {
4434 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4435 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4437 return INT2FIX(c1 < c2 ? -1 : 1);
4444 while (p1 < p1end && p2 < p2end) {
4445 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4446 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4448 if (0 <= c1 && 0 <= c2) {
4452 return INT2FIX(c1 < c2 ? -1 : 1);
4456 l1 = rb_enc_mbclen(p1, p1end, enc);
4457 l2 = rb_enc_mbclen(p2, p2end, enc);
4458 len = l1 < l2 ? l1 : l2;
4459 r = memcmp(p1, p2,
len);
4461 return INT2FIX(r < 0 ? -1 : 1);
4463 return INT2FIX(l1 < l2 ? -1 : 1);
4469 if (p1 == p1end && p2 == p2end)
return INT2FIX(0);
4470 if (p1 == p1end)
return INT2FIX(-1);
4503 return str_casecmp_p(str1, s);
4510 VALUE folded_str1, folded_str2;
4511 VALUE fold_opt = sym_fold;
4513 enc = rb_enc_compatible(str1, str2);
4518 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4519 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4521 return rb_str_eql(folded_str1, folded_str2);
4525strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4526 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4528 const char *search_start = str_ptr;
4529 long pos, search_len = str_len - offset;
4533 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4534 if (pos < 0)
return pos;
4536 if (t == search_start + pos)
break;
4537 search_len -= t - search_start;
4538 if (search_len <= 0)
return -1;
4539 offset += t - search_start;
4542 return pos + offset;
4546#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4547#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4550rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4552 const char *str_ptr, *str_ptr_end, *sub_ptr;
4553 long str_len, sub_len;
4556 enc = rb_enc_check(str, sub);
4557 if (is_broken_string(sub))
return -1;
4559 str_ptr = RSTRING_PTR(str);
4561 str_len = RSTRING_LEN(str);
4562 sub_ptr = RSTRING_PTR(sub);
4563 sub_len = RSTRING_LEN(sub);
4565 if (str_len < sub_len)
return -1;
4568 long str_len_char, sub_len_char;
4569 int single_byte = single_byte_optimizable(str);
4570 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4571 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4573 offset += str_len_char;
4574 if (offset < 0)
return -1;
4576 if (str_len_char - offset < sub_len_char)
return -1;
4577 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4580 if (sub_len == 0)
return offset;
4583 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4596rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4603 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4604 long slen = str_strlen(str, enc);
4606 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4618 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4619 enc, single_byte_optimizable(str));
4630 pos = rb_str_index(str, sub, pos);
4644str_ensure_byte_pos(
VALUE str,
long pos)
4646 if (!single_byte_optimizable(str)) {
4647 const char *s = RSTRING_PTR(str);
4649 const char *p = s + pos;
4650 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4652 "offset %ld does not land on character boundary", pos);
4725rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4731 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4732 long slen = RSTRING_LEN(str);
4734 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4745 str_ensure_byte_pos(str, pos);
4757 pos = rb_str_byteindex(str, sub, pos);
4758 if (pos >= 0)
return LONG2NUM(pos);
4765memrchr(
const char *search_str,
int chr,
long search_len)
4767 const char *ptr = search_str + search_len;
4768 while (ptr > search_str) {
4769 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4779 char *hit, *adjusted;
4781 long slen, searchlen;
4784 sbeg = RSTRING_PTR(str);
4785 slen = RSTRING_LEN(sub);
4786 if (slen == 0)
return s - sbeg;
4788 t = RSTRING_PTR(sub);
4790 searchlen = s - sbeg + 1;
4792 if (memcmp(s, t, slen) == 0) {
4797 hit = memrchr(sbeg, c, searchlen);
4800 if (hit != adjusted) {
4801 searchlen = adjusted - sbeg;
4804 if (memcmp(hit, t, slen) == 0)
4806 searchlen = adjusted - sbeg;
4807 }
while (searchlen > 0);
4821 enc = rb_enc_check(str, sub);
4822 if (is_broken_string(sub))
return -1;
4823 singlebyte = single_byte_optimizable(str);
4824 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4825 slen = str_strlen(sub, enc);
4828 if (
len < slen)
return -1;
4829 if (
len - pos < slen) pos =
len - slen;
4830 if (
len == 0)
return pos;
4832 sbeg = RSTRING_PTR(str);
4835 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4841 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4842 return str_rindex(str, sub, s, enc);
4854rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4859 long pos,
len = str_strlen(str, enc);
4861 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4863 if (pos < 0 && (pos +=
len) < 0) {
4869 if (pos >
len) pos =
len;
4877 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4878 enc, single_byte_optimizable(str));
4889 pos = rb_str_rindex(str, sub, pos);
4899rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4905 enc = rb_enc_check(str, sub);
4906 if (is_broken_string(sub))
return -1;
4907 len = RSTRING_LEN(str);
4908 slen = RSTRING_LEN(sub);
4911 if (
len < slen)
return -1;
4912 if (
len - pos < slen) pos =
len - slen;
4913 if (
len == 0)
return pos;
4915 sbeg = RSTRING_PTR(str);
4918 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4925 return str_rindex(str, sub, s, enc);
5015rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
5019 long pos,
len = RSTRING_LEN(str);
5021 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
5023 if (pos < 0 && (pos +=
len) < 0) {
5029 if (pos >
len) pos =
len;
5035 str_ensure_byte_pos(str, pos);
5047 pos = rb_str_byterindex(str, sub, pos);
5048 if (pos >= 0)
return LONG2NUM(pos);
5090 switch (OBJ_BUILTIN_TYPE(y)) {
5144rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5151 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5182rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5186 re = get_pat(argv[0]);
5187 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5196static enum neighbor_char
5202 if (rb_enc_mbminlen(enc) > 1) {
5204 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5206 return NEIGHBOR_NOT_CHAR;
5208 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5210 if (!l)
return NEIGHBOR_NOT_CHAR;
5211 if (l !=
len)
return NEIGHBOR_WRAPPED;
5212 rb_enc_mbcput(c, p, enc);
5213 r = rb_enc_precise_mbclen(p, p +
len, enc);
5215 return NEIGHBOR_NOT_CHAR;
5217 return NEIGHBOR_FOUND;
5220 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5223 return NEIGHBOR_WRAPPED;
5224 ++((
unsigned char*)p)[i];
5225 l = rb_enc_precise_mbclen(p, p+
len, enc);
5229 return NEIGHBOR_FOUND;
5232 memset(p+l, 0xff,
len-l);
5238 for (len2 =
len-1; 0 < len2; len2--) {
5239 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5243 memset(p+len2+1, 0xff,
len-(len2+1));
5248static enum neighbor_char
5253 if (rb_enc_mbminlen(enc) > 1) {
5255 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5257 return NEIGHBOR_NOT_CHAR;
5259 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5260 if (!c)
return NEIGHBOR_NOT_CHAR;
5263 if (!l)
return NEIGHBOR_NOT_CHAR;
5264 if (l !=
len)
return NEIGHBOR_WRAPPED;
5265 rb_enc_mbcput(c, p, enc);
5266 r = rb_enc_precise_mbclen(p, p +
len, enc);
5268 return NEIGHBOR_NOT_CHAR;
5270 return NEIGHBOR_FOUND;
5273 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5276 return NEIGHBOR_WRAPPED;
5277 --((
unsigned char*)p)[i];
5278 l = rb_enc_precise_mbclen(p, p+
len, enc);
5282 return NEIGHBOR_FOUND;
5285 memset(p+l, 0,
len-l);
5291 for (len2 =
len-1; 0 < len2; len2--) {
5292 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5296 memset(p+len2+1, 0,
len-(len2+1));
5310static enum neighbor_char
5311enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5313 enum neighbor_char ret;
5317 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5321 const int max_gaps = 1;
5323 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5325 ctype = ONIGENC_CTYPE_DIGIT;
5327 ctype = ONIGENC_CTYPE_ALPHA;
5329 return NEIGHBOR_NOT_CHAR;
5332 for (
try = 0;
try <= max_gaps; ++
try) {
5333 ret = enc_succ_char(p,
len, enc);
5334 if (ret == NEIGHBOR_FOUND) {
5335 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5337 return NEIGHBOR_FOUND;
5344 ret = enc_pred_char(p,
len, enc);
5345 if (ret == NEIGHBOR_FOUND) {
5346 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5359 return NEIGHBOR_NOT_CHAR;
5362 if (ctype != ONIGENC_CTYPE_DIGIT) {
5364 return NEIGHBOR_WRAPPED;
5368 enc_succ_char(carry,
len, enc);
5369 return NEIGHBOR_WRAPPED;
5387 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5388 rb_enc_cr_str_copy_for_substr(str, orig);
5389 return str_succ(str);
5396 char *sbeg, *s, *e, *last_alnum = 0;
5397 int found_alnum = 0;
5399 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5400 long carry_pos = 0, carry_len = 1;
5401 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5403 slen = RSTRING_LEN(str);
5404 if (slen == 0)
return str;
5406 enc = STR_ENC_GET(str);
5407 sbeg = RSTRING_PTR(str);
5408 s = e = sbeg + slen;
5410 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5411 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5417 l = rb_enc_precise_mbclen(s, e, enc);
5418 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5419 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5420 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5422 case NEIGHBOR_NOT_CHAR:
5424 case NEIGHBOR_FOUND:
5426 case NEIGHBOR_WRAPPED:
5431 carry_pos = s - sbeg;
5436 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5437 enum neighbor_char neighbor;
5438 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5439 l = rb_enc_precise_mbclen(s, e, enc);
5440 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5441 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5443 neighbor = enc_succ_char(tmp, l, enc);
5445 case NEIGHBOR_FOUND:
5449 case NEIGHBOR_WRAPPED:
5452 case NEIGHBOR_NOT_CHAR:
5455 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5457 enc_succ_char(s, l, enc);
5459 if (!rb_enc_asciicompat(enc)) {
5460 MEMCPY(carry, s,
char, l);
5463 carry_pos = s - sbeg;
5467 RESIZE_CAPA(str, slen + carry_len);
5468 sbeg = RSTRING_PTR(str);
5469 s = sbeg + carry_pos;
5470 memmove(s + carry_len, s, slen - carry_pos);
5471 memmove(s, carry, carry_len);
5473 STR_SET_LEN(str, slen);
5474 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5475 rb_enc_str_coderange(str);
5490rb_str_succ_bang(
VALUE str)
5498all_digits_p(
const char *s,
long len)
5526 VALUE end, exclusive;
5530 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5536 VALUE current, after_end;
5543 enc = rb_enc_check(beg, end);
5544 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5546 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5547 char c = RSTRING_PTR(beg)[0];
5548 char e = RSTRING_PTR(end)[0];
5550 if (c > e || (excl && c == e))
return beg;
5552 VALUE str = rb_enc_str_new(&c, 1, enc);
5554 if ((*each)(str, arg))
break;
5555 if (!excl && c == e)
break;
5557 if (excl && c == e)
break;
5562 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5563 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5564 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5569 b = rb_str_to_inum(beg, 10, FALSE);
5570 e = rb_str_to_inum(end, 10, FALSE);
5577 if (excl && bi == ei)
break;
5578 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5583 ID op = excl ?
'<' : idLE;
5584 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5589 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5590 b = rb_funcallv(b, succ, 0, 0);
5597 if (n > 0 || (excl && n == 0))
return beg;
5599 after_end = rb_funcallv(end, succ, 0, 0);
5604 next = rb_funcallv(current, succ, 0, 0);
5605 if ((*each)(current, arg))
break;
5606 if (
NIL_P(next))
break;
5610 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5625 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5626 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5627 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5629 b = rb_str_to_inum(beg, 10, FALSE);
5635 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5643 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5644 b = rb_funcallv(b, succ, 0, 0);
5650 VALUE next = rb_funcallv(current, succ, 0, 0);
5651 if ((*each)(current, arg))
break;
5654 if (RSTRING_LEN(current) == 0)
5665 if (!
rb_equal(str, *argp))
return 0;
5679 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5680 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5681 rb_enc_asciicompat(STR_ENC_GET(val))) {
5682 const char *bp = RSTRING_PTR(beg);
5683 const char *ep = RSTRING_PTR(end);
5684 const char *vp = RSTRING_PTR(val);
5685 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5686 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5694 if (b <= v && v < e)
return Qtrue;
5695 return RBOOL(!
RTEST(exclusive) && v == e);
5702 all_digits_p(bp, RSTRING_LEN(beg)) &&
5703 all_digits_p(ep, RSTRING_LEN(end))) {
5708 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5710 return RBOOL(
NIL_P(val));
5733 return rb_str_subpat(str, indx,
INT2FIX(0));
5736 if (rb_str_index(str, indx, 0) != -1)
5742 long beg,
len = str_strlen(str, NULL);
5754 return str_substr(str, idx, 1, FALSE);
5771rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5775 return rb_str_subpat(str, argv[0], argv[1]);
5778 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5782 return rb_str_aref(str, argv[0]);
5788 char *ptr = RSTRING_PTR(str);
5789 long olen = RSTRING_LEN(str), nlen;
5791 str_modifiable(str);
5792 if (
len > olen)
len = olen;
5794 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5796 size_t old_capa =
RSTRING(str)->as.heap.aux.capa + TERM_LEN(str);
5797 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5799 ptr =
RSTRING(str)->as.embed.ary;
5800 memmove(ptr, oldptr +
len, nlen);
5801 if (fl == STR_NOEMBED) {
5802 SIZED_FREE_N(oldptr, old_capa);
5806 if (!STR_SHARED_P(str)) {
5808 rb_enc_cr_str_exact_copy(shared, str);
5813 STR_SET_LEN(str, nlen);
5815 if (!SHARABLE_MIDDLE_SUBSTRING) {
5816 TERM_FILL(ptr + nlen, TERM_LEN(str));
5823rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5829 if (beg == 0 && vlen == 0) {
5834 str_modify_keep_cr(str);
5838 RESIZE_CAPA(str, slen + vlen -
len);
5839 sptr = RSTRING_PTR(str);
5843 cr = rb_enc_str_coderange(val);
5848 memmove(sptr + beg + vlen,
5850 slen - (beg +
len));
5852 if (vlen < beg &&
len < 0) {
5856 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5859 STR_SET_LEN(str, slen);
5860 TERM_FILL(&sptr[slen], TERM_LEN(str));
5867 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5876 int singlebyte = single_byte_optimizable(str);
5882 enc = rb_enc_check(str, val);
5883 slen = str_strlen(str, enc);
5885 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5894 if (
len > slen - beg) {
5897 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5902 beg = p - RSTRING_PTR(str);
5904 rb_str_update_0(str, beg,
len, val);
5905 rb_enc_associate(str, enc);
5916 long start, end,
len;
5926 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5930 nth += regs->num_regs;
5940 enc = rb_enc_check_str(str, val);
5941 rb_str_update_0(str, start,
len, val);
5942 rb_enc_associate(str, enc);
5950 switch (
TYPE(indx)) {
5952 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5956 beg = rb_str_index(str, indx, 0);
5995rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5999 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6007 return rb_str_aset(str, argv[0], argv[1]);
6059rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6067 str_modify_keep_cr(str);
6075 if ((nth += regs->num_regs) <= 0)
return Qnil;
6077 else if (nth >= regs->num_regs)
return Qnil;
6079 len = END(nth) - beg;
6082 else if (argc == 2) {
6091 beg = p - RSTRING_PTR(str);
6095 beg = rb_str_index(str, indx, 0);
6096 if (beg == -1)
return Qnil;
6097 len = RSTRING_LEN(indx);
6109 beg = p - RSTRING_PTR(str);
6118 beg = p - RSTRING_PTR(str);
6122 rb_enc_cr_str_copy_for_substr(result, str);
6130 char *sptr = RSTRING_PTR(str);
6131 long slen = RSTRING_LEN(str);
6132 if (beg +
len > slen)
6136 slen - (beg +
len));
6138 STR_SET_LEN(str, slen);
6139 TERM_FILL(&sptr[slen], TERM_LEN(str));
6150 switch (OBJ_BUILTIN_TYPE(pat)) {
6169get_pat_quoted(
VALUE pat,
int check)
6173 switch (OBJ_BUILTIN_TYPE(pat)) {
6187 if (check && is_broken_string(pat)) {
6194rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6197 pos = rb_str_byteindex(str, pat, pos);
6198 if (set_backref_str) {
6200 str = rb_str_new_frozen_String(str);
6201 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6203 *match = match_data;
6213 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6218rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6220 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6238rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6253 hash = rb_check_hash_type(repl);
6260 pat = get_pat_quoted(argv[0], 1);
6262 str_modifiable(str);
6263 beg = rb_pat_search(pat, str, 0, 1);
6277 end0 = beg0 + RSTRING_LEN(pat);
6286 if (iter || !
NIL_P(hash)) {
6287 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6293 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6296 str_mod_check(str, p,
len);
6297 rb_check_frozen(str);
6303 enc = rb_enc_compatible(str, repl);
6306 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6310 rb_enc_inspect_name(str_enc),
6311 rb_enc_inspect_name(STR_ENC_GET(repl)));
6313 enc = STR_ENC_GET(repl);
6316 rb_enc_associate(str, enc);
6326 rlen = RSTRING_LEN(repl);
6327 len = RSTRING_LEN(str);
6329 RESIZE_CAPA(str,
len + rlen - plen);
6331 p = RSTRING_PTR(str);
6333 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6335 rp = RSTRING_PTR(repl);
6336 memmove(p + beg0, rp, rlen);
6338 STR_SET_LEN(str,
len);
6339 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6362 rb_str_sub_bang(argc, argv, str);
6367str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6370 long beg, beg0, end0;
6371 long offset, blen, slen,
len, last;
6372 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6374 int need_backref_str = -1;
6385 hash = rb_check_hash_type(repl);
6389 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6398 rb_error_arity(argc, 1, 2);
6401 pat = get_pat_quoted(argv[0], 1);
6402 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6405 if (bang)
return Qnil;
6410 blen = RSTRING_LEN(str) + 30;
6412 sp = RSTRING_PTR(str);
6413 slen = RSTRING_LEN(str);
6415 str_enc = STR_ENC_GET(str);
6416 rb_enc_associate(dest, str_enc);
6423 end0 = beg0 + RSTRING_LEN(pat);
6437 struct RString fake_str = {RBASIC_INIT};
6439 if (mode == FAST_MAP) {
6448 val = rb_hash_aref(hash, key);
6451 str_mod_check(str, sp, slen);
6456 else if (need_backref_str) {
6458 if (need_backref_str < 0) {
6459 need_backref_str = val != repl;
6466 len = beg0 - offset;
6480 if (RSTRING_LEN(str) <= end0)
break;
6481 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6483 offset = end0 +
len;
6485 cp = RSTRING_PTR(str) + offset;
6486 if (offset > RSTRING_LEN(str))
break;
6489 if (mode != FAST_MAP && mode != STR) {
6492 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6497 if (RSTRING_LEN(str) > offset) {
6500 rb_pat_search0(pat, str, last, 1, &match);
6502 str_shared_replace(str, dest);
6527rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6529 str_modify_keep_cr(str);
6530 return str_gsub(argc, argv, str, 1);
6580 return str_gsub(argc, argv, str, 0);
6600 str_modifiable(str);
6601 if (str == str2)
return str;
6605 return str_replace(str, str2);
6622rb_str_clear(
VALUE str)
6626 STR_SET_LEN(str, 0);
6627 RSTRING_PTR(str)[0] = 0;
6628 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6644rb_str_chr(
VALUE str)
6662 pos += RSTRING_LEN(str);
6663 if (pos < 0 || RSTRING_LEN(str) <= pos)
6666 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6686 long len = RSTRING_LEN(str);
6687 char *
ptr, *head, *left = 0;
6691 if (pos < -
len ||
len <= pos)
6698 char byte = (char)(
NUM2INT(w) & 0xFF);
6700 if (!str_independent(str))
6701 str_make_independent(str);
6702 enc = STR_ENC_GET(str);
6703 head = RSTRING_PTR(str);
6705 if (!STR_EMBED_P(str)) {
6712 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6720 width = rb_enc_precise_mbclen(left, head+
len, enc);
6722 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6738str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6740 long n = RSTRING_LEN(str);
6742 if (beg > n ||
len < 0)
return Qnil;
6745 if (beg < 0)
return Qnil;
6750 if (!empty)
return Qnil;
6754 VALUE str2 = str_subseq(str, beg,
len);
6756 str_enc_copy_direct(str2, str);
6758 if (RSTRING_LEN(str2) == 0) {
6759 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6793 long beg,
len = RSTRING_LEN(str);
6801 return str_byte_substr(str, beg,
len, TRUE);
6806 return str_byte_substr(str, idx, 1, FALSE);
6818rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6823 return str_byte_substr(str, beg,
len, TRUE);
6826 return str_byte_aref(str, argv[0]);
6830str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6832 long end, slen = RSTRING_LEN(str);
6835 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6844 if (*
len > slen - *beg) {
6848 str_ensure_byte_pos(str, *beg);
6849 str_ensure_byte_pos(str, end);
6863rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6865 long beg,
len, vbeg, vlen;
6870 if (!(argc == 2 || argc == 3 || argc == 5)) {
6871 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6875 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6876 rb_builtin_class_name(argv[0]));
6883 vlen = RSTRING_LEN(val);
6888 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6889 rb_builtin_class_name(argv[2]));
6901 vlen = RSTRING_LEN(val);
6909 str_check_beg_len(str, &beg, &
len);
6910 str_check_beg_len(val, &vbeg, &vlen);
6911 str_modify_keep_cr(str);
6914 rb_enc_associate(str, rb_enc_check(str, val));
6917 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6939rb_str_reverse(
VALUE str)
6946 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6947 enc = STR_ENC_GET(str);
6953 if (RSTRING_LEN(str) > 1) {
6954 if (single_byte_optimizable(str)) {
6961 int clen = rb_enc_fast_mbclen(s, e, enc);
6969 cr = rb_enc_asciicompat(enc) ?
6972 int clen = rb_enc_mbclen(s, e, enc);
6981 STR_SET_LEN(rev, RSTRING_LEN(str));
6982 str_enc_copy_direct(rev, str);
7004rb_str_reverse_bang(
VALUE str)
7006 if (RSTRING_LEN(str) > 1) {
7007 if (single_byte_optimizable(str)) {
7010 str_modify_keep_cr(str);
7011 s = RSTRING_PTR(str);
7020 str_shared_replace(str, rb_str_reverse(str));
7024 str_modify_keep_cr(str);
7053 i = rb_str_index(str, arg, 0);
7055 return RBOOL(i != -1);
7099 rb_raise(rb_eArgError,
"invalid radix %d", base);
7101 return rb_str_to_inum(str, base, FALSE);
7126rb_str_to_f(
VALUE str)
7143rb_str_to_s(
VALUE str)
7155 char s[RUBY_MAX_CHAR_LEN];
7156 int n = rb_enc_codelen(c, enc);
7158 rb_enc_mbcput(c, s, enc);
7163#define CHAR_ESC_LEN 13
7166rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7168 char buf[CHAR_ESC_LEN + 1];
7176 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7178 else if (c < 0x10000) {
7179 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7182 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7187 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7190 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7193 l = (int)strlen(buf);
7199ruby_escaped_char(
int c)
7202 case '\0':
return "\\0";
7203 case '\n':
return "\\n";
7204 case '\r':
return "\\r";
7205 case '\t':
return "\\t";
7206 case '\f':
return "\\f";
7207 case '\013':
return "\\v";
7208 case '\010':
return "\\b";
7209 case '\007':
return "\\a";
7210 case '\033':
return "\\e";
7211 case '\x7f':
return "\\c?";
7217rb_str_escape(
VALUE str)
7221 const char *p = RSTRING_PTR(str);
7223 const char *prev = p;
7224 char buf[CHAR_ESC_LEN + 1];
7226 int unicode_p = rb_enc_unicode_p(enc);
7227 int asciicompat = rb_enc_asciicompat(enc);
7232 int n = rb_enc_precise_mbclen(p, pend, enc);
7234 if (p > prev) str_buf_cat(result, prev, p - prev);
7235 n = rb_enc_mbminlen(enc);
7237 n = (int)(pend - p);
7239 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7240 str_buf_cat(result, buf, strlen(buf));
7246 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7248 cc = ruby_escaped_char(c);
7250 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7251 str_buf_cat(result, cc, strlen(cc));
7254 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7257 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7258 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7262 if (p > prev) str_buf_cat(result, prev, p - prev);
7281 const char *p, *pend, *prev;
7282 char buf[CHAR_ESC_LEN + 1];
7284 rb_encoding *resenc = rb_default_internal_encoding();
7285 int unicode_p = rb_enc_unicode_p(enc);
7286 int asciicompat = rb_enc_asciicompat(enc);
7288 if (resenc == NULL) resenc = rb_default_external_encoding();
7289 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7290 rb_enc_associate(result, resenc);
7291 str_buf_cat2(result,
"\"");
7299 n = rb_enc_precise_mbclen(p, pend, enc);
7301 if (p > prev) str_buf_cat(result, prev, p - prev);
7302 n = rb_enc_mbminlen(enc);
7304 n = (int)(pend - p);
7306 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7307 str_buf_cat(result, buf, strlen(buf));
7313 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7315 if ((asciicompat || unicode_p) &&
7316 (c ==
'"'|| c ==
'\\' ||
7321 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7322 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7323 str_buf_cat2(result,
"\\");
7324 if (asciicompat || enc == resenc) {
7330 case '\n': cc =
'n';
break;
7331 case '\r': cc =
'r';
break;
7332 case '\t': cc =
't';
break;
7333 case '\f': cc =
'f';
break;
7334 case '\013': cc =
'v';
break;
7335 case '\010': cc =
'b';
break;
7336 case '\007': cc =
'a';
break;
7337 case 033: cc =
'e';
break;
7338 default: cc = 0;
break;
7341 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7344 str_buf_cat(result, buf, 2);
7357 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7361 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7362 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7367 if (p > prev) str_buf_cat(result, prev, p - prev);
7368 str_buf_cat2(result,
"\"");
7373#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7386 int encidx = rb_enc_get_index(str);
7389 const char *p, *pend;
7392 int u8 = (encidx == rb_utf8_encindex());
7393 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7396 if (!rb_enc_asciicompat(enc)) {
7398 len += strlen(enc->name);
7401 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7404 unsigned char c = *p++;
7407 case '"':
case '\\':
7408 case '\n':
case '\r':
7409 case '\t':
case '\f':
7410 case '\013':
case '\010':
case '\007':
case '\033':
7415 clen = IS_EVSTR(p, pend) ? 2 : 1;
7423 if (u8 && c > 0x7F) {
7424 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7426 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7429 else if (cc <= 0xFFFFF)
7442 if (clen > LONG_MAX -
len) {
7449 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7450 q = RSTRING_PTR(result); qend = q +
len + 1;
7454 unsigned char c = *p++;
7456 if (c ==
'"' || c ==
'\\') {
7460 else if (c ==
'#') {
7461 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7464 else if (c ==
'\n') {
7468 else if (c ==
'\r') {
7472 else if (c ==
'\t') {
7476 else if (c ==
'\f') {
7480 else if (c ==
'\013') {
7484 else if (c ==
'\010') {
7488 else if (c ==
'\007') {
7492 else if (c ==
'\033') {
7502 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7504 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7507 snprintf(q, qend-q,
"u%04X", cc);
7509 snprintf(q, qend-q,
"u{%X}", cc);
7514 snprintf(q, qend-q,
"x%02X", c);
7520 if (!rb_enc_asciicompat(enc)) {
7521 snprintf(q, qend-q, nonascii_suffix, enc->name);
7522 encidx = rb_ascii8bit_encindex();
7525 rb_enc_associate_index(result, encidx);
7531unescape_ascii(
unsigned int c)
7555undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7557 const char *s = *ss;
7561 unsigned char buf[6];
7579 *buf = unescape_ascii(*s);
7591 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7592 if (*penc != enc_utf8) {
7594 rb_enc_associate(undumped, enc_utf8);
7611 if (hexlen == 0 || hexlen > 6) {
7617 if (0xd800 <= c && c <= 0xdfff) {
7620 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7630 if (0xd800 <= c && c <= 0xdfff) {
7633 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7663static VALUE rb_str_is_ascii_only_p(
VALUE str);
7675str_undump(
VALUE str)
7677 const char *s = RSTRING_PTR(str);
7680 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7682 bool binary =
false;
7686 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7689 if (!str_null_check(str, &w)) {
7692 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7693 if (*s !=
'"')
goto invalid_format;
7711 static const char force_encoding_suffix[] =
".force_encoding(\"";
7712 static const char dup_suffix[] =
".dup";
7713 const char *encname;
7718 size =
sizeof(dup_suffix) - 1;
7719 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7721 size =
sizeof(force_encoding_suffix) - 1;
7722 if (s_end - s <= size)
goto invalid_format;
7723 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7727 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7731 s = memchr(s,
'"', s_end-s);
7733 if (!s)
goto invalid_format;
7734 if (s_end - s != 2)
goto invalid_format;
7735 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7737 encidx = rb_enc_find_index2(encname, (
long)size);
7741 rb_enc_associate_index(undumped, encidx);
7751 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7762 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7768 if (rb_enc_dummy_p(enc)) {
7775str_true_enc(
VALUE str)
7778 rb_str_check_dummy_enc(enc);
7782static OnigCaseFoldType
7783check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7788 rb_raise(rb_eArgError,
"too many options");
7789 if (argv[0]==sym_turkic) {
7790 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7792 if (argv[1]==sym_lithuanian)
7793 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7795 rb_raise(rb_eArgError,
"invalid second option");
7798 else if (argv[0]==sym_lithuanian) {
7799 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7801 if (argv[1]==sym_turkic)
7802 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7804 rb_raise(rb_eArgError,
"invalid second option");
7808 rb_raise(rb_eArgError,
"too many options");
7809 else if (argv[0]==sym_ascii)
7810 flags |= ONIGENC_CASE_ASCII_ONLY;
7811 else if (argv[0]==sym_fold) {
7812 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7813 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7815 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7818 rb_raise(rb_eArgError,
"invalid option");
7825 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7831#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7832#ifndef CASEMAP_DEBUG
7833# define CASEMAP_DEBUG 0
7841 OnigUChar space[FLEX_ARY_LEN];
7845mapping_buffer_free(
void *p)
7849 while (current_buffer) {
7850 previous_buffer = current_buffer;
7851 current_buffer = current_buffer->next;
7852 ruby_xfree_sized(previous_buffer, offsetof(
mapping_buffer, space) + previous_buffer->capa);
7858 {0, mapping_buffer_free,},
7867 const OnigUChar *source_current, *source_end;
7868 int target_length = 0;
7869 VALUE buffer_anchor;
7872 size_t buffer_count = 0;
7873 int buffer_length_or_invalid;
7875 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7877 source_current = (OnigUChar*)RSTRING_PTR(source);
7882 while (source_current < source_end) {
7884 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7885 if (CASEMAP_DEBUG) {
7886 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7889 *pre_buffer = current_buffer;
7890 pre_buffer = ¤t_buffer->next;
7891 current_buffer->next = NULL;
7892 current_buffer->capa =
capa;
7893 buffer_length_or_invalid = enc->case_map(flags,
7894 &source_current, source_end,
7895 current_buffer->space,
7896 current_buffer->space+current_buffer->capa,
7898 if (buffer_length_or_invalid < 0) {
7899 current_buffer =
DATA_PTR(buffer_anchor);
7901 mapping_buffer_free(current_buffer);
7902 rb_raise(rb_eArgError,
"input string invalid");
7904 target_length += current_buffer->used = buffer_length_or_invalid;
7906 if (CASEMAP_DEBUG) {
7907 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7910 if (buffer_count==1) {
7911 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7914 char *target_current;
7917 target_current = RSTRING_PTR(target);
7918 current_buffer =
DATA_PTR(buffer_anchor);
7919 while (current_buffer) {
7920 memcpy(target_current, current_buffer->space, current_buffer->used);
7921 target_current += current_buffer->used;
7922 current_buffer = current_buffer->next;
7925 current_buffer =
DATA_PTR(buffer_anchor);
7927 mapping_buffer_free(current_buffer);
7932 str_enc_copy_direct(target, source);
7941 const OnigUChar *source_current, *source_end;
7942 OnigUChar *target_current, *target_end;
7943 long old_length = RSTRING_LEN(source);
7944 int length_or_invalid;
7946 if (old_length == 0)
return Qnil;
7948 source_current = (OnigUChar*)RSTRING_PTR(source);
7950 if (source == target) {
7951 target_current = (OnigUChar*)source_current;
7952 target_end = (OnigUChar*)source_end;
7955 target_current = (OnigUChar*)RSTRING_PTR(target);
7959 length_or_invalid = onigenc_ascii_only_case_map(flags,
7960 &source_current, source_end,
7961 target_current, target_end, enc);
7962 if (length_or_invalid < 0)
7963 rb_raise(rb_eArgError,
"input string invalid");
7964 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7965 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7966 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7967 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
7968 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7971 str_enc_copy(target, source);
7977upcase_single(
VALUE str)
7979 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
7980 bool modified =
false;
7983 unsigned int c = *(
unsigned char*)s;
7985 if (
'a' <= c && c <=
'z') {
7986 *s =
'A' + (c -
'a');
8007rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
8010 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8012 flags = check_case_options(argc, argv, flags);
8013 str_modify_keep_cr(str);
8014 enc = str_true_enc(str);
8015 if (case_option_single_p(flags, enc, str)) {
8016 if (upcase_single(str))
8017 flags |= ONIGENC_CASE_MODIFIED;
8019 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8020 rb_str_ascii_casemap(str, str, &flags, enc);
8022 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8024 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8037rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8040 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8043 flags = check_case_options(argc, argv, flags);
8044 enc = str_true_enc(str);
8045 if (case_option_single_p(flags, enc, str)) {
8046 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8047 str_enc_copy_direct(ret, str);
8050 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8052 rb_str_ascii_casemap(str, ret, &flags, enc);
8055 ret = rb_str_casemap(str, &flags, enc);
8062downcase_single(
VALUE str)
8064 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8065 bool modified =
false;
8068 unsigned int c = *(
unsigned char*)s;
8070 if (
'A' <= c && c <=
'Z') {
8071 *s =
'a' + (c -
'A');
8093rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8096 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8098 flags = check_case_options(argc, argv, flags);
8099 str_modify_keep_cr(str);
8100 enc = str_true_enc(str);
8101 if (case_option_single_p(flags, enc, str)) {
8102 if (downcase_single(str))
8103 flags |= ONIGENC_CASE_MODIFIED;
8105 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8106 rb_str_ascii_casemap(str, str, &flags, enc);
8108 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8110 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8124rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8127 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8130 flags = check_case_options(argc, argv, flags);
8131 enc = str_true_enc(str);
8132 if (case_option_single_p(flags, enc, str)) {
8133 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8134 str_enc_copy_direct(ret, str);
8135 downcase_single(ret);
8137 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8139 rb_str_ascii_casemap(str, ret, &flags, enc);
8142 ret = rb_str_casemap(str, &flags, enc);
8162rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8165 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8167 flags = check_case_options(argc, argv, flags);
8168 str_modify_keep_cr(str);
8169 enc = str_true_enc(str);
8170 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8171 if (flags&ONIGENC_CASE_ASCII_ONLY)
8172 rb_str_ascii_casemap(str, str, &flags, enc);
8174 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8176 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8190rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8193 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8196 flags = check_case_options(argc, argv, flags);
8197 enc = str_true_enc(str);
8198 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8199 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8201 rb_str_ascii_casemap(str, ret, &flags, enc);
8204 ret = rb_str_casemap(str, &flags, enc);
8223rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8226 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8228 flags = check_case_options(argc, argv, flags);
8229 str_modify_keep_cr(str);
8230 enc = str_true_enc(str);
8231 if (flags&ONIGENC_CASE_ASCII_ONLY)
8232 rb_str_ascii_casemap(str, str, &flags, enc);
8234 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8236 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8250rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8253 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8256 flags = check_case_options(argc, argv, flags);
8257 enc = str_true_enc(str);
8258 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8259 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8261 rb_str_ascii_casemap(str, ret, &flags, enc);
8264 ret = rb_str_casemap(str, &flags, enc);
8269typedef unsigned char *USTR;
8273 unsigned int now, max;
8285 if (t->p == t->pend)
return -1;
8286 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8289 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8291 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8293 if (t->p < t->pend) {
8294 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8297 if (t->now < 0x80 && c < 0x80) {
8298 rb_raise(rb_eArgError,
8299 "invalid range \"%c-%c\" in string transliteration",
8303 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8307 else if (t->now < c) {
8316 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8317 if (t->now == t->max) {
8322 if (t->now < t->max) {
8338 const unsigned int errc = -1;
8339 unsigned int trans[256];
8341 struct tr trsrc, trrepl;
8343 unsigned int c, c0, last = 0;
8344 int modify = 0, i, l;
8345 unsigned char *s, *send;
8347 int singlebyte = single_byte_optimizable(str);
8351#define CHECK_IF_ASCII(c) \
8352 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8353 (cr = ENC_CODERANGE_VALID) : 0)
8357 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8358 if (RSTRING_LEN(repl) == 0) {
8359 return rb_str_delete_bang(1, &src, str);
8363 e1 = rb_enc_check(str, src);
8364 e2 = rb_enc_check(str, repl);
8369 enc = rb_enc_check(src, repl);
8371 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8372 if (RSTRING_LEN(src) > 1 &&
8373 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8374 trsrc.p + l < trsrc.pend) {
8378 trrepl.p = RSTRING_PTR(repl);
8379 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8380 trsrc.gen = trrepl.gen = 0;
8381 trsrc.now = trrepl.now = 0;
8382 trsrc.max = trrepl.max = 0;
8385 for (i=0; i<256; i++) {
8388 while ((c = trnext(&trsrc, enc)) != errc) {
8393 if (!hash) hash = rb_hash_new();
8397 while ((c = trnext(&trrepl, enc)) != errc)
8400 for (i=0; i<256; i++) {
8401 if (trans[i] != errc) {
8409 for (i=0; i<256; i++) {
8412 while ((c = trnext(&trsrc, enc)) != errc) {
8413 r = trnext(&trrepl, enc);
8414 if (r == errc) r = trrepl.now;
8417 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8420 if (!hash) hash = rb_hash_new();
8428 str_modify_keep_cr(str);
8429 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8430 termlen = rb_enc_mbminlen(enc);
8433 long offset, max = RSTRING_LEN(str);
8434 unsigned int save = -1;
8435 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8440 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8442 SIZED_FREE_N(buf, max + termlen);
8443 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8446 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8448 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8457 if (cflag) c = last;
8460 else if (cflag) c = errc;
8466 if (c != (
unsigned int)-1) {
8472 tlen = rb_enc_codelen(c, enc);
8478 if (enc != e1) may_modify = 1;
8480 if ((offset = t - buf) + tlen > max) {
8481 size_t MAYBE_UNUSED(old) = max + termlen;
8482 max = offset + tlen + (send - s);
8483 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8486 rb_enc_mbcput(c, t, enc);
8487 if (may_modify && memcmp(s, t, tlen) != 0) {
8493 if (!STR_EMBED_P(str)) {
8494 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8496 TERM_FILL((
char *)t, termlen);
8497 RSTRING(str)->as.heap.ptr = (
char *)buf;
8498 STR_SET_LEN(str, t - buf);
8499 STR_SET_NOEMBED(str);
8500 RSTRING(str)->as.heap.aux.capa = max;
8504 c = (
unsigned char)*s;
8505 if (trans[c] != errc) {
8522 long offset, max = (long)((send - s) * 1.2);
8523 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8528 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8530 SIZED_FREE_N(buf, max + termlen);
8531 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8534 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8536 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8544 if (cflag) c = last;
8547 else if (cflag) c = errc;
8551 c = cflag ? last : errc;
8554 tlen = rb_enc_codelen(c, enc);
8559 if (enc != e1) may_modify = 1;
8561 if ((offset = t - buf) + tlen > max) {
8562 size_t MAYBE_UNUSED(old) = max + termlen;
8563 max = offset + tlen + (long)((send - s) * 1.2);
8564 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8568 rb_enc_mbcput(c, t, enc);
8569 if (may_modify && memcmp(s, t, tlen) != 0) {
8577 if (!STR_EMBED_P(str)) {
8578 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8580 TERM_FILL((
char *)t, termlen);
8581 RSTRING(str)->as.heap.ptr = (
char *)buf;
8582 STR_SET_LEN(str, t - buf);
8583 STR_SET_NOEMBED(str);
8584 RSTRING(str)->as.heap.aux.capa = max;
8590 rb_enc_associate(str, enc);
8612 return tr_trans(str, src, repl, 0);
8657 tr_trans(str, src, repl, 0);
8661#define TR_TABLE_MAX (UCHAR_MAX+1)
8662#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8664tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8667 const unsigned int errc = -1;
8668 char buf[TR_TABLE_MAX];
8671 VALUE table = 0, ptable = 0;
8672 int i, l, cflag = 0;
8674 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8675 tr.gen =
tr.now =
tr.max = 0;
8677 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8682 for (i=0; i<TR_TABLE_MAX; i++) {
8685 stable[TR_TABLE_MAX] = cflag;
8687 else if (stable[TR_TABLE_MAX] && !cflag) {
8688 stable[TR_TABLE_MAX] = 0;
8690 for (i=0; i<TR_TABLE_MAX; i++) {
8694 while ((c = trnext(&
tr, enc)) != errc) {
8695 if (c < TR_TABLE_MAX) {
8696 buf[(
unsigned char)c] = !cflag;
8701 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8704 table = ptable ? ptable : rb_hash_new();
8708 table = rb_hash_new();
8713 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8714 rb_hash_aset(table, key,
Qtrue);
8718 for (i=0; i<TR_TABLE_MAX; i++) {
8719 stable[i] = stable[i] && buf[i];
8721 if (!table && !cflag) {
8728tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8730 if (c < TR_TABLE_MAX) {
8731 return table[c] != 0;
8737 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8738 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8742 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8745 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8760rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8762 char squeez[TR_TABLE_SIZE];
8765 VALUE del = 0, nodel = 0;
8767 int i, ascompat, cr;
8769 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8771 for (i=0; i<argc; i++) {
8775 enc = rb_enc_check(str, s);
8776 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8779 str_modify_keep_cr(str);
8780 ascompat = rb_enc_asciicompat(enc);
8781 s = t = RSTRING_PTR(str);
8788 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8799 c = rb_enc_codepoint_len(s, send, &clen, enc);
8801 if (tr_find(c, squeez, del, nodel)) {
8805 if (t != s) rb_enc_mbcput(c, t, enc);
8812 TERM_FILL(t, TERM_LEN(str));
8813 STR_SET_LEN(str, t - RSTRING_PTR(str));
8816 if (modify)
return str;
8830rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8833 rb_str_delete_bang(argc, argv, str);
8851rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8853 char squeez[TR_TABLE_SIZE];
8855 VALUE del = 0, nodel = 0;
8856 unsigned char *s, *send, *t;
8858 int ascompat, singlebyte = single_byte_optimizable(str);
8862 enc = STR_ENC_GET(str);
8865 for (i=0; i<argc; i++) {
8869 enc = rb_enc_check(str, s);
8870 if (singlebyte && !single_byte_optimizable(s))
8872 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8876 str_modify_keep_cr(str);
8877 s = t = (
unsigned char *)RSTRING_PTR(str);
8878 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8881 ascompat = rb_enc_asciicompat(enc);
8885 unsigned int c = *s++;
8886 if (c != save || (argc > 0 && !squeez[c])) {
8896 if (ascompat && (c = *s) < 0x80) {
8897 if (c != save || (argc > 0 && !squeez[c])) {
8903 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8905 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8906 if (t != s) rb_enc_mbcput(c, t, enc);
8915 TERM_FILL((
char *)t, TERM_LEN(str));
8916 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8917 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
8921 if (modify)
return str;
8935rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8938 rb_str_squeeze_bang(argc, argv, str);
8958 return tr_trans(str, src, repl, 1);
8986 tr_trans(str, src, repl, 1);
8999rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9001 char table[TR_TABLE_SIZE];
9003 VALUE del = 0, nodel = 0, tstr;
9013 enc = rb_enc_check(str, tstr);
9016 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9017 (ptstr = RSTRING_PTR(tstr),
9018 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9019 !is_broken_string(str)) {
9021 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9023 s = RSTRING_PTR(str);
9024 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9027 if (*(
unsigned char*)s++ == c) n++;
9033 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9034 for (i=1; i<argc; i++) {
9037 enc = rb_enc_check(str, tstr);
9038 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9041 s = RSTRING_PTR(str);
9042 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9044 ascompat = rb_enc_asciicompat(enc);
9048 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9056 c = rb_enc_codepoint_len(s, send, &clen, enc);
9057 if (tr_find(c, table, del, nodel)) {
9068rb_fs_check(
VALUE val)
9072 if (
NIL_P(val))
return 0;
9077static const char isspacetable[256] = {
9078 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9080 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9086 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9087 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9088 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9089 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9090 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9091 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9092 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9096#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9099split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9101 if (empty_count >= 0 &&
len == 0) {
9102 return empty_count + 1;
9104 if (empty_count > 0) {
9109 }
while (--empty_count > 0);
9113 rb_yield(str_new_empty_String(str));
9114 }
while (--empty_count > 0);
9128 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9132literal_split_pattern(
VALUE spat, split_type_t default_type)
9140 return SPLIT_TYPE_CHARS;
9142 else if (rb_enc_asciicompat(enc)) {
9143 if (
len == 1 && ptr[0] ==
' ') {
9144 return SPLIT_TYPE_AWK;
9149 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9150 return SPLIT_TYPE_AWK;
9153 return default_type;
9166rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9171 split_type_t split_type;
9172 long beg, end, i = 0, empty_count = -1;
9177 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9179 if (lim <= 0) limit =
Qnil;
9180 else if (lim == 1) {
9181 if (RSTRING_LEN(str) == 0)
9192 if (
NIL_P(limit) && !lim) empty_count = 0;
9194 enc = STR_ENC_GET(str);
9195 split_type = SPLIT_TYPE_REGEXP;
9197 spat = get_pat_quoted(spat, 0);
9199 else if (
NIL_P(spat = rb_fs)) {
9200 split_type = SPLIT_TYPE_AWK;
9202 else if (!(spat = rb_fs_check(spat))) {
9203 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9208 if (split_type != SPLIT_TYPE_AWK) {
9213 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9214 if (split_type == SPLIT_TYPE_AWK) {
9216 split_type = SPLIT_TYPE_STRING;
9221 mustnot_broken(spat);
9222 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9230#define SPLIT_STR(beg, len) ( \
9231 empty_count = split_string(result, str, beg, len, empty_count), \
9232 str_mod_check(str, str_start, str_len))
9235 char *ptr = RSTRING_PTR(str);
9236 char *
const str_start = ptr;
9237 const long str_len = RSTRING_LEN(str);
9238 char *
const eptr = str_start + str_len;
9239 if (split_type == SPLIT_TYPE_AWK) {
9246 if (is_ascii_string(str)) {
9247 while (ptr < eptr) {
9248 c = (
unsigned char)*ptr++;
9250 if (ascii_isspace(c)) {
9256 if (!
NIL_P(limit) && lim <= i)
break;
9259 else if (ascii_isspace(c)) {
9260 SPLIT_STR(beg, end-beg);
9263 if (!
NIL_P(limit)) ++i;
9271 while (ptr < eptr) {
9274 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9283 if (!
NIL_P(limit) && lim <= i)
break;
9287 SPLIT_STR(beg, end-beg);
9290 if (!
NIL_P(limit)) ++i;
9298 else if (split_type == SPLIT_TYPE_STRING) {
9299 char *substr_start = ptr;
9300 char *sptr = RSTRING_PTR(spat);
9301 long slen = RSTRING_LEN(spat);
9304 mustnot_broken(str);
9305 enc = rb_enc_check(str, spat);
9306 while (ptr < eptr &&
9307 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9310 if (t != ptr + end) {
9314 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9315 str_mod_check(spat, sptr, slen);
9318 if (!
NIL_P(limit) && lim <= ++i)
break;
9320 beg = ptr - str_start;
9322 else if (split_type == SPLIT_TYPE_CHARS) {
9326 mustnot_broken(str);
9327 enc = rb_enc_get(str);
9328 while (ptr < eptr &&
9329 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9330 SPLIT_STR(ptr - str_start, n);
9332 if (!
NIL_P(limit) && lim <= ++i)
break;
9334 beg = ptr - str_start;
9338 long len = RSTRING_LEN(str);
9346 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9351 if (start == end && BEG(0) == END(0)) {
9356 else if (last_null == 1) {
9357 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9364 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9370 SPLIT_STR(beg, end-beg);
9371 beg = start = END(0);
9375 for (idx=1; idx < regs->num_regs; idx++) {
9376 if (BEG(idx) == -1)
continue;
9377 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9379 if (!
NIL_P(limit) && lim <= ++i)
break;
9381 if (match) rb_match_unbusy(match);
9383 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9384 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9387 return result ? result : str;
9397 return rb_str_split_m(1, &sep, str);
9400#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9415#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9418chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9420 const char *prev = rb_enc_prev_char(p, e, e, enc);
9423 prev = rb_enc_prev_char(p, e, e, enc);
9424 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9436 RSTRING_LEN(rs) != 1 ||
9437 RSTRING_PTR(rs)[0] !=
'\n')) {
9443#define rb_rs get_rs()
9450 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9451 long pos,
len, rslen;
9457 static ID keywords[1];
9462 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9466 if (!ENUM_ELEM(ary, str)) {
9474 if (!RSTRING_LEN(str))
goto end;
9476 ptr = subptr = RSTRING_PTR(str);
9478 len = RSTRING_LEN(str);
9480 rslen = RSTRING_LEN(rs);
9483 enc = rb_enc_get(str);
9485 enc = rb_enc_check(str, rs);
9490 const char *eol = NULL;
9492 while (subend < pend) {
9493 long chomp_rslen = 0;
9495 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9497 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9499 if (eol == subend)
break;
9503 chomp_rslen = -rslen;
9507 if (!subptr) subptr = subend;
9511 }
while (subend < pend);
9513 if (rslen == 0) chomp_rslen = 0;
9515 subend - subptr + (chomp ? chomp_rslen : rslen));
9516 if (ENUM_ELEM(ary, line)) {
9517 str_mod_check(str, ptr,
len);
9519 subptr = eol = NULL;
9524 rsptr = RSTRING_PTR(rs);
9525 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9534 rsptr = RSTRING_PTR(rs);
9535 rslen = RSTRING_LEN(rs);
9538 while (subptr < pend) {
9539 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9543 if (hit != adjusted) {
9547 subend = hit += rslen;
9550 subend = chomp_newline(subptr, subend, enc);
9557 if (ENUM_ELEM(ary, line)) {
9558 str_mod_check(str, ptr,
len);
9563 if (subptr != pend) {
9566 pend = chomp_newline(subptr, pend, enc);
9568 else if (pend - subptr >= rslen &&
9569 memcmp(pend - rslen, rsptr, rslen) == 0) {
9574 ENUM_ELEM(ary, line);
9595rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9598 return rb_str_enumerate_lines(argc, argv, str, 0);
9653rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9655 VALUE ary = WANTARRAY(
"lines", 0);
9656 return rb_str_enumerate_lines(argc, argv, str, ary);
9670 for (i=0; i<RSTRING_LEN(str); i++) {
9671 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9689rb_str_each_byte(
VALUE str)
9692 return rb_str_enumerate_bytes(str, 0);
9704rb_str_bytes(
VALUE str)
9706 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9707 return rb_str_enumerate_bytes(str, ary);
9725 ptr = RSTRING_PTR(str);
9726 len = RSTRING_LEN(str);
9727 enc = rb_enc_get(str);
9730 for (i = 0; i <
len; i += n) {
9731 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9736 for (i = 0; i <
len; i += n) {
9737 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9758rb_str_each_char(
VALUE str)
9761 return rb_str_enumerate_chars(str, 0);
9773rb_str_chars(
VALUE str)
9776 return rb_str_enumerate_chars(str, ary);
9780rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9785 const char *ptr, *end;
9788 if (single_byte_optimizable(str))
9789 return rb_str_enumerate_bytes(str, ary);
9792 ptr = RSTRING_PTR(str);
9794 enc = STR_ENC_GET(str);
9797 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9818rb_str_each_codepoint(
VALUE str)
9821 return rb_str_enumerate_codepoints(str, 0);
9833rb_str_codepoints(
VALUE str)
9836 return rb_str_enumerate_codepoints(str, ary);
9842 int encidx = rb_enc_to_index(enc);
9844 const OnigUChar source_ascii[] =
"\\X";
9845 const OnigUChar *source = source_ascii;
9846 size_t source_len =
sizeof(source_ascii) - 1;
9849#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9850#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9851#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9852#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9853#define CASE_UTF(e) \
9854 case ENCINDEX_UTF_##e: { \
9855 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9856 source = source_UTF_##e; \
9857 source_len = sizeof(source_UTF_##e); \
9860 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9868 regex_t *reg_grapheme_cluster;
9870 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9871 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9873 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9874 onig_error_code_to_str(message, r, &einfo);
9875 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9878 return reg_grapheme_cluster;
9884 int encidx = rb_enc_to_index(enc);
9885 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9887 if (encidx == rb_utf8_encindex()) {
9888 if (!reg_grapheme_cluster_utf8) {
9889 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9892 return reg_grapheme_cluster_utf8;
9901 size_t grapheme_cluster_count = 0;
9903 const char *ptr, *end;
9905 if (!rb_enc_unicode_p(enc)) {
9909 bool cached_reg_grapheme_cluster =
true;
9910 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9911 if (!reg_grapheme_cluster) {
9912 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9913 cached_reg_grapheme_cluster =
false;
9916 ptr = RSTRING_PTR(str);
9920 OnigPosition
len = onig_match(reg_grapheme_cluster,
9921 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9922 (
const OnigUChar *)ptr, NULL, 0);
9923 if (
len <= 0)
break;
9924 grapheme_cluster_count++;
9928 if (!cached_reg_grapheme_cluster) {
9929 onig_free(reg_grapheme_cluster);
9932 return SIZET2NUM(grapheme_cluster_count);
9936rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9940 const char *ptr0, *ptr, *end;
9942 if (!rb_enc_unicode_p(enc)) {
9943 return rb_str_enumerate_chars(str, ary);
9948 bool cached_reg_grapheme_cluster =
true;
9949 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9950 if (!reg_grapheme_cluster) {
9951 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9952 cached_reg_grapheme_cluster =
false;
9955 ptr0 = ptr = RSTRING_PTR(str);
9959 OnigPosition
len = onig_match(reg_grapheme_cluster,
9960 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9961 (
const OnigUChar *)ptr, NULL, 0);
9962 if (
len <= 0)
break;
9967 if (!cached_reg_grapheme_cluster) {
9968 onig_free(reg_grapheme_cluster);
9988rb_str_each_grapheme_cluster(
VALUE str)
9991 return rb_str_enumerate_grapheme_clusters(str, 0);
10003rb_str_grapheme_clusters(
VALUE str)
10006 return rb_str_enumerate_grapheme_clusters(str, ary);
10010chopped_length(
VALUE str)
10013 const char *p, *p2, *beg, *end;
10015 beg = RSTRING_PTR(str);
10016 end = beg + RSTRING_LEN(str);
10017 if (beg >= end)
return 0;
10018 p = rb_enc_prev_char(beg, end, end, enc);
10020 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10021 p2 = rb_enc_prev_char(beg, p, end, enc);
10022 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10040rb_str_chop_bang(
VALUE str)
10042 str_modify_keep_cr(str);
10043 if (RSTRING_LEN(str) > 0) {
10045 len = chopped_length(str);
10046 STR_SET_LEN(str,
len);
10047 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10066rb_str_chop(
VALUE str)
10072smart_chomp(
VALUE str,
const char *e,
const char *p)
10075 if (rb_enc_mbminlen(enc) > 1) {
10080 pp = e - rb_enc_mbminlen(enc);
10083 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10091 if (--e > p && *(e-1) ==
'\r') {
10108 char *pp, *e, *rsptr;
10110 char *
const p = RSTRING_PTR(str);
10111 long len = RSTRING_LEN(str);
10113 if (
len == 0)
return 0;
10116 return smart_chomp(str, e, p);
10119 enc = rb_enc_get(str);
10122 if (rb_enc_mbminlen(enc) > 1) {
10127 pp -= rb_enc_mbminlen(enc);
10130 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10137 while (e > p && *(e-1) ==
'\n') {
10139 if (e > p && *(e-1) ==
'\r')
10145 if (rslen >
len)
return len;
10147 enc = rb_enc_get(rs);
10148 newline = rsptr[rslen-1];
10149 if (rslen == rb_enc_mbminlen(enc)) {
10151 if (newline ==
'\n')
10152 return smart_chomp(str, e, p);
10156 return smart_chomp(str, e, p);
10160 enc = rb_enc_check(str, rs);
10161 if (is_broken_string(rs)) {
10165 if (p[
len-1] == newline &&
10167 memcmp(rsptr, pp, rslen) == 0)) {
10168 if (at_char_boundary(p, pp, e, enc))
10169 return len - rslen;
10181chomp_rs(
int argc,
const VALUE *argv)
10185 VALUE rs = argv[0];
10197 long olen = RSTRING_LEN(str);
10198 long len = chompped_length(str, rs);
10199 if (
len >= olen)
return Qnil;
10200 str_modify_keep_cr(str);
10201 STR_SET_LEN(str,
len);
10202 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10222rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10225 str_modifiable(str);
10226 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10227 rs = chomp_rs(argc, argv);
10229 return rb_str_chomp_string(str, rs);
10242rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10244 VALUE rs = chomp_rs(argc, argv);
10250tr_setup_table_multi(
char table[TR_TABLE_SIZE],
VALUE *tablep,
VALUE *ctablep,
10251 VALUE str,
int num_selectors,
VALUE *selectors)
10255 for (i=0; i<num_selectors; i++) {
10256 VALUE selector = selectors[i];
10260 enc = rb_enc_check(str, selector);
10261 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10268 const char *
const start = s;
10270 if (!s || s >= e)
return 0;
10273 if (single_byte_optimizable(str)) {
10274 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10279 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10289lstrip_offset_table(
VALUE str,
const char *s,
const char *e,
rb_encoding *enc,
10290 char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
10292 const char *
const start = s;
10294 if (!s || s >= e)
return 0;
10299 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10301 if (!tr_find(cc, table, del, nodel))
break;
10320rb_str_lstrip_bang(
int argc,
VALUE *argv,
VALUE str)
10324 long olen, loffset;
10326 str_modify_keep_cr(str);
10327 enc = STR_ENC_GET(str);
10330 char table[TR_TABLE_SIZE];
10331 VALUE del = 0, nodel = 0;
10333 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10334 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10337 loffset = lstrip_offset(str, start, start+olen, enc);
10341 long len = olen-loffset;
10342 s = start + loffset;
10343 memmove(start, s,
len);
10344 STR_SET_LEN(str,
len);
10345 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10380rb_str_lstrip(
int argc,
VALUE *argv,
VALUE str)
10387 char table[TR_TABLE_SIZE];
10388 VALUE del = 0, nodel = 0;
10390 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10391 loffset = lstrip_offset_table(str, start, start+
len, STR_ENC_GET(str), table, del, nodel);
10394 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10396 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10405 rb_str_check_dummy_enc(enc);
10409 if (!s || s >= e)
return 0;
10413 if (single_byte_optimizable(str)) {
10415 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10420 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10430rstrip_offset_table(
VALUE str,
const char *s,
const char *e,
rb_encoding *enc,
10431 char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
10436 rb_str_check_dummy_enc(enc);
10440 if (!s || s >= e)
return 0;
10444 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10446 if (!tr_find(c, table, del, nodel))
break;
10466rb_str_rstrip_bang(
int argc,
VALUE *argv,
VALUE str)
10470 long olen, roffset;
10472 str_modify_keep_cr(str);
10473 enc = STR_ENC_GET(str);
10476 char table[TR_TABLE_SIZE];
10477 VALUE del = 0, nodel = 0;
10479 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10480 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10483 roffset = rstrip_offset(str, start, start+olen, enc);
10486 long len = olen - roffset;
10488 STR_SET_LEN(str,
len);
10489 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10523rb_str_rstrip(
int argc,
VALUE *argv,
VALUE str)
10527 long olen, roffset;
10529 enc = STR_ENC_GET(str);
10532 char table[TR_TABLE_SIZE];
10533 VALUE del = 0, nodel = 0;
10535 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10536 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10539 roffset = rstrip_offset(str, start, start+olen, enc);
10541 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10559rb_str_strip_bang(
int argc,
VALUE *argv,
VALUE str)
10562 long olen, loffset, roffset;
10565 str_modify_keep_cr(str);
10566 enc = STR_ENC_GET(str);
10570 char table[TR_TABLE_SIZE];
10571 VALUE del = 0, nodel = 0;
10573 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10574 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10575 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10578 loffset = lstrip_offset(str, start, start+olen, enc);
10579 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10582 if (loffset > 0 || roffset > 0) {
10583 long len = olen-roffset;
10586 memmove(start, start + loffset,
len);
10588 STR_SET_LEN(str,
len);
10589 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10624rb_str_strip(
int argc,
VALUE *argv,
VALUE str)
10627 long olen, loffset, roffset;
10633 char table[TR_TABLE_SIZE];
10634 VALUE del = 0, nodel = 0;
10636 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10637 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10638 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10641 loffset = lstrip_offset(str, start, start+olen, enc);
10642 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10645 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10650scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10653 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10659 end = pos + RSTRING_LEN(pat);
10673 if (RSTRING_LEN(str) > end)
10674 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10683 if (!regs || regs->num_regs == 1) {
10689 for (
int i = 1; i < regs->num_regs; i++) {
10720 long last = -1, prev = 0;
10721 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10723 pat = get_pat_quoted(pat, 1);
10724 mustnot_broken(str);
10728 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10733 if (last >= 0) rb_pat_search(pat, str, last, 1);
10738 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10742 str_mod_check(str, p,
len);
10744 if (last >= 0) rb_pat_search(pat, str, last, 1);
10796rb_str_hex(
VALUE str)
10798 return rb_str_to_inum(str, 16, FALSE);
10882rb_str_oct(
VALUE str)
10884 return rb_str_to_inum(str, -8, FALSE);
10887#ifndef HAVE_CRYPT_R
10892 rb_nativethread_lock_t lock;
10893} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10962# define CRYPT_END() ALLOCV_END(databuf)
10965 extern char *crypt(
const char *,
const char *);
10966# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10969 const char *s, *saltp;
10972 char salt_8bit_clean[3];
10976 mustnot_wchar(str);
10977 mustnot_wchar(salt);
10979 saltp = RSTRING_PTR(salt);
10980 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10981 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10985 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10986 salt_8bit_clean[0] = saltp[0] & 0x7f;
10987 salt_8bit_clean[1] = saltp[1] & 0x7f;
10988 salt_8bit_clean[2] =
'\0';
10989 saltp = salt_8bit_clean;
10994# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10995 data->initialized = 0;
10997 res = crypt_r(s, saltp, data);
11000 res = crypt(s, saltp);
11015 size_t res_size = strlen(res)+1;
11016 tmp_buf =
ALLOCA_N(
char, res_size);
11017 memcpy(tmp_buf, res, res_size);
11054 char *ptr, *p, *pend;
11057 unsigned long sum0 = 0;
11062 ptr = p = RSTRING_PTR(str);
11063 len = RSTRING_LEN(str);
11069 str_mod_check(str, ptr,
len);
11072 sum0 += (
unsigned char)*p;
11083 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
11084 sum0 &= (((
unsigned long)1)<<bits)-1;
11104rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
11108 long width,
len, flen = 1, fclen = 1;
11111 const char *f =
" ";
11112 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11114 int singlebyte = 1, cr;
11118 enc = STR_ENC_GET(str);
11119 termlen = rb_enc_mbminlen(enc);
11123 enc = rb_enc_check(str, pad);
11124 f = RSTRING_PTR(pad);
11125 flen = RSTRING_LEN(pad);
11126 fclen = str_strlen(pad, enc);
11127 singlebyte = single_byte_optimizable(pad);
11128 if (flen == 0 || fclen == 0) {
11129 rb_raise(rb_eArgError,
"zero width padding");
11132 len = str_strlen(str, enc);
11133 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
11135 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
11139 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11140 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11142 size = RSTRING_LEN(str);
11143 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11144 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11145 (
len += llen2 + rlen2) >= LONG_MAX - size) {
11146 rb_raise(rb_eArgError,
"argument too big");
11150 p = RSTRING_PTR(res);
11152 memset(p, *f, llen);
11156 while (llen >= fclen) {
11162 memcpy(p, f, llen2);
11166 memcpy(p, RSTRING_PTR(str), size);
11169 memset(p, *f, rlen);
11173 while (rlen >= fclen) {
11179 memcpy(p, f, rlen2);
11183 TERM_FILL(p, termlen);
11184 STR_SET_LEN(res, p-RSTRING_PTR(res));
11205rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
11207 return rb_str_justify(argc, argv, str,
'l');
11219rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11221 return rb_str_justify(argc, argv, str,
'r');
11234rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11236 return rb_str_justify(argc, argv, str,
'c');
11252 sep = get_pat_quoted(sep, 0);
11264 pos = rb_str_index(str, sep, 0);
11265 if (pos < 0)
goto failed;
11270 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11273 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11287 long pos = RSTRING_LEN(str);
11289 sep = get_pat_quoted(sep, 0);
11302 pos = rb_str_rindex(str, sep, pos);
11311 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11313 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11325rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11329 for (i=0; i<argc; i++) {
11330 VALUE tmp = argv[i];
11332 if (rb_reg_start_with_p(tmp, str))
11336 const char *p, *s, *e;
11341 enc = rb_enc_check(str, tmp);
11342 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11343 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11344 p = RSTRING_PTR(str);
11347 if (!at_char_right_boundary(p, s, e, enc))
11349 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11365rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11369 for (i=0; i<argc; i++) {
11370 VALUE tmp = argv[i];
11371 const char *p, *s, *e;
11376 enc = rb_enc_check(str, tmp);
11377 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11378 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11379 p = RSTRING_PTR(str);
11382 if (!at_char_boundary(p, s, e, enc))
11384 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11400deleted_prefix_length(
VALUE str,
VALUE prefix)
11402 const char *strptr, *prefixptr;
11403 long olen, prefixlen;
11408 if (!is_broken_string(prefix) ||
11409 !rb_enc_asciicompat(enc) ||
11410 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11411 enc = rb_enc_check(str, prefix);
11415 prefixlen = RSTRING_LEN(prefix);
11416 if (prefixlen <= 0)
return 0;
11417 olen = RSTRING_LEN(str);
11418 if (olen < prefixlen)
return 0;
11419 strptr = RSTRING_PTR(str);
11420 prefixptr = RSTRING_PTR(prefix);
11421 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11422 if (is_broken_string(prefix)) {
11423 if (!is_broken_string(str)) {
11427 const char *strend = strptr + olen;
11428 const char *after_prefix = strptr + prefixlen;
11429 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11450rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11453 str_modify_keep_cr(str);
11455 prefixlen = deleted_prefix_length(str, prefix);
11456 if (prefixlen <= 0)
return Qnil;
11470rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11474 prefixlen = deleted_prefix_length(str, prefix);
11475 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11477 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11490deleted_suffix_length(
VALUE str,
VALUE suffix)
11492 const char *strptr, *suffixptr;
11493 long olen, suffixlen;
11497 if (is_broken_string(suffix))
return 0;
11498 enc = rb_enc_check(str, suffix);
11501 suffixlen = RSTRING_LEN(suffix);
11502 if (suffixlen <= 0)
return 0;
11503 olen = RSTRING_LEN(str);
11504 if (olen < suffixlen)
return 0;
11505 strptr = RSTRING_PTR(str);
11506 suffixptr = RSTRING_PTR(suffix);
11507 const char *strend = strptr + olen;
11508 const char *before_suffix = strend - suffixlen;
11509 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11510 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11526rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11528 long olen, suffixlen,
len;
11529 str_modifiable(str);
11531 suffixlen = deleted_suffix_length(str, suffix);
11532 if (suffixlen <= 0)
return Qnil;
11534 olen = RSTRING_LEN(str);
11535 str_modify_keep_cr(str);
11536 len = olen - suffixlen;
11537 STR_SET_LEN(str,
len);
11538 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11554rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11558 suffixlen = deleted_suffix_length(str, suffix);
11559 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11561 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11568 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11574nil_setter_warning(
ID id)
11576 rb_warn_deprecated(
"non-nil '%"PRIsVALUE
"'", NULL, rb_id2str(
id));
11583 if (!
NIL_P(*var)) {
11584 nil_setter_warning(
id);
11591 val = rb_fs_check(val);
11594 "value of %"PRIsVALUE
" must be String or Regexp",
11598 nil_setter_warning(
id);
11615 str_modifiable(str);
11618 int idx = rb_enc_to_index(encoding);
11625 rb_enc_associate_index(str, idx);
11649 if (STR_EMBED_P(str)) {
11650 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11655 str_replace_shared_without_enc(str2, str);
11657 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11687rb_str_valid_encoding_p(
VALUE str)
11689 int cr = rb_enc_str_coderange(str);
11707rb_str_is_ascii_only_p(
VALUE str)
11709 int cr = rb_enc_str_coderange(str);
11717 static const char ellipsis[] =
"...";
11718 const long ellipsislen =
sizeof(ellipsis) - 1;
11720 const long blen = RSTRING_LEN(str);
11721 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11722 VALUE estr, ret = 0;
11725 if (
len * rb_enc_mbminlen(enc) >= blen ||
11729 else if (
len <= ellipsislen ||
11731 if (rb_enc_asciicompat(enc)) {
11733 rb_enc_associate(ret, enc);
11740 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11745 rb_enc_from_encoding(enc), 0,
Qnil);
11756 cr = rb_enc_str_coderange(str);
11758 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11764 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11783 if (enc == STR_ENC_GET(str)) {
11788 return enc_str_scrub(enc, str, repl, cr);
11796 const char *rep, *p, *e, *p1, *sp;
11802 rb_raise(rb_eArgError,
"both of block and replacement given");
11809 if (!
NIL_P(repl)) {
11810 repl = str_compat_and_valid(repl, enc);
11813 if (rb_enc_dummy_p(enc)) {
11816 encidx = rb_enc_to_index(enc);
11818#define DEFAULT_REPLACE_CHAR(str) do { \
11819 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11820 rep = replace; replen = (int)sizeof(replace); \
11823 slen = RSTRING_LEN(str);
11824 p = RSTRING_PTR(str);
11829 if (rb_enc_asciicompat(enc)) {
11835 else if (!
NIL_P(repl)) {
11836 rep = RSTRING_PTR(repl);
11837 replen = RSTRING_LEN(repl);
11840 else if (encidx == rb_utf8_encindex()) {
11841 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11845 DEFAULT_REPLACE_CHAR(
"?");
11850 p = search_nonascii(p, e);
11855 int ret = rb_enc_precise_mbclen(p, e, enc);
11874 if (e - p < clen) clen = e - p;
11881 for (; clen > 1; clen--) {
11882 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11893 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11894 str_mod_check(str, sp, slen);
11895 repl = str_compat_and_valid(repl, enc);
11902 p = search_nonascii(p, e);
11928 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11929 str_mod_check(str, sp, slen);
11930 repl = str_compat_and_valid(repl, enc);
11939 long mbminlen = rb_enc_mbminlen(enc);
11943 else if (!
NIL_P(repl)) {
11944 rep = RSTRING_PTR(repl);
11945 replen = RSTRING_LEN(repl);
11947 else if (encidx == ENCINDEX_UTF_16BE) {
11948 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11950 else if (encidx == ENCINDEX_UTF_16LE) {
11951 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11953 else if (encidx == ENCINDEX_UTF_32BE) {
11954 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11956 else if (encidx == ENCINDEX_UTF_32LE) {
11957 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11960 DEFAULT_REPLACE_CHAR(
"?");
11964 int ret = rb_enc_precise_mbclen(p, e, enc);
11977 if (e - p < clen) clen = e - p;
11978 if (clen <= mbminlen * 2) {
11983 for (; clen > mbminlen; clen-=mbminlen) {
11984 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11994 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11995 str_mod_check(str, sp, slen);
11996 repl = str_compat_and_valid(repl, enc);
12021 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
12022 str_mod_check(str, sp, slen);
12023 repl = str_compat_and_valid(repl, enc);
12063str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
12071static ID id_normalize;
12072static ID id_normalized_p;
12073static VALUE mUnicodeNormalize;
12076unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
12078 static int UnicodeNormalizeRequired = 0;
12081 if (!UnicodeNormalizeRequired) {
12082 rb_require(
"unicode_normalize/normalize.rb");
12083 UnicodeNormalizeRequired = 1;
12087 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
12098rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
12100 return unicode_normalize_common(argc, argv, str, id_normalize);
12114rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
12116 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12143rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
12145 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12277#define sym_equal rb_obj_equal
12280sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12284 int c = rb_enc_precise_mbclen(s, send, enc);
12288 c = rb_enc_mbc_to_codepoint(s, send, enc);
12296rb_str_symname_p(
VALUE sym)
12301 rb_encoding *resenc = rb_default_internal_encoding();
12303 if (resenc == NULL) resenc = rb_default_external_encoding();
12304 enc = STR_ENC_GET(sym);
12305 ptr = RSTRING_PTR(sym);
12306 len = RSTRING_LEN(sym);
12307 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12315rb_str_quote_unprintable(
VALUE str)
12323 resenc = rb_default_internal_encoding();
12324 if (resenc == NULL) resenc = rb_default_external_encoding();
12325 enc = STR_ENC_GET(str);
12326 ptr = RSTRING_PTR(str);
12327 len = RSTRING_LEN(str);
12328 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12329 !sym_printable(ptr, ptr +
len, enc)) {
12330 return rb_str_escape(str);
12336rb_id_quote_unprintable(
ID id)
12338 VALUE str = rb_id2str(
id);
12339 if (!rb_str_symname_p(str)) {
12340 return rb_str_escape(str);
12358sym_inspect(
VALUE sym)
12365 if (!rb_str_symname_p(str)) {
12367 len = RSTRING_LEN(str);
12368 rb_str_resize(str,
len + 1);
12369 dest = RSTRING_PTR(str);
12370 memmove(dest + 1, dest,
len);
12374 VALUE orig_str = str;
12376 len = RSTRING_LEN(orig_str);
12377 str = rb_enc_str_new(0,
len + 1, enc);
12380 ptr = RSTRING_PTR(orig_str);
12381 dest = RSTRING_PTR(str);
12382 memcpy(dest + 1, ptr,
len);
12402rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12407 rb_raise(rb_eArgError,
"no receiver given");
12510 return rb_str_match(
rb_sym2str(sym), other);
12525sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12527 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12540sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12542 return rb_str_match_m_p(argc, argv, sym);
12560 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12571sym_length(
VALUE sym)
12585sym_empty(
VALUE sym)
12619sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12635sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12651sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12665sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12667 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12680sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12682 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12694sym_encoding(
VALUE sym)
12700string_for_symbol(
VALUE name)
12705 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12719 name = string_for_symbol(name);
12720 return rb_intern_str(name);
12729 name = string_for_symbol(name);
12753 return rb_fstring(str);
12759 struct RString fake_str = {RBASIC_INIT};
12760 int encidx = ENCINDEX_US_ASCII;
12763 encidx = ENCINDEX_ASCII_8BIT;
12766 VALUE str = setup_fake_str(&fake_str,
ptr,
len, encidx);
12768 return register_fstring(str,
true,
false);
12780 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12781 rb_enc_autoload(enc);
12784 struct RString fake_str = {RBASIC_INIT};
12785 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12791 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12792 rb_enc_autoload(enc);
12795 struct RString fake_str = {RBASIC_INIT};
12796 VALUE str = register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12807#if USE_YJIT || USE_ZJIT
12809rb_jit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12814 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12815 rb_str_buf_cat_byte(str, (
char) code);
12825fstring_set_class_i(
VALUE *str,
void *data)
12829 return ST_CONTINUE;
12837 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
13004 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_cObject
Object class.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
struct RString::@53::@55 embed
Embedded contents.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
long len
Length of the string, not including terminating NUL character.
union RString::@53 as
String's specific fields.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
struct RString::@53::@54 heap
Strings that use separated memory region for contents use this pattern.
union RString::@53::@54::@56 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.