14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
46#include "ruby_assert.h"
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
62#undef rb_usascii_str_new
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
126#define RUBY_MAX_CHAR_LEN 16
127#define STR_PRECOMPUTED_HASH FL_USER4
128#define STR_SHARED_ROOT FL_USER5
129#define STR_BORROWED FL_USER6
130#define STR_TMPLOCK FL_USER7
131#define STR_NOFREE FL_USER18
132#define STR_FAKESTR FL_USER19
134#define STR_SET_NOEMBED(str) do {\
135 FL_SET((str), STR_NOEMBED);\
136 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
138#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
140#define STR_SET_LEN(str, n) do { \
141 RSTRING(str)->len = (n); \
145str_encindex_fastpath(
int encindex)
149 case ENCINDEX_ASCII_8BIT:
151 case ENCINDEX_US_ASCII:
159str_enc_fastpath(
VALUE str)
164#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
165#define TERM_FILL(ptr, termlen) do {\
166 char *const term_fill_ptr = (ptr);\
167 const int term_fill_len = (termlen);\
168 *term_fill_ptr = '\0';\
169 if (UNLIKELY(term_fill_len > 1))\
170 memset(term_fill_ptr, 0, term_fill_len);\
173#define RESIZE_CAPA(str,capacity) do {\
174 const int termlen = TERM_LEN(str);\
175 RESIZE_CAPA_TERM(str,capacity,termlen);\
177#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
178 if (STR_EMBED_P(str)) {\
179 if (str_embed_capa(str) < capacity + termlen) {\
180 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
181 const long tlen = RSTRING_LEN(str);\
182 memcpy(tmp, RSTRING_PTR(str), tlen);\
183 RSTRING(str)->as.heap.ptr = tmp;\
184 RSTRING(str)->len = tlen;\
185 STR_SET_NOEMBED(str);\
186 RSTRING(str)->as.heap.aux.capa = (capacity);\
190 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
191 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
192 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
193 RSTRING(str)->as.heap.aux.capa = (capacity);\
197#define STR_SET_SHARED(str, shared_str) do { \
198 if (!FL_TEST(str, STR_FAKESTR)) { \
199 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
200 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
201 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
202 FL_SET((str), STR_SHARED); \
203 FL_SET((shared_str), STR_SHARED_ROOT); \
204 if (RBASIC_CLASS((shared_str)) == 0) \
205 FL_SET_RAW((shared_str), STR_BORROWED); \
209#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
210#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
213#define STR_ENC_GET(str) get_encoding(str)
215#if !defined SHARABLE_MIDDLE_SUBSTRING
216# define SHARABLE_MIDDLE_SUBSTRING 0
218#if !SHARABLE_MIDDLE_SUBSTRING
219#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
221#define SHARABLE_SUBSTRING_P(beg, len, end) 1
226str_embed_capa(
VALUE str)
228 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
232rb_str_reembeddable_p(
VALUE str)
234 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
238rb_str_embed_size(
long capa)
244rb_str_size_as_embedded(
VALUE str)
247 if (STR_EMBED_P(str)) {
248 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
252 else if (rb_str_reembeddable_p(str)) {
253 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
256 real_size =
sizeof(
struct RString);
260 real_size +=
sizeof(st_index_t);
267STR_EMBEDDABLE_P(
long len,
long termlen)
269 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
274static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
275static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
277static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
278static inline void str_modifiable(
VALUE str);
283str_make_independent(
VALUE str)
285 long len = RSTRING_LEN(str);
286 int termlen = TERM_LEN(str);
287 str_make_independent_expand((str),
len, 0L, termlen);
290static inline int str_dependent_p(
VALUE str);
293rb_str_make_independent(
VALUE str)
295 if (str_dependent_p(str)) {
296 str_make_independent(str);
301rb_str_make_embedded(
VALUE str)
306 char *buf =
RSTRING(str)->as.heap.ptr;
310 STR_SET_LEN(str,
len);
313 memcpy(RSTRING_PTR(str), buf,
len);
317 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
321rb_debug_rstring_null_ptr(
const char *func)
323 fprintf(stderr,
"%s is returning NULL!! "
324 "SIGSEGV is highly expected to follow immediately.\n"
325 "If you could reproduce, attach your debugger here, "
326 "and look at the passed string.\n",
331static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
334get_encoding(
VALUE str)
340mustnot_broken(
VALUE str)
342 if (is_broken_string(str)) {
343 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348mustnot_wchar(
VALUE str)
351 if (rb_enc_mbminlen(enc) > 1) {
352 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
358static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
360#if SIZEOF_LONG == SIZEOF_VOIDP
361#define PRECOMPUTED_FAKESTR_HASH 1
365#ifdef PRECOMPUTED_FAKESTR_HASH
367fstring_hash(
VALUE str)
371 return (st_index_t)
RSTRING(str)->as.heap.aux.capa;
378#define fstring_hash rb_str_hash
386#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
388static inline st_index_t
389str_do_hash(
VALUE str)
391 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
393 if (e && !is_ascii_string(str)) {
400str_store_precomputed_hash(
VALUE str, st_index_t hash)
406 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
407 size_t free_bytes = str_embed_capa(str) - used_bytes;
411 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
413 FL_SET(str, STR_PRECOMPUTED_HASH);
421 bool force_precompute_hash;
425fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data,
int existing)
434 if (rb_objspace_garbage_object_p(str)) {
449 long len = RSTRING_LEN(str);
450 long capa =
len +
sizeof(st_index_t);
451 int term_len = TERM_LEN(str);
453 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
455 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
456 STR_SET_LEN(new_str, RSTRING_LEN(str));
458 rb_enc_copy(new_str, str);
459 str_store_precomputed_hash(new_str, fstring_hash(str));
463 rb_enc_copy(new_str, str);
464#ifdef PRECOMPUTED_FAKESTR_HASH
465 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
466 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
480 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
483 if (STR_SHARED_P(str)) {
485 str_make_independent(str);
488 if (!BARE_STRING_P(str)) {
494 RBASIC(str)->flags |= RSTRING_FSTR;
496 *key = *value = arg->fstr = str;
509 if (
FL_TEST(str, RSTRING_FSTR))
512 bare = BARE_STRING_P(str);
514 if (STR_EMBED_P(str)) {
519 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
526 rb_str_resize(str, RSTRING_LEN(str));
528 fstr = register_fstring(str,
false,
false);
531 str_replace_shared_without_enc(str, fstr);
539register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
543 .force_precompute_hash = force_precompute_hash
546#if SIZEOF_VOIDP == SIZEOF_LONG
550 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
556 st_table *frozen_strings = rb_vm_fstring_table();
559 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
560 }
while (UNDEF_P(args.fstr));
573setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
588 return (
VALUE)fake_str;
597 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
606rb_fstring_new(
const char *ptr,
long len)
609 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
false,
false);
616 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
false,
false);
620rb_fstring_cstr(
const char *
ptr)
622 return rb_fstring_new(
ptr, strlen(
ptr));
626fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
636 const char *aptr, *bptr;
639 return (alen != blen ||
641 memcmp(aptr, bptr, alen) != 0);
645single_byte_optimizable(
VALUE str)
649 case ENCINDEX_ASCII_8BIT:
650 case ENCINDEX_US_ASCII:
672static inline const char *
673search_nonascii(
const char *p,
const char *e)
675 const uintptr_t *s, *t;
677#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
678# if SIZEOF_UINTPTR_T == 8
679# define NONASCII_MASK UINT64_C(0x8080808080808080)
680# elif SIZEOF_UINTPTR_T == 4
681# define NONASCII_MASK UINT32_C(0x80808080)
683# error "don't know what to do."
686# if SIZEOF_UINTPTR_T == 8
687# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
688# elif SIZEOF_UINTPTR_T == 4
689# define NONASCII_MASK 0x80808080UL
691# error "don't know what to do."
695 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
696#if !UNALIGNED_WORD_ACCESS
697 if ((uintptr_t)p % SIZEOF_VOIDP) {
698 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
703 case 7:
if (p[-7]&0x80)
return p-7;
704 case 6:
if (p[-6]&0x80)
return p-6;
705 case 5:
if (p[-5]&0x80)
return p-5;
706 case 4:
if (p[-4]&0x80)
return p-4;
708 case 3:
if (p[-3]&0x80)
return p-3;
709 case 2:
if (p[-2]&0x80)
return p-2;
710 case 1:
if (p[-1]&0x80)
return p-1;
715#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
716#define aligned_ptr(value) \
717 __builtin_assume_aligned((value), sizeof(uintptr_t))
719#define aligned_ptr(value) (uintptr_t *)(value)
722 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
725 if (*s & NONASCII_MASK) {
726#ifdef WORDS_BIGENDIAN
727 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
729 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
739 case 7:
if (e[-7]&0x80)
return e-7;
740 case 6:
if (e[-6]&0x80)
return e-6;
741 case 5:
if (e[-5]&0x80)
return e-5;
742 case 4:
if (e[-4]&0x80)
return e-4;
744 case 3:
if (e[-3]&0x80)
return e-3;
745 case 2:
if (e[-2]&0x80)
return e-2;
746 case 1:
if (e[-1]&0x80)
return e-1;
754 const char *e = p +
len;
756 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
758 p = search_nonascii(p, e);
762 if (rb_enc_asciicompat(enc)) {
763 p = search_nonascii(p, e);
766 int ret = rb_enc_precise_mbclen(p, e, enc);
770 p = search_nonascii(p, e);
776 int ret = rb_enc_precise_mbclen(p, e, enc);
792 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
795 p = search_nonascii(p, e);
799 else if (rb_enc_asciicompat(enc)) {
800 p = search_nonascii(p, e);
806 int ret = rb_enc_precise_mbclen(p, e, enc);
813 p = search_nonascii(p, e);
819 int ret = rb_enc_precise_mbclen(p, e, enc);
844 rb_enc_set_index(str1, rb_enc_get_index(str2));
852rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
857 str_enc_copy(dest, src);
858 if (RSTRING_LEN(dest) == 0) {
859 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
870 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
871 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
882rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
884 str_enc_copy(dest, src);
891 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
897 return enc_coderange_scan(str, enc);
906 cr = enc_coderange_scan(str, get_encoding(str));
913rb_enc_str_asciicompat(
VALUE str)
916 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
924 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
933str_mod_check(
VALUE s,
const char *p,
long len)
935 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
941str_capacity(
VALUE str,
const int termlen)
943 if (STR_EMBED_P(str)) {
944 return str_embed_capa(str) - termlen;
946 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
950 return RSTRING(str)->as.heap.aux.capa;
957 return str_capacity(str, TERM_LEN(str));
961must_not_null(
const char *
ptr)
964 rb_raise(rb_eArgError,
"NULL pointer given");
971 size_t size = rb_str_embed_size(
capa);
975 NEWOBJ_OF(str,
struct RString, klass,
982str_alloc_heap(
VALUE klass)
984 NEWOBJ_OF(str,
struct RString, klass,
991empty_str_alloc(
VALUE klass)
993 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
994 VALUE str = str_alloc_embed(klass, 0);
995 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1006 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1010 enc = rb_ascii8bit_encoding();
1013 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1015 int termlen = rb_enc_mbminlen(enc);
1017 if (STR_EMBEDDABLE_P(
len, termlen)) {
1018 str = str_alloc_embed(klass,
len + termlen);
1024 str = str_alloc_heap(klass);
1030 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1033 rb_enc_raw_set(str, enc);
1036 memcpy(RSTRING_PTR(str),
ptr,
len);
1039 STR_SET_LEN(str,
len);
1040 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1047 return str_enc_new(klass,
ptr,
len, rb_ascii8bit_encoding());
1082 __msan_unpoison_string(
ptr);
1102 if (rb_enc_mbminlen(enc) != 1) {
1103 rb_raise(rb_eArgError,
"wchar encoding given");
1105 return rb_enc_str_new(
ptr, strlen(
ptr), enc);
1109str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1114 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1118 str = str_enc_new(klass,
ptr,
len, rb_enc_from_index(encindex));
1121 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1122 str = str_alloc_heap(klass);
1126 RBASIC(str)->flags |= STR_NOFREE;
1127 rb_enc_associate_index(str, encindex);
1156static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1158 int ecflags,
VALUE ecopts);
1163 int encidx = rb_enc_to_index(enc);
1164 if (rb_enc_get_index(str) == encidx)
1165 return is_ascii_string(str);
1176 if (!to)
return str;
1177 if (!from) from = rb_enc_get(str);
1178 if (from == to)
return str;
1179 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1180 rb_is_ascii8bit_enc(to)) {
1181 if (STR_ENC_GET(str) != to) {
1183 rb_enc_associate(str, to);
1190 from, to, ecflags, ecopts);
1191 if (
NIL_P(newstr)) {
1199rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1204 olen = RSTRING_LEN(newstr);
1205 if (ofs < -olen || olen < ofs)
1207 if (ofs < 0) ofs += olen;
1209 STR_SET_LEN(newstr, ofs);
1213 rb_str_modify(newstr);
1214 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1222 STR_SET_LEN(str, 0);
1223 rb_enc_associate(str, enc);
1229str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1231 int ecflags,
VALUE ecopts)
1236 VALUE econv_wrapper;
1237 const unsigned char *start, *sp;
1238 unsigned char *dest, *dp;
1239 size_t converted_output = (size_t)ofs;
1244 RBASIC_CLEAR_CLASS(econv_wrapper);
1246 if (!ec)
return Qnil;
1249 sp = (
unsigned char*)
ptr;
1251 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1252 (dp = dest + converted_output),
1256 size_t converted_input = sp - start;
1257 size_t rest =
len - converted_input;
1258 converted_output = dp - dest;
1260 if (converted_input && converted_output &&
1261 rest < (LONG_MAX / converted_output)) {
1262 rest = (rest * converted_output) / converted_input;
1267 olen += rest < 2 ? 2 : rest;
1268 rb_str_resize(newstr, olen);
1275 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1277 rb_enc_associate(newstr, to);
1296 const int eidx = rb_enc_to_index(eenc);
1299 return rb_enc_str_new(
ptr,
len, eenc);
1303 if ((eidx == rb_ascii8bit_encindex()) ||
1304 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1308 ienc = rb_default_internal_encoding();
1309 if (!ienc || eenc == ienc) {
1310 return rb_enc_str_new(
ptr,
len, eenc);
1314 if ((eidx == rb_ascii8bit_encindex()) ||
1315 (eidx == rb_usascii_encindex()) ||
1316 (rb_enc_asciicompat(eenc) && !search_nonascii(
ptr,
ptr +
len))) {
1317 return rb_enc_str_new(
ptr,
len, ienc);
1320 str = rb_enc_str_new(NULL, 0, ienc);
1323 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1324 rb_str_initialize(str,
ptr,
len, eenc);
1332 int eidx = rb_enc_to_index(eenc);
1333 if (eidx == rb_usascii_encindex() &&
1334 !is_ascii_string(str)) {
1335 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1338 rb_enc_associate_index(str, eidx);
1397str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1399 const int termlen = TERM_LEN(str);
1404 if (str_embed_capa(str2) >=
len + termlen) {
1405 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1406 STR_SET_EMBED(str2);
1407 memcpy(ptr2, RSTRING_PTR(str),
len);
1408 TERM_FILL(ptr2+
len, termlen);
1412 if (STR_SHARED_P(str)) {
1413 root =
RSTRING(str)->as.heap.aux.shared;
1422 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1424 rb_fatal(
"about to free a possible shared root");
1426 char *ptr2 = STR_HEAP_PTR(str2);
1428 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1431 FL_SET(str2, STR_NOEMBED);
1433 STR_SET_SHARED(str2, root);
1436 STR_SET_LEN(str2,
len);
1444 str_replace_shared_without_enc(str2, str);
1445 rb_enc_cr_str_exact_copy(str2, str);
1452 return str_replace_shared(str_alloc_heap(klass), str);
1469rb_str_new_frozen_String(
VALUE orig)
1476rb_str_tmp_frozen_acquire(
VALUE orig)
1479 return str_new_frozen_buffer(0, orig, FALSE);
1483rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1485 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1486 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1488 VALUE str = str_alloc_heap(0);
1491 FL_SET(str, STR_SHARED_ROOT);
1493 size_t capa = str_capacity(orig, TERM_LEN(orig));
1499 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1500 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1507 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1508 RBASIC(orig)->flags &= ~STR_NOFREE;
1509 STR_SET_SHARED(orig, str);
1519rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1524 if (STR_EMBED_P(tmp)) {
1533 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1537 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1538 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1543 STR_SET_LEN(tmp, 0);
1551 return str_new_frozen_buffer(klass, orig, TRUE);
1560 VALUE str = str_alloc_heap(klass);
1561 STR_SET_LEN(str, RSTRING_LEN(orig));
1562 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1563 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1564 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1565 RBASIC(orig)->flags &= ~STR_NOFREE;
1566 STR_SET_SHARED(orig, str);
1573str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1577 long len = RSTRING_LEN(orig);
1578 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1579 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1581 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1582 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1588 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1589 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1595 if ((ofs > 0) || (rest > 0) ||
1598 str = str_new_shared(klass,
shared);
1600 RSTRING(str)->as.heap.ptr += ofs;
1601 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1609 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1610 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1612 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1613 STR_SET_LEN(str, RSTRING_LEN(orig));
1618 str = heap_str_make_shared(klass, orig);
1622 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1634str_new_empty_String(
VALUE str)
1637 rb_enc_copy(v, str);
1641#define STR_BUF_MIN_SIZE 63
1646 if (STR_EMBEDDABLE_P(
capa, 1)) {
1654 RSTRING(str)->as.heap.ptr[0] =
'\0';
1674 return str_new(0, 0,
len);
1680 if (STR_EMBED_P(str)) {
1681 RB_DEBUG_COUNTER_INC(obj_str_embed);
1683 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1684 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1685 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1688 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1689 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1694rb_str_memsize(
VALUE str)
1696 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1697 return STR_HEAP_SIZE(str);
1707 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1710static inline void str_discard(
VALUE str);
1711static void str_shared_replace(
VALUE str,
VALUE str2);
1716 if (str != str2) str_shared_replace(str, str2);
1727 enc = STR_ENC_GET(str2);
1730 termlen = rb_enc_mbminlen(enc);
1732 STR_SET_LEN(str, RSTRING_LEN(str2));
1734 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1736 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1737 rb_enc_associate(str, enc);
1741 if (STR_EMBED_P(str2)) {
1743 long len = RSTRING_LEN(str2);
1746 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1747 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1748 RSTRING(str2)->as.heap.ptr = new_ptr;
1749 STR_SET_LEN(str2,
len);
1751 STR_SET_NOEMBED(str2);
1754 STR_SET_NOEMBED(str);
1756 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1758 if (
FL_TEST(str2, STR_SHARED)) {
1760 STR_SET_SHARED(str,
shared);
1763 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1767 STR_SET_EMBED(str2);
1768 RSTRING_PTR(str2)[0] = 0;
1769 STR_SET_LEN(str2, 0);
1770 rb_enc_associate(str, enc);
1784 return rb_obj_as_string_result(str, obj);
1800 len = RSTRING_LEN(str2);
1801 if (STR_SHARED_P(str2)) {
1804 STR_SET_NOEMBED(str);
1805 STR_SET_LEN(str,
len);
1806 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1807 STR_SET_SHARED(str,
shared);
1808 rb_enc_cr_str_exact_copy(str, str2);
1811 str_replace_shared(str, str2);
1820 size_t size = rb_str_embed_size(
capa);
1824 NEWOBJ_OF(str,
struct RString, klass,
1833 NEWOBJ_OF(str,
struct RString, klass,
1844 encidx = rb_enc_get_index(str);
1845 flags &= ~ENCODING_MASK;
1848 if (encidx) rb_enc_associate_index(dup, encidx);
1858 long len = RSTRING_LEN(str);
1863 STR_SET_LEN(dup, RSTRING_LEN(str));
1864 return str_duplicate_setup_encoding(str, dup, flags);
1873 root =
RSTRING(str)->as.heap.aux.shared;
1875 else if (UNLIKELY(!(flags &
FL_FREEZE))) {
1876 root = str = str_new_frozen(klass, str);
1882 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1883 FL_SET(root, STR_SHARED_ROOT);
1885 flags |= RSTRING_NOEMBED | STR_SHARED;
1887 STR_SET_LEN(dup, RSTRING_LEN(str));
1888 return str_duplicate_setup_encoding(str, dup, flags);
1894 if (STR_EMBED_P(str)) {
1895 return str_duplicate_setup_embed(klass, str, dup);
1898 return str_duplicate_setup_heap(klass, str, dup);
1906 if (STR_EMBED_P(str)) {
1907 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1910 dup = str_alloc_heap(klass);
1913 return str_duplicate_setup(klass, str, dup);
1924rb_str_dup_m(
VALUE str)
1926 if (LIKELY(BARE_STRING_P(str))) {
1937 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1944 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1948 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1949 str_duplicate_setup_embed(klass, str, new_str);
1952 new_str = ec_str_alloc_heap(ec, klass);
1953 str_duplicate_setup_heap(klass, str, new_str);
1962rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
1964 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
1966 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1983 static ID keyword_ids[2];
1984 VALUE orig, opt, venc, vcapa;
1989 if (!keyword_ids[0]) {
1990 keyword_ids[0] = rb_id_encoding();
1991 CONST_ID(keyword_ids[1],
"capacity");
1999 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2000 enc = rb_to_encoding(venc);
2002 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2005 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2007 if (
capa < STR_BUF_MIN_SIZE) {
2008 capa = STR_BUF_MIN_SIZE;
2012 len = RSTRING_LEN(orig);
2016 if (orig == str) n = 0;
2018 str_modifiable(str);
2019 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2021 const size_t size = (size_t)
capa + termlen;
2022 const char *
const old_ptr = RSTRING_PTR(str);
2023 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2024 char *new_ptr =
ALLOC_N(
char, size);
2025 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2026 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2028 RSTRING(str)->as.heap.ptr = new_ptr;
2030 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2031 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2032 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2034 STR_SET_LEN(str,
len);
2037 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2038 rb_enc_cr_str_exact_copy(str, orig);
2040 FL_SET(str, STR_NOEMBED);
2047 rb_enc_associate(str, enc);
2059rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2065 static ID keyword_ids[2];
2075 keyword_ids[0] = rb_id_encoding();
2076 CONST_ID(keyword_ids[1],
"capacity");
2078 encoding = kwargs[0];
2079 capacity = kwargs[1];
2088 if (UNDEF_P(encoding)) {
2090 encoding = rb_obj_encoding(orig);
2094 if (!UNDEF_P(encoding)) {
2095 enc = rb_to_encoding(encoding);
2099 if (UNDEF_P(capacity)) {
2101 VALUE empty_str = str_new(klass,
"", 0);
2103 rb_enc_associate(empty_str, enc);
2107 VALUE copy = str_duplicate(klass, orig);
2108 rb_enc_associate(copy, enc);
2121 if (orig_capa >
capa) {
2126 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2127 STR_SET_LEN(str, 0);
2138#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2153static inline uintptr_t
2154count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2159 d = (d>>6) | (~d>>7);
2160 d &= NONASCII_MASK >> 7;
2163#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2165 return rb_popcount_intptr(d);
2169# if SIZEOF_VOIDP == 8
2178enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
2184 long diff = (long)(e - p);
2185 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2190 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2191 const uintptr_t *s, *t;
2192 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2193 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2194 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2195 while (p < (
const char *)s) {
2196 if (is_utf8_lead_byte(*p))
len++;
2200 len += count_utf8_lead_bytes_with_word(s);
2203 p = (
const char *)s;
2206 if (is_utf8_lead_byte(*p))
len++;
2212 else if (rb_enc_asciicompat(enc)) {
2217 q = search_nonascii(p, e);
2223 p += rb_enc_fast_mbclen(p, e, enc);
2230 q = search_nonascii(p, e);
2236 p += rb_enc_mbclen(p, e, enc);
2243 for (c=0; p<e; c++) {
2244 p += rb_enc_mbclen(p, e, enc);
2259rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2267 long diff = (long)(e - p);
2268 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2270 else if (rb_enc_asciicompat(enc)) {
2274 q = search_nonascii(p, e);
2282 ret = rb_enc_precise_mbclen(p, e, enc);
2297 for (c=0; p<e; c++) {
2298 ret = rb_enc_precise_mbclen(p, e, enc);
2305 if (p + rb_enc_mbminlen(enc) <= e)
2306 p += rb_enc_mbminlen(enc);
2322 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2323 if (!enc) enc = STR_ENC_GET(str);
2324 p = RSTRING_PTR(str);
2329 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2334 return enc_strlen(p, e, enc, cr);
2341 return str_strlen(str, NULL);
2355 return LONG2NUM(str_strlen(str, NULL));
2367rb_str_bytesize(
VALUE str)
2385rb_str_empty(
VALUE str)
2387 return RBOOL(RSTRING_LEN(str) == 0);
2405 char *ptr1, *ptr2, *ptr3;
2410 enc = rb_enc_check_str(str1, str2);
2413 termlen = rb_enc_mbminlen(enc);
2414 if (len1 > LONG_MAX - len2) {
2415 rb_raise(rb_eArgError,
"string size too big");
2417 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2418 ptr3 = RSTRING_PTR(str3);
2419 memcpy(ptr3, ptr1, len1);
2420 memcpy(ptr3+len1, ptr2, len2);
2421 TERM_FILL(&ptr3[len1+len2], termlen);
2437 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2440 int enc1 = rb_enc_get_index(str1);
2441 int enc2 = rb_enc_get_index(str2);
2446 else if (enc2 < 0) {
2449 else if (enc1 != enc2) {
2452 else if (len1 > LONG_MAX - len2) {
2485 rb_enc_copy(str2, str);
2490 rb_raise(rb_eArgError,
"negative argument");
2492 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2493 if (STR_EMBEDDABLE_P(
len, 1)) {
2495 memset(RSTRING_PTR(str2), 0,
len + 1);
2502 STR_SET_LEN(str2,
len);
2503 rb_enc_copy(str2, str);
2506 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2507 rb_raise(rb_eArgError,
"argument too big");
2510 len *= RSTRING_LEN(str);
2511 termlen = TERM_LEN(str);
2513 ptr2 = RSTRING_PTR(str2);
2515 n = RSTRING_LEN(str);
2516 memcpy(ptr2, RSTRING_PTR(str), n);
2517 while (n <=
len/2) {
2518 memcpy(ptr2 + n, ptr2, n);
2521 memcpy(ptr2 + n, ptr2,
len-n);
2523 STR_SET_LEN(str2,
len);
2524 TERM_FILL(&ptr2[
len], termlen);
2525 rb_enc_cr_str_copy_for_substr(str2, str);
2551 VALUE tmp = rb_check_array_type(arg);
2560rb_check_lockedtmp(
VALUE str)
2562 if (
FL_TEST(str, STR_TMPLOCK)) {
2569#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2571str_modifiable(
VALUE str)
2573 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2574 if (CHILLED_STRING_P(str)) {
2575 CHILLED_STRING_MUTATED(str);
2577 rb_check_lockedtmp(str);
2578 rb_check_frozen(str);
2583str_dependent_p(
VALUE str)
2585 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2595#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2597str_independent(
VALUE str)
2599 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2600 str_modifiable(str);
2601 return !str_dependent_p(str);
2607str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2615 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2620 STR_SET_LEN(str,
len);
2625 oldptr = RSTRING_PTR(str);
2627 memcpy(
ptr, oldptr,
len);
2629 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2632 STR_SET_NOEMBED(str);
2633 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2634 TERM_FILL(
ptr +
len, termlen);
2636 STR_SET_LEN(str,
len);
2643 if (!str_independent(str))
2644 str_make_independent(str);
2651 int termlen = TERM_LEN(str);
2652 long len = RSTRING_LEN(str);
2655 rb_raise(rb_eArgError,
"negative expanding string size");
2657 if (expand >= LONG_MAX -
len) {
2658 rb_raise(rb_eArgError,
"string size too big");
2661 if (!str_independent(str)) {
2662 str_make_independent_expand(str,
len, expand, termlen);
2664 else if (expand > 0) {
2665 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2672str_modify_keep_cr(
VALUE str)
2674 if (!str_independent(str))
2675 str_make_independent(str);
2682str_discard(
VALUE str)
2684 str_modifiable(str);
2685 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2686 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2687 RSTRING(str)->as.heap.ptr = 0;
2688 STR_SET_LEN(str, 0);
2695 int encindex = rb_enc_get_index(str);
2697 if (RB_UNLIKELY(encindex == -1)) {
2701 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2706 if (!rb_enc_asciicompat(enc)) {
2726 return RSTRING_PTR(str);
2730zero_filled(
const char *s,
int n)
2732 for (; n > 0; --n) {
2739str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2741 const char *e = s +
len;
2743 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2744 if (zero_filled(s, minlen))
return s;
2750str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2755 if (str_dependent_p(str)) {
2756 if (!zero_filled(s +
len, termlen))
2757 str_make_independent_expand(str,
len, 0L, termlen);
2760 TERM_FILL(s +
len, termlen);
2763 return RSTRING_PTR(str);
2767rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2769 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2770 long len = RSTRING_LEN(str);
2774 rb_check_lockedtmp(str);
2775 str_make_independent_expand(str,
len, 0L, termlen);
2777 else if (str_dependent_p(str)) {
2778 if (termlen > oldtermlen)
2779 str_make_independent_expand(str,
len, 0L, termlen);
2782 if (!STR_EMBED_P(str)) {
2787 if (termlen > oldtermlen) {
2788 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2796str_null_check(
VALUE str,
int *w)
2798 char *s = RSTRING_PTR(str);
2799 long len = RSTRING_LEN(str);
2801 const int minlen = rb_enc_mbminlen(enc);
2805 if (str_null_char(s,
len, minlen, enc)) {
2808 return str_fill_term(str, s,
len, minlen);
2811 if (!s || memchr(s, 0,
len)) {
2815 s = str_fill_term(str, s,
len, minlen);
2821rb_str_to_cstr(
VALUE str)
2824 return str_null_check(str, &w);
2832 char *s = str_null_check(str, &w);
2835 rb_raise(rb_eArgError,
"string contains null char");
2837 rb_raise(rb_eArgError,
"string contains null byte");
2843rb_str_fill_terminator(
VALUE str,
const int newminlen)
2845 char *s = RSTRING_PTR(str);
2846 long len = RSTRING_LEN(str);
2847 return str_fill_term(str, s,
len, newminlen);
2853 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2877str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
2886 else if (rb_enc_asciicompat(enc)) {
2887 const char *p2, *e2;
2890 while (p < e && 0 < nth) {
2897 p2 = search_nonascii(p, e2);
2906 n = rb_enc_mbclen(p, e, enc);
2917 while (p < e && nth--) {
2918 p += rb_enc_mbclen(p, e, enc);
2929 return str_nth_len(p, e, &nth, enc);
2933str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2938 p = str_nth_len(p, e, &nth, enc);
2947str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2949 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2950 if (!pp)
return e - p;
2957 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
2958 STR_ENC_GET(str), single_byte_optimizable(str));
2963str_utf8_nth(
const char *p,
const char *e,
long *nthp)
2966 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
2967 const uintptr_t *s, *t;
2968 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2969 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2970 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2971 while (p < (
const char *)s) {
2972 if (is_utf8_lead_byte(*p)) nth--;
2976 nth -= count_utf8_lead_bytes_with_word(s);
2978 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
2982 if (is_utf8_lead_byte(*p)) {
2983 if (nth == 0)
break;
2993str_utf8_offset(
const char *p,
const char *e,
long nth)
2995 const char *pp = str_utf8_nth(p, e, &nth);
3004 if (single_byte_optimizable(str) || pos < 0)
3007 char *p = RSTRING_PTR(str);
3008 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3013str_subseq(
VALUE str,
long beg,
long len)
3021 const int termlen = TERM_LEN(str);
3022 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3029 if (str_embed_capa(str2) >=
len + termlen) {
3030 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3031 STR_SET_EMBED(str2);
3032 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3033 TERM_FILL(ptr2+
len, termlen);
3035 STR_SET_LEN(str2,
len);
3039 str_replace_shared(str2, str);
3042 RSTRING(str2)->as.heap.ptr += beg;
3043 if (RSTRING_LEN(str2) >
len) {
3044 STR_SET_LEN(str2,
len);
3054 VALUE str2 = str_subseq(str, beg,
len);
3055 rb_enc_cr_str_copy_for_substr(str2, str);
3064 const long blen = RSTRING_LEN(str);
3066 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3068 if (
len < 0)
return 0;
3069 if (beg < 0 && -beg < 0)
return 0;
3073 if (single_byte_optimizable(str)) {
3074 if (beg > blen)
return 0;
3077 if (beg < 0)
return 0;
3079 if (
len > blen - beg)
3081 if (
len < 0)
return 0;
3086 if (
len > -beg)
len = -beg;
3090 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3093 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3099 slen = str_strlen(str, enc);
3101 if (beg < 0)
return 0;
3103 if (
len == 0)
goto end;
3106 else if (beg > 0 && beg > blen) {
3110 if (beg > str_strlen(str, enc))
return 0;
3115 enc == rb_utf8_encoding()) {
3116 p = str_utf8_nth(s, e, &beg);
3117 if (beg > 0)
return 0;
3118 len = str_utf8_offset(p, e,
len);
3124 p = s + beg * char_sz;
3128 else if (
len * char_sz > e - p)
3133 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3134 if (beg > 0)
return 0;
3138 len = str_offset(p, e,
len, enc, 0);
3146static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3151 return str_substr(str, beg,
len, TRUE);
3161str_substr(
VALUE str,
long beg,
long len,
int empty)
3165 if (!p)
return Qnil;
3166 if (!
len && !empty)
return Qnil;
3168 beg = p - RSTRING_PTR(str);
3170 VALUE str2 = str_subseq(str, beg,
len);
3171 rb_enc_cr_str_copy_for_substr(str2, str);
3179 if (CHILLED_STRING_P(str)) {
3184 rb_str_resize(str, RSTRING_LEN(str));
3200 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3230str_uminus(
VALUE str)
3235 return rb_fstring(str);
3239#define rb_str_dup_frozen rb_str_new_frozen
3244 if (
FL_TEST(str, STR_TMPLOCK)) {
3247 FL_SET(str, STR_TMPLOCK);
3254 if (!
FL_TEST(str, STR_TMPLOCK)) {
3272 const int termlen = TERM_LEN(str);
3274 str_modifiable(str);
3275 if (STR_SHARED_P(str)) {
3278 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3279 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3290 else if (
len > RSTRING_LEN(str)) {
3294 const char *
const new_end = RSTRING_PTR(str) +
len;
3304 else if (
len < RSTRING_LEN(str)) {
3312 STR_SET_LEN(str,
len);
3313 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3320 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3323 int independent = str_independent(str);
3324 long slen = RSTRING_LEN(str);
3325 const int termlen = TERM_LEN(str);
3327 if (slen >
len || (termlen != 1 && slen <
len)) {
3333 if (STR_EMBED_P(str)) {
3334 if (
len == slen)
return str;
3335 if (str_embed_capa(str) >=
len + termlen) {
3336 STR_SET_LEN(str,
len);
3340 str_make_independent_expand(str, slen,
len - slen, termlen);
3342 else if (str_embed_capa(str) >=
len + termlen) {
3343 char *
ptr = STR_HEAP_PTR(str);
3345 if (slen >
len) slen =
len;
3348 STR_SET_LEN(str,
len);
3349 if (independent) ruby_xfree(
ptr);
3352 else if (!independent) {
3353 if (
len == slen)
return str;
3354 str_make_independent_expand(str, slen,
len - slen, termlen);
3358 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3359 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3362 else if (
len == slen)
return str;
3363 STR_SET_LEN(str,
len);
3370str_ensure_available_capa(
VALUE str,
long len)
3372 str_modify_keep_cr(str);
3374 const int termlen = TERM_LEN(str);
3375 long olen = RSTRING_LEN(str);
3377 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3378 rb_raise(rb_eArgError,
"string sizes too big");
3381 long total = olen +
len;
3382 long capa = str_capacity(str, termlen);
3385 if (total >= LONG_MAX / 2) {
3388 while (total >
capa) {
3391 RESIZE_CAPA_TERM(str,
capa, termlen);
3396str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3399 str_modify_keep_cr(str);
3404 if (
len == 0)
return 0;
3406 long total, olen,
off = -1;
3408 const int termlen = TERM_LEN(str);
3411 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3415 long capa = str_capacity(str, termlen);
3417 if (olen > LONG_MAX -
len) {
3418 rb_raise(rb_eArgError,
"string sizes too big");
3422 if (total >= LONG_MAX / 2) {
3425 while (total >
capa) {
3428 RESIZE_CAPA_TERM(str,
capa, termlen);
3429 sptr = RSTRING_PTR(str);
3434 memcpy(sptr + olen,
ptr,
len);
3435 STR_SET_LEN(str, total);
3436 TERM_FILL(sptr + total, termlen);
3441#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3442#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3447 if (
len == 0)
return str;
3449 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3451 return str_buf_cat(str,
ptr,
len);
3462rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3467 if (UNLIKELY(!str_independent(str))) {
3468 str_make_independent(str);
3471 long string_length = -1;
3472 const int null_terminator_length = 1;
3477 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3478 rb_raise(rb_eArgError,
"string sizes too big");
3481 long string_capacity = str_capacity(str, null_terminator_length);
3487 if (LIKELY(string_capacity >= string_length + 1)) {
3489 sptr[string_length] = byte;
3490 STR_SET_LEN(str, string_length + 1);
3491 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3495 str_buf_cat(str, (
char *)&
byte, 1);
3511 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3522rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3523 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3532 if (str_encindex == ptr_encindex) {
3534 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3538 str_enc = rb_enc_from_index(str_encindex);
3539 ptr_enc = rb_enc_from_index(ptr_encindex);
3540 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3543 if (RSTRING_LEN(str) == 0) {
3546 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3552 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3561 *ptr_cr_ret = ptr_cr;
3563 if (str_encindex != ptr_encindex &&
3566 str_enc = rb_enc_from_index(str_encindex);
3567 ptr_enc = rb_enc_from_index(ptr_encindex);
3572 res_encindex = str_encindex;
3577 res_encindex = str_encindex;
3581 res_encindex = ptr_encindex;
3586 res_encindex = str_encindex;
3593 res_encindex = str_encindex;
3599 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3601 str_buf_cat(str,
ptr,
len);
3607 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3614 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3624 if (rb_enc_asciicompat(enc)) {
3625 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3631 unsigned int c = (
unsigned char)*
ptr;
3632 int len = rb_enc_codelen(c, enc);
3633 rb_enc_mbcput(c, buf, enc);
3634 rb_enc_cr_str_buf_cat(str, buf,
len,
3647 if (str_enc_fastpath(str)) {
3651 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3657 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3668 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3684rb_str_concat_literals(
size_t num,
const VALUE *strary)
3688 unsigned long len = 1;
3693 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3695 str_enc_copy_direct(str, strary[0]);
3697 for (i = s; i < num; ++i) {
3698 const VALUE v = strary[i];
3702 if (encidx != ENCINDEX_US_ASCII) {
3704 rb_enc_set_index(str, encidx);
3729rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3731 str_modifiable(str);
3736 else if (argc > 1) {
3739 rb_enc_copy(arg_str, str);
3740 for (i = 0; i < argc; i++) {
3773rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3775 long needed_capacity = 0;
3779 for (
int index = 0; index < argc; index++) {
3780 VALUE obj = argv[index];
3788 needed_capacity += RSTRING_LEN(obj);
3793 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3800 str_ensure_available_capa(str, needed_capacity);
3803 for (
int index = 0; index < argc; index++) {
3804 VALUE obj = argv[index];
3809 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3810 char byte = (char)(
NUM2INT(obj) & 0xFF);
3824 rb_bug(
"append_as_bytes arguments should have been validated");
3828 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3829 TERM_FILL(sptr, TERM_LEN(str));
3834 for (
int index = 0; index < argc; index++) {
3835 VALUE obj = argv[index];
3852 rb_bug(
"append_as_bytes arguments should have been validated");
3926 if (rb_num_to_uint(str2, &code) == 0) {
3939 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3942 rb_str_buf_cat_byte(str1, (
unsigned char)code);
3945 long pos = RSTRING_LEN(str1);
3950 switch (
len = rb_enc_codelen(code, enc)) {
3951 case ONIGERR_INVALID_CODE_POINT_VALUE:
3952 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3954 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3960 rb_enc_mbcput(code, buf, enc);
3961 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
3962 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3964 rb_str_resize(str1, pos+
len);
3965 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
3978rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
3980 int encidx = rb_enc_to_index(enc);
3982 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3987 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3988 return ENCINDEX_ASCII_8BIT;
4011rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4013 str_modifiable(str);
4018 else if (argc > 1) {
4021 rb_enc_copy(arg_str, str);
4022 for (i = 0; i < argc; i++) {
4035 st_index_t precomputed_hash;
4036 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4038 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4039 return precomputed_hash;
4042 return str_do_hash(str);
4049 const char *ptr1, *ptr2;
4052 return (len1 != len2 ||
4054 memcmp(ptr1, ptr2, len1) != 0);
4068rb_str_hash_m(
VALUE str)
4074#define lesser(a,b) (((a)>(b))?(b):(a))
4082 if (RSTRING_LEN(str1) == 0)
return TRUE;
4083 if (RSTRING_LEN(str2) == 0)
return TRUE;
4086 if (idx1 == idx2)
return TRUE;
4091 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4095 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4105 const char *ptr1, *ptr2;
4108 if (str1 == str2)
return 0;
4111 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4120 if (len1 > len2)
return 1;
4123 if (retval > 0)
return 1;
4150 if (str1 == str2)
return Qtrue;
4157 return rb_str_eql_internal(str1, str2);
4181 if (str1 == str2)
return Qtrue;
4183 return rb_str_eql_internal(str1, str2);
4214 return rb_invcmp(str1, str2);
4256 return str_casecmp(str1, s);
4264 const char *p1, *p1end, *p2, *p2end;
4266 enc = rb_enc_compatible(str1, str2);
4271 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4272 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4273 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4274 while (p1 < p1end && p2 < p2end) {
4276 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4277 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4279 return INT2FIX(c1 < c2 ? -1 : 1);
4286 while (p1 < p1end && p2 < p2end) {
4287 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4288 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4290 if (0 <= c1 && 0 <= c2) {
4294 return INT2FIX(c1 < c2 ? -1 : 1);
4298 l1 = rb_enc_mbclen(p1, p1end, enc);
4299 l2 = rb_enc_mbclen(p2, p2end, enc);
4300 len = l1 < l2 ? l1 : l2;
4301 r = memcmp(p1, p2,
len);
4303 return INT2FIX(r < 0 ? -1 : 1);
4305 return INT2FIX(l1 < l2 ? -1 : 1);
4311 if (RSTRING_LEN(str1) == RSTRING_LEN(str2))
return INT2FIX(0);
4312 if (RSTRING_LEN(str1) > RSTRING_LEN(str2))
return INT2FIX(1);
4346 return str_casecmp_p(str1, s);
4353 VALUE folded_str1, folded_str2;
4354 VALUE fold_opt = sym_fold;
4356 enc = rb_enc_compatible(str1, str2);
4361 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4362 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4364 return rb_str_eql(folded_str1, folded_str2);
4368strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4369 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
4371 const char *search_start = str_ptr;
4372 long pos, search_len = str_len - offset;
4376 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4377 if (pos < 0)
return pos;
4379 if (t == search_start + pos)
break;
4380 search_len -= t - search_start;
4381 if (search_len <= 0)
return -1;
4382 offset += t - search_start;
4385 return pos + offset;
4389#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4390#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4393rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4395 const char *str_ptr, *str_ptr_end, *sub_ptr;
4396 long str_len, sub_len;
4399 enc = rb_enc_check(str, sub);
4400 if (is_broken_string(sub))
return -1;
4402 str_ptr = RSTRING_PTR(str);
4404 str_len = RSTRING_LEN(str);
4405 sub_ptr = RSTRING_PTR(sub);
4406 sub_len = RSTRING_LEN(sub);
4408 if (str_len < sub_len)
return -1;
4411 long str_len_char, sub_len_char;
4412 int single_byte = single_byte_optimizable(str);
4413 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4414 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4416 offset += str_len_char;
4417 if (offset < 0)
return -1;
4419 if (str_len_char - offset < sub_len_char)
return -1;
4420 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4423 if (sub_len == 0)
return offset;
4426 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4440rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4447 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4448 long slen = str_strlen(str, enc);
4450 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4462 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4463 enc, single_byte_optimizable(str));
4474 pos = rb_str_index(str, sub, pos);
4488str_ensure_byte_pos(
VALUE str,
long pos)
4490 if (!single_byte_optimizable(str)) {
4491 const char *s = RSTRING_PTR(str);
4493 const char *p = s + pos;
4494 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4496 "offset %ld does not land on character boundary", pos);
4543rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4549 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4550 long slen = RSTRING_LEN(str);
4552 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4563 str_ensure_byte_pos(str, pos);
4575 pos = rb_str_byteindex(str, sub, pos);
4576 if (pos >= 0)
return LONG2NUM(pos);
4583memrchr(
const char *search_str,
int chr,
long search_len)
4585 const char *ptr = search_str + search_len;
4586 while (ptr > search_str) {
4587 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4597 char *hit, *adjusted;
4599 long slen, searchlen;
4602 sbeg = RSTRING_PTR(str);
4603 slen = RSTRING_LEN(sub);
4604 if (slen == 0)
return s - sbeg;
4606 t = RSTRING_PTR(sub);
4608 searchlen = s - sbeg + 1;
4610 if (memcmp(s, t, slen) == 0) {
4615 hit = memrchr(sbeg, c, searchlen);
4618 if (hit != adjusted) {
4619 searchlen = adjusted - sbeg;
4622 if (memcmp(hit, t, slen) == 0)
4624 searchlen = adjusted - sbeg;
4625 }
while (searchlen > 0);
4639 enc = rb_enc_check(str, sub);
4640 if (is_broken_string(sub))
return -1;
4641 singlebyte = single_byte_optimizable(str);
4642 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4643 slen = str_strlen(sub, enc);
4646 if (
len < slen)
return -1;
4647 if (
len - pos < slen) pos =
len - slen;
4648 if (
len == 0)
return pos;
4650 sbeg = RSTRING_PTR(str);
4653 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4659 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4660 return str_rindex(str, sub, s, enc);
4721rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4726 long pos,
len = str_strlen(str, enc);
4728 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4730 if (pos < 0 && (pos +=
len) < 0) {
4736 if (pos >
len) pos =
len;
4744 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4745 enc, single_byte_optimizable(str));
4756 pos = rb_str_rindex(str, sub, pos);
4766rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4772 enc = rb_enc_check(str, sub);
4773 if (is_broken_string(sub))
return -1;
4774 len = RSTRING_LEN(str);
4775 slen = RSTRING_LEN(sub);
4778 if (
len < slen)
return -1;
4779 if (
len - pos < slen) pos =
len - slen;
4780 if (
len == 0)
return pos;
4782 sbeg = RSTRING_PTR(str);
4785 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4792 return str_rindex(str, sub, s, enc);
4857rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4861 long pos,
len = RSTRING_LEN(str);
4863 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4865 if (pos < 0 && (pos +=
len) < 0) {
4871 if (pos >
len) pos =
len;
4877 str_ensure_byte_pos(str, pos);
4889 pos = rb_str_byterindex(str, sub, pos);
4890 if (pos >= 0)
return LONG2NUM(pos);
4926 switch (OBJ_BUILTIN_TYPE(y)) {
4978rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
4985 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5017rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5021 re = get_pat(argv[0]);
5022 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5031static enum neighbor_char
5037 if (rb_enc_mbminlen(enc) > 1) {
5039 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5041 return NEIGHBOR_NOT_CHAR;
5043 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5045 if (!l)
return NEIGHBOR_NOT_CHAR;
5046 if (l !=
len)
return NEIGHBOR_WRAPPED;
5047 rb_enc_mbcput(c, p, enc);
5048 r = rb_enc_precise_mbclen(p, p +
len, enc);
5050 return NEIGHBOR_NOT_CHAR;
5052 return NEIGHBOR_FOUND;
5055 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5058 return NEIGHBOR_WRAPPED;
5059 ++((
unsigned char*)p)[i];
5060 l = rb_enc_precise_mbclen(p, p+
len, enc);
5064 return NEIGHBOR_FOUND;
5067 memset(p+l, 0xff,
len-l);
5073 for (len2 =
len-1; 0 < len2; len2--) {
5074 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5078 memset(p+len2+1, 0xff,
len-(len2+1));
5083static enum neighbor_char
5088 if (rb_enc_mbminlen(enc) > 1) {
5090 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5092 return NEIGHBOR_NOT_CHAR;
5094 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5095 if (!c)
return NEIGHBOR_NOT_CHAR;
5098 if (!l)
return NEIGHBOR_NOT_CHAR;
5099 if (l !=
len)
return NEIGHBOR_WRAPPED;
5100 rb_enc_mbcput(c, p, enc);
5101 r = rb_enc_precise_mbclen(p, p +
len, enc);
5103 return NEIGHBOR_NOT_CHAR;
5105 return NEIGHBOR_FOUND;
5108 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5111 return NEIGHBOR_WRAPPED;
5112 --((
unsigned char*)p)[i];
5113 l = rb_enc_precise_mbclen(p, p+
len, enc);
5117 return NEIGHBOR_FOUND;
5120 memset(p+l, 0,
len-l);
5126 for (len2 =
len-1; 0 < len2; len2--) {
5127 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5131 memset(p+len2+1, 0,
len-(len2+1));
5145static enum neighbor_char
5146enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
5148 enum neighbor_char ret;
5152 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5156 const int max_gaps = 1;
5158 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5160 ctype = ONIGENC_CTYPE_DIGIT;
5162 ctype = ONIGENC_CTYPE_ALPHA;
5164 return NEIGHBOR_NOT_CHAR;
5167 for (
try = 0;
try <= max_gaps; ++
try) {
5168 ret = enc_succ_char(p,
len, enc);
5169 if (ret == NEIGHBOR_FOUND) {
5170 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5172 return NEIGHBOR_FOUND;
5179 ret = enc_pred_char(p,
len, enc);
5180 if (ret == NEIGHBOR_FOUND) {
5181 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5194 return NEIGHBOR_NOT_CHAR;
5197 if (ctype != ONIGENC_CTYPE_DIGIT) {
5199 return NEIGHBOR_WRAPPED;
5203 enc_succ_char(carry,
len, enc);
5204 return NEIGHBOR_WRAPPED;
5272 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5273 rb_enc_cr_str_copy_for_substr(str, orig);
5274 return str_succ(str);
5281 char *sbeg, *s, *e, *last_alnum = 0;
5282 int found_alnum = 0;
5284 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5285 long carry_pos = 0, carry_len = 1;
5286 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5288 slen = RSTRING_LEN(str);
5289 if (slen == 0)
return str;
5291 enc = STR_ENC_GET(str);
5292 sbeg = RSTRING_PTR(str);
5293 s = e = sbeg + slen;
5295 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5296 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5302 l = rb_enc_precise_mbclen(s, e, enc);
5303 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5304 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5305 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5307 case NEIGHBOR_NOT_CHAR:
5309 case NEIGHBOR_FOUND:
5311 case NEIGHBOR_WRAPPED:
5316 carry_pos = s - sbeg;
5321 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5322 enum neighbor_char neighbor;
5323 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5324 l = rb_enc_precise_mbclen(s, e, enc);
5325 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5326 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5328 neighbor = enc_succ_char(tmp, l, enc);
5330 case NEIGHBOR_FOUND:
5334 case NEIGHBOR_WRAPPED:
5337 case NEIGHBOR_NOT_CHAR:
5340 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5342 enc_succ_char(s, l, enc);
5344 if (!rb_enc_asciicompat(enc)) {
5345 MEMCPY(carry, s,
char, l);
5348 carry_pos = s - sbeg;
5352 RESIZE_CAPA(str, slen + carry_len);
5353 sbeg = RSTRING_PTR(str);
5354 s = sbeg + carry_pos;
5355 memmove(s + carry_len, s, slen - carry_pos);
5356 memmove(s, carry, carry_len);
5358 STR_SET_LEN(str, slen);
5359 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5373rb_str_succ_bang(
VALUE str)
5381all_digits_p(
const char *s,
long len)
5435 VALUE end, exclusive;
5439 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5445 VALUE current, after_end;
5452 enc = rb_enc_check(beg, end);
5453 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5455 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5456 char c = RSTRING_PTR(beg)[0];
5457 char e = RSTRING_PTR(end)[0];
5459 if (c > e || (excl && c == e))
return beg;
5461 VALUE str = rb_enc_str_new(&c, 1, enc);
5463 if ((*each)(str, arg))
break;
5464 if (!excl && c == e)
break;
5466 if (excl && c == e)
break;
5471 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5472 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5473 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5478 b = rb_str_to_inum(beg, 10, FALSE);
5479 e = rb_str_to_inum(end, 10, FALSE);
5486 if (excl && bi == ei)
break;
5487 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5492 ID op = excl ?
'<' : idLE;
5493 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5498 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5499 b = rb_funcallv(b, succ, 0, 0);
5506 if (n > 0 || (excl && n == 0))
return beg;
5508 after_end = rb_funcallv(end, succ, 0, 0);
5513 next = rb_funcallv(current, succ, 0, 0);
5514 if ((*each)(current, arg))
break;
5515 if (
NIL_P(next))
break;
5519 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5534 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5535 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5536 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5538 b = rb_str_to_inum(beg, 10, FALSE);
5544 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5552 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5553 b = rb_funcallv(b, succ, 0, 0);
5559 VALUE next = rb_funcallv(current, succ, 0, 0);
5560 if ((*each)(current, arg))
break;
5563 if (RSTRING_LEN(current) == 0)
5574 if (!
rb_equal(str, *argp))
return 0;
5588 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5589 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5590 rb_enc_asciicompat(STR_ENC_GET(val))) {
5591 const char *bp = RSTRING_PTR(beg);
5592 const char *ep = RSTRING_PTR(end);
5593 const char *vp = RSTRING_PTR(val);
5594 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5595 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5603 if (b <= v && v < e)
return Qtrue;
5604 return RBOOL(!
RTEST(exclusive) && v == e);
5611 all_digits_p(bp, RSTRING_LEN(beg)) &&
5612 all_digits_p(ep, RSTRING_LEN(end))) {
5617 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5619 return RBOOL(
NIL_P(val));
5642 return rb_str_subpat(str, indx,
INT2FIX(0));
5645 if (rb_str_index(str, indx, 0) != -1)
5651 long beg,
len = str_strlen(str, NULL);
5663 return str_substr(str, idx, 1, FALSE);
5682rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5686 return rb_str_subpat(str, argv[0], argv[1]);
5689 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5693 return rb_str_aref(str, argv[0]);
5699 char *ptr = RSTRING_PTR(str);
5700 long olen = RSTRING_LEN(str), nlen;
5702 str_modifiable(str);
5703 if (
len > olen)
len = olen;
5705 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5707 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5709 ptr =
RSTRING(str)->as.embed.ary;
5710 memmove(ptr, oldptr +
len, nlen);
5711 if (fl == STR_NOEMBED)
xfree(oldptr);
5714 if (!STR_SHARED_P(str)) {
5716 rb_enc_cr_str_exact_copy(shared, str);
5721 STR_SET_LEN(str, nlen);
5723 if (!SHARABLE_MIDDLE_SUBSTRING) {
5724 TERM_FILL(ptr + nlen, TERM_LEN(str));
5731rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5737 if (beg == 0 && vlen == 0) {
5742 str_modify_keep_cr(str);
5746 RESIZE_CAPA(str, slen + vlen -
len);
5747 sptr = RSTRING_PTR(str);
5756 memmove(sptr + beg + vlen,
5758 slen - (beg +
len));
5760 if (vlen < beg &&
len < 0) {
5764 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5767 STR_SET_LEN(str, slen);
5768 TERM_FILL(&sptr[slen], TERM_LEN(str));
5775 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5784 int singlebyte = single_byte_optimizable(str);
5790 enc = rb_enc_check(str, val);
5791 slen = str_strlen(str, enc);
5793 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5802 if (
len > slen - beg) {
5805 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5810 beg = p - RSTRING_PTR(str);
5812 rb_str_update_0(str, beg,
len, val);
5813 rb_enc_associate(str, enc);
5824 long start, end,
len;
5834 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5838 nth += regs->num_regs;
5848 enc = rb_enc_check_str(str, val);
5849 rb_str_update_0(str, start,
len, val);
5850 rb_enc_associate(str, enc);
5858 switch (
TYPE(indx)) {
5860 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5864 beg = rb_str_index(str, indx, 0);
5918rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5922 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5930 return rb_str_aset(str, argv[0], argv[1]);
5990rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
5998 str_modify_keep_cr(str);
6006 if ((nth += regs->num_regs) <= 0)
return Qnil;
6008 else if (nth >= regs->num_regs)
return Qnil;
6010 len = END(nth) - beg;
6013 else if (argc == 2) {
6022 beg = p - RSTRING_PTR(str);
6026 beg = rb_str_index(str, indx, 0);
6027 if (beg == -1)
return Qnil;
6028 len = RSTRING_LEN(indx);
6040 beg = p - RSTRING_PTR(str);
6049 beg = p - RSTRING_PTR(str);
6053 rb_enc_cr_str_copy_for_substr(result, str);
6061 char *sptr = RSTRING_PTR(str);
6062 long slen = RSTRING_LEN(str);
6063 if (beg +
len > slen)
6067 slen - (beg +
len));
6069 STR_SET_LEN(str, slen);
6070 TERM_FILL(&sptr[slen], TERM_LEN(str));
6081 switch (OBJ_BUILTIN_TYPE(pat)) {
6100get_pat_quoted(
VALUE pat,
int check)
6104 switch (OBJ_BUILTIN_TYPE(pat)) {
6118 if (check && is_broken_string(pat)) {
6125rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6128 pos = rb_str_byteindex(str, pat, pos);
6129 if (set_backref_str) {
6131 str = rb_str_new_frozen_String(str);
6132 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6141 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6161rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6175 hash = rb_check_hash_type(argv[1]);
6181 pat = get_pat_quoted(argv[0], 1);
6183 str_modifiable(str);
6184 beg = rb_pat_search(pat, str, 0, 1);
6198 end0 = beg0 + RSTRING_LEN(pat);
6207 if (iter || !
NIL_P(hash)) {
6208 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6214 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6217 str_mod_check(str, p,
len);
6218 rb_check_frozen(str);
6224 enc = rb_enc_compatible(str, repl);
6227 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6231 rb_enc_inspect_name(str_enc),
6232 rb_enc_inspect_name(STR_ENC_GET(repl)));
6234 enc = STR_ENC_GET(repl);
6237 rb_enc_associate(str, enc);
6247 rlen = RSTRING_LEN(repl);
6248 len = RSTRING_LEN(str);
6250 RESIZE_CAPA(str,
len + rlen - plen);
6252 p = RSTRING_PTR(str);
6254 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6256 rp = RSTRING_PTR(repl);
6257 memmove(p + beg0, rp, rlen);
6259 STR_SET_LEN(str,
len);
6260 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6289 rb_str_sub_bang(argc, argv, str);
6294str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6297 long beg, beg0, end0;
6298 long offset, blen, slen,
len, last;
6299 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6301 int need_backref = -1;
6311 hash = rb_check_hash_type(argv[1]);
6315 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6323 rb_error_arity(argc, 1, 2);
6326 pat = get_pat_quoted(argv[0], 1);
6327 beg = rb_pat_search(pat, str, 0, need_backref);
6329 if (bang)
return Qnil;
6334 blen = RSTRING_LEN(str) + 30;
6336 sp = RSTRING_PTR(str);
6337 slen = RSTRING_LEN(str);
6339 str_enc = STR_ENC_GET(str);
6340 rb_enc_associate(dest, str_enc);
6348 end0 = beg0 + RSTRING_LEN(pat);
6364 if (mode == FAST_MAP) {
6373 val = rb_hash_aref(hash, key);
6376 str_mod_check(str, sp, slen);
6381 else if (need_backref) {
6383 if (need_backref < 0) {
6384 need_backref = val != repl;
6391 len = beg0 - offset;
6405 if (RSTRING_LEN(str) <= end0)
break;
6406 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6408 offset = end0 +
len;
6410 cp = RSTRING_PTR(str) + offset;
6411 if (offset > RSTRING_LEN(str))
break;
6412 beg = rb_pat_search(pat, str, offset, need_backref);
6416 if (RSTRING_LEN(str) > offset) {
6419 rb_pat_search(pat, str, last, 1);
6421 str_shared_replace(str, dest);
6449rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6451 str_modify_keep_cr(str);
6452 return str_gsub(argc, argv, str, 1);
6475 return str_gsub(argc, argv, str, 0);
6493 str_modifiable(str);
6494 if (str == str2)
return str;
6498 return str_replace(str, str2);
6513rb_str_clear(
VALUE str)
6517 STR_SET_LEN(str, 0);
6518 RSTRING_PTR(str)[0] = 0;
6519 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6538rb_str_chr(
VALUE str)
6562 pos += RSTRING_LEN(str);
6563 if (pos < 0 || RSTRING_LEN(str) <= pos)
6566 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6585 long len = RSTRING_LEN(str);
6586 char *
ptr, *head, *left = 0;
6590 if (pos < -
len ||
len <= pos)
6597 char byte = (char)(
NUM2INT(w) & 0xFF);
6599 if (!str_independent(str))
6600 str_make_independent(str);
6601 enc = STR_ENC_GET(str);
6602 head = RSTRING_PTR(str);
6604 if (!STR_EMBED_P(str)) {
6611 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6619 width = rb_enc_precise_mbclen(left, head+
len, enc);
6621 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6637str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6639 long n = RSTRING_LEN(str);
6641 if (beg > n ||
len < 0)
return Qnil;
6644 if (beg < 0)
return Qnil;
6649 if (!empty)
return Qnil;
6653 VALUE str2 = str_subseq(str, beg,
len);
6655 str_enc_copy_direct(str2, str);
6657 if (RSTRING_LEN(str2) == 0) {
6658 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6692 long beg,
len = RSTRING_LEN(str);
6700 return str_byte_substr(str, beg,
len, TRUE);
6705 return str_byte_substr(str, idx, 1, FALSE);
6752rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6757 return str_byte_substr(str, beg,
len, TRUE);
6760 return str_byte_aref(str, argv[0]);
6764str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6766 long end, slen = RSTRING_LEN(str);
6769 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6778 if (*
len > slen - *beg) {
6782 str_ensure_byte_pos(str, *beg);
6783 str_ensure_byte_pos(str, end);
6808rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6810 long beg,
len, vbeg, vlen;
6815 if (!(argc == 2 || argc == 3 || argc == 5)) {
6816 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6820 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6821 rb_builtin_class_name(argv[0]));
6828 vlen = RSTRING_LEN(val);
6833 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6834 rb_builtin_class_name(argv[2]));
6846 vlen = RSTRING_LEN(val);
6854 str_check_beg_len(str, &beg, &
len);
6855 str_check_beg_len(val, &vbeg, &vlen);
6856 str_modify_keep_cr(str);
6859 rb_enc_associate(str, rb_enc_check(str, val));
6862 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6880rb_str_reverse(
VALUE str)
6887 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6888 enc = STR_ENC_GET(str);
6894 if (RSTRING_LEN(str) > 1) {
6895 if (single_byte_optimizable(str)) {
6902 int clen = rb_enc_fast_mbclen(s, e, enc);
6910 cr = rb_enc_asciicompat(enc) ?
6913 int clen = rb_enc_mbclen(s, e, enc);
6922 STR_SET_LEN(rev, RSTRING_LEN(str));
6923 str_enc_copy_direct(rev, str);
6943rb_str_reverse_bang(
VALUE str)
6945 if (RSTRING_LEN(str) > 1) {
6946 if (single_byte_optimizable(str)) {
6949 str_modify_keep_cr(str);
6950 s = RSTRING_PTR(str);
6959 str_shared_replace(str, rb_str_reverse(str));
6963 str_modify_keep_cr(str);
6988 i = rb_str_index(str, arg, 0);
6990 return RBOOL(i != -1);
7032 rb_raise(rb_eArgError,
"invalid radix %d", base);
7034 return rb_str_to_inum(str, base, FALSE);
7058rb_str_to_f(
VALUE str)
7073rb_str_to_s(
VALUE str)
7085 char s[RUBY_MAX_CHAR_LEN];
7086 int n = rb_enc_codelen(c, enc);
7088 rb_enc_mbcput(c, s, enc);
7093#define CHAR_ESC_LEN 13
7096rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7098 char buf[CHAR_ESC_LEN + 1];
7106 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7108 else if (c < 0x10000) {
7109 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7112 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7117 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7120 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7123 l = (int)strlen(buf);
7129ruby_escaped_char(
int c)
7132 case '\0':
return "\\0";
7133 case '\n':
return "\\n";
7134 case '\r':
return "\\r";
7135 case '\t':
return "\\t";
7136 case '\f':
return "\\f";
7137 case '\013':
return "\\v";
7138 case '\010':
return "\\b";
7139 case '\007':
return "\\a";
7140 case '\033':
return "\\e";
7141 case '\x7f':
return "\\c?";
7147rb_str_escape(
VALUE str)
7151 const char *p = RSTRING_PTR(str);
7153 const char *prev = p;
7154 char buf[CHAR_ESC_LEN + 1];
7156 int unicode_p = rb_enc_unicode_p(enc);
7157 int asciicompat = rb_enc_asciicompat(enc);
7162 int n = rb_enc_precise_mbclen(p, pend, enc);
7164 if (p > prev) str_buf_cat(result, prev, p - prev);
7165 n = rb_enc_mbminlen(enc);
7167 n = (int)(pend - p);
7169 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7170 str_buf_cat(result, buf, strlen(buf));
7176 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7178 cc = ruby_escaped_char(c);
7180 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7181 str_buf_cat(result, cc, strlen(cc));
7184 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7187 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7188 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7192 if (p > prev) str_buf_cat(result, prev, p - prev);
7216 const char *p, *pend, *prev;
7217 char buf[CHAR_ESC_LEN + 1];
7219 rb_encoding *resenc = rb_default_internal_encoding();
7220 int unicode_p = rb_enc_unicode_p(enc);
7221 int asciicompat = rb_enc_asciicompat(enc);
7223 if (resenc == NULL) resenc = rb_default_external_encoding();
7224 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7225 rb_enc_associate(result, resenc);
7226 str_buf_cat2(result,
"\"");
7234 n = rb_enc_precise_mbclen(p, pend, enc);
7236 if (p > prev) str_buf_cat(result, prev, p - prev);
7237 n = rb_enc_mbminlen(enc);
7239 n = (int)(pend - p);
7241 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7242 str_buf_cat(result, buf, strlen(buf));
7248 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7250 if ((asciicompat || unicode_p) &&
7251 (c ==
'"'|| c ==
'\\' ||
7256 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7257 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7258 str_buf_cat2(result,
"\\");
7259 if (asciicompat || enc == resenc) {
7265 case '\n': cc =
'n';
break;
7266 case '\r': cc =
'r';
break;
7267 case '\t': cc =
't';
break;
7268 case '\f': cc =
'f';
break;
7269 case '\013': cc =
'v';
break;
7270 case '\010': cc =
'b';
break;
7271 case '\007': cc =
'a';
break;
7272 case 033: cc =
'e';
break;
7273 default: cc = 0;
break;
7276 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7279 str_buf_cat(result, buf, 2);
7292 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7296 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7297 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7302 if (p > prev) str_buf_cat(result, prev, p - prev);
7303 str_buf_cat2(result,
"\"");
7308#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7328 int encidx = rb_enc_get_index(str);
7331 const char *p, *pend;
7334 int u8 = (encidx == rb_utf8_encindex());
7335 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7338 if (!rb_enc_asciicompat(enc)) {
7340 len += strlen(enc->name);
7343 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7346 unsigned char c = *p++;
7349 case '"':
case '\\':
7350 case '\n':
case '\r':
7351 case '\t':
case '\f':
7352 case '\013':
case '\010':
case '\007':
case '\033':
7357 clen = IS_EVSTR(p, pend) ? 2 : 1;
7365 if (u8 && c > 0x7F) {
7366 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7368 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7371 else if (cc <= 0xFFFFF)
7384 if (clen > LONG_MAX -
len) {
7391 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7392 q = RSTRING_PTR(result); qend = q +
len + 1;
7396 unsigned char c = *p++;
7398 if (c ==
'"' || c ==
'\\') {
7402 else if (c ==
'#') {
7403 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7406 else if (c ==
'\n') {
7410 else if (c ==
'\r') {
7414 else if (c ==
'\t') {
7418 else if (c ==
'\f') {
7422 else if (c ==
'\013') {
7426 else if (c ==
'\010') {
7430 else if (c ==
'\007') {
7434 else if (c ==
'\033') {
7444 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7446 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7449 snprintf(q, qend-q,
"u%04X", cc);
7451 snprintf(q, qend-q,
"u{%X}", cc);
7456 snprintf(q, qend-q,
"x%02X", c);
7462 if (!rb_enc_asciicompat(enc)) {
7463 snprintf(q, qend-q, nonascii_suffix, enc->name);
7464 encidx = rb_ascii8bit_encindex();
7467 rb_enc_associate_index(result, encidx);
7473unescape_ascii(
unsigned int c)
7497undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
7499 const char *s = *ss;
7503 unsigned char buf[6];
7521 *buf = unescape_ascii(*s);
7533 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7534 if (*penc != enc_utf8) {
7536 rb_enc_associate(undumped, enc_utf8);
7553 if (hexlen == 0 || hexlen > 6) {
7559 if (0xd800 <= c && c <= 0xdfff) {
7562 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7572 if (0xd800 <= c && c <= 0xdfff) {
7575 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7603static VALUE rb_str_is_ascii_only_p(
VALUE str);
7621str_undump(
VALUE str)
7623 const char *s = RSTRING_PTR(str);
7626 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7628 bool binary =
false;
7632 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7635 if (!str_null_check(str, &w)) {
7638 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7639 if (*s !=
'"')
goto invalid_format;
7657 static const char force_encoding_suffix[] =
".force_encoding(\"";
7658 static const char dup_suffix[] =
".dup";
7659 const char *encname;
7664 size =
sizeof(dup_suffix) - 1;
7665 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7667 size =
sizeof(force_encoding_suffix) - 1;
7668 if (s_end - s <= size)
goto invalid_format;
7669 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7673 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7677 s = memchr(s,
'"', s_end-s);
7679 if (!s)
goto invalid_format;
7680 if (s_end - s != 2)
goto invalid_format;
7681 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7683 encidx = rb_enc_find_index2(encname, (
long)size);
7687 rb_enc_associate_index(undumped, encidx);
7697 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7708 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7714 if (rb_enc_dummy_p(enc)) {
7721str_true_enc(
VALUE str)
7724 rb_str_check_dummy_enc(enc);
7728static OnigCaseFoldType
7729check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7734 rb_raise(rb_eArgError,
"too many options");
7735 if (argv[0]==sym_turkic) {
7736 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7738 if (argv[1]==sym_lithuanian)
7739 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7741 rb_raise(rb_eArgError,
"invalid second option");
7744 else if (argv[0]==sym_lithuanian) {
7745 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7747 if (argv[1]==sym_turkic)
7748 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7750 rb_raise(rb_eArgError,
"invalid second option");
7754 rb_raise(rb_eArgError,
"too many options");
7755 else if (argv[0]==sym_ascii)
7756 flags |= ONIGENC_CASE_ASCII_ONLY;
7757 else if (argv[0]==sym_fold) {
7758 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7759 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7761 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7764 rb_raise(rb_eArgError,
"invalid option");
7771 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7777#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7778#ifndef CASEMAP_DEBUG
7779# define CASEMAP_DEBUG 0
7787 OnigUChar space[FLEX_ARY_LEN];
7791mapping_buffer_free(
void *p)
7795 while (current_buffer) {
7796 previous_buffer = current_buffer;
7797 current_buffer = current_buffer->next;
7798 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7804 {0, mapping_buffer_free,},
7805 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7813 const OnigUChar *source_current, *source_end;
7814 int target_length = 0;
7815 VALUE buffer_anchor;
7818 size_t buffer_count = 0;
7819 int buffer_length_or_invalid;
7821 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7823 source_current = (OnigUChar*)RSTRING_PTR(source);
7828 while (source_current < source_end) {
7830 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7831 if (CASEMAP_DEBUG) {
7832 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7835 *pre_buffer = current_buffer;
7836 pre_buffer = ¤t_buffer->next;
7837 current_buffer->next = NULL;
7838 current_buffer->capa =
capa;
7839 buffer_length_or_invalid = enc->case_map(flags,
7840 &source_current, source_end,
7841 current_buffer->space,
7842 current_buffer->space+current_buffer->capa,
7844 if (buffer_length_or_invalid < 0) {
7845 current_buffer =
DATA_PTR(buffer_anchor);
7847 mapping_buffer_free(current_buffer);
7848 rb_raise(rb_eArgError,
"input string invalid");
7850 target_length += current_buffer->used = buffer_length_or_invalid;
7852 if (CASEMAP_DEBUG) {
7853 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7856 if (buffer_count==1) {
7857 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7860 char *target_current;
7863 target_current = RSTRING_PTR(target);
7864 current_buffer =
DATA_PTR(buffer_anchor);
7865 while (current_buffer) {
7866 memcpy(target_current, current_buffer->space, current_buffer->used);
7867 target_current += current_buffer->used;
7868 current_buffer = current_buffer->next;
7871 current_buffer =
DATA_PTR(buffer_anchor);
7873 mapping_buffer_free(current_buffer);
7878 str_enc_copy_direct(target, source);
7887 const OnigUChar *source_current, *source_end;
7888 OnigUChar *target_current, *target_end;
7889 long old_length = RSTRING_LEN(source);
7890 int length_or_invalid;
7892 if (old_length == 0)
return Qnil;
7894 source_current = (OnigUChar*)RSTRING_PTR(source);
7896 if (source == target) {
7897 target_current = (OnigUChar*)source_current;
7898 target_end = (OnigUChar*)source_end;
7901 target_current = (OnigUChar*)RSTRING_PTR(target);
7905 length_or_invalid = onigenc_ascii_only_case_map(flags,
7906 &source_current, source_end,
7907 target_current, target_end, enc);
7908 if (length_or_invalid < 0)
7909 rb_raise(rb_eArgError,
"input string invalid");
7910 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7911 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7912 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7913 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
7914 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7917 str_enc_copy(target, source);
7923upcase_single(
VALUE str)
7925 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
7926 bool modified =
false;
7929 unsigned int c = *(
unsigned char*)s;
7931 if (
'a' <= c && c <=
'z') {
7932 *s =
'A' + (c -
'a');
7960rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
7963 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7965 flags = check_case_options(argc, argv, flags);
7966 str_modify_keep_cr(str);
7967 enc = str_true_enc(str);
7968 if (case_option_single_p(flags, enc, str)) {
7969 if (upcase_single(str))
7970 flags |= ONIGENC_CASE_MODIFIED;
7972 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7973 rb_str_ascii_casemap(str, str, &flags, enc);
7975 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7977 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7999rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
8002 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8005 flags = check_case_options(argc, argv, flags);
8006 enc = str_true_enc(str);
8007 if (case_option_single_p(flags, enc, str)) {
8008 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8009 str_enc_copy_direct(ret, str);
8012 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8014 rb_str_ascii_casemap(str, ret, &flags, enc);
8017 ret = rb_str_casemap(str, &flags, enc);
8024downcase_single(
VALUE str)
8026 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8027 bool modified =
false;
8030 unsigned int c = *(
unsigned char*)s;
8032 if (
'A' <= c && c <=
'Z') {
8033 *s =
'a' + (c -
'A');
8062rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8065 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8067 flags = check_case_options(argc, argv, flags);
8068 str_modify_keep_cr(str);
8069 enc = str_true_enc(str);
8070 if (case_option_single_p(flags, enc, str)) {
8071 if (downcase_single(str))
8072 flags |= ONIGENC_CASE_MODIFIED;
8074 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8075 rb_str_ascii_casemap(str, str, &flags, enc);
8077 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8079 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8101rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8104 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8107 flags = check_case_options(argc, argv, flags);
8108 enc = str_true_enc(str);
8109 if (case_option_single_p(flags, enc, str)) {
8110 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8111 str_enc_copy_direct(ret, str);
8112 downcase_single(ret);
8114 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8116 rb_str_ascii_casemap(str, ret, &flags, enc);
8119 ret = rb_str_casemap(str, &flags, enc);
8147rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8150 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8152 flags = check_case_options(argc, argv, flags);
8153 str_modify_keep_cr(str);
8154 enc = str_true_enc(str);
8155 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8156 if (flags&ONIGENC_CASE_ASCII_ONLY)
8157 rb_str_ascii_casemap(str, str, &flags, enc);
8159 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8161 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8185rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8188 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8191 flags = check_case_options(argc, argv, flags);
8192 enc = str_true_enc(str);
8193 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8194 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8196 rb_str_ascii_casemap(str, ret, &flags, enc);
8199 ret = rb_str_casemap(str, &flags, enc);
8226rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8229 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8231 flags = check_case_options(argc, argv, flags);
8232 str_modify_keep_cr(str);
8233 enc = str_true_enc(str);
8234 if (flags&ONIGENC_CASE_ASCII_ONLY)
8235 rb_str_ascii_casemap(str, str, &flags, enc);
8237 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8239 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8263rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8266 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8269 flags = check_case_options(argc, argv, flags);
8270 enc = str_true_enc(str);
8271 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8272 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8274 rb_str_ascii_casemap(str, ret, &flags, enc);
8277 ret = rb_str_casemap(str, &flags, enc);
8282typedef unsigned char *USTR;
8286 unsigned int now, max;
8298 if (t->p == t->pend)
return -1;
8299 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8302 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8304 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8306 if (t->p < t->pend) {
8307 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8310 if (t->now < 0x80 && c < 0x80) {
8311 rb_raise(rb_eArgError,
8312 "invalid range \"%c-%c\" in string transliteration",
8316 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8320 else if (t->now < c) {
8329 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8330 if (t->now == t->max) {
8335 if (t->now < t->max) {
8351 const unsigned int errc = -1;
8352 unsigned int trans[256];
8354 struct tr trsrc, trrepl;
8356 unsigned int c, c0, last = 0;
8357 int modify = 0, i, l;
8358 unsigned char *s, *send;
8360 int singlebyte = single_byte_optimizable(str);
8364#define CHECK_IF_ASCII(c) \
8365 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8366 (cr = ENC_CODERANGE_VALID) : 0)
8370 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8371 if (RSTRING_LEN(repl) == 0) {
8372 return rb_str_delete_bang(1, &src, str);
8376 e1 = rb_enc_check(str, src);
8377 e2 = rb_enc_check(str, repl);
8382 enc = rb_enc_check(src, repl);
8384 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8385 if (RSTRING_LEN(src) > 1 &&
8386 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8387 trsrc.p + l < trsrc.pend) {
8391 trrepl.p = RSTRING_PTR(repl);
8392 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8393 trsrc.gen = trrepl.gen = 0;
8394 trsrc.now = trrepl.now = 0;
8395 trsrc.max = trrepl.max = 0;
8398 for (i=0; i<256; i++) {
8401 while ((c = trnext(&trsrc, enc)) != errc) {
8406 if (!hash) hash = rb_hash_new();
8410 while ((c = trnext(&trrepl, enc)) != errc)
8413 for (i=0; i<256; i++) {
8414 if (trans[i] != errc) {
8422 for (i=0; i<256; i++) {
8425 while ((c = trnext(&trsrc, enc)) != errc) {
8426 r = trnext(&trrepl, enc);
8427 if (r == errc) r = trrepl.now;
8430 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8433 if (!hash) hash = rb_hash_new();
8441 str_modify_keep_cr(str);
8442 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8443 termlen = rb_enc_mbminlen(enc);
8446 long offset, max = RSTRING_LEN(str);
8447 unsigned int save = -1;
8448 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8453 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8456 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8459 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8461 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8470 if (cflag) c = last;
8473 else if (cflag) c = errc;
8479 if (c != (
unsigned int)-1) {
8485 tlen = rb_enc_codelen(c, enc);
8491 if (enc != e1) may_modify = 1;
8493 if ((offset = t - buf) + tlen > max) {
8494 size_t MAYBE_UNUSED(old) = max + termlen;
8495 max = offset + tlen + (send - s);
8496 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8499 rb_enc_mbcput(c, t, enc);
8500 if (may_modify && memcmp(s, t, tlen) != 0) {
8506 if (!STR_EMBED_P(str)) {
8507 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8509 TERM_FILL((
char *)t, termlen);
8510 RSTRING(str)->as.heap.ptr = (
char *)buf;
8511 STR_SET_LEN(str, t - buf);
8512 STR_SET_NOEMBED(str);
8513 RSTRING(str)->as.heap.aux.capa = max;
8517 c = (
unsigned char)*s;
8518 if (trans[c] != errc) {
8535 long offset, max = (long)((send - s) * 1.2);
8536 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8541 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8544 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8547 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8549 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8557 if (cflag) c = last;
8560 else if (cflag) c = errc;
8564 c = cflag ? last : errc;
8567 tlen = rb_enc_codelen(c, enc);
8572 if (enc != e1) may_modify = 1;
8574 if ((offset = t - buf) + tlen > max) {
8575 size_t MAYBE_UNUSED(old) = max + termlen;
8576 max = offset + tlen + (long)((send - s) * 1.2);
8577 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8581 rb_enc_mbcput(c, t, enc);
8582 if (may_modify && memcmp(s, t, tlen) != 0) {
8590 if (!STR_EMBED_P(str)) {
8591 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8593 TERM_FILL((
char *)t, termlen);
8594 RSTRING(str)->as.heap.ptr = (
char *)buf;
8595 STR_SET_LEN(str, t - buf);
8596 STR_SET_NOEMBED(str);
8597 RSTRING(str)->as.heap.aux.capa = max;
8603 rb_enc_associate(str, enc);
8622 return tr_trans(str, src, repl, 0);
8669 tr_trans(str, src, repl, 0);
8673#define TR_TABLE_MAX (UCHAR_MAX+1)
8674#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8676tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8679 const unsigned int errc = -1;
8680 char buf[TR_TABLE_MAX];
8683 VALUE table = 0, ptable = 0;
8684 int i, l, cflag = 0;
8686 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8687 tr.gen =
tr.now =
tr.max = 0;
8689 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8694 for (i=0; i<TR_TABLE_MAX; i++) {
8697 stable[TR_TABLE_MAX] = cflag;
8699 else if (stable[TR_TABLE_MAX] && !cflag) {
8700 stable[TR_TABLE_MAX] = 0;
8702 for (i=0; i<TR_TABLE_MAX; i++) {
8706 while ((c = trnext(&
tr, enc)) != errc) {
8707 if (c < TR_TABLE_MAX) {
8708 buf[(
unsigned char)c] = !cflag;
8713 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8716 table = ptable ? ptable : rb_hash_new();
8720 table = rb_hash_new();
8725 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8726 rb_hash_aset(table, key,
Qtrue);
8730 for (i=0; i<TR_TABLE_MAX; i++) {
8731 stable[i] = stable[i] && buf[i];
8733 if (!table && !cflag) {
8740tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8742 if (c < TR_TABLE_MAX) {
8743 return table[c] != 0;
8749 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8750 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8754 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8757 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8771rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8773 char squeez[TR_TABLE_SIZE];
8776 VALUE del = 0, nodel = 0;
8778 int i, ascompat, cr;
8780 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8782 for (i=0; i<argc; i++) {
8786 enc = rb_enc_check(str, s);
8787 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8790 str_modify_keep_cr(str);
8791 ascompat = rb_enc_asciicompat(enc);
8792 s = t = RSTRING_PTR(str);
8799 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8810 c = rb_enc_codepoint_len(s, send, &clen, enc);
8812 if (tr_find(c, squeez, del, nodel)) {
8816 if (t != s) rb_enc_mbcput(c, t, enc);
8823 TERM_FILL(t, TERM_LEN(str));
8824 STR_SET_LEN(str, t - RSTRING_PTR(str));
8827 if (modify)
return str;
8847rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8850 rb_str_delete_bang(argc, argv, str);
8864rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8866 char squeez[TR_TABLE_SIZE];
8868 VALUE del = 0, nodel = 0;
8869 unsigned char *s, *send, *t;
8871 int ascompat, singlebyte = single_byte_optimizable(str);
8875 enc = STR_ENC_GET(str);
8878 for (i=0; i<argc; i++) {
8882 enc = rb_enc_check(str, s);
8883 if (singlebyte && !single_byte_optimizable(s))
8885 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8889 str_modify_keep_cr(str);
8890 s = t = (
unsigned char *)RSTRING_PTR(str);
8891 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8894 ascompat = rb_enc_asciicompat(enc);
8898 unsigned int c = *s++;
8899 if (c != save || (argc > 0 && !squeez[c])) {
8909 if (ascompat && (c = *s) < 0x80) {
8910 if (c != save || (argc > 0 && !squeez[c])) {
8916 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8918 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8919 if (t != s) rb_enc_mbcput(c, t, enc);
8928 TERM_FILL((
char *)t, TERM_LEN(str));
8929 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8930 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
8934 if (modify)
return str;
8957rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8960 rb_str_squeeze_bang(argc, argv, str);
8978 return tr_trans(str, src, repl, 1);
9001 tr_trans(str, src, repl, 1);
9030rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9032 char table[TR_TABLE_SIZE];
9034 VALUE del = 0, nodel = 0, tstr;
9044 enc = rb_enc_check(str, tstr);
9047 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9048 (ptstr = RSTRING_PTR(tstr),
9049 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9050 !is_broken_string(str)) {
9052 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9054 s = RSTRING_PTR(str);
9055 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9058 if (*(
unsigned char*)s++ == c) n++;
9064 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9065 for (i=1; i<argc; i++) {
9068 enc = rb_enc_check(str, tstr);
9069 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9072 s = RSTRING_PTR(str);
9073 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9075 ascompat = rb_enc_asciicompat(enc);
9079 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9087 c = rb_enc_codepoint_len(s, send, &clen, enc);
9088 if (tr_find(c, table, del, nodel)) {
9099rb_fs_check(
VALUE val)
9103 if (
NIL_P(val))
return 0;
9108static const char isspacetable[256] = {
9109 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9111 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9127#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9130split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9132 if (empty_count >= 0 &&
len == 0) {
9133 return empty_count + 1;
9135 if (empty_count > 0) {
9139 rb_ary_push(result, str_new_empty_String(str));
9140 }
while (--empty_count > 0);
9144 rb_yield(str_new_empty_String(str));
9145 }
while (--empty_count > 0);
9150 rb_ary_push(result, str);
9159 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9163literal_split_pattern(
VALUE spat, split_type_t default_type)
9171 return SPLIT_TYPE_CHARS;
9173 else if (rb_enc_asciicompat(enc)) {
9174 if (
len == 1 && ptr[0] ==
' ') {
9175 return SPLIT_TYPE_AWK;
9180 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9181 return SPLIT_TYPE_AWK;
9184 return default_type;
9197rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9202 split_type_t split_type;
9203 long beg, end, i = 0, empty_count = -1;
9208 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9210 if (lim <= 0) limit =
Qnil;
9211 else if (lim == 1) {
9212 if (RSTRING_LEN(str) == 0)
9223 if (
NIL_P(limit) && !lim) empty_count = 0;
9225 enc = STR_ENC_GET(str);
9226 split_type = SPLIT_TYPE_REGEXP;
9228 spat = get_pat_quoted(spat, 0);
9230 else if (
NIL_P(spat = rb_fs)) {
9231 split_type = SPLIT_TYPE_AWK;
9233 else if (!(spat = rb_fs_check(spat))) {
9234 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9239 if (split_type != SPLIT_TYPE_AWK) {
9244 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9245 if (split_type == SPLIT_TYPE_AWK) {
9247 split_type = SPLIT_TYPE_STRING;
9252 mustnot_broken(spat);
9253 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9261#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9264 char *ptr = RSTRING_PTR(str);
9266 if (split_type == SPLIT_TYPE_AWK) {
9271 if (result) result = rb_ary_new();
9273 if (is_ascii_string(str)) {
9274 while (ptr < eptr) {
9275 c = (
unsigned char)*ptr++;
9277 if (ascii_isspace(c)) {
9283 if (!
NIL_P(limit) && lim <= i)
break;
9286 else if (ascii_isspace(c)) {
9287 SPLIT_STR(beg, end-beg);
9290 if (!
NIL_P(limit)) ++i;
9298 while (ptr < eptr) {
9301 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9310 if (!
NIL_P(limit) && lim <= i)
break;
9314 SPLIT_STR(beg, end-beg);
9317 if (!
NIL_P(limit)) ++i;
9325 else if (split_type == SPLIT_TYPE_STRING) {
9326 char *str_start = ptr;
9327 char *substr_start = ptr;
9328 char *sptr = RSTRING_PTR(spat);
9329 long slen = RSTRING_LEN(spat);
9331 if (result) result = rb_ary_new();
9332 mustnot_broken(str);
9333 enc = rb_enc_check(str, spat);
9334 while (ptr < eptr &&
9335 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9338 if (t != ptr + end) {
9342 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9345 if (!
NIL_P(limit) && lim <= ++i)
break;
9347 beg = ptr - str_start;
9349 else if (split_type == SPLIT_TYPE_CHARS) {
9350 char *str_start = ptr;
9353 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9354 mustnot_broken(str);
9355 enc = rb_enc_get(str);
9356 while (ptr < eptr &&
9357 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9358 SPLIT_STR(ptr - str_start, n);
9360 if (!
NIL_P(limit) && lim <= ++i)
break;
9362 beg = ptr - str_start;
9365 if (result) result = rb_ary_new();
9366 long len = RSTRING_LEN(str);
9374 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9379 if (start == end && BEG(0) == END(0)) {
9384 else if (last_null == 1) {
9385 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9392 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9398 SPLIT_STR(beg, end-beg);
9399 beg = start = END(0);
9403 for (idx=1; idx < regs->num_regs; idx++) {
9404 if (BEG(idx) == -1)
continue;
9405 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9407 if (!
NIL_P(limit) && lim <= ++i)
break;
9409 if (match) rb_match_unbusy(match);
9411 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9412 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9415 return result ? result : str;
9425 return rb_str_split_m(1, &sep, str);
9428#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9434 rb_ary_push(ary, e);
9443#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9446chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
9448 const char *prev = rb_enc_prev_char(p, e, e, enc);
9451 prev = rb_enc_prev_char(p, e, e, enc);
9452 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9464 RSTRING_LEN(rs) != 1 ||
9465 RSTRING_PTR(rs)[0] !=
'\n')) {
9471#define rb_rs get_rs()
9478 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9479 long pos,
len, rslen;
9485 static ID keywords[1];
9490 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9494 if (!ENUM_ELEM(ary, str)) {
9502 if (!RSTRING_LEN(str))
goto end;
9504 ptr = subptr = RSTRING_PTR(str);
9506 len = RSTRING_LEN(str);
9508 rslen = RSTRING_LEN(rs);
9511 enc = rb_enc_get(str);
9513 enc = rb_enc_check(str, rs);
9518 const char *eol = NULL;
9520 while (subend < pend) {
9521 long chomp_rslen = 0;
9523 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9525 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9527 if (eol == subend)
break;
9531 chomp_rslen = -rslen;
9535 if (!subptr) subptr = subend;
9539 }
while (subend < pend);
9541 if (rslen == 0) chomp_rslen = 0;
9543 subend - subptr + (chomp ? chomp_rslen : rslen));
9544 if (ENUM_ELEM(ary, line)) {
9545 str_mod_check(str, ptr,
len);
9547 subptr = eol = NULL;
9552 rsptr = RSTRING_PTR(rs);
9553 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9562 rsptr = RSTRING_PTR(rs);
9563 rslen = RSTRING_LEN(rs);
9566 while (subptr < pend) {
9567 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9571 if (hit != adjusted) {
9575 subend = hit += rslen;
9578 subend = chomp_newline(subptr, subend, enc);
9585 if (ENUM_ELEM(ary, line)) {
9586 str_mod_check(str, ptr,
len);
9591 if (subptr != pend) {
9594 pend = chomp_newline(subptr, pend, enc);
9596 else if (pend - subptr >= rslen &&
9597 memcmp(pend - rslen, rsptr, rslen) == 0) {
9602 ENUM_ELEM(ary, line);
9623rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9626 return rb_str_enumerate_lines(argc, argv, str, 0);
9639rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9641 VALUE ary = WANTARRAY(
"lines", 0);
9642 return rb_str_enumerate_lines(argc, argv, str, ary);
9656 for (i=0; i<RSTRING_LEN(str); i++) {
9657 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9675rb_str_each_byte(
VALUE str)
9678 return rb_str_enumerate_bytes(str, 0);
9690rb_str_bytes(
VALUE str)
9692 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9693 return rb_str_enumerate_bytes(str, ary);
9711 ptr = RSTRING_PTR(str);
9712 len = RSTRING_LEN(str);
9713 enc = rb_enc_get(str);
9716 for (i = 0; i <
len; i += n) {
9717 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9722 for (i = 0; i <
len; i += n) {
9723 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9744rb_str_each_char(
VALUE str)
9747 return rb_str_enumerate_chars(str, 0);
9759rb_str_chars(
VALUE str)
9762 return rb_str_enumerate_chars(str, ary);
9766rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9771 const char *ptr, *end;
9774 if (single_byte_optimizable(str))
9775 return rb_str_enumerate_bytes(str, ary);
9778 ptr = RSTRING_PTR(str);
9780 enc = STR_ENC_GET(str);
9783 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9804rb_str_each_codepoint(
VALUE str)
9807 return rb_str_enumerate_codepoints(str, 0);
9819rb_str_codepoints(
VALUE str)
9822 return rb_str_enumerate_codepoints(str, ary);
9828 int encidx = rb_enc_to_index(enc);
9830 const OnigUChar source_ascii[] =
"\\X";
9831 const OnigUChar *source = source_ascii;
9832 size_t source_len =
sizeof(source_ascii) - 1;
9835#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9836#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9837#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9838#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9839#define CASE_UTF(e) \
9840 case ENCINDEX_UTF_##e: { \
9841 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9842 source = source_UTF_##e; \
9843 source_len = sizeof(source_UTF_##e); \
9846 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9854 regex_t *reg_grapheme_cluster;
9856 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9857 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9859 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9860 onig_error_code_to_str(message, r, &einfo);
9861 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9864 return reg_grapheme_cluster;
9870 int encidx = rb_enc_to_index(enc);
9871 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9873 if (encidx == rb_utf8_encindex()) {
9874 if (!reg_grapheme_cluster_utf8) {
9875 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9878 return reg_grapheme_cluster_utf8;
9887 size_t grapheme_cluster_count = 0;
9889 const char *ptr, *end;
9891 if (!rb_enc_unicode_p(enc)) {
9895 bool cached_reg_grapheme_cluster =
true;
9896 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9897 if (!reg_grapheme_cluster) {
9898 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9899 cached_reg_grapheme_cluster =
false;
9902 ptr = RSTRING_PTR(str);
9906 OnigPosition
len = onig_match(reg_grapheme_cluster,
9907 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9908 (
const OnigUChar *)ptr, NULL, 0);
9909 if (
len <= 0)
break;
9910 grapheme_cluster_count++;
9914 if (!cached_reg_grapheme_cluster) {
9915 onig_free(reg_grapheme_cluster);
9918 return SIZET2NUM(grapheme_cluster_count);
9922rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9926 const char *ptr0, *ptr, *end;
9928 if (!rb_enc_unicode_p(enc)) {
9929 return rb_str_enumerate_chars(str, ary);
9934 bool cached_reg_grapheme_cluster =
true;
9935 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9936 if (!reg_grapheme_cluster) {
9937 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9938 cached_reg_grapheme_cluster =
false;
9941 ptr0 = ptr = RSTRING_PTR(str);
9945 OnigPosition
len = onig_match(reg_grapheme_cluster,
9946 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9947 (
const OnigUChar *)ptr, NULL, 0);
9948 if (
len <= 0)
break;
9953 if (!cached_reg_grapheme_cluster) {
9954 onig_free(reg_grapheme_cluster);
9974rb_str_each_grapheme_cluster(
VALUE str)
9977 return rb_str_enumerate_grapheme_clusters(str, 0);
9989rb_str_grapheme_clusters(
VALUE str)
9992 return rb_str_enumerate_grapheme_clusters(str, ary);
9996chopped_length(
VALUE str)
9999 const char *p, *p2, *beg, *end;
10001 beg = RSTRING_PTR(str);
10002 end = beg + RSTRING_LEN(str);
10003 if (beg >= end)
return 0;
10004 p = rb_enc_prev_char(beg, end, end, enc);
10006 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
10007 p2 = rb_enc_prev_char(beg, p, end, enc);
10008 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10024rb_str_chop_bang(
VALUE str)
10026 str_modify_keep_cr(str);
10027 if (RSTRING_LEN(str) > 0) {
10029 len = chopped_length(str);
10030 STR_SET_LEN(str,
len);
10031 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10050rb_str_chop(
VALUE str)
10056smart_chomp(
VALUE str,
const char *e,
const char *p)
10059 if (rb_enc_mbminlen(enc) > 1) {
10064 pp = e - rb_enc_mbminlen(enc);
10067 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10075 if (--e > p && *(e-1) ==
'\r') {
10092 char *pp, *e, *rsptr;
10094 char *
const p = RSTRING_PTR(str);
10095 long len = RSTRING_LEN(str);
10097 if (
len == 0)
return 0;
10100 return smart_chomp(str, e, p);
10103 enc = rb_enc_get(str);
10106 if (rb_enc_mbminlen(enc) > 1) {
10111 pp -= rb_enc_mbminlen(enc);
10114 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10121 while (e > p && *(e-1) ==
'\n') {
10123 if (e > p && *(e-1) ==
'\r')
10129 if (rslen >
len)
return len;
10131 enc = rb_enc_get(rs);
10132 newline = rsptr[rslen-1];
10133 if (rslen == rb_enc_mbminlen(enc)) {
10135 if (newline ==
'\n')
10136 return smart_chomp(str, e, p);
10140 return smart_chomp(str, e, p);
10144 enc = rb_enc_check(str, rs);
10145 if (is_broken_string(rs)) {
10149 if (p[
len-1] == newline &&
10151 memcmp(rsptr, pp, rslen) == 0)) {
10152 if (at_char_boundary(p, pp, e, enc))
10153 return len - rslen;
10165chomp_rs(
int argc,
const VALUE *argv)
10169 VALUE rs = argv[0];
10181 long olen = RSTRING_LEN(str);
10182 long len = chompped_length(str, rs);
10183 if (
len >= olen)
return Qnil;
10184 str_modify_keep_cr(str);
10185 STR_SET_LEN(str,
len);
10186 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10203rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10206 str_modifiable(str);
10207 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10208 rs = chomp_rs(argc, argv);
10210 return rb_str_chomp_string(str, rs);
10223rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10225 VALUE rs = chomp_rs(argc, argv);
10233 const char *
const start = s;
10235 if (!s || s >= e)
return 0;
10238 if (single_byte_optimizable(str)) {
10239 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10244 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10264rb_str_lstrip_bang(
VALUE str)
10268 long olen, loffset;
10270 str_modify_keep_cr(str);
10271 enc = STR_ENC_GET(str);
10273 loffset = lstrip_offset(str, start, start+olen, enc);
10275 long len = olen-loffset;
10276 s = start + loffset;
10277 memmove(start, s,
len);
10278 STR_SET_LEN(str,
len);
10279 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10302rb_str_lstrip(
VALUE str)
10307 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10308 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10317 rb_str_check_dummy_enc(enc);
10321 if (!s || s >= e)
return 0;
10325 if (single_byte_optimizable(str)) {
10327 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10332 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10352rb_str_rstrip_bang(
VALUE str)
10356 long olen, roffset;
10358 str_modify_keep_cr(str);
10359 enc = STR_ENC_GET(str);
10361 roffset = rstrip_offset(str, start, start+olen, enc);
10363 long len = olen - roffset;
10365 STR_SET_LEN(str,
len);
10366 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10389rb_str_rstrip(
VALUE str)
10393 long olen, roffset;
10395 enc = STR_ENC_GET(str);
10397 roffset = rstrip_offset(str, start, start+olen, enc);
10399 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10415rb_str_strip_bang(
VALUE str)
10418 long olen, loffset, roffset;
10421 str_modify_keep_cr(str);
10422 enc = STR_ENC_GET(str);
10424 loffset = lstrip_offset(str, start, start+olen, enc);
10425 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10427 if (loffset > 0 || roffset > 0) {
10428 long len = olen-roffset;
10431 memmove(start, start + loffset,
len);
10433 STR_SET_LEN(str,
len);
10434 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10457rb_str_strip(
VALUE str)
10460 long olen, loffset, roffset;
10464 loffset = lstrip_offset(str, start, start+olen, enc);
10465 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10467 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10472scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10475 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10481 end = pos + RSTRING_LEN(pat);
10495 if (RSTRING_LEN(str) > end)
10496 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10505 if (!regs || regs->num_regs == 1) {
10511 for (
int i = 1; i < regs->num_regs; i++) {
10517 rb_ary_push(result, s);
10572 long last = -1, prev = 0;
10573 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10575 pat = get_pat_quoted(pat, 1);
10576 mustnot_broken(str);
10578 VALUE ary = rb_ary_new();
10580 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10583 rb_ary_push(ary, result);
10585 if (last >= 0) rb_pat_search(pat, str, last, 1);
10590 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10594 str_mod_check(str, p,
len);
10596 if (last >= 0) rb_pat_search(pat, str, last, 1);
10620rb_str_hex(
VALUE str)
10622 return rb_str_to_inum(str, 16, FALSE);
10647rb_str_oct(
VALUE str)
10649 return rb_str_to_inum(str, -8, FALSE);
10652#ifndef HAVE_CRYPT_R
10657 rb_nativethread_lock_t lock;
10658} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10661crypt_mutex_initialize(
void)
10732# define CRYPT_END() ALLOCV_END(databuf)
10734 extern char *crypt(
const char *,
const char *);
10735# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10738 const char *s, *saltp;
10741 char salt_8bit_clean[3];
10745 mustnot_wchar(str);
10746 mustnot_wchar(salt);
10748 saltp = RSTRING_PTR(salt);
10749 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10750 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10754 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10755 salt_8bit_clean[0] = saltp[0] & 0x7f;
10756 salt_8bit_clean[1] = saltp[1] & 0x7f;
10757 salt_8bit_clean[2] =
'\0';
10758 saltp = salt_8bit_clean;
10763# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10764 data->initialized = 0;
10766 res = crypt_r(s, saltp, data);
10768 crypt_mutex_initialize();
10770 res = crypt(s, saltp);
10811 char *ptr, *p, *pend;
10814 unsigned long sum0 = 0;
10819 ptr = p = RSTRING_PTR(str);
10820 len = RSTRING_LEN(str);
10826 str_mod_check(str, ptr,
len);
10829 sum0 += (
unsigned char)*p;
10840 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10841 sum0 &= (((
unsigned long)1)<<bits)-1;
10861rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10865 long width,
len, flen = 1, fclen = 1;
10868 const char *f =
" ";
10869 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10871 int singlebyte = 1, cr;
10875 enc = STR_ENC_GET(str);
10876 termlen = rb_enc_mbminlen(enc);
10880 enc = rb_enc_check(str, pad);
10881 f = RSTRING_PTR(pad);
10882 flen = RSTRING_LEN(pad);
10883 fclen = str_strlen(pad, enc);
10884 singlebyte = single_byte_optimizable(pad);
10885 if (flen == 0 || fclen == 0) {
10886 rb_raise(rb_eArgError,
"zero width padding");
10889 len = str_strlen(str, enc);
10890 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
10892 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
10896 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10897 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10899 size = RSTRING_LEN(str);
10900 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10901 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10902 (
len += llen2 + rlen2) >= LONG_MAX - size) {
10903 rb_raise(rb_eArgError,
"argument too big");
10907 p = RSTRING_PTR(res);
10909 memset(p, *f, llen);
10913 while (llen >= fclen) {
10919 memcpy(p, f, llen2);
10923 memcpy(p, RSTRING_PTR(str), size);
10926 memset(p, *f, rlen);
10930 while (rlen >= fclen) {
10936 memcpy(p, f, rlen2);
10940 TERM_FILL(p, termlen);
10941 STR_SET_LEN(res, p-RSTRING_PTR(res));
10964rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
10966 return rb_str_justify(argc, argv, str,
'l');
10980rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
10982 return rb_str_justify(argc, argv, str,
'r');
10997rb_str_center(
int argc,
VALUE *argv,
VALUE str)
10999 return rb_str_justify(argc, argv, str,
'c');
11015 sep = get_pat_quoted(sep, 0);
11027 pos = rb_str_index(str, sep, 0);
11028 if (pos < 0)
goto failed;
11033 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11036 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11050 long pos = RSTRING_LEN(str);
11052 sep = get_pat_quoted(sep, 0);
11065 pos = rb_str_rindex(str, sep, pos);
11074 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11076 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11088rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11092 for (i=0; i<argc; i++) {
11093 VALUE tmp = argv[i];
11095 if (rb_reg_start_with_p(tmp, str))
11099 const char *p, *s, *e;
11104 enc = rb_enc_check(str, tmp);
11105 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11106 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11107 p = RSTRING_PTR(str);
11110 if (!at_char_right_boundary(p, s, e, enc))
11112 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11128rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11132 for (i=0; i<argc; i++) {
11133 VALUE tmp = argv[i];
11134 const char *p, *s, *e;
11139 enc = rb_enc_check(str, tmp);
11140 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11141 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11142 p = RSTRING_PTR(str);
11145 if (!at_char_boundary(p, s, e, enc))
11147 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11163deleted_prefix_length(
VALUE str,
VALUE prefix)
11165 const char *strptr, *prefixptr;
11166 long olen, prefixlen;
11171 if (!is_broken_string(prefix) ||
11172 !rb_enc_asciicompat(enc) ||
11173 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11174 enc = rb_enc_check(str, prefix);
11178 prefixlen = RSTRING_LEN(prefix);
11179 if (prefixlen <= 0)
return 0;
11180 olen = RSTRING_LEN(str);
11181 if (olen < prefixlen)
return 0;
11182 strptr = RSTRING_PTR(str);
11183 prefixptr = RSTRING_PTR(prefix);
11184 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11185 if (is_broken_string(prefix)) {
11186 if (!is_broken_string(str)) {
11190 const char *strend = strptr + olen;
11191 const char *after_prefix = strptr + prefixlen;
11192 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11212rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11215 str_modify_keep_cr(str);
11217 prefixlen = deleted_prefix_length(str, prefix);
11218 if (prefixlen <= 0)
return Qnil;
11232rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11236 prefixlen = deleted_prefix_length(str, prefix);
11237 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11239 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11252deleted_suffix_length(
VALUE str,
VALUE suffix)
11254 const char *strptr, *suffixptr;
11255 long olen, suffixlen;
11259 if (is_broken_string(suffix))
return 0;
11260 enc = rb_enc_check(str, suffix);
11263 suffixlen = RSTRING_LEN(suffix);
11264 if (suffixlen <= 0)
return 0;
11265 olen = RSTRING_LEN(str);
11266 if (olen < suffixlen)
return 0;
11267 strptr = RSTRING_PTR(str);
11268 suffixptr = RSTRING_PTR(suffix);
11269 const char *strend = strptr + olen;
11270 const char *before_suffix = strend - suffixlen;
11271 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11272 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11287rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11289 long olen, suffixlen,
len;
11290 str_modifiable(str);
11292 suffixlen = deleted_suffix_length(str, suffix);
11293 if (suffixlen <= 0)
return Qnil;
11295 olen = RSTRING_LEN(str);
11296 str_modify_keep_cr(str);
11297 len = olen - suffixlen;
11298 STR_SET_LEN(str,
len);
11299 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11315rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11319 suffixlen = deleted_suffix_length(str, suffix);
11320 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11322 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11329 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11337 val = rb_fs_check(val);
11340 "value of %"PRIsVALUE
" must be String or Regexp",
11344 rb_warn_deprecated(
"'$;'", NULL);
11361 str_modifiable(str);
11364 int idx = rb_enc_to_index(encoding);
11371 rb_enc_associate_index(str, idx);
11395 if (STR_EMBED_P(str)) {
11396 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11401 str_replace_shared_without_enc(str2, str);
11403 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11436rb_str_valid_encoding_p(
VALUE str)
11456rb_str_is_ascii_only_p(
VALUE str)
11466 static const char ellipsis[] =
"...";
11467 const long ellipsislen =
sizeof(ellipsis) - 1;
11469 const long blen = RSTRING_LEN(str);
11470 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11471 VALUE estr, ret = 0;
11474 if (
len * rb_enc_mbminlen(enc) >= blen ||
11478 else if (
len <= ellipsislen ||
11480 if (rb_enc_asciicompat(enc)) {
11482 rb_enc_associate(ret, enc);
11489 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11494 rb_enc_from_encoding(enc), 0,
Qnil);
11507 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11513 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11532 if (enc == STR_ENC_GET(str)) {
11537 return enc_str_scrub(enc, str, repl, cr);
11545 const char *rep, *p, *e, *p1, *sp;
11551 rb_raise(rb_eArgError,
"both of block and replacement given");
11558 if (!
NIL_P(repl)) {
11559 repl = str_compat_and_valid(repl, enc);
11562 if (rb_enc_dummy_p(enc)) {
11565 encidx = rb_enc_to_index(enc);
11567#define DEFAULT_REPLACE_CHAR(str) do { \
11568 static const char replace[sizeof(str)-1] = str; \
11569 rep = replace; replen = (int)sizeof(replace); \
11572 slen = RSTRING_LEN(str);
11573 p = RSTRING_PTR(str);
11578 if (rb_enc_asciicompat(enc)) {
11584 else if (!
NIL_P(repl)) {
11585 rep = RSTRING_PTR(repl);
11586 replen = RSTRING_LEN(repl);
11589 else if (encidx == rb_utf8_encindex()) {
11590 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11594 DEFAULT_REPLACE_CHAR(
"?");
11599 p = search_nonascii(p, e);
11604 int ret = rb_enc_precise_mbclen(p, e, enc);
11623 if (e - p < clen) clen = e - p;
11630 for (; clen > 1; clen--) {
11631 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11642 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11643 str_mod_check(str, sp, slen);
11644 repl = str_compat_and_valid(repl, enc);
11651 p = search_nonascii(p, e);
11677 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11678 str_mod_check(str, sp, slen);
11679 repl = str_compat_and_valid(repl, enc);
11688 long mbminlen = rb_enc_mbminlen(enc);
11692 else if (!
NIL_P(repl)) {
11693 rep = RSTRING_PTR(repl);
11694 replen = RSTRING_LEN(repl);
11696 else if (encidx == ENCINDEX_UTF_16BE) {
11697 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11699 else if (encidx == ENCINDEX_UTF_16LE) {
11700 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11702 else if (encidx == ENCINDEX_UTF_32BE) {
11703 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11705 else if (encidx == ENCINDEX_UTF_32LE) {
11706 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11709 DEFAULT_REPLACE_CHAR(
"?");
11713 int ret = rb_enc_precise_mbclen(p, e, enc);
11726 if (e - p < clen) clen = e - p;
11727 if (clen <= mbminlen * 2) {
11732 for (; clen > mbminlen; clen-=mbminlen) {
11733 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11743 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11744 str_mod_check(str, sp, slen);
11745 repl = str_compat_and_valid(repl, enc);
11770 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11771 str_mod_check(str, sp, slen);
11772 repl = str_compat_and_valid(repl, enc);
11808str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11816static ID id_normalize;
11817static ID id_normalized_p;
11818static VALUE mUnicodeNormalize;
11821unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11823 static int UnicodeNormalizeRequired = 0;
11826 if (!UnicodeNormalizeRequired) {
11827 rb_require(
"unicode_normalize/normalize.rb");
11828 UnicodeNormalizeRequired = 1;
11832 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11869rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11871 return unicode_normalize_common(argc, argv, str, id_normalize);
11885rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
11887 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11914rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
11916 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12048#define sym_equal rb_obj_equal
12051sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
12055 int c = rb_enc_precise_mbclen(s, send, enc);
12059 c = rb_enc_mbc_to_codepoint(s, send, enc);
12067rb_str_symname_p(
VALUE sym)
12072 rb_encoding *resenc = rb_default_internal_encoding();
12074 if (resenc == NULL) resenc = rb_default_external_encoding();
12075 enc = STR_ENC_GET(sym);
12076 ptr = RSTRING_PTR(sym);
12077 len = RSTRING_LEN(sym);
12078 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12086rb_str_quote_unprintable(
VALUE str)
12094 resenc = rb_default_internal_encoding();
12095 if (resenc == NULL) resenc = rb_default_external_encoding();
12096 enc = STR_ENC_GET(str);
12097 ptr = RSTRING_PTR(str);
12098 len = RSTRING_LEN(str);
12099 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12100 !sym_printable(ptr, ptr +
len, enc)) {
12101 return rb_str_escape(str);
12107rb_id_quote_unprintable(
ID id)
12109 VALUE str = rb_id2str(
id);
12110 if (!rb_str_symname_p(str)) {
12111 return rb_str_escape(str);
12129sym_inspect(
VALUE sym)
12136 if (!rb_str_symname_p(str)) {
12138 len = RSTRING_LEN(str);
12139 rb_str_resize(str,
len + 1);
12140 dest = RSTRING_PTR(str);
12141 memmove(dest + 1, dest,
len);
12145 VALUE orig_str = str;
12147 len = RSTRING_LEN(orig_str);
12148 str = rb_enc_str_new(0,
len + 1, enc);
12151 ptr = RSTRING_PTR(orig_str);
12152 dest = RSTRING_PTR(str);
12153 memcpy(dest + 1, ptr,
len);
12173rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12178 rb_raise(rb_eArgError,
"no receiver given");
12275 return rb_str_match(
rb_sym2str(sym), other);
12290sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12292 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12305sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12307 return rb_str_match_m_p(argc, argv, sym);
12325 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12336sym_length(
VALUE sym)
12350sym_empty(
VALUE sym)
12384sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12400sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12416sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12430sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12432 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12445sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12447 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12459sym_encoding(
VALUE sym)
12465string_for_symbol(
VALUE name)
12470 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12484 name = string_for_symbol(name);
12485 return rb_intern_str(name);
12494 name = string_for_symbol(name);
12518 return rb_fstring(str);
12525 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII),
true,
false);
12537 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12538 rb_enc_autoload(enc);
12542 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
false);
12548 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12549 rb_enc_autoload(enc);
12553 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc),
true,
true);
12564rb_yjit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12569 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12570 rb_str_buf_cat_byte(str, (
char) code);
12584 st_foreach(rb_vm_fstring_table(), fstring_set_class_i,
rb_cString);
12750 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
long len
Length of the string, not including terminating NUL character.
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
union RString::@52::@53::@55 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.