Ruby 4.0.0dev (2025-12-06 revision 0346206d3eab2a8e659be0dd52aea6fc7b0ebb06)
string.c (0346206d3eab2a8e659be0dd52aea6fc7b0ebb06)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 FL_SET((shared_str), STR_SHARED_ROOT); \
209 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
210 FL_SET_RAW((shared_str), STR_BORROWED); \
211 } \
212} while (0)
213
214#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
215#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216/* TODO: include the terminator size in capa. */
217
218#define STR_ENC_GET(str) get_encoding(str)
219
220#if !defined SHARABLE_MIDDLE_SUBSTRING
221# define SHARABLE_MIDDLE_SUBSTRING 0
222#endif
223#if !SHARABLE_MIDDLE_SUBSTRING
224#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#else
226#define SHARABLE_SUBSTRING_P(beg, len, end) 1
227#endif
228
229
230static inline long
231str_embed_capa(VALUE str)
232{
233 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
234}
235
236bool
237rb_str_reembeddable_p(VALUE str)
238{
239 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
240}
241
242static inline size_t
243rb_str_embed_size(long capa, long termlen)
244{
245 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
246 if (size < sizeof(struct RString)) size = sizeof(struct RString);
247 return size;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254 if (STR_EMBED_P(str)) {
255 size_t capa = RSTRING(str)->len;
256 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
257
258 real_size = rb_str_embed_size(capa, TERM_LEN(str));
259 }
260 /* if the string is not currently embedded, but it can be embedded, how
261 * much space would it require */
262 else if (rb_str_reembeddable_p(str)) {
263 size_t capa = RSTRING(str)->as.heap.aux.capa;
264 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
265
266 real_size = rb_str_embed_size(capa, TERM_LEN(str));
267 }
268 else {
269 real_size = sizeof(struct RString);
270 }
271
272 return real_size;
273}
274
275static inline bool
276STR_EMBEDDABLE_P(long len, long termlen)
277{
278 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
279}
280
281static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
282static VALUE str_new_frozen(VALUE klass, VALUE orig);
283static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
284static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
285static VALUE str_new(VALUE klass, const char *ptr, long len);
286static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
287static inline void str_modifiable(VALUE str);
288static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
289static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->len;
317
318 STR_SET_EMBED(str);
319 STR_SET_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_debug_rstring_null_ptr(const char *func)
331{
332 fprintf(stderr, "%s is returning NULL!! "
333 "SIGSEGV is highly expected to follow immediately.\n"
334 "If you could reproduce, attach your debugger here, "
335 "and look at the passed string.\n",
336 func);
337}
338
339/* symbols for [up|down|swap]case/capitalize options */
340static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
341
342static rb_encoding *
343get_encoding(VALUE str)
344{
345 return rb_enc_from_index(ENCODING_GET(str));
346}
347
348static void
349mustnot_broken(VALUE str)
350{
351 if (is_broken_string(str)) {
352 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
353 }
354}
355
356static void
357mustnot_wchar(VALUE str)
358{
359 rb_encoding *enc = STR_ENC_GET(str);
360 if (rb_enc_mbminlen(enc) > 1) {
361 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
362 }
363}
364
365static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
366
367#if SIZEOF_LONG == SIZEOF_VOIDP
368#define PRECOMPUTED_FAKESTR_HASH 1
369#else
370#endif
371
372static inline bool
373BARE_STRING_P(VALUE str)
374{
375 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
376}
377
378static inline st_index_t
379str_do_hash(VALUE str)
380{
381 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
382 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
383 if (e && !is_ascii_string(str)) {
384 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
385 }
386 return h;
387}
388
389static VALUE
390str_store_precomputed_hash(VALUE str, st_index_t hash)
391{
392 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
393 RUBY_ASSERT(STR_EMBED_P(str));
394
395#if RUBY_DEBUG
396 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
397 size_t free_bytes = str_embed_capa(str) - used_bytes;
398 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
399#endif
400
401 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
402
403 FL_SET(str, STR_PRECOMPUTED_HASH);
404
405 return str;
406}
407
408VALUE
409rb_fstring(VALUE str)
410{
411 VALUE fstr;
412 int bare;
413
414 Check_Type(str, T_STRING);
415
416 if (FL_TEST(str, RSTRING_FSTR))
417 return str;
418
419 bare = BARE_STRING_P(str);
420 if (!bare) {
421 if (STR_EMBED_P(str)) {
422 OBJ_FREEZE(str);
423 return str;
424 }
425
426 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 return str;
429 }
430 }
431
432 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
433 rb_str_resize(str, RSTRING_LEN(str));
434
435 fstr = register_fstring(str, false, false);
436
437 if (!bare) {
438 str_replace_shared_without_enc(str, fstr);
439 OBJ_FREEZE(str);
440 return str;
441 }
442 return fstr;
443}
444
445static VALUE fstring_table_obj;
446
447static VALUE
448fstring_concurrent_set_hash(VALUE str)
449{
450#ifdef PRECOMPUTED_FAKESTR_HASH
451 st_index_t h;
452 if (FL_TEST_RAW(str, STR_FAKESTR)) {
453 // register_fstring precomputes the hash and stores it in capa for fake strings
454 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
455 }
456 else {
457 h = rb_str_hash(str);
458 }
459 // rb_str_hash doesn't include the encoding for ascii only strings, so
460 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
461 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
462#else
463 return (VALUE)rb_str_hash(str);
464#endif
465}
466
467static bool
468fstring_concurrent_set_cmp(VALUE a, VALUE b)
469{
470 long alen, blen;
471 const char *aptr, *bptr;
472
475
476 RSTRING_GETMEM(a, aptr, alen);
477 RSTRING_GETMEM(b, bptr, blen);
478 return (alen == blen &&
479 ENCODING_GET(a) == ENCODING_GET(b) &&
480 memcmp(aptr, bptr, alen) == 0);
481}
482
484 bool copy;
485 bool force_precompute_hash;
486};
487
488static VALUE
489fstring_concurrent_set_create(VALUE str, void *data)
490{
491 struct fstr_create_arg *arg = data;
492
493 // Unless the string is empty or binary, its coderange has been precomputed.
494 int coderange = ENC_CODERANGE(str);
495
496 if (FL_TEST_RAW(str, STR_FAKESTR)) {
497 if (arg->copy) {
498 VALUE new_str;
499 long len = RSTRING_LEN(str);
500 long capa = len + sizeof(st_index_t);
501 int term_len = TERM_LEN(str);
502
503 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
504 new_str = str_alloc_embed(rb_cString, capa + term_len);
505 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
506 STR_SET_LEN(new_str, RSTRING_LEN(str));
507 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
508 rb_enc_copy(new_str, str);
509 str_store_precomputed_hash(new_str, str_do_hash(str));
510 }
511 else {
512 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
513 rb_enc_copy(new_str, str);
514#ifdef PRECOMPUTED_FAKESTR_HASH
515 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
516 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
517 }
518#endif
519 }
520 str = new_str;
521 }
522 else {
523 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
524 RSTRING(str)->len,
525 ENCODING_GET(str));
526 }
527 OBJ_FREEZE(str);
528 }
529 else {
530 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
531 str = str_new_frozen(rb_cString, str);
532 }
533 if (STR_SHARED_P(str)) { /* str should not be shared */
534 /* shared substring */
535 str_make_independent(str);
537 }
538 if (!BARE_STRING_P(str)) {
539 str = str_new_frozen(rb_cString, str);
540 }
541 }
542
543 ENC_CODERANGE_SET(str, coderange);
544 RBASIC(str)->flags |= RSTRING_FSTR;
545 if (!RB_OBJ_SHAREABLE_P(str)) {
546 RB_OBJ_SET_SHAREABLE(str);
547 }
548 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
551 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
552 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
554 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
555
556 return str;
557}
558
559static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
560 .hash = fstring_concurrent_set_hash,
561 .cmp = fstring_concurrent_set_cmp,
562 .create = fstring_concurrent_set_create,
563 .free = NULL,
564};
565
566void
567Init_fstring_table(void)
568{
569 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
570 rb_gc_register_address(&fstring_table_obj);
571}
572
573static VALUE
574register_fstring(VALUE str, bool copy, bool force_precompute_hash)
575{
576 struct fstr_create_arg args = {
577 .copy = copy,
578 .force_precompute_hash = force_precompute_hash
579 };
580
581#if SIZEOF_VOIDP == SIZEOF_LONG
582 if (FL_TEST_RAW(str, STR_FAKESTR)) {
583 // if the string hasn't been interned, we'll need the hash twice, so we
584 // compute it once and store it in capa
585 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
586 }
587#endif
588
589 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
590
591 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593 RUBY_ASSERT(OBJ_FROZEN(result));
595 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
596 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
598
599 return result;
600}
601
602bool
603rb_obj_is_fstring_table(VALUE obj)
604{
605 ASSERT_vm_locking();
606
607 return obj == fstring_table_obj;
608}
609
610void
611rb_gc_free_fstring(VALUE obj)
612{
613 ASSERT_vm_locking_with_barrier();
614
615 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
617 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
618
619 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
620
621 RB_DEBUG_COUNTER_INC(obj_str_fstr);
622
623 FL_UNSET(obj, RSTRING_FSTR);
624}
625
626void
627rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
628{
629 if (fstring_table_obj) {
630 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
631 }
632}
633
634static VALUE
635setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
636{
637 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
638 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
639
640 if (!name) {
642 name = "";
643 }
644
645 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
646
647 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
648 fake_str->len = len;
649 fake_str->as.heap.ptr = (char *)name;
650 fake_str->as.heap.aux.capa = len;
651 return (VALUE)fake_str;
652}
653
654/*
655 * set up a fake string which refers a static string literal.
656 */
657VALUE
658rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
659{
660 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
661}
662
663/*
664 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
665 * shared string which refers a static string literal. `ptr` must
666 * point a constant string.
667 */
668VALUE
669rb_fstring_new(const char *ptr, long len)
670{
671 struct RString fake_str = {RBASIC_INIT};
672 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
673}
674
675VALUE
676rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
677{
678 struct RString fake_str = {RBASIC_INIT};
679 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
680}
681
682VALUE
683rb_fstring_cstr(const char *ptr)
684{
685 return rb_fstring_new(ptr, strlen(ptr));
686}
687
688static inline bool
689single_byte_optimizable(VALUE str)
690{
691 int encindex = ENCODING_GET(str);
692 switch (encindex) {
693 case ENCINDEX_ASCII_8BIT:
694 case ENCINDEX_US_ASCII:
695 return true;
696 case ENCINDEX_UTF_8:
697 // For UTF-8 it's worth scanning the string coderange when unknown.
699 }
700 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
701 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
702 return true;
703 }
704
705 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
706 return true;
707 }
708
709 /* Conservative. Possibly single byte.
710 * "\xa1" in Shift_JIS for example. */
711 return false;
712}
713
715
716static inline const char *
717search_nonascii(const char *p, const char *e)
718{
719 const uintptr_t *s, *t;
720
721#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
722# if SIZEOF_UINTPTR_T == 8
723# define NONASCII_MASK UINT64_C(0x8080808080808080)
724# elif SIZEOF_UINTPTR_T == 4
725# define NONASCII_MASK UINT32_C(0x80808080)
726# else
727# error "don't know what to do."
728# endif
729#else
730# if SIZEOF_UINTPTR_T == 8
731# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
732# elif SIZEOF_UINTPTR_T == 4
733# define NONASCII_MASK 0x80808080UL /* or...? */
734# else
735# error "don't know what to do."
736# endif
737#endif
738
739 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
740#if !UNALIGNED_WORD_ACCESS
741 if ((uintptr_t)p % SIZEOF_VOIDP) {
742 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
743 p += l;
744 switch (l) {
745 default: UNREACHABLE;
746#if SIZEOF_VOIDP > 4
747 case 7: if (p[-7]&0x80) return p-7;
748 case 6: if (p[-6]&0x80) return p-6;
749 case 5: if (p[-5]&0x80) return p-5;
750 case 4: if (p[-4]&0x80) return p-4;
751#endif
752 case 3: if (p[-3]&0x80) return p-3;
753 case 2: if (p[-2]&0x80) return p-2;
754 case 1: if (p[-1]&0x80) return p-1;
755 case 0: break;
756 }
757 }
758#endif
759#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
760#define aligned_ptr(value) \
761 __builtin_assume_aligned((value), sizeof(uintptr_t))
762#else
763#define aligned_ptr(value) (uintptr_t *)(value)
764#endif
765 s = aligned_ptr(p);
766 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
767#undef aligned_ptr
768 for (;s < t; s++) {
769 if (*s & NONASCII_MASK) {
770#ifdef WORDS_BIGENDIAN
771 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
772#else
773 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
774#endif
775 }
776 }
777 p = (const char *)s;
778 }
779
780 switch (e - p) {
781 default: UNREACHABLE;
782#if SIZEOF_VOIDP > 4
783 case 7: if (e[-7]&0x80) return e-7;
784 case 6: if (e[-6]&0x80) return e-6;
785 case 5: if (e[-5]&0x80) return e-5;
786 case 4: if (e[-4]&0x80) return e-4;
787#endif
788 case 3: if (e[-3]&0x80) return e-3;
789 case 2: if (e[-2]&0x80) return e-2;
790 case 1: if (e[-1]&0x80) return e-1;
791 case 0: return NULL;
792 }
793}
794
795static int
796coderange_scan(const char *p, long len, rb_encoding *enc)
797{
798 const char *e = p + len;
799
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
801 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
802 p = search_nonascii(p, e);
804 }
805
806 if (rb_enc_asciicompat(enc)) {
807 p = search_nonascii(p, e);
808 if (!p) return ENC_CODERANGE_7BIT;
809 for (;;) {
810 int ret = rb_enc_precise_mbclen(p, e, enc);
812 p += MBCLEN_CHARFOUND_LEN(ret);
813 if (p == e) break;
814 p = search_nonascii(p, e);
815 if (!p) break;
816 }
817 }
818 else {
819 while (p < e) {
820 int ret = rb_enc_precise_mbclen(p, e, enc);
822 p += MBCLEN_CHARFOUND_LEN(ret);
823 }
824 }
825 return ENC_CODERANGE_VALID;
826}
827
828long
829rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
830{
831 const char *p = s;
832
833 if (*cr == ENC_CODERANGE_BROKEN)
834 return e - s;
835
836 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
837 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
838 if (*cr == ENC_CODERANGE_VALID) return e - s;
839 p = search_nonascii(p, e);
841 return e - s;
842 }
843 else if (rb_enc_asciicompat(enc)) {
844 p = search_nonascii(p, e);
845 if (!p) {
846 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
847 return e - s;
848 }
849 for (;;) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 if (p == e) break;
857 p = search_nonascii(p, e);
858 if (!p) break;
859 }
860 }
861 else {
862 while (p < e) {
863 int ret = rb_enc_precise_mbclen(p, e, enc);
864 if (!MBCLEN_CHARFOUND_P(ret)) {
866 return p - s;
867 }
868 p += MBCLEN_CHARFOUND_LEN(ret);
869 }
870 }
872 return e - s;
873}
874
875static inline void
876str_enc_copy(VALUE str1, VALUE str2)
877{
878 rb_enc_set_index(str1, ENCODING_GET(str2));
879}
880
881/* Like str_enc_copy, but does not check frozen status of str1.
882 * You should use this only if you're certain that str1 is not frozen. */
883static inline void
884str_enc_copy_direct(VALUE str1, VALUE str2)
885{
886 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
887 if (inlined_encoding == ENCODING_INLINE_MAX) {
888 rb_enc_set_index(str1, rb_enc_get_index(str2));
889 }
890 else {
891 ENCODING_SET_INLINED(str1, inlined_encoding);
892 }
893}
894
895static void
896rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
897{
898 /* this function is designed for copying encoding and coderange
899 * from src to new string "dest" which is made from the part of src.
900 */
901 str_enc_copy(dest, src);
902 if (RSTRING_LEN(dest) == 0) {
903 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
905 else
907 return;
908 }
909 switch (ENC_CODERANGE(src)) {
912 break;
914 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
915 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
917 else
919 break;
920 default:
921 break;
922 }
923}
924
925static void
926rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
927{
928 str_enc_copy(dest, src);
930}
931
932static int
933enc_coderange_scan(VALUE str, rb_encoding *enc)
934{
935 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
936}
937
938int
939rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
940{
941 return enc_coderange_scan(str, enc);
942}
943
944int
946{
947 int cr = ENC_CODERANGE(str);
948
949 if (cr == ENC_CODERANGE_UNKNOWN) {
950 cr = enc_coderange_scan(str, get_encoding(str));
951 ENC_CODERANGE_SET(str, cr);
952 }
953 return cr;
954}
955
956static inline bool
957rb_enc_str_asciicompat(VALUE str)
958{
959 int encindex = ENCODING_GET_INLINED(str);
960 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
961}
962
963int
965{
966 switch(ENC_CODERANGE(str)) {
968 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
970 return true;
971 default:
972 return false;
973 }
974}
975
976static inline void
977str_mod_check(VALUE s, const char *p, long len)
978{
979 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
980 rb_raise(rb_eRuntimeError, "string modified");
981 }
982}
983
984static size_t
985str_capacity(VALUE str, const int termlen)
986{
987 if (STR_EMBED_P(str)) {
988 return str_embed_capa(str) - termlen;
989 }
990 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
991 return RSTRING(str)->len;
992 }
993 else {
994 return RSTRING(str)->as.heap.aux.capa;
995 }
996}
997
998size_t
1000{
1001 return str_capacity(str, TERM_LEN(str));
1002}
1003
1004static inline void
1005must_not_null(const char *ptr)
1006{
1007 if (!ptr) {
1008 rb_raise(rb_eArgError, "NULL pointer given");
1009 }
1010}
1011
1012static inline VALUE
1013str_alloc_embed(VALUE klass, size_t capa)
1014{
1015 size_t size = rb_str_embed_size(capa, 0);
1016 RUBY_ASSERT(size > 0);
1017 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1018
1019 NEWOBJ_OF(str, struct RString, klass,
1021
1022 str->len = 0;
1023 str->as.embed.ary[0] = 0;
1024
1025 return (VALUE)str;
1026}
1027
1028static inline VALUE
1029str_alloc_heap(VALUE klass)
1030{
1031 NEWOBJ_OF(str, struct RString, klass,
1032 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1033
1034 str->len = 0;
1035 str->as.heap.aux.capa = 0;
1036 str->as.heap.ptr = NULL;
1037
1038 return (VALUE)str;
1039}
1040
1041static inline VALUE
1042empty_str_alloc(VALUE klass)
1043{
1044 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1045 VALUE str = str_alloc_embed(klass, 0);
1046 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1048 return str;
1049}
1050
1051static VALUE
1052str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1053{
1054 VALUE str;
1055
1056 if (len < 0) {
1057 rb_raise(rb_eArgError, "negative string size (or size too big)");
1058 }
1059
1060 if (enc == NULL) {
1061 enc = rb_ascii8bit_encoding();
1062 }
1063
1064 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1065
1066 int termlen = rb_enc_mbminlen(enc);
1067
1068 if (STR_EMBEDDABLE_P(len, termlen)) {
1069 str = str_alloc_embed(klass, len + termlen);
1070 if (len == 0) {
1071 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1072 }
1073 }
1074 else {
1075 str = str_alloc_heap(klass);
1076 RSTRING(str)->as.heap.aux.capa = len;
1077 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1078 * integer overflow. If we can STATIC_ASSERT that, the following
1079 * mul_add_mul can be reverted to a simple ALLOC_N. */
1080 RSTRING(str)->as.heap.ptr =
1081 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1082 }
1083
1084 rb_enc_raw_set(str, enc);
1085
1086 if (ptr) {
1087 memcpy(RSTRING_PTR(str), ptr, len);
1088 }
1089 else {
1090 memset(RSTRING_PTR(str), 0, len);
1091 }
1092
1093 STR_SET_LEN(str, len);
1094 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1095 return str;
1096}
1097
1098static VALUE
1099str_new(VALUE klass, const char *ptr, long len)
1100{
1101 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1102}
1103
1104VALUE
1105rb_str_new(const char *ptr, long len)
1106{
1107 return str_new(rb_cString, ptr, len);
1108}
1109
1110VALUE
1111rb_usascii_str_new(const char *ptr, long len)
1112{
1113 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1114}
1115
1116VALUE
1117rb_utf8_str_new(const char *ptr, long len)
1118{
1119 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1120}
1121
1122VALUE
1123rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1124{
1125 return str_enc_new(rb_cString, ptr, len, enc);
1126}
1127
1128VALUE
1130{
1131 must_not_null(ptr);
1132 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1133 * memory regions, and that cannot be detected by the MSAN. Just
1134 * trust the programmer that the argument passed here is a sane C
1135 * string. */
1136 __msan_unpoison_string(ptr);
1137 return rb_str_new(ptr, strlen(ptr));
1138}
1139
1140VALUE
1142{
1143 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1144}
1145
1146VALUE
1148{
1149 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1150}
1151
1152VALUE
1154{
1155 must_not_null(ptr);
1156 if (rb_enc_mbminlen(enc) != 1) {
1157 rb_raise(rb_eArgError, "wchar encoding given");
1158 }
1159 return rb_enc_str_new(ptr, strlen(ptr), enc);
1160}
1161
1162static VALUE
1163str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1164{
1165 VALUE str;
1166
1167 if (len < 0) {
1168 rb_raise(rb_eArgError, "negative string size (or size too big)");
1169 }
1170
1171 if (!ptr) {
1172 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1173 }
1174 else {
1175 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1176 str = str_alloc_heap(klass);
1177 RSTRING(str)->len = len;
1178 RSTRING(str)->as.heap.ptr = (char *)ptr;
1179 RSTRING(str)->as.heap.aux.capa = len;
1180 RBASIC(str)->flags |= STR_NOFREE;
1181 rb_enc_associate_index(str, encindex);
1182 }
1183 return str;
1184}
1185
1186VALUE
1187rb_str_new_static(const char *ptr, long len)
1188{
1189 return str_new_static(rb_cString, ptr, len, 0);
1190}
1191
1192VALUE
1194{
1195 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1196}
1197
1198VALUE
1200{
1201 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1202}
1203
1204VALUE
1206{
1207 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1208}
1209
1210static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1211 rb_encoding *from, rb_encoding *to,
1212 int ecflags, VALUE ecopts);
1213
1214static inline bool
1215is_enc_ascii_string(VALUE str, rb_encoding *enc)
1216{
1217 int encidx = rb_enc_to_index(enc);
1218 if (rb_enc_get_index(str) == encidx)
1219 return is_ascii_string(str);
1220 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1221}
1222
1223VALUE
1224rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1225{
1226 long len;
1227 const char *ptr;
1228 VALUE newstr;
1229
1230 if (!to) return str;
1231 if (!from) from = rb_enc_get(str);
1232 if (from == to) return str;
1233 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1234 rb_is_ascii8bit_enc(to)) {
1235 if (STR_ENC_GET(str) != to) {
1236 str = rb_str_dup(str);
1237 rb_enc_associate(str, to);
1238 }
1239 return str;
1240 }
1241
1242 RSTRING_GETMEM(str, ptr, len);
1243 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1244 from, to, ecflags, ecopts);
1245 if (NIL_P(newstr)) {
1246 /* some error, return original */
1247 return str;
1248 }
1249 return newstr;
1250}
1251
1252VALUE
1253rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1254 rb_encoding *from, int ecflags, VALUE ecopts)
1255{
1256 long olen;
1257
1258 olen = RSTRING_LEN(newstr);
1259 if (ofs < -olen || olen < ofs)
1260 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1261 if (ofs < 0) ofs += olen;
1262 if (!from) {
1263 STR_SET_LEN(newstr, ofs);
1264 return rb_str_cat(newstr, ptr, len);
1265 }
1266
1267 rb_str_modify(newstr);
1268 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1269 rb_enc_get(newstr),
1270 ecflags, ecopts);
1271}
1272
1273VALUE
1274rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1275{
1276 STR_SET_LEN(str, 0);
1277 rb_enc_associate(str, enc);
1278 rb_str_cat(str, ptr, len);
1279 return str;
1280}
1281
1282static VALUE
1283str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1284 rb_encoding *from, rb_encoding *to,
1285 int ecflags, VALUE ecopts)
1286{
1287 rb_econv_t *ec;
1289 long olen;
1290 VALUE econv_wrapper;
1291 const unsigned char *start, *sp;
1292 unsigned char *dest, *dp;
1293 size_t converted_output = (size_t)ofs;
1294
1295 olen = rb_str_capacity(newstr);
1296
1297 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1298 RBASIC_CLEAR_CLASS(econv_wrapper);
1299 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1300 if (!ec) return Qnil;
1301 DATA_PTR(econv_wrapper) = ec;
1302
1303 sp = (unsigned char*)ptr;
1304 start = sp;
1305 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1306 (dp = dest + converted_output),
1307 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1309 /* destination buffer short */
1310 size_t converted_input = sp - start;
1311 size_t rest = len - converted_input;
1312 converted_output = dp - dest;
1313 rb_str_set_len(newstr, converted_output);
1314 if (converted_input && converted_output &&
1315 rest < (LONG_MAX / converted_output)) {
1316 rest = (rest * converted_output) / converted_input;
1317 }
1318 else {
1319 rest = olen;
1320 }
1321 olen += rest < 2 ? 2 : rest;
1322 rb_str_resize(newstr, olen);
1323 }
1324 DATA_PTR(econv_wrapper) = 0;
1325 RB_GC_GUARD(econv_wrapper);
1326 rb_econv_close(ec);
1327 switch (ret) {
1328 case econv_finished:
1329 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1330 rb_str_set_len(newstr, len);
1331 rb_enc_associate(newstr, to);
1332 return newstr;
1333
1334 default:
1335 return Qnil;
1336 }
1337}
1338
1339VALUE
1341{
1342 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1343}
1344
1345VALUE
1347{
1348 rb_encoding *ienc;
1349 VALUE str;
1350 const int eidx = rb_enc_to_index(eenc);
1351
1352 if (!ptr) {
1353 return rb_enc_str_new(ptr, len, eenc);
1354 }
1355
1356 /* ASCII-8BIT case, no conversion */
1357 if ((eidx == rb_ascii8bit_encindex()) ||
1358 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1359 return rb_str_new(ptr, len);
1360 }
1361 /* no default_internal or same encoding, no conversion */
1362 ienc = rb_default_internal_encoding();
1363 if (!ienc || eenc == ienc) {
1364 return rb_enc_str_new(ptr, len, eenc);
1365 }
1366 /* ASCII compatible, and ASCII only string, no conversion in
1367 * default_internal */
1368 if ((eidx == rb_ascii8bit_encindex()) ||
1369 (eidx == rb_usascii_encindex()) ||
1370 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1371 return rb_enc_str_new(ptr, len, ienc);
1372 }
1373 /* convert from the given encoding to default_internal */
1374 str = rb_enc_str_new(NULL, 0, ienc);
1375 /* when the conversion failed for some reason, just ignore the
1376 * default_internal and result in the given encoding as-is. */
1377 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1378 rb_str_initialize(str, ptr, len, eenc);
1379 }
1380 return str;
1381}
1382
1383VALUE
1384rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1385{
1386 int eidx = rb_enc_to_index(eenc);
1387 if (eidx == rb_usascii_encindex() &&
1388 !is_ascii_string(str)) {
1389 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1390 return str;
1391 }
1392 rb_enc_associate_index(str, eidx);
1393 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1394}
1395
1396VALUE
1397rb_external_str_new(const char *ptr, long len)
1398{
1399 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1400}
1401
1402VALUE
1404{
1405 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1406}
1407
1408VALUE
1409rb_locale_str_new(const char *ptr, long len)
1410{
1411 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1412}
1413
1414VALUE
1416{
1417 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1418}
1419
1420VALUE
1422{
1423 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1424}
1425
1426VALUE
1428{
1429 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1430}
1431
1432VALUE
1434{
1435 return rb_str_export_to_enc(str, rb_default_external_encoding());
1436}
1437
1438VALUE
1440{
1441 return rb_str_export_to_enc(str, rb_locale_encoding());
1442}
1443
1444VALUE
1446{
1447 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1448}
1449
1450static VALUE
1451str_replace_shared_without_enc(VALUE str2, VALUE str)
1452{
1453 const int termlen = TERM_LEN(str);
1454 char *ptr;
1455 long len;
1456
1457 RSTRING_GETMEM(str, ptr, len);
1458 if (str_embed_capa(str2) >= len + termlen) {
1459 char *ptr2 = RSTRING(str2)->as.embed.ary;
1460 STR_SET_EMBED(str2);
1461 memcpy(ptr2, RSTRING_PTR(str), len);
1462 TERM_FILL(ptr2+len, termlen);
1463 }
1464 else {
1465 VALUE root;
1466 if (STR_SHARED_P(str)) {
1467 root = RSTRING(str)->as.heap.aux.shared;
1468 RSTRING_GETMEM(str, ptr, len);
1469 }
1470 else {
1471 root = rb_str_new_frozen(str);
1472 RSTRING_GETMEM(root, ptr, len);
1473 }
1474 RUBY_ASSERT(OBJ_FROZEN(root));
1475
1476 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1477 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1478 rb_fatal("about to free a possible shared root");
1479 }
1480 char *ptr2 = STR_HEAP_PTR(str2);
1481 if (ptr2 != ptr) {
1482 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1483 }
1484 }
1485 FL_SET(str2, STR_NOEMBED);
1486 RSTRING(str2)->as.heap.ptr = ptr;
1487 STR_SET_SHARED(str2, root);
1488 }
1489
1490 STR_SET_LEN(str2, len);
1491
1492 return str2;
1493}
1494
1495static VALUE
1496str_replace_shared(VALUE str2, VALUE str)
1497{
1498 str_replace_shared_without_enc(str2, str);
1499 rb_enc_cr_str_exact_copy(str2, str);
1500 return str2;
1501}
1502
1503static VALUE
1504str_new_shared(VALUE klass, VALUE str)
1505{
1506 return str_replace_shared(str_alloc_heap(klass), str);
1507}
1508
1509VALUE
1511{
1512 return str_new_shared(rb_obj_class(str), str);
1513}
1514
1515VALUE
1517{
1518 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1519 return str_new_frozen(rb_obj_class(orig), orig);
1520}
1521
1522static VALUE
1523rb_str_new_frozen_String(VALUE orig)
1524{
1525 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1526 return str_new_frozen(rb_cString, orig);
1527}
1528
1529
1530VALUE
1531rb_str_frozen_bare_string(VALUE orig)
1532{
1533 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1534 return str_new_frozen(rb_cString, orig);
1535}
1536
1537VALUE
1538rb_str_tmp_frozen_acquire(VALUE orig)
1539{
1540 if (OBJ_FROZEN_RAW(orig)) return orig;
1541 return str_new_frozen_buffer(0, orig, FALSE);
1542}
1543
1544VALUE
1545rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1546{
1547 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1548 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1549
1550 VALUE str = str_alloc_heap(0);
1551 OBJ_FREEZE(str);
1552 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1553 FL_SET(str, STR_SHARED_ROOT);
1554
1555 size_t capa = str_capacity(orig, TERM_LEN(orig));
1556
1557 /* If the string is embedded then we want to create a copy that is heap
1558 * allocated. If the string is shared then the shared root must be
1559 * embedded, so we want to create a copy. If the string is a shared root
1560 * then it must be embedded, so we want to create a copy. */
1561 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1562 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1563 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1564 }
1565 else {
1566 /* orig must be heap allocated and not shared, so we can safely transfer
1567 * the pointer to str. */
1568 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1569 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1570 RBASIC(orig)->flags &= ~STR_NOFREE;
1571 STR_SET_SHARED(orig, str);
1572 if (RB_OBJ_SHAREABLE_P(orig)) {
1573 RB_OBJ_SET_SHAREABLE(str);
1574 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1575 }
1576 }
1577
1578 RSTRING(str)->len = RSTRING(orig)->len;
1579 RSTRING(str)->as.heap.aux.capa = capa;
1580
1581 return str;
1582}
1583
1584void
1585rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1586{
1587 if (RBASIC_CLASS(tmp) != 0)
1588 return;
1589
1590 if (STR_EMBED_P(tmp)) {
1592 }
1593 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1594 !OBJ_FROZEN_RAW(orig)) {
1595 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1596
1597 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1598 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1599 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1600
1601 /* Unshare orig since the root (tmp) only has this one child. */
1602 FL_UNSET_RAW(orig, STR_SHARED);
1603 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1604 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1606
1607 /* Make tmp embedded and empty so it is safe for sweeping. */
1608 STR_SET_EMBED(tmp);
1609 STR_SET_LEN(tmp, 0);
1610 }
1611 }
1612}
1613
1614static VALUE
1615str_new_frozen(VALUE klass, VALUE orig)
1616{
1617 return str_new_frozen_buffer(klass, orig, TRUE);
1618}
1619
1620static VALUE
1621heap_str_make_shared(VALUE klass, VALUE orig)
1622{
1623 RUBY_ASSERT(!STR_EMBED_P(orig));
1624 RUBY_ASSERT(!STR_SHARED_P(orig));
1626
1627 VALUE str = str_alloc_heap(klass);
1628 STR_SET_LEN(str, RSTRING_LEN(orig));
1629 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1630 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1631 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1632 RBASIC(orig)->flags &= ~STR_NOFREE;
1633 STR_SET_SHARED(orig, str);
1634 if (klass == 0)
1635 FL_UNSET_RAW(str, STR_BORROWED);
1636 return str;
1637}
1638
1639static VALUE
1640str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1641{
1642 VALUE str;
1643
1644 long len = RSTRING_LEN(orig);
1645 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1646 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1647
1648 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1649 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1650 RUBY_ASSERT(STR_EMBED_P(str));
1651 }
1652 else {
1653 if (FL_TEST_RAW(orig, STR_SHARED)) {
1654 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1655 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1656 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1657 RUBY_ASSERT(ofs >= 0);
1658 RUBY_ASSERT(rest >= 0);
1659 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1661
1662 if ((ofs > 0) || (rest > 0) ||
1663 (klass != RBASIC(shared)->klass) ||
1664 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1665 str = str_new_shared(klass, shared);
1666 RUBY_ASSERT(!STR_EMBED_P(str));
1667 RSTRING(str)->as.heap.ptr += ofs;
1668 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1669 }
1670 else {
1671 if (RBASIC_CLASS(shared) == 0)
1672 FL_SET_RAW(shared, STR_BORROWED);
1673 return shared;
1674 }
1675 }
1676 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1677 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1678 STR_SET_EMBED(str);
1679 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1680 STR_SET_LEN(str, RSTRING_LEN(orig));
1681 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1682 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1683 }
1684 else {
1685 if (RB_OBJ_SHAREABLE_P(orig)) {
1686 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1687 }
1688 else {
1689 str = heap_str_make_shared(klass, orig);
1690 }
1691 }
1692 }
1693
1694 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1695 OBJ_FREEZE(str);
1696 return str;
1697}
1698
1699VALUE
1700rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1701{
1702 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1703}
1704
1705static VALUE
1706str_new_empty_String(VALUE str)
1707{
1708 VALUE v = rb_str_new(0, 0);
1709 rb_enc_copy(v, str);
1710 return v;
1711}
1712
1713#define STR_BUF_MIN_SIZE 63
1714
1715VALUE
1717{
1718 if (STR_EMBEDDABLE_P(capa, 1)) {
1719 return str_alloc_embed(rb_cString, capa + 1);
1720 }
1721
1722 VALUE str = str_alloc_heap(rb_cString);
1723
1724 RSTRING(str)->as.heap.aux.capa = capa;
1725 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1726 RSTRING(str)->as.heap.ptr[0] = '\0';
1727
1728 return str;
1729}
1730
1731VALUE
1733{
1734 VALUE str;
1735 long len = strlen(ptr);
1736
1737 str = rb_str_buf_new(len);
1738 rb_str_buf_cat(str, ptr, len);
1739
1740 return str;
1741}
1742
1743VALUE
1745{
1746 return str_new(0, 0, len);
1747}
1748
1749void
1751{
1752 if (STR_EMBED_P(str)) {
1753 RB_DEBUG_COUNTER_INC(obj_str_embed);
1754 }
1755 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1756 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1757 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1758 }
1759 else {
1760 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1761 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1762 }
1763}
1764
1765size_t
1766rb_str_memsize(VALUE str)
1767{
1768 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1769 return STR_HEAP_SIZE(str);
1770 }
1771 else {
1772 return 0;
1773 }
1774}
1775
1776VALUE
1778{
1779 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1780}
1781
1782static inline void str_discard(VALUE str);
1783static void str_shared_replace(VALUE str, VALUE str2);
1784
1785void
1787{
1788 if (str != str2) str_shared_replace(str, str2);
1789}
1790
1791static void
1792str_shared_replace(VALUE str, VALUE str2)
1793{
1794 rb_encoding *enc;
1795 int cr;
1796 int termlen;
1797
1798 RUBY_ASSERT(str2 != str);
1799 enc = STR_ENC_GET(str2);
1800 cr = ENC_CODERANGE(str2);
1801 str_discard(str);
1802 termlen = rb_enc_mbminlen(enc);
1803
1804 STR_SET_LEN(str, RSTRING_LEN(str2));
1805
1806 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1807 STR_SET_EMBED(str);
1808 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1809 rb_enc_associate(str, enc);
1810 ENC_CODERANGE_SET(str, cr);
1811 }
1812 else {
1813 if (STR_EMBED_P(str2)) {
1814 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1815 long len = RSTRING_LEN(str2);
1816 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1817
1818 char *new_ptr = ALLOC_N(char, len + termlen);
1819 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1820 RSTRING(str2)->as.heap.ptr = new_ptr;
1821 STR_SET_LEN(str2, len);
1822 RSTRING(str2)->as.heap.aux.capa = len;
1823 STR_SET_NOEMBED(str2);
1824 }
1825
1826 STR_SET_NOEMBED(str);
1827 FL_UNSET(str, STR_SHARED);
1828 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1829
1830 if (FL_TEST(str2, STR_SHARED)) {
1831 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1832 STR_SET_SHARED(str, shared);
1833 }
1834 else {
1835 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1836 }
1837
1838 /* abandon str2 */
1839 STR_SET_EMBED(str2);
1840 RSTRING_PTR(str2)[0] = 0;
1841 STR_SET_LEN(str2, 0);
1842 rb_enc_associate(str, enc);
1843 ENC_CODERANGE_SET(str, cr);
1844 }
1845}
1846
1847VALUE
1849{
1850 VALUE str;
1851
1852 if (RB_TYPE_P(obj, T_STRING)) {
1853 return obj;
1854 }
1855 str = rb_funcall(obj, idTo_s, 0);
1856 return rb_obj_as_string_result(str, obj);
1857}
1858
1859VALUE
1860rb_obj_as_string_result(VALUE str, VALUE obj)
1861{
1862 if (!RB_TYPE_P(str, T_STRING))
1863 return rb_any_to_s(obj);
1864 return str;
1865}
1866
1867static VALUE
1868str_replace(VALUE str, VALUE str2)
1869{
1870 long len;
1871
1872 len = RSTRING_LEN(str2);
1873 if (STR_SHARED_P(str2)) {
1874 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1876 STR_SET_NOEMBED(str);
1877 STR_SET_LEN(str, len);
1878 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1879 STR_SET_SHARED(str, shared);
1880 rb_enc_cr_str_exact_copy(str, str2);
1881 }
1882 else {
1883 str_replace_shared(str, str2);
1884 }
1885
1886 return str;
1887}
1888
1889static inline VALUE
1890ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1891{
1892 size_t size = rb_str_embed_size(capa, 0);
1893 RUBY_ASSERT(size > 0);
1894 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1895
1896 NEWOBJ_OF(str, struct RString, klass,
1898
1899 str->len = 0;
1900
1901 return (VALUE)str;
1902}
1903
1904static inline VALUE
1905ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1906{
1907 NEWOBJ_OF(str, struct RString, klass,
1908 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1909
1910 str->as.heap.aux.capa = 0;
1911 str->as.heap.ptr = NULL;
1912
1913 return (VALUE)str;
1914}
1915
1916static inline VALUE
1917str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1918{
1919 int encidx = 0;
1920 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1921 encidx = rb_enc_get_index(str);
1922 flags &= ~ENCODING_MASK;
1923 }
1924 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1925 if (encidx) rb_enc_associate_index(dup, encidx);
1926 return dup;
1927}
1928
1929static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1930
1931static inline VALUE
1932str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1933{
1934 VALUE flags = FL_TEST_RAW(str, flag_mask);
1935 long len = RSTRING_LEN(str);
1936
1937 RUBY_ASSERT(STR_EMBED_P(dup));
1938 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1939 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1940 STR_SET_LEN(dup, RSTRING_LEN(str));
1941 return str_duplicate_setup_encoding(str, dup, flags);
1942}
1943
1944static inline VALUE
1945str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1946{
1947 VALUE flags = FL_TEST_RAW(str, flag_mask);
1948 VALUE root = str;
1949 if (FL_TEST_RAW(str, STR_SHARED)) {
1950 root = RSTRING(str)->as.heap.aux.shared;
1951 }
1952 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1953 root = str = str_new_frozen(klass, str);
1954 flags = FL_TEST_RAW(str, flag_mask);
1955 }
1956 RUBY_ASSERT(!STR_SHARED_P(root));
1958
1959 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1960 FL_SET(root, STR_SHARED_ROOT);
1961 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1962 flags |= RSTRING_NOEMBED | STR_SHARED;
1963
1964 STR_SET_LEN(dup, RSTRING_LEN(str));
1965 return str_duplicate_setup_encoding(str, dup, flags);
1966}
1967
1968static inline VALUE
1969str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1970{
1971 if (STR_EMBED_P(str)) {
1972 return str_duplicate_setup_embed(klass, str, dup);
1973 }
1974 else {
1975 return str_duplicate_setup_heap(klass, str, dup);
1976 }
1977}
1978
1979static inline VALUE
1980str_duplicate(VALUE klass, VALUE str)
1981{
1982 VALUE dup;
1983 if (STR_EMBED_P(str)) {
1984 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1985 }
1986 else {
1987 dup = str_alloc_heap(klass);
1988 }
1989
1990 return str_duplicate_setup(klass, str, dup);
1991}
1992
1993VALUE
1995{
1996 return str_duplicate(rb_obj_class(str), str);
1997}
1998
1999/* :nodoc: */
2000VALUE
2001rb_str_dup_m(VALUE str)
2002{
2003 if (LIKELY(BARE_STRING_P(str))) {
2004 return str_duplicate(rb_cString, str);
2005 }
2006 else {
2007 return rb_obj_dup(str);
2008 }
2009}
2010
2011VALUE
2013{
2014 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2015 return str_duplicate(rb_cString, str);
2016}
2017
2018VALUE
2019rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2020{
2021 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2022 VALUE new_str, klass = rb_cString;
2023
2024 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2025 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2026 str_duplicate_setup_embed(klass, str, new_str);
2027 }
2028 else {
2029 new_str = ec_str_alloc_heap(ec, klass);
2030 str_duplicate_setup_heap(klass, str, new_str);
2031 }
2032 if (chilled) {
2033 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2034 }
2035 return new_str;
2036}
2037
2038VALUE
2039rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2040{
2041 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2042 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2043 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2044 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2045 return rb_str_freeze(str);
2046}
2047
2048/*
2049 * The documentation block below uses an include (instead of inline text)
2050 * because the included text has non-ASCII characters (which are not allowed in a C file).
2051 */
2052
2053/*
2054 *
2055 * call-seq:
2056 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2057 *
2058 * :include: doc/string/new.rdoc
2059 *
2060 */
2061
2062static VALUE
2063rb_str_init(int argc, VALUE *argv, VALUE str)
2064{
2065 static ID keyword_ids[2];
2066 VALUE orig, opt, venc, vcapa;
2067 VALUE kwargs[2];
2068 rb_encoding *enc = 0;
2069 int n;
2070
2071 if (!keyword_ids[0]) {
2072 keyword_ids[0] = rb_id_encoding();
2073 CONST_ID(keyword_ids[1], "capacity");
2074 }
2075
2076 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2077 if (!NIL_P(opt)) {
2078 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2079 venc = kwargs[0];
2080 vcapa = kwargs[1];
2081 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2082 enc = rb_to_encoding(venc);
2083 }
2084 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2085 long capa = NUM2LONG(vcapa);
2086 long len = 0;
2087 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2088
2089 if (capa < STR_BUF_MIN_SIZE) {
2090 capa = STR_BUF_MIN_SIZE;
2091 }
2092 if (n == 1) {
2093 StringValue(orig);
2094 len = RSTRING_LEN(orig);
2095 if (capa < len) {
2096 capa = len;
2097 }
2098 if (orig == str) n = 0;
2099 }
2100 str_modifiable(str);
2101 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2102 /* make noembed always */
2103 const size_t size = (size_t)capa + termlen;
2104 const char *const old_ptr = RSTRING_PTR(str);
2105 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2106 char *new_ptr = ALLOC_N(char, size);
2107 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2108 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2109 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2110 RSTRING(str)->as.heap.ptr = new_ptr;
2111 }
2112 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2113 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2114 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2115 }
2116 STR_SET_LEN(str, len);
2117 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2118 if (n == 1) {
2119 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2120 rb_enc_cr_str_exact_copy(str, orig);
2121 }
2122 FL_SET(str, STR_NOEMBED);
2123 RSTRING(str)->as.heap.aux.capa = capa;
2124 }
2125 else if (n == 1) {
2126 rb_str_replace(str, orig);
2127 }
2128 if (enc) {
2129 rb_enc_associate(str, enc);
2131 }
2132 }
2133 else if (n == 1) {
2134 rb_str_replace(str, orig);
2135 }
2136 return str;
2137}
2138
2139/* :nodoc: */
2140static VALUE
2141rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2142{
2143 if (klass != rb_cString) {
2144 return rb_class_new_instance_pass_kw(argc, argv, klass);
2145 }
2146
2147 static ID keyword_ids[2];
2148 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2149 VALUE kwargs[2];
2150 rb_encoding *enc = NULL;
2151
2152 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2153 if (NIL_P(opt)) {
2154 return rb_class_new_instance_pass_kw(argc, argv, klass);
2155 }
2156
2157 keyword_ids[0] = rb_id_encoding();
2158 CONST_ID(keyword_ids[1], "capacity");
2159 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2160 encoding = kwargs[0];
2161 capacity = kwargs[1];
2162
2163 if (n == 1) {
2164 orig = StringValue(orig);
2165 }
2166 else {
2167 orig = Qnil;
2168 }
2169
2170 if (UNDEF_P(encoding)) {
2171 if (!NIL_P(orig)) {
2172 encoding = rb_obj_encoding(orig);
2173 }
2174 }
2175
2176 if (!UNDEF_P(encoding)) {
2177 enc = rb_to_encoding(encoding);
2178 }
2179
2180 // If capacity is nil, we're basically just duping `orig`.
2181 if (UNDEF_P(capacity)) {
2182 if (NIL_P(orig)) {
2183 VALUE empty_str = str_new(klass, "", 0);
2184 if (enc) {
2185 rb_enc_associate(empty_str, enc);
2186 }
2187 return empty_str;
2188 }
2189 VALUE copy = str_duplicate(klass, orig);
2190 rb_enc_associate(copy, enc);
2191 ENC_CODERANGE_CLEAR(copy);
2192 return copy;
2193 }
2194
2195 long capa = 0;
2196 capa = NUM2LONG(capacity);
2197 if (capa < 0) {
2198 capa = 0;
2199 }
2200
2201 if (!NIL_P(orig)) {
2202 long orig_capa = rb_str_capacity(orig);
2203 if (orig_capa > capa) {
2204 capa = orig_capa;
2205 }
2206 }
2207
2208 VALUE str = str_enc_new(klass, NULL, capa, enc);
2209 STR_SET_LEN(str, 0);
2210 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2211
2212 if (!NIL_P(orig)) {
2213 rb_str_buf_append(str, orig);
2214 }
2215
2216 return str;
2217}
2218
2219#ifdef NONASCII_MASK
2220#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2221
2222/*
2223 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2224 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2225 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2226 *
2227 * if (!(byte & 0x80))
2228 * byte |= 0x40; // turn on bit6
2229 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2230 *
2231 * This function calculates whether a byte is leading or not for all bytes
2232 * in the argument word by concurrently using the above logic, and then
2233 * adds up the number of leading bytes in the word.
2234 */
2235static inline uintptr_t
2236count_utf8_lead_bytes_with_word(const uintptr_t *s)
2237{
2238 uintptr_t d = *s;
2239
2240 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2241 d = (d>>6) | (~d>>7);
2242 d &= NONASCII_MASK >> 7;
2243
2244 /* Gather all bytes. */
2245#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2246 /* use only if it can use POPCNT */
2247 return rb_popcount_intptr(d);
2248#else
2249 d += (d>>8);
2250 d += (d>>16);
2251# if SIZEOF_VOIDP == 8
2252 d += (d>>32);
2253# endif
2254 return (d&0xF);
2255#endif
2256}
2257#endif
2258
2259static inline long
2260enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2261{
2262 long c;
2263 const char *q;
2264
2265 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2266 long diff = (long)(e - p);
2267 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2268 }
2269#ifdef NONASCII_MASK
2270 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2271 uintptr_t len = 0;
2272 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2273 const uintptr_t *s, *t;
2274 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2275 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2276 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2277 while (p < (const char *)s) {
2278 if (is_utf8_lead_byte(*p)) len++;
2279 p++;
2280 }
2281 while (s < t) {
2282 len += count_utf8_lead_bytes_with_word(s);
2283 s++;
2284 }
2285 p = (const char *)s;
2286 }
2287 while (p < e) {
2288 if (is_utf8_lead_byte(*p)) len++;
2289 p++;
2290 }
2291 return (long)len;
2292 }
2293#endif
2294 else if (rb_enc_asciicompat(enc)) {
2295 c = 0;
2296 if (ENC_CODERANGE_CLEAN_P(cr)) {
2297 while (p < e) {
2298 if (ISASCII(*p)) {
2299 q = search_nonascii(p, e);
2300 if (!q)
2301 return c + (e - p);
2302 c += q - p;
2303 p = q;
2304 }
2305 p += rb_enc_fast_mbclen(p, e, enc);
2306 c++;
2307 }
2308 }
2309 else {
2310 while (p < e) {
2311 if (ISASCII(*p)) {
2312 q = search_nonascii(p, e);
2313 if (!q)
2314 return c + (e - p);
2315 c += q - p;
2316 p = q;
2317 }
2318 p += rb_enc_mbclen(p, e, enc);
2319 c++;
2320 }
2321 }
2322 return c;
2323 }
2324
2325 for (c=0; p<e; c++) {
2326 p += rb_enc_mbclen(p, e, enc);
2327 }
2328 return c;
2329}
2330
2331long
2332rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2333{
2334 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2335}
2336
2337/* To get strlen with cr
2338 * Note that given cr is not used.
2339 */
2340long
2341rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2342{
2343 long c;
2344 const char *q;
2345 int ret;
2346
2347 *cr = 0;
2348 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2349 long diff = (long)(e - p);
2350 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2351 }
2352 else if (rb_enc_asciicompat(enc)) {
2353 c = 0;
2354 while (p < e) {
2355 if (ISASCII(*p)) {
2356 q = search_nonascii(p, e);
2357 if (!q) {
2358 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2359 return c + (e - p);
2360 }
2361 c += q - p;
2362 p = q;
2363 }
2364 ret = rb_enc_precise_mbclen(p, e, enc);
2365 if (MBCLEN_CHARFOUND_P(ret)) {
2366 *cr |= ENC_CODERANGE_VALID;
2367 p += MBCLEN_CHARFOUND_LEN(ret);
2368 }
2369 else {
2371 p++;
2372 }
2373 c++;
2374 }
2375 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2376 return c;
2377 }
2378
2379 for (c=0; p<e; c++) {
2380 ret = rb_enc_precise_mbclen(p, e, enc);
2381 if (MBCLEN_CHARFOUND_P(ret)) {
2382 *cr |= ENC_CODERANGE_VALID;
2383 p += MBCLEN_CHARFOUND_LEN(ret);
2384 }
2385 else {
2387 if (p + rb_enc_mbminlen(enc) <= e)
2388 p += rb_enc_mbminlen(enc);
2389 else
2390 p = e;
2391 }
2392 }
2393 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2394 return c;
2395}
2396
2397/* enc must be str's enc or rb_enc_check(str, str2) */
2398static long
2399str_strlen(VALUE str, rb_encoding *enc)
2400{
2401 const char *p, *e;
2402 int cr;
2403
2404 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2405 if (!enc) enc = STR_ENC_GET(str);
2406 p = RSTRING_PTR(str);
2407 e = RSTRING_END(str);
2408 cr = ENC_CODERANGE(str);
2409
2410 if (cr == ENC_CODERANGE_UNKNOWN) {
2411 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2412 if (cr) ENC_CODERANGE_SET(str, cr);
2413 return n;
2414 }
2415 else {
2416 return enc_strlen(p, e, enc, cr);
2417 }
2418}
2419
2420long
2422{
2423 return str_strlen(str, NULL);
2424}
2425
2426/*
2427 * call-seq:
2428 * length -> integer
2429 *
2430 * :include: doc/string/length.rdoc
2431 *
2432 */
2433
2434VALUE
2436{
2437 return LONG2NUM(str_strlen(str, NULL));
2438}
2439
2440/*
2441 * call-seq:
2442 * bytesize -> integer
2443 *
2444 * :include: doc/string/bytesize.rdoc
2445 *
2446 */
2447
2448VALUE
2449rb_str_bytesize(VALUE str)
2450{
2451 return LONG2NUM(RSTRING_LEN(str));
2452}
2453
2454/*
2455 * call-seq:
2456 * empty? -> true or false
2457 *
2458 * Returns whether the length of +self+ is zero:
2459 *
2460 * 'hello'.empty? # => false
2461 * ' '.empty? # => false
2462 * ''.empty? # => true
2463 *
2464 * Related: see {Querying}[rdoc-ref:String@Querying].
2465 */
2466
2467static VALUE
2468rb_str_empty(VALUE str)
2469{
2470 return RBOOL(RSTRING_LEN(str) == 0);
2471}
2472
2473/*
2474 * call-seq:
2475 * self + other_string -> new_string
2476 *
2477 * Returns a new string containing +other_string+ concatenated to +self+:
2478 *
2479 * 'Hello from ' + self.to_s # => "Hello from main"
2480 *
2481 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2482 */
2483
2484VALUE
2486{
2487 VALUE str3;
2488 rb_encoding *enc;
2489 char *ptr1, *ptr2, *ptr3;
2490 long len1, len2;
2491 int termlen;
2492
2493 StringValue(str2);
2494 enc = rb_enc_check_str(str1, str2);
2495 RSTRING_GETMEM(str1, ptr1, len1);
2496 RSTRING_GETMEM(str2, ptr2, len2);
2497 termlen = rb_enc_mbminlen(enc);
2498 if (len1 > LONG_MAX - len2) {
2499 rb_raise(rb_eArgError, "string size too big");
2500 }
2501 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2502 ptr3 = RSTRING_PTR(str3);
2503 memcpy(ptr3, ptr1, len1);
2504 memcpy(ptr3+len1, ptr2, len2);
2505 TERM_FILL(&ptr3[len1+len2], termlen);
2506
2507 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2509 RB_GC_GUARD(str1);
2510 RB_GC_GUARD(str2);
2511 return str3;
2512}
2513
2514/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2515VALUE
2516rb_str_opt_plus(VALUE str1, VALUE str2)
2517{
2520 long len1, len2;
2521 MAYBE_UNUSED(char) *ptr1, *ptr2;
2522 RSTRING_GETMEM(str1, ptr1, len1);
2523 RSTRING_GETMEM(str2, ptr2, len2);
2524 int enc1 = rb_enc_get_index(str1);
2525 int enc2 = rb_enc_get_index(str2);
2526
2527 if (enc1 < 0) {
2528 return Qundef;
2529 }
2530 else if (enc2 < 0) {
2531 return Qundef;
2532 }
2533 else if (enc1 != enc2) {
2534 return Qundef;
2535 }
2536 else if (len1 > LONG_MAX - len2) {
2537 return Qundef;
2538 }
2539 else {
2540 return rb_str_plus(str1, str2);
2541 }
2542
2543}
2544
2545/*
2546 * call-seq:
2547 * self * n -> new_string
2548 *
2549 * Returns a new string containing +n+ copies of +self+:
2550 *
2551 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2552 * 'No!' * 0 # => ""
2553 *
2554 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2555 */
2556
2557VALUE
2559{
2560 VALUE str2;
2561 long n, len;
2562 char *ptr2;
2563 int termlen;
2564
2565 if (times == INT2FIX(1)) {
2566 return str_duplicate(rb_cString, str);
2567 }
2568 if (times == INT2FIX(0)) {
2569 str2 = str_alloc_embed(rb_cString, 0);
2570 rb_enc_copy(str2, str);
2571 return str2;
2572 }
2573 len = NUM2LONG(times);
2574 if (len < 0) {
2575 rb_raise(rb_eArgError, "negative argument");
2576 }
2577 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2578 if (STR_EMBEDDABLE_P(len, 1)) {
2579 str2 = str_alloc_embed(rb_cString, len + 1);
2580 memset(RSTRING_PTR(str2), 0, len + 1);
2581 }
2582 else {
2583 str2 = str_alloc_heap(rb_cString);
2584 RSTRING(str2)->as.heap.aux.capa = len;
2585 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2586 }
2587 STR_SET_LEN(str2, len);
2588 rb_enc_copy(str2, str);
2589 return str2;
2590 }
2591 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2592 rb_raise(rb_eArgError, "argument too big");
2593 }
2594
2595 len *= RSTRING_LEN(str);
2596 termlen = TERM_LEN(str);
2597 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2598 ptr2 = RSTRING_PTR(str2);
2599 if (len) {
2600 n = RSTRING_LEN(str);
2601 memcpy(ptr2, RSTRING_PTR(str), n);
2602 while (n <= len/2) {
2603 memcpy(ptr2 + n, ptr2, n);
2604 n *= 2;
2605 }
2606 memcpy(ptr2 + n, ptr2, len-n);
2607 }
2608 STR_SET_LEN(str2, len);
2609 TERM_FILL(&ptr2[len], termlen);
2610 rb_enc_cr_str_copy_for_substr(str2, str);
2611
2612 return str2;
2613}
2614
2615/*
2616 * call-seq:
2617 * self % object -> new_string
2618 *
2619 * Returns the result of formatting +object+ into the format specifications
2620 * contained in +self+
2621 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2622 *
2623 * '%05d' % 123 # => "00123"
2624 *
2625 * If +self+ contains multiple format specifications,
2626 * +object+ must be an array or hash containing the objects to be formatted:
2627 *
2628 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2629 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2630 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2631 *
2632 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2633 */
2634
2635static VALUE
2636rb_str_format_m(VALUE str, VALUE arg)
2637{
2638 VALUE tmp = rb_check_array_type(arg);
2639
2640 if (!NIL_P(tmp)) {
2641 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2642 }
2643 return rb_str_format(1, &arg, str);
2644}
2645
2646static inline void
2647rb_check_lockedtmp(VALUE str)
2648{
2649 if (FL_TEST(str, STR_TMPLOCK)) {
2650 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2651 }
2652}
2653
2654// If none of these flags are set, we know we have an modifiable string.
2655// If any is set, we need to do more detailed checks.
2656#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2657static inline void
2658str_modifiable(VALUE str)
2659{
2660 RUBY_ASSERT(ruby_thread_has_gvl_p());
2661
2662 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2663 if (CHILLED_STRING_P(str)) {
2664 CHILLED_STRING_MUTATED(str);
2665 }
2666 rb_check_lockedtmp(str);
2667 rb_check_frozen(str);
2668 }
2669}
2670
2671static inline int
2672str_dependent_p(VALUE str)
2673{
2674 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2675 return FALSE;
2676 }
2677 else {
2678 return TRUE;
2679 }
2680}
2681
2682// If none of these flags are set, we know we have an independent string.
2683// If any is set, we need to do more detailed checks.
2684#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2685static inline int
2686str_independent(VALUE str)
2687{
2688 RUBY_ASSERT(ruby_thread_has_gvl_p());
2689
2690 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2691 str_modifiable(str);
2692 return !str_dependent_p(str);
2693 }
2694 return TRUE;
2695}
2696
2697static void
2698str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2699{
2700 RUBY_ASSERT(ruby_thread_has_gvl_p());
2701
2702 char *ptr;
2703 char *oldptr;
2704 long capa = len + expand;
2705
2706 if (len > capa) len = capa;
2707
2708 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2709 ptr = RSTRING(str)->as.heap.ptr;
2710 STR_SET_EMBED(str);
2711 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2712 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2713 STR_SET_LEN(str, len);
2714 return;
2715 }
2716
2717 ptr = ALLOC_N(char, (size_t)capa + termlen);
2718 oldptr = RSTRING_PTR(str);
2719 if (oldptr) {
2720 memcpy(ptr, oldptr, len);
2721 }
2722 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2723 xfree(oldptr);
2724 }
2725 STR_SET_NOEMBED(str);
2726 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2727 TERM_FILL(ptr + len, termlen);
2728 RSTRING(str)->as.heap.ptr = ptr;
2729 STR_SET_LEN(str, len);
2730 RSTRING(str)->as.heap.aux.capa = capa;
2731}
2732
2733void
2734rb_str_modify(VALUE str)
2735{
2736 if (!str_independent(str))
2737 str_make_independent(str);
2739}
2740
2741void
2743{
2744 RUBY_ASSERT(ruby_thread_has_gvl_p());
2745
2746 int termlen = TERM_LEN(str);
2747 long len = RSTRING_LEN(str);
2748
2749 if (expand < 0) {
2750 rb_raise(rb_eArgError, "negative expanding string size");
2751 }
2752 if (expand >= LONG_MAX - len) {
2753 rb_raise(rb_eArgError, "string size too big");
2754 }
2755
2756 if (!str_independent(str)) {
2757 str_make_independent_expand(str, len, expand, termlen);
2758 }
2759 else if (expand > 0) {
2760 RESIZE_CAPA_TERM(str, len + expand, termlen);
2761 }
2763}
2764
2765/* As rb_str_modify(), but don't clear coderange */
2766static void
2767str_modify_keep_cr(VALUE str)
2768{
2769 if (!str_independent(str))
2770 str_make_independent(str);
2772 /* Force re-scan later */
2774}
2775
2776static inline void
2777str_discard(VALUE str)
2778{
2779 str_modifiable(str);
2780 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2781 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2782 RSTRING(str)->as.heap.ptr = 0;
2783 STR_SET_LEN(str, 0);
2784 }
2785}
2786
2787void
2789{
2790 int encindex = rb_enc_get_index(str);
2791
2792 if (RB_UNLIKELY(encindex == -1)) {
2793 rb_raise(rb_eTypeError, "not encoding capable object");
2794 }
2795
2796 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2797 return;
2798 }
2799
2800 rb_encoding *enc = rb_enc_from_index(encindex);
2801 if (!rb_enc_asciicompat(enc)) {
2802 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2803 }
2804}
2805
2806VALUE
2808{
2809 RUBY_ASSERT(ruby_thread_has_gvl_p());
2810
2811 VALUE s = *ptr;
2812 if (!RB_TYPE_P(s, T_STRING)) {
2813 s = rb_str_to_str(s);
2814 *ptr = s;
2815 }
2816 return s;
2817}
2818
2819char *
2821{
2822 VALUE str = rb_string_value(ptr);
2823 return RSTRING_PTR(str);
2824}
2825
2826static int
2827zero_filled(const char *s, int n)
2828{
2829 for (; n > 0; --n) {
2830 if (*s++) return 0;
2831 }
2832 return 1;
2833}
2834
2835static const char *
2836str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2837{
2838 const char *e = s + len;
2839
2840 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2841 if (zero_filled(s, minlen)) return s;
2842 }
2843 return 0;
2844}
2845
2846static char *
2847str_fill_term(VALUE str, char *s, long len, int termlen)
2848{
2849 /* This function assumes that (capa + termlen) bytes of memory
2850 * is allocated, like many other functions in this file.
2851 */
2852 if (str_dependent_p(str)) {
2853 if (!zero_filled(s + len, termlen))
2854 str_make_independent_expand(str, len, 0L, termlen);
2855 }
2856 else {
2857 TERM_FILL(s + len, termlen);
2858 return s;
2859 }
2860 return RSTRING_PTR(str);
2861}
2862
2863void
2864rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2865{
2866 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2867 long len = RSTRING_LEN(str);
2868
2869 RUBY_ASSERT(capa >= len);
2870 if (capa - len < termlen) {
2871 rb_check_lockedtmp(str);
2872 str_make_independent_expand(str, len, 0L, termlen);
2873 }
2874 else if (str_dependent_p(str)) {
2875 if (termlen > oldtermlen)
2876 str_make_independent_expand(str, len, 0L, termlen);
2877 }
2878 else {
2879 if (!STR_EMBED_P(str)) {
2880 /* modify capa instead of realloc */
2881 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2882 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2883 }
2884 if (termlen > oldtermlen) {
2885 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2886 }
2887 }
2888
2889 return;
2890}
2891
2892static char *
2893str_null_check(VALUE str, int *w)
2894{
2895 char *s = RSTRING_PTR(str);
2896 long len = RSTRING_LEN(str);
2897 rb_encoding *enc = rb_enc_get(str);
2898 const int minlen = rb_enc_mbminlen(enc);
2899
2900 if (minlen > 1) {
2901 *w = 1;
2902 if (str_null_char(s, len, minlen, enc)) {
2903 return NULL;
2904 }
2905 return str_fill_term(str, s, len, minlen);
2906 }
2907 *w = 0;
2908 if (!s || memchr(s, 0, len)) {
2909 return NULL;
2910 }
2911 if (s[len]) {
2912 s = str_fill_term(str, s, len, minlen);
2913 }
2914 return s;
2915}
2916
2917char *
2918rb_str_to_cstr(VALUE str)
2919{
2920 int w;
2921 return str_null_check(str, &w);
2922}
2923
2924char *
2926{
2927 VALUE str = rb_string_value(ptr);
2928 int w;
2929 char *s = str_null_check(str, &w);
2930 if (!s) {
2931 if (w) {
2932 rb_raise(rb_eArgError, "string contains null char");
2933 }
2934 rb_raise(rb_eArgError, "string contains null byte");
2935 }
2936 return s;
2937}
2938
2939char *
2940rb_str_fill_terminator(VALUE str, const int newminlen)
2941{
2942 char *s = RSTRING_PTR(str);
2943 long len = RSTRING_LEN(str);
2944 return str_fill_term(str, s, len, newminlen);
2945}
2946
2947VALUE
2949{
2950 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2951 return str;
2952}
2953
2954/*
2955 * call-seq:
2956 * String.try_convert(object) -> object, new_string, or nil
2957 *
2958 * Attempts to convert the given +object+ to a string.
2959 *
2960 * If +object+ is already a string, returns +object+, unmodified.
2961 *
2962 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2963 * calls <tt>object.to_str</tt> and returns the result.
2964 *
2965 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2966 *
2967 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2968 */
2969static VALUE
2970rb_str_s_try_convert(VALUE dummy, VALUE str)
2971{
2972 return rb_check_string_type(str);
2973}
2974
2975static char*
2976str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2977{
2978 long nth = *nthp;
2979 if (rb_enc_mbmaxlen(enc) == 1) {
2980 p += nth;
2981 }
2982 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2983 p += nth * rb_enc_mbmaxlen(enc);
2984 }
2985 else if (rb_enc_asciicompat(enc)) {
2986 const char *p2, *e2;
2987 int n;
2988
2989 while (p < e && 0 < nth) {
2990 e2 = p + nth;
2991 if (e < e2) {
2992 *nthp = nth;
2993 return (char *)e;
2994 }
2995 if (ISASCII(*p)) {
2996 p2 = search_nonascii(p, e2);
2997 if (!p2) {
2998 nth -= e2 - p;
2999 *nthp = nth;
3000 return (char *)e2;
3001 }
3002 nth -= p2 - p;
3003 p = p2;
3004 }
3005 n = rb_enc_mbclen(p, e, enc);
3006 p += n;
3007 nth--;
3008 }
3009 *nthp = nth;
3010 if (nth != 0) {
3011 return (char *)e;
3012 }
3013 return (char *)p;
3014 }
3015 else {
3016 while (p < e && nth--) {
3017 p += rb_enc_mbclen(p, e, enc);
3018 }
3019 }
3020 if (p > e) p = e;
3021 *nthp = nth;
3022 return (char*)p;
3023}
3024
3025char*
3026rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3027{
3028 return str_nth_len(p, e, &nth, enc);
3029}
3030
3031static char*
3032str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3033{
3034 if (singlebyte)
3035 p += nth;
3036 else {
3037 p = str_nth_len(p, e, &nth, enc);
3038 }
3039 if (!p) return 0;
3040 if (p > e) p = e;
3041 return (char *)p;
3042}
3043
3044/* char offset to byte offset */
3045static long
3046str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3047{
3048 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3049 if (!pp) return e - p;
3050 return pp - p;
3051}
3052
3053long
3054rb_str_offset(VALUE str, long pos)
3055{
3056 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3057 STR_ENC_GET(str), single_byte_optimizable(str));
3058}
3059
3060#ifdef NONASCII_MASK
3061static char *
3062str_utf8_nth(const char *p, const char *e, long *nthp)
3063{
3064 long nth = *nthp;
3065 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3066 const uintptr_t *s, *t;
3067 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3068 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3069 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3070 while (p < (const char *)s) {
3071 if (is_utf8_lead_byte(*p)) nth--;
3072 p++;
3073 }
3074 do {
3075 nth -= count_utf8_lead_bytes_with_word(s);
3076 s++;
3077 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3078 p = (char *)s;
3079 }
3080 while (p < e) {
3081 if (is_utf8_lead_byte(*p)) {
3082 if (nth == 0) break;
3083 nth--;
3084 }
3085 p++;
3086 }
3087 *nthp = nth;
3088 return (char *)p;
3089}
3090
3091static long
3092str_utf8_offset(const char *p, const char *e, long nth)
3093{
3094 const char *pp = str_utf8_nth(p, e, &nth);
3095 return pp - p;
3096}
3097#endif
3098
3099/* byte offset to char offset */
3100long
3101rb_str_sublen(VALUE str, long pos)
3102{
3103 if (single_byte_optimizable(str) || pos < 0)
3104 return pos;
3105 else {
3106 char *p = RSTRING_PTR(str);
3107 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3108 }
3109}
3110
3111static VALUE
3112str_subseq(VALUE str, long beg, long len)
3113{
3114 VALUE str2;
3115
3116 RUBY_ASSERT(beg >= 0);
3117 RUBY_ASSERT(len >= 0);
3118 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3119
3120 const int termlen = TERM_LEN(str);
3121 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3122 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3123 RB_GC_GUARD(str);
3124 return str2;
3125 }
3126
3127 str2 = str_alloc_heap(rb_cString);
3128 if (str_embed_capa(str2) >= len + termlen) {
3129 char *ptr2 = RSTRING(str2)->as.embed.ary;
3130 STR_SET_EMBED(str2);
3131 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3132 TERM_FILL(ptr2+len, termlen);
3133
3134 STR_SET_LEN(str2, len);
3135 RB_GC_GUARD(str);
3136 }
3137 else {
3138 str_replace_shared(str2, str);
3139 RUBY_ASSERT(!STR_EMBED_P(str2));
3140 ENC_CODERANGE_CLEAR(str2);
3141 RSTRING(str2)->as.heap.ptr += beg;
3142 if (RSTRING_LEN(str2) > len) {
3143 STR_SET_LEN(str2, len);
3144 }
3145 }
3146
3147 return str2;
3148}
3149
3150VALUE
3151rb_str_subseq(VALUE str, long beg, long len)
3152{
3153 VALUE str2 = str_subseq(str, beg, len);
3154 rb_enc_cr_str_copy_for_substr(str2, str);
3155 return str2;
3156}
3157
3158char *
3159rb_str_subpos(VALUE str, long beg, long *lenp)
3160{
3161 long len = *lenp;
3162 long slen = -1L;
3163 const long blen = RSTRING_LEN(str);
3164 rb_encoding *enc = STR_ENC_GET(str);
3165 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3166
3167 if (len < 0) return 0;
3168 if (beg < 0 && -beg < 0) return 0;
3169 if (!blen) {
3170 len = 0;
3171 }
3172 if (single_byte_optimizable(str)) {
3173 if (beg > blen) return 0;
3174 if (beg < 0) {
3175 beg += blen;
3176 if (beg < 0) return 0;
3177 }
3178 if (len > blen - beg)
3179 len = blen - beg;
3180 if (len < 0) return 0;
3181 p = s + beg;
3182 goto end;
3183 }
3184 if (beg < 0) {
3185 if (len > -beg) len = -beg;
3186 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3187 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3188 beg = -beg;
3189 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3190 p = e;
3191 if (!p) return 0;
3192 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3193 if (!p) return 0;
3194 len = e - p;
3195 goto end;
3196 }
3197 else {
3198 slen = str_strlen(str, enc);
3199 beg += slen;
3200 if (beg < 0) return 0;
3201 p = s + beg;
3202 if (len == 0) goto end;
3203 }
3204 }
3205 else if (beg > 0 && beg > blen) {
3206 return 0;
3207 }
3208 if (len == 0) {
3209 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3210 p = s + beg;
3211 }
3212#ifdef NONASCII_MASK
3213 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3214 enc == rb_utf8_encoding()) {
3215 p = str_utf8_nth(s, e, &beg);
3216 if (beg > 0) return 0;
3217 len = str_utf8_offset(p, e, len);
3218 }
3219#endif
3220 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3221 int char_sz = rb_enc_mbmaxlen(enc);
3222
3223 p = s + beg * char_sz;
3224 if (p > e) {
3225 return 0;
3226 }
3227 else if (len * char_sz > e - p)
3228 len = e - p;
3229 else
3230 len *= char_sz;
3231 }
3232 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3233 if (beg > 0) return 0;
3234 len = 0;
3235 }
3236 else {
3237 len = str_offset(p, e, len, enc, 0);
3238 }
3239 end:
3240 *lenp = len;
3241 RB_GC_GUARD(str);
3242 return p;
3243}
3244
3245static VALUE str_substr(VALUE str, long beg, long len, int empty);
3246
3247VALUE
3248rb_str_substr(VALUE str, long beg, long len)
3249{
3250 return str_substr(str, beg, len, TRUE);
3251}
3252
3253VALUE
3254rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3255{
3256 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3257}
3258
3259static VALUE
3260str_substr(VALUE str, long beg, long len, int empty)
3261{
3262 char *p = rb_str_subpos(str, beg, &len);
3263
3264 if (!p) return Qnil;
3265 if (!len && !empty) return Qnil;
3266
3267 beg = p - RSTRING_PTR(str);
3268
3269 VALUE str2 = str_subseq(str, beg, len);
3270 rb_enc_cr_str_copy_for_substr(str2, str);
3271 return str2;
3272}
3273
3274/* :nodoc: */
3275VALUE
3277{
3278 if (CHILLED_STRING_P(str)) {
3279 FL_UNSET_RAW(str, STR_CHILLED);
3280 }
3281
3282 if (OBJ_FROZEN(str)) return str;
3283 rb_str_resize(str, RSTRING_LEN(str));
3284 return rb_obj_freeze(str);
3285}
3286
3287/*
3288 * call-seq:
3289 * +string -> new_string or self
3290 *
3291 * Returns +self+ if +self+ is not frozen and can be mutated
3292 * without warning issuance.
3293 *
3294 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3295 *
3296 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3297 */
3298static VALUE
3299str_uplus(VALUE str)
3300{
3301 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3302 return rb_str_dup(str);
3303 }
3304 else {
3305 return str;
3306 }
3307}
3308
3309/*
3310 * call-seq:
3311 * -self -> frozen_string
3312 *
3313 * Returns a frozen string equal to +self+.
3314 *
3315 * The returned string is +self+ if and only if all of the following are true:
3316 *
3317 * - +self+ is already frozen.
3318 * - +self+ is an instance of \String (rather than of a subclass of \String)
3319 * - +self+ has no instance variables set on it.
3320 *
3321 * Otherwise, the returned string is a frozen copy of +self+.
3322 *
3323 * Returning +self+, when possible, saves duplicating +self+;
3324 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3325 *
3326 * It may also save duplicating other, already-existing, strings:
3327 *
3328 * s0 = 'foo'
3329 * s1 = 'foo'
3330 * s0.object_id == s1.object_id # => false
3331 * (-s0).object_id == (-s1).object_id # => true
3332 *
3333 * Note that method #-@ is convenient for defining a constant:
3334 *
3335 * FileName = -'config/database.yml'
3336 *
3337 * While its alias #dedup is better suited for chaining:
3338 *
3339 * 'foo'.dedup.gsub!('o')
3340 *
3341 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3342 */
3343static VALUE
3344str_uminus(VALUE str)
3345{
3346 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3347 str = rb_str_dup(str);
3348 }
3349 return rb_fstring(str);
3350}
3351
3352RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3353#define rb_str_dup_frozen rb_str_new_frozen
3354
3355VALUE
3357{
3358 rb_check_frozen(str);
3359 if (FL_TEST(str, STR_TMPLOCK)) {
3360 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3361 }
3362 FL_SET(str, STR_TMPLOCK);
3363 return str;
3364}
3365
3366VALUE
3368{
3369 rb_check_frozen(str);
3370 if (!FL_TEST(str, STR_TMPLOCK)) {
3371 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3372 }
3373 FL_UNSET(str, STR_TMPLOCK);
3374 return str;
3375}
3376
3377VALUE
3378rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3379{
3380 rb_str_locktmp(str);
3381 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3382}
3383
3384void
3386{
3387 RUBY_ASSERT(ruby_thread_has_gvl_p());
3388
3389 long capa;
3390 const int termlen = TERM_LEN(str);
3391
3392 str_modifiable(str);
3393 if (STR_SHARED_P(str)) {
3394 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3395 }
3396 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3397 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3398 }
3399
3400 int cr = ENC_CODERANGE(str);
3401 if (len == 0) {
3402 /* Empty string does not contain non-ASCII */
3404 }
3405 else if (cr == ENC_CODERANGE_UNKNOWN) {
3406 /* Leave unknown. */
3407 }
3408 else if (len > RSTRING_LEN(str)) {
3409 if (ENC_CODERANGE_CLEAN_P(cr)) {
3410 /* Update the coderange regarding the extended part. */
3411 const char *const prev_end = RSTRING_END(str);
3412 const char *const new_end = RSTRING_PTR(str) + len;
3413 rb_encoding *enc = rb_enc_get(str);
3414 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3415 ENC_CODERANGE_SET(str, cr);
3416 }
3417 else if (cr == ENC_CODERANGE_BROKEN) {
3418 /* May be valid now, by appended part. */
3420 }
3421 }
3422 else if (len < RSTRING_LEN(str)) {
3423 if (cr != ENC_CODERANGE_7BIT) {
3424 /* ASCII-only string is keeping after truncated. Valid
3425 * and broken may be invalid or valid, leave unknown. */
3427 }
3428 }
3429
3430 STR_SET_LEN(str, len);
3431 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3432}
3433
3434VALUE
3435rb_str_resize(VALUE str, long len)
3436{
3437 if (len < 0) {
3438 rb_raise(rb_eArgError, "negative string size (or size too big)");
3439 }
3440
3441 int independent = str_independent(str);
3442 long slen = RSTRING_LEN(str);
3443 const int termlen = TERM_LEN(str);
3444
3445 if (slen > len || (termlen != 1 && slen < len)) {
3447 }
3448
3449 {
3450 long capa;
3451 if (STR_EMBED_P(str)) {
3452 if (len == slen) return str;
3453 if (str_embed_capa(str) >= len + termlen) {
3454 STR_SET_LEN(str, len);
3455 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3456 return str;
3457 }
3458 str_make_independent_expand(str, slen, len - slen, termlen);
3459 }
3460 else if (str_embed_capa(str) >= len + termlen) {
3461 char *ptr = STR_HEAP_PTR(str);
3462 STR_SET_EMBED(str);
3463 if (slen > len) slen = len;
3464 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3465 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3466 STR_SET_LEN(str, len);
3467 if (independent) ruby_xfree(ptr);
3468 return str;
3469 }
3470 else if (!independent) {
3471 if (len == slen) return str;
3472 str_make_independent_expand(str, slen, len - slen, termlen);
3473 }
3474 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3475 (capa - len) > (len < 1024 ? len : 1024)) {
3476 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3477 (size_t)len + termlen, STR_HEAP_SIZE(str));
3478 RSTRING(str)->as.heap.aux.capa = len;
3479 }
3480 else if (len == slen) return str;
3481 STR_SET_LEN(str, len);
3482 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3483 }
3484 return str;
3485}
3486
3487static void
3488str_ensure_available_capa(VALUE str, long len)
3489{
3490 str_modify_keep_cr(str);
3491
3492 const int termlen = TERM_LEN(str);
3493 long olen = RSTRING_LEN(str);
3494
3495 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3496 rb_raise(rb_eArgError, "string sizes too big");
3497 }
3498
3499 long total = olen + len;
3500 long capa = str_capacity(str, termlen);
3501
3502 if (capa < total) {
3503 if (total >= LONG_MAX / 2) {
3504 capa = total;
3505 }
3506 while (total > capa) {
3507 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3508 }
3509 RESIZE_CAPA_TERM(str, capa, termlen);
3510 }
3511}
3512
3513static VALUE
3514str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3515{
3516 if (keep_cr) {
3517 str_modify_keep_cr(str);
3518 }
3519 else {
3520 rb_str_modify(str);
3521 }
3522 if (len == 0) return 0;
3523
3524 long total, olen, off = -1;
3525 char *sptr;
3526 const int termlen = TERM_LEN(str);
3527
3528 RSTRING_GETMEM(str, sptr, olen);
3529 if (ptr >= sptr && ptr <= sptr + olen) {
3530 off = ptr - sptr;
3531 }
3532
3533 long capa = str_capacity(str, termlen);
3534
3535 if (olen > LONG_MAX - len) {
3536 rb_raise(rb_eArgError, "string sizes too big");
3537 }
3538 total = olen + len;
3539 if (capa < total) {
3540 if (total >= LONG_MAX / 2) {
3541 capa = total;
3542 }
3543 while (total > capa) {
3544 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3545 }
3546 RESIZE_CAPA_TERM(str, capa, termlen);
3547 sptr = RSTRING_PTR(str);
3548 }
3549 if (off != -1) {
3550 ptr = sptr + off;
3551 }
3552 memcpy(sptr + olen, ptr, len);
3553 STR_SET_LEN(str, total);
3554 TERM_FILL(sptr + total, termlen); /* sentinel */
3555
3556 return str;
3557}
3558
3559#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3560#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3561
3562VALUE
3563rb_str_cat(VALUE str, const char *ptr, long len)
3564{
3565 if (len == 0) return str;
3566 if (len < 0) {
3567 rb_raise(rb_eArgError, "negative string size (or size too big)");
3568 }
3569 return str_buf_cat(str, ptr, len);
3570}
3571
3572VALUE
3573rb_str_cat_cstr(VALUE str, const char *ptr)
3574{
3575 must_not_null(ptr);
3576 return rb_str_buf_cat(str, ptr, strlen(ptr));
3577}
3578
3579static void
3580rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3581{
3582 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3583
3584 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3585 if (UNLIKELY(!str_independent(str))) {
3586 str_make_independent(str);
3587 }
3588
3589 long string_length = -1;
3590 const int null_terminator_length = 1;
3591 char *sptr;
3592 RSTRING_GETMEM(str, sptr, string_length);
3593
3594 // Ensure the resulting string wouldn't be too long.
3595 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3596 rb_raise(rb_eArgError, "string sizes too big");
3597 }
3598
3599 long string_capacity = str_capacity(str, null_terminator_length);
3600
3601 // Get the code range before any modifications since those might clear the code range.
3602 int cr = ENC_CODERANGE(str);
3603
3604 // Check if the string has spare string_capacity to write the new byte.
3605 if (LIKELY(string_capacity >= string_length + 1)) {
3606 // In fast path we can write the new byte and note the string's new length.
3607 sptr[string_length] = byte;
3608 STR_SET_LEN(str, string_length + 1);
3609 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3610 }
3611 else {
3612 // If there's not enough string_capacity, make a call into the general string concatenation function.
3613 str_buf_cat(str, (char *)&byte, 1);
3614 }
3615
3616 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3617 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3618 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3619 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3620 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3621 if (ISASCII(byte)) {
3623 }
3624 else {
3626
3627 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3628 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3629 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3630 }
3631 }
3632 }
3633}
3634
3635RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3636RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3637RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3638
3639static VALUE
3640rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3641 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3642{
3643 int str_encindex = ENCODING_GET(str);
3644 int res_encindex;
3645 int str_cr, res_cr;
3646 rb_encoding *str_enc, *ptr_enc;
3647
3648 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3649
3650 if (str_encindex == ptr_encindex) {
3651 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3652 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3653 }
3654 }
3655 else {
3656 str_enc = rb_enc_from_index(str_encindex);
3657 ptr_enc = rb_enc_from_index(ptr_encindex);
3658 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3659 if (len == 0)
3660 return str;
3661 if (RSTRING_LEN(str) == 0) {
3662 rb_str_buf_cat(str, ptr, len);
3663 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3664 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3665 return str;
3666 }
3667 goto incompatible;
3668 }
3669 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3670 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3671 }
3672 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3673 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3674 str_cr = rb_enc_str_coderange(str);
3675 }
3676 }
3677 }
3678 if (ptr_cr_ret)
3679 *ptr_cr_ret = ptr_cr;
3680
3681 if (str_encindex != ptr_encindex &&
3682 str_cr != ENC_CODERANGE_7BIT &&
3683 ptr_cr != ENC_CODERANGE_7BIT) {
3684 str_enc = rb_enc_from_index(str_encindex);
3685 ptr_enc = rb_enc_from_index(ptr_encindex);
3686 goto incompatible;
3687 }
3688
3689 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3690 res_encindex = str_encindex;
3691 res_cr = ENC_CODERANGE_UNKNOWN;
3692 }
3693 else if (str_cr == ENC_CODERANGE_7BIT) {
3694 if (ptr_cr == ENC_CODERANGE_7BIT) {
3695 res_encindex = str_encindex;
3696 res_cr = ENC_CODERANGE_7BIT;
3697 }
3698 else {
3699 res_encindex = ptr_encindex;
3700 res_cr = ptr_cr;
3701 }
3702 }
3703 else if (str_cr == ENC_CODERANGE_VALID) {
3704 res_encindex = str_encindex;
3705 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3706 res_cr = str_cr;
3707 else
3708 res_cr = ptr_cr;
3709 }
3710 else { /* str_cr == ENC_CODERANGE_BROKEN */
3711 res_encindex = str_encindex;
3712 res_cr = str_cr;
3713 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3714 }
3715
3716 if (len < 0) {
3717 rb_raise(rb_eArgError, "negative string size (or size too big)");
3718 }
3719 str_buf_cat(str, ptr, len);
3720 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3721 return str;
3722
3723 incompatible:
3724 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3725 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3727}
3728
3729VALUE
3730rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3731{
3732 return rb_enc_cr_str_buf_cat(str, ptr, len,
3733 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3734}
3735
3736VALUE
3738{
3739 /* ptr must reference NUL terminated ASCII string. */
3740 int encindex = ENCODING_GET(str);
3741 rb_encoding *enc = rb_enc_from_index(encindex);
3742 if (rb_enc_asciicompat(enc)) {
3743 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3744 encindex, ENC_CODERANGE_7BIT, 0);
3745 }
3746 else {
3747 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3748 while (*ptr) {
3749 unsigned int c = (unsigned char)*ptr;
3750 int len = rb_enc_codelen(c, enc);
3751 rb_enc_mbcput(c, buf, enc);
3752 rb_enc_cr_str_buf_cat(str, buf, len,
3753 encindex, ENC_CODERANGE_VALID, 0);
3754 ptr++;
3755 }
3756 return str;
3757 }
3758}
3759
3760VALUE
3762{
3763 int str2_cr = rb_enc_str_coderange(str2);
3764
3765 if (str_enc_fastpath(str)) {
3766 switch (str2_cr) {
3767 case ENC_CODERANGE_7BIT:
3768 // If RHS is 7bit we can do simple concatenation
3769 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3770 RB_GC_GUARD(str2);
3771 return str;
3773 // If RHS is valid, we can do simple concatenation if encodings are the same
3774 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3775 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3776 int str_cr = ENC_CODERANGE(str);
3777 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3778 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3779 }
3780 RB_GC_GUARD(str2);
3781 return str;
3782 }
3783 }
3784 }
3785
3786 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3787 ENCODING_GET(str2), str2_cr, &str2_cr);
3788
3789 ENC_CODERANGE_SET(str2, str2_cr);
3790
3791 return str;
3792}
3793
3794VALUE
3796{
3797 StringValue(str2);
3798 return rb_str_buf_append(str, str2);
3799}
3800
3801VALUE
3802rb_str_concat_literals(size_t num, const VALUE *strary)
3803{
3804 VALUE str;
3805 size_t i, s = 0;
3806 unsigned long len = 1;
3807
3808 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3809 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3810
3811 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3812 str = rb_str_buf_new(len);
3813 str_enc_copy_direct(str, strary[0]);
3814
3815 for (i = s; i < num; ++i) {
3816 const VALUE v = strary[i];
3817 int encidx = ENCODING_GET(v);
3818
3819 rb_str_buf_append(str, v);
3820 if (encidx != ENCINDEX_US_ASCII) {
3821 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3822 rb_enc_set_index(str, encidx);
3823 }
3824 }
3825 return str;
3826}
3827
3828/*
3829 * call-seq:
3830 * concat(*objects) -> string
3831 *
3832 * :include: doc/string/concat.rdoc
3833 */
3834static VALUE
3835rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3836{
3837 str_modifiable(str);
3838
3839 if (argc == 1) {
3840 return rb_str_concat(str, argv[0]);
3841 }
3842 else if (argc > 1) {
3843 int i;
3844 VALUE arg_str = rb_str_tmp_new(0);
3845 rb_enc_copy(arg_str, str);
3846 for (i = 0; i < argc; i++) {
3847 rb_str_concat(arg_str, argv[i]);
3848 }
3849 rb_str_buf_append(str, arg_str);
3850 }
3851
3852 return str;
3853}
3854
3855/*
3856 * call-seq:
3857 * append_as_bytes(*objects) -> self
3858 *
3859 * Concatenates each object in +objects+ into +self+; returns +self+;
3860 * performs no encoding validation or conversion:
3861 *
3862 * s = 'foo'
3863 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3864 * s.valid_encoding? # => false
3865 * s.append_as_bytes("\xAC 12")
3866 * s.valid_encoding? # => true
3867 *
3868 * When a given object is an integer,
3869 * the value is considered an 8-bit byte;
3870 * if the integer occupies more than one byte (i.e,. is greater than 255),
3871 * appends only the low-order byte (similar to String#setbyte):
3872 *
3873 * s = ""
3874 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3875 * s.bytesize # => 2
3876 *
3877 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3878 */
3879
3880VALUE
3881rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3882{
3883 long needed_capacity = 0;
3884 volatile VALUE t0;
3885 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3886
3887 for (int index = 0; index < argc; index++) {
3888 VALUE obj = argv[index];
3889 enum ruby_value_type type = types[index] = rb_type(obj);
3890 switch (type) {
3891 case T_FIXNUM:
3892 case T_BIGNUM:
3893 needed_capacity++;
3894 break;
3895 case T_STRING:
3896 needed_capacity += RSTRING_LEN(obj);
3897 break;
3898 default:
3899 rb_raise(
3901 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3902 rb_obj_class(obj)
3903 );
3904 break;
3905 }
3906 }
3907
3908 str_ensure_available_capa(str, needed_capacity);
3909 char *sptr = RSTRING_END(str);
3910
3911 for (int index = 0; index < argc; index++) {
3912 VALUE obj = argv[index];
3913 enum ruby_value_type type = types[index];
3914 switch (type) {
3915 case T_FIXNUM:
3916 case T_BIGNUM: {
3917 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3918 char byte = (char)(NUM2INT(obj) & 0xFF);
3919 *sptr = byte;
3920 sptr++;
3921 break;
3922 }
3923 case T_STRING: {
3924 const char *ptr;
3925 long len;
3926 RSTRING_GETMEM(obj, ptr, len);
3927 memcpy(sptr, ptr, len);
3928 sptr += len;
3929 break;
3930 }
3931 default:
3932 rb_bug("append_as_bytes arguments should have been validated");
3933 }
3934 }
3935
3936 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3937 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3938
3939 int cr = ENC_CODERANGE(str);
3940 switch (cr) {
3941 case ENC_CODERANGE_7BIT: {
3942 for (int index = 0; index < argc; index++) {
3943 VALUE obj = argv[index];
3944 enum ruby_value_type type = types[index];
3945 switch (type) {
3946 case T_FIXNUM:
3947 case T_BIGNUM: {
3948 if (!ISASCII(NUM2INT(obj))) {
3949 goto clear_cr;
3950 }
3951 break;
3952 }
3953 case T_STRING: {
3954 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3955 goto clear_cr;
3956 }
3957 break;
3958 }
3959 default:
3960 rb_bug("append_as_bytes arguments should have been validated");
3961 }
3962 }
3963 break;
3964 }
3966 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3967 goto keep_cr;
3968 }
3969 else {
3970 goto clear_cr;
3971 }
3972 break;
3973 default:
3974 goto clear_cr;
3975 break;
3976 }
3977
3978 RB_GC_GUARD(t0);
3979
3980 clear_cr:
3981 // If no fast path was hit, we clear the coderange.
3982 // append_as_bytes is predominently meant to be used in
3983 // buffering situation, hence it's likely the coderange
3984 // will never be scanned, so it's not worth spending time
3985 // precomputing the coderange except for simple and common
3986 // situations.
3988 keep_cr:
3989 return str;
3990}
3991
3992/*
3993 * call-seq:
3994 * self << object -> self
3995 *
3996 * Appends a string representation of +object+ to +self+;
3997 * returns +self+.
3998 *
3999 * If +object+ is a string, appends it to +self+:
4000 *
4001 * s = 'foo'
4002 * s << 'bar' # => "foobar"
4003 * s # => "foobar"
4004 *
4005 * If +object+ is an integer,
4006 * its value is considered a codepoint;
4007 * converts the value to a character before concatenating:
4008 *
4009 * s = 'foo'
4010 * s << 33 # => "foo!"
4011 *
4012 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4013 * and the encoding of +self+ is Encoding::US_ASCII,
4014 * changes the encoding to Encoding::ASCII_8BIT:
4015 *
4016 * s = 'foo'.encode(Encoding::US_ASCII)
4017 * s.encoding # => #<Encoding:US-ASCII>
4018 * s << 0xff # => "foo\xFF"
4019 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4020 *
4021 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4022 *
4023 * s = 'foo'
4024 * s.encoding # => <Encoding:UTF-8>
4025 * s << 0x00110000 # 1114112 out of char range (RangeError)
4026 * s = 'foo'.encode(Encoding::EUC_JP)
4027 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4028 *
4029 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4030 */
4031VALUE
4033{
4034 unsigned int code;
4035 rb_encoding *enc = STR_ENC_GET(str1);
4036 int encidx;
4037
4038 if (RB_INTEGER_TYPE_P(str2)) {
4039 if (rb_num_to_uint(str2, &code) == 0) {
4040 }
4041 else if (FIXNUM_P(str2)) {
4042 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4043 }
4044 else {
4045 rb_raise(rb_eRangeError, "bignum out of char range");
4046 }
4047 }
4048 else {
4049 return rb_str_append(str1, str2);
4050 }
4051
4052 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4053
4054 if (encidx >= 0) {
4055 rb_str_buf_cat_byte(str1, (unsigned char)code);
4056 }
4057 else {
4058 long pos = RSTRING_LEN(str1);
4059 int cr = ENC_CODERANGE(str1);
4060 int len;
4061 char *buf;
4062
4063 switch (len = rb_enc_codelen(code, enc)) {
4064 case ONIGERR_INVALID_CODE_POINT_VALUE:
4065 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4066 break;
4067 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4068 case 0:
4069 rb_raise(rb_eRangeError, "%u out of char range", code);
4070 break;
4071 }
4072 buf = ALLOCA_N(char, len + 1);
4073 rb_enc_mbcput(code, buf, enc);
4074 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4075 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4076 }
4077 rb_str_resize(str1, pos+len);
4078 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4079 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4081 }
4082 else if (cr == ENC_CODERANGE_BROKEN) {
4084 }
4085 ENC_CODERANGE_SET(str1, cr);
4086 }
4087 return str1;
4088}
4089
4090int
4091rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4092{
4093 int encidx = rb_enc_to_index(enc);
4094
4095 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4096 /* US-ASCII automatically extended to ASCII-8BIT */
4097 if (code > 0xFF) {
4098 rb_raise(rb_eRangeError, "%u out of char range", code);
4099 }
4100 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4101 return ENCINDEX_ASCII_8BIT;
4102 }
4103 return encidx;
4104 }
4105 else {
4106 return -1;
4107 }
4108}
4109
4110/*
4111 * call-seq:
4112 * prepend(*other_strings) -> new_string
4113 *
4114 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4115 *
4116 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4117 *
4118 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4119 *
4120 */
4121
4122static VALUE
4123rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4124{
4125 str_modifiable(str);
4126
4127 if (argc == 1) {
4128 rb_str_update(str, 0L, 0L, argv[0]);
4129 }
4130 else if (argc > 1) {
4131 int i;
4132 VALUE arg_str = rb_str_tmp_new(0);
4133 rb_enc_copy(arg_str, str);
4134 for (i = 0; i < argc; i++) {
4135 rb_str_append(arg_str, argv[i]);
4136 }
4137 rb_str_update(str, 0L, 0L, arg_str);
4138 }
4139
4140 return str;
4141}
4142
4143st_index_t
4145{
4146 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4147 st_index_t precomputed_hash;
4148 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4149
4150 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4151 return precomputed_hash;
4152 }
4153
4154 return str_do_hash(str);
4155}
4156
4157int
4159{
4160 long len1, len2;
4161 const char *ptr1, *ptr2;
4162 RSTRING_GETMEM(str1, ptr1, len1);
4163 RSTRING_GETMEM(str2, ptr2, len2);
4164 return (len1 != len2 ||
4165 !rb_str_comparable(str1, str2) ||
4166 memcmp(ptr1, ptr2, len1) != 0);
4167}
4168
4169/*
4170 * call-seq:
4171 * hash -> integer
4172 *
4173 * :include: doc/string/hash.rdoc
4174 *
4175 */
4176
4177static VALUE
4178rb_str_hash_m(VALUE str)
4179{
4180 st_index_t hval = rb_str_hash(str);
4181 return ST2FIX(hval);
4182}
4183
4184#define lesser(a,b) (((a)>(b))?(b):(a))
4185
4186int
4188{
4189 int idx1, idx2;
4190 int rc1, rc2;
4191
4192 if (RSTRING_LEN(str1) == 0) return TRUE;
4193 if (RSTRING_LEN(str2) == 0) return TRUE;
4194 idx1 = ENCODING_GET(str1);
4195 idx2 = ENCODING_GET(str2);
4196 if (idx1 == idx2) return TRUE;
4197 rc1 = rb_enc_str_coderange(str1);
4198 rc2 = rb_enc_str_coderange(str2);
4199 if (rc1 == ENC_CODERANGE_7BIT) {
4200 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4201 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4202 return TRUE;
4203 }
4204 if (rc2 == ENC_CODERANGE_7BIT) {
4205 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4206 return TRUE;
4207 }
4208 return FALSE;
4209}
4210
4211int
4213{
4214 long len1, len2;
4215 const char *ptr1, *ptr2;
4216 int retval;
4217
4218 if (str1 == str2) return 0;
4219 RSTRING_GETMEM(str1, ptr1, len1);
4220 RSTRING_GETMEM(str2, ptr2, len2);
4221 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4222 if (len1 == len2) {
4223 if (!rb_str_comparable(str1, str2)) {
4224 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4225 return 1;
4226 return -1;
4227 }
4228 return 0;
4229 }
4230 if (len1 > len2) return 1;
4231 return -1;
4232 }
4233 if (retval > 0) return 1;
4234 return -1;
4235}
4236
4237/*
4238 * call-seq:
4239 * self == object -> true or false
4240 *
4241 * Returns whether +object+ is equal to +self+.
4242 *
4243 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4244 *
4245 * s = 'foo'
4246 * s == 'foo' # => true
4247 * s == 'food' # => false
4248 * s == 'FOO' # => false
4249 *
4250 * Returns +false+ if the two strings' encodings are not compatible:
4251 *
4252 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4253 *
4254 * When +object+ is not a string:
4255 *
4256 * - If +object+ responds to method <tt>to_str</tt>,
4257 * <tt>object == self</tt> is called and its return value is returned.
4258 * - If +object+ does not respond to <tt>to_str</tt>,
4259 * +false+ is returned.
4260 *
4261 * Related: {Comparing}[rdoc-ref:String@Comparing].
4262 */
4263
4264VALUE
4266{
4267 if (str1 == str2) return Qtrue;
4268 if (!RB_TYPE_P(str2, T_STRING)) {
4269 if (!rb_respond_to(str2, idTo_str)) {
4270 return Qfalse;
4271 }
4272 return rb_equal(str2, str1);
4273 }
4274 return rb_str_eql_internal(str1, str2);
4275}
4276
4277/*
4278 * call-seq:
4279 * eql?(object) -> true or false
4280 *
4281 * :include: doc/string/eql_p.rdoc
4282 *
4283 */
4284
4285VALUE
4286rb_str_eql(VALUE str1, VALUE str2)
4287{
4288 if (str1 == str2) return Qtrue;
4289 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4290 return rb_str_eql_internal(str1, str2);
4291}
4292
4293/*
4294 * call-seq:
4295 * self <=> other_string -> -1, 0, 1, or nil
4296 *
4297 * Compares +self+ and +other_string+, returning:
4298 *
4299 * - -1 if +other_string+ is larger.
4300 * - 0 if the two are equal.
4301 * - 1 if +other_string+ is smaller.
4302 * - +nil+ if the two are incomparable.
4303 *
4304 * Examples:
4305 *
4306 * 'foo' <=> 'foo' # => 0
4307 * 'foo' <=> 'food' # => -1
4308 * 'food' <=> 'foo' # => 1
4309 * 'FOO' <=> 'foo' # => -1
4310 * 'foo' <=> 'FOO' # => 1
4311 * 'foo' <=> 1 # => nil
4312 *
4313 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4314 */
4315
4316static VALUE
4317rb_str_cmp_m(VALUE str1, VALUE str2)
4318{
4319 int result;
4320 VALUE s = rb_check_string_type(str2);
4321 if (NIL_P(s)) {
4322 return rb_invcmp(str1, str2);
4323 }
4324 result = rb_str_cmp(str1, s);
4325 return INT2FIX(result);
4326}
4327
4328static VALUE str_casecmp(VALUE str1, VALUE str2);
4329static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4330
4331/*
4332 * call-seq:
4333 * casecmp(other_string) -> -1, 0, 1, or nil
4334 *
4335 * Ignoring case, compares +self+ and +other_string+; returns:
4336 *
4337 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4338 * - 0 if the two are equal.
4339 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4340 * - +nil+ if the two are incomparable.
4341 *
4342 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4343 *
4344 * Examples:
4345 *
4346 * 'foo'.casecmp('goo') # => -1
4347 * 'goo'.casecmp('foo') # => 1
4348 * 'foo'.casecmp('food') # => -1
4349 * 'food'.casecmp('foo') # => 1
4350 * 'FOO'.casecmp('foo') # => 0
4351 * 'foo'.casecmp('FOO') # => 0
4352 * 'foo'.casecmp(1) # => nil
4353 *
4354 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4355 */
4356
4357static VALUE
4358rb_str_casecmp(VALUE str1, VALUE str2)
4359{
4360 VALUE s = rb_check_string_type(str2);
4361 if (NIL_P(s)) {
4362 return Qnil;
4363 }
4364 return str_casecmp(str1, s);
4365}
4366
4367static VALUE
4368str_casecmp(VALUE str1, VALUE str2)
4369{
4370 long len;
4371 rb_encoding *enc;
4372 const char *p1, *p1end, *p2, *p2end;
4373
4374 enc = rb_enc_compatible(str1, str2);
4375 if (!enc) {
4376 return Qnil;
4377 }
4378
4379 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4380 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4381 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4382 while (p1 < p1end && p2 < p2end) {
4383 if (*p1 != *p2) {
4384 unsigned int c1 = TOLOWER(*p1 & 0xff);
4385 unsigned int c2 = TOLOWER(*p2 & 0xff);
4386 if (c1 != c2)
4387 return INT2FIX(c1 < c2 ? -1 : 1);
4388 }
4389 p1++;
4390 p2++;
4391 }
4392 }
4393 else {
4394 while (p1 < p1end && p2 < p2end) {
4395 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4396 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4397
4398 if (0 <= c1 && 0 <= c2) {
4399 c1 = TOLOWER(c1);
4400 c2 = TOLOWER(c2);
4401 if (c1 != c2)
4402 return INT2FIX(c1 < c2 ? -1 : 1);
4403 }
4404 else {
4405 int r;
4406 l1 = rb_enc_mbclen(p1, p1end, enc);
4407 l2 = rb_enc_mbclen(p2, p2end, enc);
4408 len = l1 < l2 ? l1 : l2;
4409 r = memcmp(p1, p2, len);
4410 if (r != 0)
4411 return INT2FIX(r < 0 ? -1 : 1);
4412 if (l1 != l2)
4413 return INT2FIX(l1 < l2 ? -1 : 1);
4414 }
4415 p1 += l1;
4416 p2 += l2;
4417 }
4418 }
4419 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4420 if (p1 == p1end) return INT2FIX(-1);
4421 return INT2FIX(1);
4422}
4423
4424/*
4425 * call-seq:
4426 * casecmp?(other_string) -> true, false, or nil
4427 *
4428 * Returns +true+ if +self+ and +other_string+ are equal after
4429 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4430 *
4431 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4432 *
4433 * Examples:
4434 *
4435 * 'foo'.casecmp?('goo') # => false
4436 * 'goo'.casecmp?('foo') # => false
4437 * 'foo'.casecmp?('food') # => false
4438 * 'food'.casecmp?('foo') # => false
4439 * 'FOO'.casecmp?('foo') # => true
4440 * 'foo'.casecmp?('FOO') # => true
4441 * 'foo'.casecmp?(1) # => nil
4442 *
4443 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4444 */
4445
4446static VALUE
4447rb_str_casecmp_p(VALUE str1, VALUE str2)
4448{
4449 VALUE s = rb_check_string_type(str2);
4450 if (NIL_P(s)) {
4451 return Qnil;
4452 }
4453 return str_casecmp_p(str1, s);
4454}
4455
4456static VALUE
4457str_casecmp_p(VALUE str1, VALUE str2)
4458{
4459 rb_encoding *enc;
4460 VALUE folded_str1, folded_str2;
4461 VALUE fold_opt = sym_fold;
4462
4463 enc = rb_enc_compatible(str1, str2);
4464 if (!enc) {
4465 return Qnil;
4466 }
4467
4468 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4469 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4470
4471 return rb_str_eql(folded_str1, folded_str2);
4472}
4473
4474static long
4475strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4476 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4477{
4478 const char *search_start = str_ptr;
4479 long pos, search_len = str_len - offset;
4480
4481 for (;;) {
4482 const char *t;
4483 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4484 if (pos < 0) return pos;
4485 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4486 if (t == search_start + pos) break;
4487 search_len -= t - search_start;
4488 if (search_len <= 0) return -1;
4489 offset += t - search_start;
4490 search_start = t;
4491 }
4492 return pos + offset;
4493}
4494
4495/* found index in byte */
4496#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4497#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4498
4499static long
4500rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4501{
4502 const char *str_ptr, *str_ptr_end, *sub_ptr;
4503 long str_len, sub_len;
4504 rb_encoding *enc;
4505
4506 enc = rb_enc_check(str, sub);
4507 if (is_broken_string(sub)) return -1;
4508
4509 str_ptr = RSTRING_PTR(str);
4510 str_ptr_end = RSTRING_END(str);
4511 str_len = RSTRING_LEN(str);
4512 sub_ptr = RSTRING_PTR(sub);
4513 sub_len = RSTRING_LEN(sub);
4514
4515 if (str_len < sub_len) return -1;
4516
4517 if (offset != 0) {
4518 long str_len_char, sub_len_char;
4519 int single_byte = single_byte_optimizable(str);
4520 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4521 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4522 if (offset < 0) {
4523 offset += str_len_char;
4524 if (offset < 0) return -1;
4525 }
4526 if (str_len_char - offset < sub_len_char) return -1;
4527 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4528 str_ptr += offset;
4529 }
4530 if (sub_len == 0) return offset;
4531
4532 /* need proceed one character at a time */
4533 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4534}
4535
4536
4537/*
4538 * call-seq:
4539 * index(pattern, offset = 0) -> integer or nil
4540 *
4541 * :include: doc/string/index.rdoc
4542 *
4543 */
4544
4545static VALUE
4546rb_str_index_m(int argc, VALUE *argv, VALUE str)
4547{
4548 VALUE sub;
4549 VALUE initpos;
4550 rb_encoding *enc = STR_ENC_GET(str);
4551 long pos;
4552
4553 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4554 long slen = str_strlen(str, enc); /* str's enc */
4555 pos = NUM2LONG(initpos);
4556 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4557 if (RB_TYPE_P(sub, T_REGEXP)) {
4559 }
4560 return Qnil;
4561 }
4562 }
4563 else {
4564 pos = 0;
4565 }
4566
4567 if (RB_TYPE_P(sub, T_REGEXP)) {
4568 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4569 enc, single_byte_optimizable(str));
4570
4571 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4572 VALUE match = rb_backref_get();
4573 struct re_registers *regs = RMATCH_REGS(match);
4574 pos = rb_str_sublen(str, BEG(0));
4575 return LONG2NUM(pos);
4576 }
4577 }
4578 else {
4579 StringValue(sub);
4580 pos = rb_str_index(str, sub, pos);
4581 if (pos >= 0) {
4582 pos = rb_str_sublen(str, pos);
4583 return LONG2NUM(pos);
4584 }
4585 }
4586 return Qnil;
4587}
4588
4589/* Ensure that the given pos is a valid character boundary.
4590 * Note that in this function, "character" means a code point
4591 * (Unicode scalar value), not a grapheme cluster.
4592 */
4593static void
4594str_ensure_byte_pos(VALUE str, long pos)
4595{
4596 if (!single_byte_optimizable(str)) {
4597 const char *s = RSTRING_PTR(str);
4598 const char *e = RSTRING_END(str);
4599 const char *p = s + pos;
4600 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4601 rb_raise(rb_eIndexError,
4602 "offset %ld does not land on character boundary", pos);
4603 }
4604 }
4605}
4606
4607/*
4608 * call-seq:
4609 * byteindex(object, offset = 0) -> integer or nil
4610 *
4611 * Returns the 0-based integer index of a substring of +self+
4612 * specified by +object+ (a string or Regexp) and +offset+,
4613 * or +nil+ if there is no such substring;
4614 * the returned index is the count of _bytes_ (not characters).
4615 *
4616 * When +object+ is a string,
4617 * returns the index of the first found substring equal to +object+:
4618 *
4619 * s = 'foo' # => "foo"
4620 * s.size # => 3 # Three 1-byte characters.
4621 * s.bytesize # => 3 # Three bytes.
4622 * s.byteindex('f') # => 0
4623 * s.byteindex('o') # => 1
4624 * s.byteindex('oo') # => 1
4625 * s.byteindex('ooo') # => nil
4626 *
4627 * When +object+ is a Regexp,
4628 * returns the index of the first found substring matching +object+;
4629 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4630 *
4631 * s = 'foo'
4632 * s.byteindex(/f/) # => 0
4633 * $~ # => #<MatchData "f">
4634 * s.byteindex(/o/) # => 1
4635 * s.byteindex(/oo/) # => 1
4636 * s.byteindex(/ooo/) # => nil
4637 * $~ # => nil
4638 *
4639 * \Integer argument +offset+, if given, specifies the 0-based index
4640 * of the byte where searching is to begin.
4641 *
4642 * When +offset+ is non-negative,
4643 * searching begins at byte position +offset+:
4644 *
4645 * s = 'foo'
4646 * s.byteindex('o', 1) # => 1
4647 * s.byteindex('o', 2) # => 2
4648 * s.byteindex('o', 3) # => nil
4649 *
4650 * When +offset+ is negative, counts backward from the end of +self+:
4651 *
4652 * s = 'foo'
4653 * s.byteindex('o', -1) # => 2
4654 * s.byteindex('o', -2) # => 1
4655 * s.byteindex('o', -3) # => 1
4656 * s.byteindex('o', -4) # => nil
4657 *
4658 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4659 *
4660 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4661 * s.size # => 2 # Two 3-byte characters.
4662 * s.bytesize # => 6 # Six bytes.
4663 * s.byteindex("\uFFFF") # => 0
4664 * s.byteindex("\uFFFF", 1) # Raises IndexError
4665 * s.byteindex("\uFFFF", 2) # Raises IndexError
4666 * s.byteindex("\uFFFF", 3) # => 3
4667 * s.byteindex("\uFFFF", 4) # Raises IndexError
4668 * s.byteindex("\uFFFF", 5) # Raises IndexError
4669 * s.byteindex("\uFFFF", 6) # => nil
4670 *
4671 * Related: see {Querying}[rdoc-ref:String@Querying].
4672 */
4673
4674static VALUE
4675rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4676{
4677 VALUE sub;
4678 VALUE initpos;
4679 long pos;
4680
4681 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4682 long slen = RSTRING_LEN(str);
4683 pos = NUM2LONG(initpos);
4684 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4685 if (RB_TYPE_P(sub, T_REGEXP)) {
4687 }
4688 return Qnil;
4689 }
4690 }
4691 else {
4692 pos = 0;
4693 }
4694
4695 str_ensure_byte_pos(str, pos);
4696
4697 if (RB_TYPE_P(sub, T_REGEXP)) {
4698 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4699 VALUE match = rb_backref_get();
4700 struct re_registers *regs = RMATCH_REGS(match);
4701 pos = BEG(0);
4702 return LONG2NUM(pos);
4703 }
4704 }
4705 else {
4706 StringValue(sub);
4707 pos = rb_str_byteindex(str, sub, pos);
4708 if (pos >= 0) return LONG2NUM(pos);
4709 }
4710 return Qnil;
4711}
4712
4713#ifndef HAVE_MEMRCHR
4714static void*
4715memrchr(const char *search_str, int chr, long search_len)
4716{
4717 const char *ptr = search_str + search_len;
4718 while (ptr > search_str) {
4719 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4720 }
4721
4722 return ((void *)0);
4723}
4724#endif
4725
4726static long
4727str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4728{
4729 char *hit, *adjusted;
4730 int c;
4731 long slen, searchlen;
4732 char *sbeg, *e, *t;
4733
4734 sbeg = RSTRING_PTR(str);
4735 slen = RSTRING_LEN(sub);
4736 if (slen == 0) return s - sbeg;
4737 e = RSTRING_END(str);
4738 t = RSTRING_PTR(sub);
4739 c = *t & 0xff;
4740 searchlen = s - sbeg + 1;
4741
4742 if (memcmp(s, t, slen) == 0) {
4743 return s - sbeg;
4744 }
4745
4746 do {
4747 hit = memrchr(sbeg, c, searchlen);
4748 if (!hit) break;
4749 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4750 if (hit != adjusted) {
4751 searchlen = adjusted - sbeg;
4752 continue;
4753 }
4754 if (memcmp(hit, t, slen) == 0)
4755 return hit - sbeg;
4756 searchlen = adjusted - sbeg;
4757 } while (searchlen > 0);
4758
4759 return -1;
4760}
4761
4762/* found index in byte */
4763static long
4764rb_str_rindex(VALUE str, VALUE sub, long pos)
4765{
4766 long len, slen;
4767 char *sbeg, *s;
4768 rb_encoding *enc;
4769 int singlebyte;
4770
4771 enc = rb_enc_check(str, sub);
4772 if (is_broken_string(sub)) return -1;
4773 singlebyte = single_byte_optimizable(str);
4774 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4775 slen = str_strlen(sub, enc); /* rb_enc_check */
4776
4777 /* substring longer than string */
4778 if (len < slen) return -1;
4779 if (len - pos < slen) pos = len - slen;
4780 if (len == 0) return pos;
4781
4782 sbeg = RSTRING_PTR(str);
4783
4784 if (pos == 0) {
4785 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4786 return 0;
4787 else
4788 return -1;
4789 }
4790
4791 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4792 return str_rindex(str, sub, s, enc);
4793}
4794
4795/*
4796 * call-seq:
4797 * rindex(pattern, offset = self.length) -> integer or nil
4798 *
4799 * :include:doc/string/rindex.rdoc
4800 *
4801 */
4802
4803static VALUE
4804rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4805{
4806 VALUE sub;
4807 VALUE initpos;
4808 rb_encoding *enc = STR_ENC_GET(str);
4809 long pos, len = str_strlen(str, enc); /* str's enc */
4810
4811 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4812 pos = NUM2LONG(initpos);
4813 if (pos < 0 && (pos += len) < 0) {
4814 if (RB_TYPE_P(sub, T_REGEXP)) {
4816 }
4817 return Qnil;
4818 }
4819 if (pos > len) pos = len;
4820 }
4821 else {
4822 pos = len;
4823 }
4824
4825 if (RB_TYPE_P(sub, T_REGEXP)) {
4826 /* enc = rb_enc_check(str, sub); */
4827 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4828 enc, single_byte_optimizable(str));
4829
4830 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4831 VALUE match = rb_backref_get();
4832 struct re_registers *regs = RMATCH_REGS(match);
4833 pos = rb_str_sublen(str, BEG(0));
4834 return LONG2NUM(pos);
4835 }
4836 }
4837 else {
4838 StringValue(sub);
4839 pos = rb_str_rindex(str, sub, pos);
4840 if (pos >= 0) {
4841 pos = rb_str_sublen(str, pos);
4842 return LONG2NUM(pos);
4843 }
4844 }
4845 return Qnil;
4846}
4847
4848static long
4849rb_str_byterindex(VALUE str, VALUE sub, long pos)
4850{
4851 long len, slen;
4852 char *sbeg, *s;
4853 rb_encoding *enc;
4854
4855 enc = rb_enc_check(str, sub);
4856 if (is_broken_string(sub)) return -1;
4857 len = RSTRING_LEN(str);
4858 slen = RSTRING_LEN(sub);
4859
4860 /* substring longer than string */
4861 if (len < slen) return -1;
4862 if (len - pos < slen) pos = len - slen;
4863 if (len == 0) return pos;
4864
4865 sbeg = RSTRING_PTR(str);
4866
4867 if (pos == 0) {
4868 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4869 return 0;
4870 else
4871 return -1;
4872 }
4873
4874 s = sbeg + pos;
4875 return str_rindex(str, sub, s, enc);
4876}
4877
4878/*
4879 * call-seq:
4880 * byterindex(object, offset = self.bytesize) -> integer or nil
4881 *
4882 * Returns the 0-based integer index of a substring of +self+
4883 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4884 * or +nil+ if there is no such substring;
4885 * the returned index is the count of _bytes_ (not characters).
4886 *
4887 * When +object+ is a string,
4888 * returns the index of the _last_ found substring equal to +object+:
4889 *
4890 * s = 'foo' # => "foo"
4891 * s.size # => 3 # Three 1-byte characters.
4892 * s.bytesize # => 3 # Three bytes.
4893 * s.byterindex('f') # => 0
4894 s.byterindex('o') # => 2
4895 s.byterindex('oo') # => 1
4896 s.byterindex('ooo') # => nil
4897 *
4898 * When +object+ is a Regexp,
4899 * returns the index of the last found substring matching +object+;
4900 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4901 *
4902 * s = 'foo'
4903 * s.byterindex(/f/) # => 0
4904 * $~ # => #<MatchData "f">
4905 * s.byterindex(/o/) # => 2
4906 * s.byterindex(/oo/) # => 1
4907 * s.byterindex(/ooo/) # => nil
4908 * $~ # => nil
4909 *
4910 * The last match means starting at the possible last position,
4911 * not the last of the longest matches:
4912 *
4913 * s = 'foo'
4914 * s.byterindex(/o+/) # => 2
4915 * $~ #=> #<MatchData "o">
4916 *
4917 * To get the last longest match, use a negative lookbehind:
4918 *
4919 * s = 'foo'
4920 * s.byterindex(/(?<!o)o+/) # => 1
4921 * $~ # => #<MatchData "oo">
4922 *
4923 * Or use method #byteindex with negative lookahead:
4924 *
4925 * s = 'foo'
4926 * s.byteindex(/o+(?!.*o)/) # => 1
4927 * $~ #=> #<MatchData "oo">
4928 *
4929 * \Integer argument +offset+, if given, specifies the 0-based index
4930 * of the byte where searching is to end.
4931 *
4932 * When +offset+ is non-negative,
4933 * searching ends at byte position +offset+:
4934 *
4935 * s = 'foo'
4936 * s.byterindex('o', 0) # => nil
4937 * s.byterindex('o', 1) # => 1
4938 * s.byterindex('o', 2) # => 2
4939 * s.byterindex('o', 3) # => 2
4940 *
4941 * When +offset+ is negative, counts backward from the end of +self+:
4942 *
4943 * s = 'foo'
4944 * s.byterindex('o', -1) # => 2
4945 * s.byterindex('o', -2) # => 1
4946 * s.byterindex('o', -3) # => nil
4947 *
4948 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4949 *
4950 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4951 * s.size # => 2 # Two 3-byte characters.
4952 * s.bytesize # => 6 # Six bytes.
4953 * s.byterindex("\uFFFF") # => 3
4954 * s.byterindex("\uFFFF", 1) # Raises IndexError
4955 * s.byterindex("\uFFFF", 2) # Raises IndexError
4956 * s.byterindex("\uFFFF", 3) # => 3
4957 * s.byterindex("\uFFFF", 4) # Raises IndexError
4958 * s.byterindex("\uFFFF", 5) # Raises IndexError
4959 * s.byterindex("\uFFFF", 6) # => nil
4960 *
4961 * Related: see {Querying}[rdoc-ref:String@Querying].
4962 */
4963
4964static VALUE
4965rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4966{
4967 VALUE sub;
4968 VALUE initpos;
4969 long pos, len = RSTRING_LEN(str);
4970
4971 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4972 pos = NUM2LONG(initpos);
4973 if (pos < 0 && (pos += len) < 0) {
4974 if (RB_TYPE_P(sub, T_REGEXP)) {
4976 }
4977 return Qnil;
4978 }
4979 if (pos > len) pos = len;
4980 }
4981 else {
4982 pos = len;
4983 }
4984
4985 str_ensure_byte_pos(str, pos);
4986
4987 if (RB_TYPE_P(sub, T_REGEXP)) {
4988 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4989 VALUE match = rb_backref_get();
4990 struct re_registers *regs = RMATCH_REGS(match);
4991 pos = BEG(0);
4992 return LONG2NUM(pos);
4993 }
4994 }
4995 else {
4996 StringValue(sub);
4997 pos = rb_str_byterindex(str, sub, pos);
4998 if (pos >= 0) return LONG2NUM(pos);
4999 }
5000 return Qnil;
5001}
5002
5003/*
5004 * call-seq:
5005 * self =~ object -> integer or nil
5006 *
5007 * When +object+ is a Regexp, returns the index of the first substring in +self+
5008 * matched by +object+,
5009 * or +nil+ if no match is found;
5010 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5011 *
5012 * 'foo' =~ /f/ # => 0
5013 * $~ # => #<MatchData "f">
5014 * 'foo' =~ /o/ # => 1
5015 * $~ # => #<MatchData "o">
5016 * 'foo' =~ /x/ # => nil
5017 * $~ # => nil
5018 *
5019 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5020 * (see Regexp#=~):
5021 *
5022 * number = nil
5023 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5024 * number # => nil # Not assigned.
5025 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5026 * number # => "9" # Assigned.
5027 *
5028 * If +object+ is not a Regexp, returns the value
5029 * returned by <tt>object =~ self</tt>.
5030 *
5031 * Related: see {Querying}[rdoc-ref:String@Querying].
5032 */
5033
5034static VALUE
5035rb_str_match(VALUE x, VALUE y)
5036{
5037 switch (OBJ_BUILTIN_TYPE(y)) {
5038 case T_STRING:
5039 rb_raise(rb_eTypeError, "type mismatch: String given");
5040
5041 case T_REGEXP:
5042 return rb_reg_match(y, x);
5043
5044 default:
5045 return rb_funcall(y, idEqTilde, 1, x);
5046 }
5047}
5048
5049
5050static VALUE get_pat(VALUE);
5051
5052
5053/*
5054 * call-seq:
5055 * match(pattern, offset = 0) -> matchdata or nil
5056 * match(pattern, offset = 0) {|matchdata| ... } -> object
5057 *
5058 * Creates a MatchData object based on +self+ and the given arguments;
5059 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5060 *
5061 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5062 *
5063 * regexp = Regexp.new(pattern)
5064 *
5065 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5066 * (see Regexp#match):
5067 *
5068 * matchdata = regexp.match(self[offset..])
5069 *
5070 * With no block given, returns the computed +matchdata+ or +nil+:
5071 *
5072 * 'foo'.match('f') # => #<MatchData "f">
5073 * 'foo'.match('o') # => #<MatchData "o">
5074 * 'foo'.match('x') # => nil
5075 * 'foo'.match('f', 1) # => nil
5076 * 'foo'.match('o', 1) # => #<MatchData "o">
5077 *
5078 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5079 * returns the block's return value:
5080 *
5081 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5082 *
5083 * With a block given and +nil+ +matchdata+, does not call the block:
5084 *
5085 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5086 *
5087 * Related: see {Querying}[rdoc-ref:String@Querying].
5088 */
5089
5090static VALUE
5091rb_str_match_m(int argc, VALUE *argv, VALUE str)
5092{
5093 VALUE re, result;
5094 if (argc < 1)
5095 rb_check_arity(argc, 1, 2);
5096 re = argv[0];
5097 argv[0] = str;
5098 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5099 if (!NIL_P(result) && rb_block_given_p()) {
5100 return rb_yield(result);
5101 }
5102 return result;
5103}
5104
5105/*
5106 * call-seq:
5107 * match?(pattern, offset = 0) -> true or false
5108 *
5109 * Returns whether a match is found for +self+ and the given arguments;
5110 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5111 *
5112 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5113 *
5114 * regexp = Regexp.new(pattern)
5115 *
5116 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5117 * +false+ otherwise:
5118 *
5119 * 'foo'.match?(/o/) # => true
5120 * 'foo'.match?('o') # => true
5121 * 'foo'.match?(/x/) # => false
5122 * 'foo'.match?('f', 1) # => false
5123 * 'foo'.match?('o', 1) # => true
5124 *
5125 * Related: see {Querying}[rdoc-ref:String@Querying].
5126 */
5127
5128static VALUE
5129rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5130{
5131 VALUE re;
5132 rb_check_arity(argc, 1, 2);
5133 re = get_pat(argv[0]);
5134 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5135}
5136
5137enum neighbor_char {
5138 NEIGHBOR_NOT_CHAR,
5139 NEIGHBOR_FOUND,
5140 NEIGHBOR_WRAPPED
5141};
5142
5143static enum neighbor_char
5144enc_succ_char(char *p, long len, rb_encoding *enc)
5145{
5146 long i;
5147 int l;
5148
5149 if (rb_enc_mbminlen(enc) > 1) {
5150 /* wchar, trivial case */
5151 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5152 if (!MBCLEN_CHARFOUND_P(r)) {
5153 return NEIGHBOR_NOT_CHAR;
5154 }
5155 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5156 l = rb_enc_code_to_mbclen(c, enc);
5157 if (!l) return NEIGHBOR_NOT_CHAR;
5158 if (l != len) return NEIGHBOR_WRAPPED;
5159 rb_enc_mbcput(c, p, enc);
5160 r = rb_enc_precise_mbclen(p, p + len, enc);
5161 if (!MBCLEN_CHARFOUND_P(r)) {
5162 return NEIGHBOR_NOT_CHAR;
5163 }
5164 return NEIGHBOR_FOUND;
5165 }
5166 while (1) {
5167 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5168 p[i] = '\0';
5169 if (i < 0)
5170 return NEIGHBOR_WRAPPED;
5171 ++((unsigned char*)p)[i];
5172 l = rb_enc_precise_mbclen(p, p+len, enc);
5173 if (MBCLEN_CHARFOUND_P(l)) {
5174 l = MBCLEN_CHARFOUND_LEN(l);
5175 if (l == len) {
5176 return NEIGHBOR_FOUND;
5177 }
5178 else {
5179 memset(p+l, 0xff, len-l);
5180 }
5181 }
5182 if (MBCLEN_INVALID_P(l) && i < len-1) {
5183 long len2;
5184 int l2;
5185 for (len2 = len-1; 0 < len2; len2--) {
5186 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5187 if (!MBCLEN_INVALID_P(l2))
5188 break;
5189 }
5190 memset(p+len2+1, 0xff, len-(len2+1));
5191 }
5192 }
5193}
5194
5195static enum neighbor_char
5196enc_pred_char(char *p, long len, rb_encoding *enc)
5197{
5198 long i;
5199 int l;
5200 if (rb_enc_mbminlen(enc) > 1) {
5201 /* wchar, trivial case */
5202 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5203 if (!MBCLEN_CHARFOUND_P(r)) {
5204 return NEIGHBOR_NOT_CHAR;
5205 }
5206 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5207 if (!c) return NEIGHBOR_NOT_CHAR;
5208 --c;
5209 l = rb_enc_code_to_mbclen(c, enc);
5210 if (!l) return NEIGHBOR_NOT_CHAR;
5211 if (l != len) return NEIGHBOR_WRAPPED;
5212 rb_enc_mbcput(c, p, enc);
5213 r = rb_enc_precise_mbclen(p, p + len, enc);
5214 if (!MBCLEN_CHARFOUND_P(r)) {
5215 return NEIGHBOR_NOT_CHAR;
5216 }
5217 return NEIGHBOR_FOUND;
5218 }
5219 while (1) {
5220 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5221 p[i] = '\xff';
5222 if (i < 0)
5223 return NEIGHBOR_WRAPPED;
5224 --((unsigned char*)p)[i];
5225 l = rb_enc_precise_mbclen(p, p+len, enc);
5226 if (MBCLEN_CHARFOUND_P(l)) {
5227 l = MBCLEN_CHARFOUND_LEN(l);
5228 if (l == len) {
5229 return NEIGHBOR_FOUND;
5230 }
5231 else {
5232 memset(p+l, 0, len-l);
5233 }
5234 }
5235 if (MBCLEN_INVALID_P(l) && i < len-1) {
5236 long len2;
5237 int l2;
5238 for (len2 = len-1; 0 < len2; len2--) {
5239 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5240 if (!MBCLEN_INVALID_P(l2))
5241 break;
5242 }
5243 memset(p+len2+1, 0, len-(len2+1));
5244 }
5245 }
5246}
5247
5248/*
5249 overwrite +p+ by succeeding letter in +enc+ and returns
5250 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5251 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5252 assuming each ranges are successive, and mbclen
5253 never change in each ranges.
5254 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5255 character.
5256 */
5257static enum neighbor_char
5258enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5259{
5260 enum neighbor_char ret;
5261 unsigned int c;
5262 int ctype;
5263 int range;
5264 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5265
5266 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5267 int try;
5268 const int max_gaps = 1;
5269
5270 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5271 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5272 ctype = ONIGENC_CTYPE_DIGIT;
5273 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5274 ctype = ONIGENC_CTYPE_ALPHA;
5275 else
5276 return NEIGHBOR_NOT_CHAR;
5277
5278 MEMCPY(save, p, char, len);
5279 for (try = 0; try <= max_gaps; ++try) {
5280 ret = enc_succ_char(p, len, enc);
5281 if (ret == NEIGHBOR_FOUND) {
5282 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5283 if (rb_enc_isctype(c, ctype, enc))
5284 return NEIGHBOR_FOUND;
5285 }
5286 }
5287 MEMCPY(p, save, char, len);
5288 range = 1;
5289 while (1) {
5290 MEMCPY(save, p, char, len);
5291 ret = enc_pred_char(p, len, enc);
5292 if (ret == NEIGHBOR_FOUND) {
5293 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5294 if (!rb_enc_isctype(c, ctype, enc)) {
5295 MEMCPY(p, save, char, len);
5296 break;
5297 }
5298 }
5299 else {
5300 MEMCPY(p, save, char, len);
5301 break;
5302 }
5303 range++;
5304 }
5305 if (range == 1) {
5306 return NEIGHBOR_NOT_CHAR;
5307 }
5308
5309 if (ctype != ONIGENC_CTYPE_DIGIT) {
5310 MEMCPY(carry, p, char, len);
5311 return NEIGHBOR_WRAPPED;
5312 }
5313
5314 MEMCPY(carry, p, char, len);
5315 enc_succ_char(carry, len, enc);
5316 return NEIGHBOR_WRAPPED;
5317}
5318
5319
5320static VALUE str_succ(VALUE str);
5321
5322/*
5323 * call-seq:
5324 * succ -> new_str
5325 *
5326 * :include: doc/string/succ.rdoc
5327 *
5328 */
5329
5330VALUE
5332{
5333 VALUE str;
5334 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5335 rb_enc_cr_str_copy_for_substr(str, orig);
5336 return str_succ(str);
5337}
5338
5339static VALUE
5340str_succ(VALUE str)
5341{
5342 rb_encoding *enc;
5343 char *sbeg, *s, *e, *last_alnum = 0;
5344 int found_alnum = 0;
5345 long l, slen;
5346 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5347 long carry_pos = 0, carry_len = 1;
5348 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5349
5350 slen = RSTRING_LEN(str);
5351 if (slen == 0) return str;
5352
5353 enc = STR_ENC_GET(str);
5354 sbeg = RSTRING_PTR(str);
5355 s = e = sbeg + slen;
5356
5357 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5358 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5359 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5360 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5361 break;
5362 }
5363 }
5364 l = rb_enc_precise_mbclen(s, e, enc);
5365 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5366 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5367 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5368 switch (neighbor) {
5369 case NEIGHBOR_NOT_CHAR:
5370 continue;
5371 case NEIGHBOR_FOUND:
5372 return str;
5373 case NEIGHBOR_WRAPPED:
5374 last_alnum = s;
5375 break;
5376 }
5377 found_alnum = 1;
5378 carry_pos = s - sbeg;
5379 carry_len = l;
5380 }
5381 if (!found_alnum) { /* str contains no alnum */
5382 s = e;
5383 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5384 enum neighbor_char neighbor;
5385 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5386 l = rb_enc_precise_mbclen(s, e, enc);
5387 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5388 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5389 MEMCPY(tmp, s, char, l);
5390 neighbor = enc_succ_char(tmp, l, enc);
5391 switch (neighbor) {
5392 case NEIGHBOR_FOUND:
5393 MEMCPY(s, tmp, char, l);
5394 return str;
5395 break;
5396 case NEIGHBOR_WRAPPED:
5397 MEMCPY(s, tmp, char, l);
5398 break;
5399 case NEIGHBOR_NOT_CHAR:
5400 break;
5401 }
5402 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5403 /* wrapped to \0...\0. search next valid char. */
5404 enc_succ_char(s, l, enc);
5405 }
5406 if (!rb_enc_asciicompat(enc)) {
5407 MEMCPY(carry, s, char, l);
5408 carry_len = l;
5409 }
5410 carry_pos = s - sbeg;
5411 }
5413 }
5414 RESIZE_CAPA(str, slen + carry_len);
5415 sbeg = RSTRING_PTR(str);
5416 s = sbeg + carry_pos;
5417 memmove(s + carry_len, s, slen - carry_pos);
5418 memmove(s, carry, carry_len);
5419 slen += carry_len;
5420 STR_SET_LEN(str, slen);
5421 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5423 return str;
5424}
5425
5426
5427/*
5428 * call-seq:
5429 * succ! -> self
5430 *
5431 * Like String#succ, but modifies +self+ in place; returns +self+.
5432 *
5433 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5434 */
5435
5436static VALUE
5437rb_str_succ_bang(VALUE str)
5438{
5439 rb_str_modify(str);
5440 str_succ(str);
5441 return str;
5442}
5443
5444static int
5445all_digits_p(const char *s, long len)
5446{
5447 while (len-- > 0) {
5448 if (!ISDIGIT(*s)) return 0;
5449 s++;
5450 }
5451 return 1;
5452}
5453
5454static int
5455str_upto_i(VALUE str, VALUE arg)
5456{
5457 rb_yield(str);
5458 return 0;
5459}
5460
5461/*
5462 * call-seq:
5463 * upto(other_string, exclusive = false) {|string| ... } -> self
5464 * upto(other_string, exclusive = false) -> new_enumerator
5465 *
5466 * :include: doc/string/upto.rdoc
5467 *
5468 */
5469
5470static VALUE
5471rb_str_upto(int argc, VALUE *argv, VALUE beg)
5472{
5473 VALUE end, exclusive;
5474
5475 rb_scan_args(argc, argv, "11", &end, &exclusive);
5476 RETURN_ENUMERATOR(beg, argc, argv);
5477 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5478}
5479
5480VALUE
5481rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5482{
5483 VALUE current, after_end;
5484 ID succ;
5485 int n, ascii;
5486 rb_encoding *enc;
5487
5488 CONST_ID(succ, "succ");
5489 StringValue(end);
5490 enc = rb_enc_check(beg, end);
5491 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5492 /* single character */
5493 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5494 char c = RSTRING_PTR(beg)[0];
5495 char e = RSTRING_PTR(end)[0];
5496
5497 if (c > e || (excl && c == e)) return beg;
5498 for (;;) {
5499 VALUE str = rb_enc_str_new(&c, 1, enc);
5501 if ((*each)(str, arg)) break;
5502 if (!excl && c == e) break;
5503 c++;
5504 if (excl && c == e) break;
5505 }
5506 return beg;
5507 }
5508 /* both edges are all digits */
5509 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5510 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5511 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5512 VALUE b, e;
5513 int width;
5514
5515 width = RSTRING_LENINT(beg);
5516 b = rb_str_to_inum(beg, 10, FALSE);
5517 e = rb_str_to_inum(end, 10, FALSE);
5518 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5519 long bi = FIX2LONG(b);
5520 long ei = FIX2LONG(e);
5521 rb_encoding *usascii = rb_usascii_encoding();
5522
5523 while (bi <= ei) {
5524 if (excl && bi == ei) break;
5525 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5526 bi++;
5527 }
5528 }
5529 else {
5530 ID op = excl ? '<' : idLE;
5531 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5532
5533 args[0] = INT2FIX(width);
5534 while (rb_funcall(b, op, 1, e)) {
5535 args[1] = b;
5536 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5537 b = rb_funcallv(b, succ, 0, 0);
5538 }
5539 }
5540 return beg;
5541 }
5542 /* normal case */
5543 n = rb_str_cmp(beg, end);
5544 if (n > 0 || (excl && n == 0)) return beg;
5545
5546 after_end = rb_funcallv(end, succ, 0, 0);
5547 current = str_duplicate(rb_cString, beg);
5548 while (!rb_str_equal(current, after_end)) {
5549 VALUE next = Qnil;
5550 if (excl || !rb_str_equal(current, end))
5551 next = rb_funcallv(current, succ, 0, 0);
5552 if ((*each)(current, arg)) break;
5553 if (NIL_P(next)) break;
5554 current = next;
5555 StringValue(current);
5556 if (excl && rb_str_equal(current, end)) break;
5557 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5558 break;
5559 }
5560
5561 return beg;
5562}
5563
5564VALUE
5565rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5566{
5567 VALUE current;
5568 ID succ;
5569
5570 CONST_ID(succ, "succ");
5571 /* both edges are all digits */
5572 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5573 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5574 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5575 int width = RSTRING_LENINT(beg);
5576 b = rb_str_to_inum(beg, 10, FALSE);
5577 if (FIXNUM_P(b)) {
5578 long bi = FIX2LONG(b);
5579 rb_encoding *usascii = rb_usascii_encoding();
5580
5581 while (FIXABLE(bi)) {
5582 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5583 bi++;
5584 }
5585 b = LONG2NUM(bi);
5586 }
5587 args[0] = INT2FIX(width);
5588 while (1) {
5589 args[1] = b;
5590 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5591 b = rb_funcallv(b, succ, 0, 0);
5592 }
5593 }
5594 /* normal case */
5595 current = str_duplicate(rb_cString, beg);
5596 while (1) {
5597 VALUE next = rb_funcallv(current, succ, 0, 0);
5598 if ((*each)(current, arg)) break;
5599 current = next;
5600 StringValue(current);
5601 if (RSTRING_LEN(current) == 0)
5602 break;
5603 }
5604
5605 return beg;
5606}
5607
5608static int
5609include_range_i(VALUE str, VALUE arg)
5610{
5611 VALUE *argp = (VALUE *)arg;
5612 if (!rb_equal(str, *argp)) return 0;
5613 *argp = Qnil;
5614 return 1;
5615}
5616
5617VALUE
5618rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5619{
5620 beg = rb_str_new_frozen(beg);
5621 StringValue(end);
5622 end = rb_str_new_frozen(end);
5623 if (NIL_P(val)) return Qfalse;
5624 val = rb_check_string_type(val);
5625 if (NIL_P(val)) return Qfalse;
5626 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5627 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5628 rb_enc_asciicompat(STR_ENC_GET(val))) {
5629 const char *bp = RSTRING_PTR(beg);
5630 const char *ep = RSTRING_PTR(end);
5631 const char *vp = RSTRING_PTR(val);
5632 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5633 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5634 return Qfalse;
5635 else {
5636 char b = *bp;
5637 char e = *ep;
5638 char v = *vp;
5639
5640 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5641 if (b <= v && v < e) return Qtrue;
5642 return RBOOL(!RTEST(exclusive) && v == e);
5643 }
5644 }
5645 }
5646#if 0
5647 /* both edges are all digits */
5648 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5649 all_digits_p(bp, RSTRING_LEN(beg)) &&
5650 all_digits_p(ep, RSTRING_LEN(end))) {
5651 /* TODO */
5652 }
5653#endif
5654 }
5655 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5656
5657 return RBOOL(NIL_P(val));
5658}
5659
5660static VALUE
5661rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5662{
5663 if (rb_reg_search(re, str, 0, 0) >= 0) {
5664 VALUE match = rb_backref_get();
5665 int nth = rb_reg_backref_number(match, backref);
5666 return rb_reg_nth_match(nth, match);
5667 }
5668 return Qnil;
5669}
5670
5671static VALUE
5672rb_str_aref(VALUE str, VALUE indx)
5673{
5674 long idx;
5675
5676 if (FIXNUM_P(indx)) {
5677 idx = FIX2LONG(indx);
5678 }
5679 else if (RB_TYPE_P(indx, T_REGEXP)) {
5680 return rb_str_subpat(str, indx, INT2FIX(0));
5681 }
5682 else if (RB_TYPE_P(indx, T_STRING)) {
5683 if (rb_str_index(str, indx, 0) != -1)
5684 return str_duplicate(rb_cString, indx);
5685 return Qnil;
5686 }
5687 else {
5688 /* check if indx is Range */
5689 long beg, len = str_strlen(str, NULL);
5690 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5691 case Qfalse:
5692 break;
5693 case Qnil:
5694 return Qnil;
5695 default:
5696 return rb_str_substr(str, beg, len);
5697 }
5698 idx = NUM2LONG(indx);
5699 }
5700
5701 return str_substr(str, idx, 1, FALSE);
5702}
5703
5704
5705/*
5706 * call-seq:
5707 * self[index] -> new_string or nil
5708 * self[start, length] -> new_string or nil
5709 * self[range] -> new_string or nil
5710 * self[regexp, capture = 0] -> new_string or nil
5711 * self[substring] -> new_string or nil
5712 *
5713 * :include: doc/string/aref.rdoc
5714 *
5715 */
5716
5717static VALUE
5718rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5719{
5720 if (argc == 2) {
5721 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5722 return rb_str_subpat(str, argv[0], argv[1]);
5723 }
5724 else {
5725 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5726 }
5727 }
5728 rb_check_arity(argc, 1, 2);
5729 return rb_str_aref(str, argv[0]);
5730}
5731
5732VALUE
5734{
5735 char *ptr = RSTRING_PTR(str);
5736 long olen = RSTRING_LEN(str), nlen;
5737
5738 str_modifiable(str);
5739 if (len > olen) len = olen;
5740 nlen = olen - len;
5741 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5742 char *oldptr = ptr;
5743 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5744 STR_SET_EMBED(str);
5745 ptr = RSTRING(str)->as.embed.ary;
5746 memmove(ptr, oldptr + len, nlen);
5747 if (fl == STR_NOEMBED) xfree(oldptr);
5748 }
5749 else {
5750 if (!STR_SHARED_P(str)) {
5751 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5752 rb_enc_cr_str_exact_copy(shared, str);
5753 OBJ_FREEZE(shared);
5754 }
5755 ptr = RSTRING(str)->as.heap.ptr += len;
5756 }
5757 STR_SET_LEN(str, nlen);
5758
5759 if (!SHARABLE_MIDDLE_SUBSTRING) {
5760 TERM_FILL(ptr + nlen, TERM_LEN(str));
5761 }
5763 return str;
5764}
5765
5766static void
5767rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5768{
5769 char *sptr;
5770 long slen;
5771 int cr;
5772
5773 if (beg == 0 && vlen == 0) {
5774 rb_str_drop_bytes(str, len);
5775 return;
5776 }
5777
5778 str_modify_keep_cr(str);
5779 RSTRING_GETMEM(str, sptr, slen);
5780 if (len < vlen) {
5781 /* expand string */
5782 RESIZE_CAPA(str, slen + vlen - len);
5783 sptr = RSTRING_PTR(str);
5784 }
5785
5787 cr = rb_enc_str_coderange(val);
5788 else
5790
5791 if (vlen != len) {
5792 memmove(sptr + beg + vlen,
5793 sptr + beg + len,
5794 slen - (beg + len));
5795 }
5796 if (vlen < beg && len < 0) {
5797 MEMZERO(sptr + slen, char, -len);
5798 }
5799 if (vlen > 0) {
5800 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5801 }
5802 slen += vlen - len;
5803 STR_SET_LEN(str, slen);
5804 TERM_FILL(&sptr[slen], TERM_LEN(str));
5805 ENC_CODERANGE_SET(str, cr);
5806}
5807
5808static inline void
5809rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5810{
5811 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5812}
5813
5814void
5815rb_str_update(VALUE str, long beg, long len, VALUE val)
5816{
5817 long slen;
5818 char *p, *e;
5819 rb_encoding *enc;
5820 int singlebyte = single_byte_optimizable(str);
5821 int cr;
5822
5823 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5824
5825 StringValue(val);
5826 enc = rb_enc_check(str, val);
5827 slen = str_strlen(str, enc); /* rb_enc_check */
5828
5829 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5830 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5831 }
5832 if (beg < 0) {
5833 beg += slen;
5834 }
5835 RUBY_ASSERT(beg >= 0);
5836 RUBY_ASSERT(beg <= slen);
5837
5838 if (len > slen - beg) {
5839 len = slen - beg;
5840 }
5841 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5842 if (!p) p = RSTRING_END(str);
5843 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5844 if (!e) e = RSTRING_END(str);
5845 /* error check */
5846 beg = p - RSTRING_PTR(str); /* physical position */
5847 len = e - p; /* physical length */
5848 rb_str_update_0(str, beg, len, val);
5849 rb_enc_associate(str, enc);
5851 if (cr != ENC_CODERANGE_BROKEN)
5852 ENC_CODERANGE_SET(str, cr);
5853}
5854
5855static void
5856rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5857{
5858 int nth;
5859 VALUE match;
5860 long start, end, len;
5861 rb_encoding *enc;
5862 struct re_registers *regs;
5863
5864 if (rb_reg_search(re, str, 0, 0) < 0) {
5865 rb_raise(rb_eIndexError, "regexp not matched");
5866 }
5867 match = rb_backref_get();
5868 nth = rb_reg_backref_number(match, backref);
5869 regs = RMATCH_REGS(match);
5870 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5871 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5872 }
5873 if (nth < 0) {
5874 nth += regs->num_regs;
5875 }
5876
5877 start = BEG(nth);
5878 if (start == -1) {
5879 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5880 }
5881 end = END(nth);
5882 len = end - start;
5883 StringValue(val);
5884 enc = rb_enc_check_str(str, val);
5885 rb_str_update_0(str, start, len, val);
5886 rb_enc_associate(str, enc);
5887}
5888
5889static VALUE
5890rb_str_aset(VALUE str, VALUE indx, VALUE val)
5891{
5892 long idx, beg;
5893
5894 switch (TYPE(indx)) {
5895 case T_REGEXP:
5896 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5897 return val;
5898
5899 case T_STRING:
5900 beg = rb_str_index(str, indx, 0);
5901 if (beg < 0) {
5902 rb_raise(rb_eIndexError, "string not matched");
5903 }
5904 beg = rb_str_sublen(str, beg);
5905 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5906 return val;
5907
5908 default:
5909 /* check if indx is Range */
5910 {
5911 long beg, len;
5912 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5913 rb_str_update(str, beg, len, val);
5914 return val;
5915 }
5916 }
5917 /* FALLTHROUGH */
5918
5919 case T_FIXNUM:
5920 idx = NUM2LONG(indx);
5921 rb_str_update(str, idx, 1, val);
5922 return val;
5923 }
5924}
5925
5926/*
5927 * call-seq:
5928 * self[index] = other_string -> new_string
5929 * self[start, length] = other_string -> new_string
5930 * self[range] = other_string -> new_string
5931 * self[regexp, capture = 0] = other_string -> new_string
5932 * self[substring] = other_string -> new_string
5933 *
5934 * :include: doc/string/aset.rdoc
5935 *
5936 */
5937
5938static VALUE
5939rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5940{
5941 if (argc == 3) {
5942 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5943 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5944 }
5945 else {
5946 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5947 }
5948 return argv[2];
5949 }
5950 rb_check_arity(argc, 2, 3);
5951 return rb_str_aset(str, argv[0], argv[1]);
5952}
5953
5954/*
5955 * call-seq:
5956 * insert(offset, other_string) -> self
5957 *
5958 * :include: doc/string/insert.rdoc
5959 *
5960 */
5961
5962static VALUE
5963rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5964{
5965 long pos = NUM2LONG(idx);
5966
5967 if (pos == -1) {
5968 return rb_str_append(str, str2);
5969 }
5970 else if (pos < 0) {
5971 pos++;
5972 }
5973 rb_str_update(str, pos, 0, str2);
5974 return str;
5975}
5976
5977
5978/*
5979 * call-seq:
5980 * slice!(index) -> new_string or nil
5981 * slice!(start, length) -> new_string or nil
5982 * slice!(range) -> new_string or nil
5983 * slice!(regexp, capture = 0) -> new_string or nil
5984 * slice!(substring) -> new_string or nil
5985 *
5986 * Like String#[] (and its alias String#slice), except that:
5987 *
5988 * - Performs substitutions in +self+ (not in a copy of +self+).
5989 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
5990 *
5991 * A few examples:
5992 *
5993 * s = 'hello'
5994 * s.slice!('e') # => "e"
5995 * s # => "hllo"
5996 * s.slice!('e') # => nil
5997 * s # => "hllo"
5998 *
5999 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6000 */
6001
6002static VALUE
6003rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6004{
6005 VALUE result = Qnil;
6006 VALUE indx;
6007 long beg, len = 1;
6008 char *p;
6009
6010 rb_check_arity(argc, 1, 2);
6011 str_modify_keep_cr(str);
6012 indx = argv[0];
6013 if (RB_TYPE_P(indx, T_REGEXP)) {
6014 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6015 VALUE match = rb_backref_get();
6016 struct re_registers *regs = RMATCH_REGS(match);
6017 int nth = 0;
6018 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6019 if ((nth += regs->num_regs) <= 0) return Qnil;
6020 }
6021 else if (nth >= regs->num_regs) return Qnil;
6022 beg = BEG(nth);
6023 len = END(nth) - beg;
6024 goto subseq;
6025 }
6026 else if (argc == 2) {
6027 beg = NUM2LONG(indx);
6028 len = NUM2LONG(argv[1]);
6029 goto num_index;
6030 }
6031 else if (FIXNUM_P(indx)) {
6032 beg = FIX2LONG(indx);
6033 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6034 if (!len) return Qnil;
6035 beg = p - RSTRING_PTR(str);
6036 goto subseq;
6037 }
6038 else if (RB_TYPE_P(indx, T_STRING)) {
6039 beg = rb_str_index(str, indx, 0);
6040 if (beg == -1) return Qnil;
6041 len = RSTRING_LEN(indx);
6042 result = str_duplicate(rb_cString, indx);
6043 goto squash;
6044 }
6045 else {
6046 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6047 case Qnil:
6048 return Qnil;
6049 case Qfalse:
6050 beg = NUM2LONG(indx);
6051 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6052 if (!len) return Qnil;
6053 beg = p - RSTRING_PTR(str);
6054 goto subseq;
6055 default:
6056 goto num_index;
6057 }
6058 }
6059
6060 num_index:
6061 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6062 beg = p - RSTRING_PTR(str);
6063
6064 subseq:
6065 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6066 rb_enc_cr_str_copy_for_substr(result, str);
6067
6068 squash:
6069 if (len > 0) {
6070 if (beg == 0) {
6071 rb_str_drop_bytes(str, len);
6072 }
6073 else {
6074 char *sptr = RSTRING_PTR(str);
6075 long slen = RSTRING_LEN(str);
6076 if (beg + len > slen) /* pathological check */
6077 len = slen - beg;
6078 memmove(sptr + beg,
6079 sptr + beg + len,
6080 slen - (beg + len));
6081 slen -= len;
6082 STR_SET_LEN(str, slen);
6083 TERM_FILL(&sptr[slen], TERM_LEN(str));
6084 }
6085 }
6086 return result;
6087}
6088
6089static VALUE
6090get_pat(VALUE pat)
6091{
6092 VALUE val;
6093
6094 switch (OBJ_BUILTIN_TYPE(pat)) {
6095 case T_REGEXP:
6096 return pat;
6097
6098 case T_STRING:
6099 break;
6100
6101 default:
6102 val = rb_check_string_type(pat);
6103 if (NIL_P(val)) {
6104 Check_Type(pat, T_REGEXP);
6105 }
6106 pat = val;
6107 }
6108
6109 return rb_reg_regcomp(pat);
6110}
6111
6112static VALUE
6113get_pat_quoted(VALUE pat, int check)
6114{
6115 VALUE val;
6116
6117 switch (OBJ_BUILTIN_TYPE(pat)) {
6118 case T_REGEXP:
6119 return pat;
6120
6121 case T_STRING:
6122 break;
6123
6124 default:
6125 val = rb_check_string_type(pat);
6126 if (NIL_P(val)) {
6127 Check_Type(pat, T_REGEXP);
6128 }
6129 pat = val;
6130 }
6131 if (check && is_broken_string(pat)) {
6132 rb_exc_raise(rb_reg_check_preprocess(pat));
6133 }
6134 return pat;
6135}
6136
6137static long
6138rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6139{
6140 if (BUILTIN_TYPE(pat) == T_STRING) {
6141 pos = rb_str_byteindex(str, pat, pos);
6142 if (set_backref_str) {
6143 if (pos >= 0) {
6144 str = rb_str_new_frozen_String(str);
6145 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6146 if (match) {
6147 *match = match_data;
6148 }
6149 }
6150 else {
6152 }
6153 }
6154 return pos;
6155 }
6156 else {
6157 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6158 }
6159}
6160
6161static long
6162rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6163{
6164 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6165}
6166
6167
6168/*
6169 * call-seq:
6170 * sub!(pattern, replacement) -> self or nil
6171 * sub!(pattern) {|match| ... } -> self or nil
6172 *
6173 * Like String#sub, except that:
6174 *
6175 * - Changes are made to +self+, not to copy of +self+.
6176 * - Returns +self+ if any changes are made, +nil+ otherwise.
6177 *
6178 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6179 */
6180
6181static VALUE
6182rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6183{
6184 VALUE pat, repl, hash = Qnil;
6185 int iter = 0;
6186 long plen;
6187 int min_arity = rb_block_given_p() ? 1 : 2;
6188 long beg;
6189
6190 rb_check_arity(argc, min_arity, 2);
6191 if (argc == 1) {
6192 iter = 1;
6193 }
6194 else {
6195 repl = argv[1];
6196 hash = rb_check_hash_type(argv[1]);
6197 if (NIL_P(hash)) {
6198 StringValue(repl);
6199 }
6200 }
6201
6202 pat = get_pat_quoted(argv[0], 1);
6203
6204 str_modifiable(str);
6205 beg = rb_pat_search(pat, str, 0, 1);
6206 if (beg >= 0) {
6207 rb_encoding *enc;
6208 int cr = ENC_CODERANGE(str);
6209 long beg0, end0;
6210 VALUE match, match0 = Qnil;
6211 struct re_registers *regs;
6212 char *p, *rp;
6213 long len, rlen;
6214
6215 match = rb_backref_get();
6216 regs = RMATCH_REGS(match);
6217 if (RB_TYPE_P(pat, T_STRING)) {
6218 beg0 = beg;
6219 end0 = beg0 + RSTRING_LEN(pat);
6220 match0 = pat;
6221 }
6222 else {
6223 beg0 = BEG(0);
6224 end0 = END(0);
6225 if (iter) match0 = rb_reg_nth_match(0, match);
6226 }
6227
6228 if (iter || !NIL_P(hash)) {
6229 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6230
6231 if (iter) {
6232 repl = rb_obj_as_string(rb_yield(match0));
6233 }
6234 else {
6235 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6236 repl = rb_obj_as_string(repl);
6237 }
6238 str_mod_check(str, p, len);
6239 rb_check_frozen(str);
6240 }
6241 else {
6242 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6243 }
6244
6245 enc = rb_enc_compatible(str, repl);
6246 if (!enc) {
6247 rb_encoding *str_enc = STR_ENC_GET(str);
6248 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6249 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6250 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6251 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6252 rb_enc_inspect_name(str_enc),
6253 rb_enc_inspect_name(STR_ENC_GET(repl)));
6254 }
6255 enc = STR_ENC_GET(repl);
6256 }
6257 rb_str_modify(str);
6258 rb_enc_associate(str, enc);
6260 int cr2 = ENC_CODERANGE(repl);
6261 if (cr2 == ENC_CODERANGE_BROKEN ||
6262 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6264 else
6265 cr = cr2;
6266 }
6267 plen = end0 - beg0;
6268 rlen = RSTRING_LEN(repl);
6269 len = RSTRING_LEN(str);
6270 if (rlen > plen) {
6271 RESIZE_CAPA(str, len + rlen - plen);
6272 }
6273 p = RSTRING_PTR(str);
6274 if (rlen != plen) {
6275 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6276 }
6277 rp = RSTRING_PTR(repl);
6278 memmove(p + beg0, rp, rlen);
6279 len += rlen - plen;
6280 STR_SET_LEN(str, len);
6281 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6282 ENC_CODERANGE_SET(str, cr);
6283
6284 RB_GC_GUARD(match);
6285
6286 return str;
6287 }
6288 return Qnil;
6289}
6290
6291
6292/*
6293 * call-seq:
6294 * sub(pattern, replacement) -> new_string
6295 * sub(pattern) {|match| ... } -> new_string
6296 *
6297 * :include: doc/string/sub.rdoc
6298 */
6299
6300static VALUE
6301rb_str_sub(int argc, VALUE *argv, VALUE str)
6302{
6303 str = str_duplicate(rb_cString, str);
6304 rb_str_sub_bang(argc, argv, str);
6305 return str;
6306}
6307
6308static VALUE
6309str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6310{
6311 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6312 long beg, beg0, end0;
6313 long offset, blen, slen, len, last;
6314 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6315 char *sp, *cp;
6316 int need_backref_str = -1;
6317 rb_encoding *str_enc;
6318
6319 switch (argc) {
6320 case 1:
6321 RETURN_ENUMERATOR(str, argc, argv);
6322 mode = ITER;
6323 break;
6324 case 2:
6325 repl = argv[1];
6326 hash = rb_check_hash_type(argv[1]);
6327 if (NIL_P(hash)) {
6328 StringValue(repl);
6329 }
6330 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6331 mode = FAST_MAP;
6332 }
6333 else {
6334 mode = MAP;
6335 }
6336 break;
6337 default:
6338 rb_error_arity(argc, 1, 2);
6339 }
6340
6341 pat = get_pat_quoted(argv[0], 1);
6342 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6343
6344 if (beg < 0) {
6345 if (bang) return Qnil; /* no match, no substitution */
6346 return str_duplicate(rb_cString, str);
6347 }
6348
6349 offset = 0;
6350 blen = RSTRING_LEN(str) + 30; /* len + margin */
6351 dest = rb_str_buf_new(blen);
6352 sp = RSTRING_PTR(str);
6353 slen = RSTRING_LEN(str);
6354 cp = sp;
6355 str_enc = STR_ENC_GET(str);
6356 rb_enc_associate(dest, str_enc);
6357 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6358
6359 do {
6360 struct re_registers *regs = RMATCH_REGS(match);
6361 if (RB_TYPE_P(pat, T_STRING)) {
6362 beg0 = beg;
6363 end0 = beg0 + RSTRING_LEN(pat);
6364 match0 = pat;
6365 }
6366 else {
6367 beg0 = BEG(0);
6368 end0 = END(0);
6369 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6370 }
6371
6372 if (mode != STR) {
6373 if (mode == ITER) {
6374 val = rb_obj_as_string(rb_yield(match0));
6375 }
6376 else {
6377 struct RString fake_str = {RBASIC_INIT};
6378 VALUE key;
6379 if (mode == FAST_MAP) {
6380 // It is safe to use a fake_str here because we established that it won't escape,
6381 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6382 // default proc.
6383 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6384 }
6385 else {
6386 key = rb_str_subseq(str, beg0, end0 - beg0);
6387 }
6388 val = rb_hash_aref(hash, key);
6389 val = rb_obj_as_string(val);
6390 }
6391 str_mod_check(str, sp, slen);
6392 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6393 rb_raise(rb_eRuntimeError, "block should not cheat");
6394 }
6395 }
6396 else if (need_backref_str) {
6397 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6398 if (need_backref_str < 0) {
6399 need_backref_str = val != repl;
6400 }
6401 }
6402 else {
6403 val = repl;
6404 }
6405
6406 len = beg0 - offset; /* copy pre-match substr */
6407 if (len) {
6408 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6409 }
6410
6411 rb_str_buf_append(dest, val);
6412
6413 last = offset;
6414 offset = end0;
6415 if (beg0 == end0) {
6416 /*
6417 * Always consume at least one character of the input string
6418 * in order to prevent infinite loops.
6419 */
6420 if (RSTRING_LEN(str) <= end0) break;
6421 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6422 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6423 offset = end0 + len;
6424 }
6425 cp = RSTRING_PTR(str) + offset;
6426 if (offset > RSTRING_LEN(str)) break;
6427
6428 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6429 if (mode != FAST_MAP && mode != STR) {
6430 match = Qnil;
6431 }
6432 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6433
6434 RB_GC_GUARD(match);
6435 } while (beg >= 0);
6436
6437 if (RSTRING_LEN(str) > offset) {
6438 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6439 }
6440 rb_pat_search0(pat, str, last, 1, &match);
6441 if (bang) {
6442 str_shared_replace(str, dest);
6443 }
6444 else {
6445 str = dest;
6446 }
6447
6448 return str;
6449}
6450
6451
6452/*
6453 * call-seq:
6454 * gsub!(pattern, replacement) -> self or nil
6455 * gsub!(pattern) {|match| ... } -> self or nil
6456 * gsub!(pattern) -> an_enumerator
6457 *
6458 * Like String#gsub, except that:
6459 *
6460 * - Performs substitutions in +self+ (not in a copy of +self+).
6461 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6462 *
6463 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6464 */
6465
6466static VALUE
6467rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6468{
6469 str_modify_keep_cr(str);
6470 return str_gsub(argc, argv, str, 1);
6471}
6472
6473
6474/*
6475 * call-seq:
6476 * gsub(pattern, replacement) -> new_string
6477 * gsub(pattern) {|match| ... } -> new_string
6478 * gsub(pattern) -> enumerator
6479 *
6480 * Returns a copy of +self+ with zero or more substrings replaced.
6481 *
6482 * Argument +pattern+ may be a string or a Regexp;
6483 * argument +replacement+ may be a string or a Hash.
6484 * Varying types for the argument values makes this method very versatile.
6485 *
6486 * Below are some simple examples;
6487 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6488 *
6489 * With arguments +pattern+ and string +replacement+ given,
6490 * replaces each matching substring with the given +replacement+ string:
6491 *
6492 * s = 'abracadabra'
6493 * s.gsub('ab', 'AB') # => "ABracadABra"
6494 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6495 *
6496 * With arguments +pattern+ and hash +replacement+ given,
6497 * replaces each matching substring with a value from the given +replacement+ hash,
6498 * or removes it:
6499 *
6500 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6501 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6502 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6503 *
6504 * With argument +pattern+ and a block given,
6505 * calls the block with each matching substring;
6506 * replaces that substring with the block's return value:
6507 *
6508 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6509 * # => "ABrACADABrA"
6510 *
6511 * With argument +pattern+ and no block given,
6512 * returns a new Enumerator.
6513 *
6514 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6515 */
6516
6517static VALUE
6518rb_str_gsub(int argc, VALUE *argv, VALUE str)
6519{
6520 return str_gsub(argc, argv, str, 0);
6521}
6522
6523
6524/*
6525 * call-seq:
6526 * replace(other_string) -> self
6527 *
6528 * Replaces the contents of +self+ with the contents of +other_string+;
6529 * returns +self+:
6530 *
6531 * s = 'foo' # => "foo"
6532 * s.replace('bar') # => "bar"
6533 *
6534 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6535 */
6536
6537VALUE
6539{
6540 str_modifiable(str);
6541 if (str == str2) return str;
6542
6543 StringValue(str2);
6544 str_discard(str);
6545 return str_replace(str, str2);
6546}
6547
6548/*
6549 * call-seq:
6550 * clear -> self
6551 *
6552 * Removes the contents of +self+:
6553 *
6554 * s = 'foo'
6555 * s.clear # => ""
6556 * s # => ""
6557 *
6558 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6559 */
6560
6561static VALUE
6562rb_str_clear(VALUE str)
6563{
6564 str_discard(str);
6565 STR_SET_EMBED(str);
6566 STR_SET_LEN(str, 0);
6567 RSTRING_PTR(str)[0] = 0;
6568 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6570 else
6572 return str;
6573}
6574
6575/*
6576 * call-seq:
6577 * chr -> string
6578 *
6579 * :include: doc/string/chr.rdoc
6580 *
6581 */
6582
6583static VALUE
6584rb_str_chr(VALUE str)
6585{
6586 return rb_str_substr(str, 0, 1);
6587}
6588
6589/*
6590 * call-seq:
6591 * getbyte(index) -> integer or nil
6592 *
6593 * :include: doc/string/getbyte.rdoc
6594 *
6595 */
6596VALUE
6597rb_str_getbyte(VALUE str, VALUE index)
6598{
6599 long pos = NUM2LONG(index);
6600
6601 if (pos < 0)
6602 pos += RSTRING_LEN(str);
6603 if (pos < 0 || RSTRING_LEN(str) <= pos)
6604 return Qnil;
6605
6606 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6607}
6608
6609/*
6610 * call-seq:
6611 * setbyte(index, integer) -> integer
6612 *
6613 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6614 * returns +integer+:
6615 *
6616 * s = 'xyzzy'
6617 * s.setbyte(2, 129) # => 129
6618 * s # => "xy\x81zy"
6619 *
6620 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6621 */
6622VALUE
6623rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6624{
6625 long pos = NUM2LONG(index);
6626 long len = RSTRING_LEN(str);
6627 char *ptr, *head, *left = 0;
6628 rb_encoding *enc;
6629 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6630
6631 if (pos < -len || len <= pos)
6632 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6633 if (pos < 0)
6634 pos += len;
6635
6636 VALUE v = rb_to_int(value);
6637 VALUE w = rb_int_and(v, INT2FIX(0xff));
6638 char byte = (char)(NUM2INT(w) & 0xFF);
6639
6640 if (!str_independent(str))
6641 str_make_independent(str);
6642 enc = STR_ENC_GET(str);
6643 head = RSTRING_PTR(str);
6644 ptr = &head[pos];
6645 if (!STR_EMBED_P(str)) {
6646 cr = ENC_CODERANGE(str);
6647 switch (cr) {
6648 case ENC_CODERANGE_7BIT:
6649 left = ptr;
6650 *ptr = byte;
6651 if (ISASCII(byte)) goto end;
6652 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6653 if (!MBCLEN_CHARFOUND_P(nlen))
6655 else
6657 goto end;
6659 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6660 width = rb_enc_precise_mbclen(left, head+len, enc);
6661 *ptr = byte;
6662 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6663 if (!MBCLEN_CHARFOUND_P(nlen))
6665 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6667 goto end;
6668 }
6669 }
6671 *ptr = byte;
6672
6673 end:
6674 return value;
6675}
6676
6677static VALUE
6678str_byte_substr(VALUE str, long beg, long len, int empty)
6679{
6680 long n = RSTRING_LEN(str);
6681
6682 if (beg > n || len < 0) return Qnil;
6683 if (beg < 0) {
6684 beg += n;
6685 if (beg < 0) return Qnil;
6686 }
6687 if (len > n - beg)
6688 len = n - beg;
6689 if (len <= 0) {
6690 if (!empty) return Qnil;
6691 len = 0;
6692 }
6693
6694 VALUE str2 = str_subseq(str, beg, len);
6695
6696 str_enc_copy_direct(str2, str);
6697
6698 if (RSTRING_LEN(str2) == 0) {
6699 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6701 else
6703 }
6704 else {
6705 switch (ENC_CODERANGE(str)) {
6706 case ENC_CODERANGE_7BIT:
6708 break;
6709 default:
6711 break;
6712 }
6713 }
6714
6715 return str2;
6716}
6717
6718VALUE
6719rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6720{
6721 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6722}
6723
6724static VALUE
6725str_byte_aref(VALUE str, VALUE indx)
6726{
6727 long idx;
6728 if (FIXNUM_P(indx)) {
6729 idx = FIX2LONG(indx);
6730 }
6731 else {
6732 /* check if indx is Range */
6733 long beg, len = RSTRING_LEN(str);
6734
6735 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6736 case Qfalse:
6737 break;
6738 case Qnil:
6739 return Qnil;
6740 default:
6741 return str_byte_substr(str, beg, len, TRUE);
6742 }
6743
6744 idx = NUM2LONG(indx);
6745 }
6746 return str_byte_substr(str, idx, 1, FALSE);
6747}
6748
6749/*
6750 * call-seq:
6751 * byteslice(offset, length = 1) -> string or nil
6752 * byteslice(range) -> string or nil
6753 *
6754 * :include: doc/string/byteslice.rdoc
6755 */
6756
6757static VALUE
6758rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6759{
6760 if (argc == 2) {
6761 long beg = NUM2LONG(argv[0]);
6762 long len = NUM2LONG(argv[1]);
6763 return str_byte_substr(str, beg, len, TRUE);
6764 }
6765 rb_check_arity(argc, 1, 2);
6766 return str_byte_aref(str, argv[0]);
6767}
6768
6769static void
6770str_check_beg_len(VALUE str, long *beg, long *len)
6771{
6772 long end, slen = RSTRING_LEN(str);
6773
6774 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6775 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6776 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6777 }
6778 if (*beg < 0) {
6779 *beg += slen;
6780 }
6781 RUBY_ASSERT(*beg >= 0);
6782 RUBY_ASSERT(*beg <= slen);
6783
6784 if (*len > slen - *beg) {
6785 *len = slen - *beg;
6786 }
6787 end = *beg + *len;
6788 str_ensure_byte_pos(str, *beg);
6789 str_ensure_byte_pos(str, end);
6790}
6791
6792/*
6793 * call-seq:
6794 * bytesplice(offset, length, str) -> self
6795 * bytesplice(offset, length, str, str_offset, str_length) -> self
6796 * bytesplice(range, str) -> self
6797 * bytesplice(range, str, str_range) -> self
6798 *
6799 * :include: doc/string/bytesplice.rdoc
6800 */
6801
6802static VALUE
6803rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6804{
6805 long beg, len, vbeg, vlen;
6806 VALUE val;
6807 int cr;
6808
6809 rb_check_arity(argc, 2, 5);
6810 if (!(argc == 2 || argc == 3 || argc == 5)) {
6811 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6812 }
6813 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6814 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6815 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6816 rb_builtin_class_name(argv[0]));
6817 }
6818 val = argv[1];
6819 StringValue(val);
6820 if (argc == 2) {
6821 /* bytesplice(range, str) */
6822 vbeg = 0;
6823 vlen = RSTRING_LEN(val);
6824 }
6825 else {
6826 /* bytesplice(range, str, str_range) */
6827 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6828 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6829 rb_builtin_class_name(argv[2]));
6830 }
6831 }
6832 }
6833 else {
6834 beg = NUM2LONG(argv[0]);
6835 len = NUM2LONG(argv[1]);
6836 val = argv[2];
6837 StringValue(val);
6838 if (argc == 3) {
6839 /* bytesplice(index, length, str) */
6840 vbeg = 0;
6841 vlen = RSTRING_LEN(val);
6842 }
6843 else {
6844 /* bytesplice(index, length, str, str_index, str_length) */
6845 vbeg = NUM2LONG(argv[3]);
6846 vlen = NUM2LONG(argv[4]);
6847 }
6848 }
6849 str_check_beg_len(str, &beg, &len);
6850 str_check_beg_len(val, &vbeg, &vlen);
6851 str_modify_keep_cr(str);
6852
6853 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6854 rb_enc_associate(str, rb_enc_check(str, val));
6855 }
6856
6857 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6859 if (cr != ENC_CODERANGE_BROKEN)
6860 ENC_CODERANGE_SET(str, cr);
6861 return str;
6862}
6863
6864/*
6865 * call-seq:
6866 * reverse -> new_string
6867 *
6868 * Returns a new string with the characters from +self+ in reverse order.
6869 *
6870 * 'drawer'.reverse # => "reward"
6871 * 'reviled'.reverse # => "deliver"
6872 * 'stressed'.reverse # => "desserts"
6873 * 'semordnilaps'.reverse # => "spalindromes"
6874 *
6875 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6876 */
6877
6878static VALUE
6879rb_str_reverse(VALUE str)
6880{
6881 rb_encoding *enc;
6882 VALUE rev;
6883 char *s, *e, *p;
6884 int cr;
6885
6886 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6887 enc = STR_ENC_GET(str);
6888 rev = rb_str_new(0, RSTRING_LEN(str));
6889 s = RSTRING_PTR(str); e = RSTRING_END(str);
6890 p = RSTRING_END(rev);
6891 cr = ENC_CODERANGE(str);
6892
6893 if (RSTRING_LEN(str) > 1) {
6894 if (single_byte_optimizable(str)) {
6895 while (s < e) {
6896 *--p = *s++;
6897 }
6898 }
6899 else if (cr == ENC_CODERANGE_VALID) {
6900 while (s < e) {
6901 int clen = rb_enc_fast_mbclen(s, e, enc);
6902
6903 p -= clen;
6904 memcpy(p, s, clen);
6905 s += clen;
6906 }
6907 }
6908 else {
6909 cr = rb_enc_asciicompat(enc) ?
6911 while (s < e) {
6912 int clen = rb_enc_mbclen(s, e, enc);
6913
6914 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6915 p -= clen;
6916 memcpy(p, s, clen);
6917 s += clen;
6918 }
6919 }
6920 }
6921 STR_SET_LEN(rev, RSTRING_LEN(str));
6922 str_enc_copy_direct(rev, str);
6923 ENC_CODERANGE_SET(rev, cr);
6924
6925 return rev;
6926}
6927
6928
6929/*
6930 * call-seq:
6931 * reverse! -> self
6932 *
6933 * Returns +self+ with its characters reversed:
6934 *
6935 * 'drawer'.reverse! # => "reward"
6936 * 'reviled'.reverse! # => "deliver"
6937 * 'stressed'.reverse! # => "desserts"
6938 * 'semordnilaps'.reverse! # => "spalindromes"
6939 *
6940 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6941 */
6942
6943static VALUE
6944rb_str_reverse_bang(VALUE str)
6945{
6946 if (RSTRING_LEN(str) > 1) {
6947 if (single_byte_optimizable(str)) {
6948 char *s, *e, c;
6949
6950 str_modify_keep_cr(str);
6951 s = RSTRING_PTR(str);
6952 e = RSTRING_END(str) - 1;
6953 while (s < e) {
6954 c = *s;
6955 *s++ = *e;
6956 *e-- = c;
6957 }
6958 }
6959 else {
6960 str_shared_replace(str, rb_str_reverse(str));
6961 }
6962 }
6963 else {
6964 str_modify_keep_cr(str);
6965 }
6966 return str;
6967}
6968
6969
6970/*
6971 * call-seq:
6972 * include?(other_string) -> true or false
6973 *
6974 * Returns whether +self+ contains +other_string+:
6975 *
6976 * s = 'bar'
6977 * s.include?('ba') # => true
6978 * s.include?('ar') # => true
6979 * s.include?('bar') # => true
6980 * s.include?('a') # => true
6981 * s.include?('') # => true
6982 * s.include?('foo') # => false
6983 *
6984 * Related: see {Querying}[rdoc-ref:String@Querying].
6985 */
6986
6987VALUE
6988rb_str_include(VALUE str, VALUE arg)
6989{
6990 long i;
6991
6992 StringValue(arg);
6993 i = rb_str_index(str, arg, 0);
6994
6995 return RBOOL(i != -1);
6996}
6997
6998
6999/*
7000 * call-seq:
7001 * to_i(base = 10) -> integer
7002 *
7003 * Returns the result of interpreting leading characters in +self+
7004 * as an integer in the given +base+;
7005 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7006 *
7007 * '123456'.to_i # => 123456
7008 * '123def'.to_i(16) # => 1195503
7009 *
7010 * With +base+ zero given, string +object+ may contain leading characters
7011 * to specify the actual base:
7012 *
7013 * '123def'.to_i(0) # => 123
7014 * '0123def'.to_i(0) # => 83
7015 * '0b123def'.to_i(0) # => 1
7016 * '0o123def'.to_i(0) # => 83
7017 * '0d123def'.to_i(0) # => 123
7018 * '0x123def'.to_i(0) # => 1195503
7019 *
7020 * Characters past a leading valid number (in the given +base+) are ignored:
7021 *
7022 * '12.345'.to_i # => 12
7023 * '12345'.to_i(2) # => 1
7024 *
7025 * Returns zero if there is no leading valid number:
7026 *
7027 * 'abcdef'.to_i # => 0
7028 * '2'.to_i(2) # => 0
7029 *
7030 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7031 */
7032
7033static VALUE
7034rb_str_to_i(int argc, VALUE *argv, VALUE str)
7035{
7036 int base = 10;
7037
7038 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7039 rb_raise(rb_eArgError, "invalid radix %d", base);
7040 }
7041 return rb_str_to_inum(str, base, FALSE);
7042}
7043
7044
7045/*
7046 * call-seq:
7047 * to_f -> float
7048 *
7049 * Returns the result of interpreting leading characters in +self+ as a Float:
7050 *
7051 * '3.14159'.to_f # => 3.14159
7052 * '1.234e-2'.to_f # => 0.01234
7053 *
7054 * Characters past a leading valid number are ignored:
7055 *
7056 * '3.14 (pi to two places)'.to_f # => 3.14
7057 *
7058 * Returns zero if there is no leading valid number:
7059 *
7060 * 'abcdef'.to_f # => 0.0
7061 *
7062 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7063 */
7064
7065static VALUE
7066rb_str_to_f(VALUE str)
7067{
7068 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7069}
7070
7071
7072/*
7073 * call-seq:
7074 * to_s -> self or new_string
7075 *
7076 * Returns +self+ if +self+ is a +String+,
7077 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7078 *
7079 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7080 */
7081
7082static VALUE
7083rb_str_to_s(VALUE str)
7084{
7085 if (rb_obj_class(str) != rb_cString) {
7086 return str_duplicate(rb_cString, str);
7087 }
7088 return str;
7089}
7090
7091#if 0
7092static void
7093str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7094{
7095 char s[RUBY_MAX_CHAR_LEN];
7096 int n = rb_enc_codelen(c, enc);
7097
7098 rb_enc_mbcput(c, s, enc);
7099 rb_enc_str_buf_cat(str, s, n, enc);
7100}
7101#endif
7102
7103#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7104
7105int
7106rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7107{
7108 char buf[CHAR_ESC_LEN + 1];
7109 int l;
7110
7111#if SIZEOF_INT > 4
7112 c &= 0xffffffff;
7113#endif
7114 if (unicode_p) {
7115 if (c < 0x7F && ISPRINT(c)) {
7116 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7117 }
7118 else if (c < 0x10000) {
7119 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7120 }
7121 else {
7122 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7123 }
7124 }
7125 else {
7126 if (c < 0x100) {
7127 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7128 }
7129 else {
7130 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7131 }
7132 }
7133 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7134 rb_str_buf_cat(result, buf, l);
7135 return l;
7136}
7137
7138const char *
7139ruby_escaped_char(int c)
7140{
7141 switch (c) {
7142 case '\0': return "\\0";
7143 case '\n': return "\\n";
7144 case '\r': return "\\r";
7145 case '\t': return "\\t";
7146 case '\f': return "\\f";
7147 case '\013': return "\\v";
7148 case '\010': return "\\b";
7149 case '\007': return "\\a";
7150 case '\033': return "\\e";
7151 case '\x7f': return "\\c?";
7152 }
7153 return NULL;
7154}
7155
7156VALUE
7157rb_str_escape(VALUE str)
7158{
7159 int encidx = ENCODING_GET(str);
7160 rb_encoding *enc = rb_enc_from_index(encidx);
7161 const char *p = RSTRING_PTR(str);
7162 const char *pend = RSTRING_END(str);
7163 const char *prev = p;
7164 char buf[CHAR_ESC_LEN + 1];
7165 VALUE result = rb_str_buf_new(0);
7166 int unicode_p = rb_enc_unicode_p(enc);
7167 int asciicompat = rb_enc_asciicompat(enc);
7168
7169 while (p < pend) {
7170 unsigned int c;
7171 const char *cc;
7172 int n = rb_enc_precise_mbclen(p, pend, enc);
7173 if (!MBCLEN_CHARFOUND_P(n)) {
7174 if (p > prev) str_buf_cat(result, prev, p - prev);
7175 n = rb_enc_mbminlen(enc);
7176 if (pend < p + n)
7177 n = (int)(pend - p);
7178 while (n--) {
7179 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7180 str_buf_cat(result, buf, strlen(buf));
7181 prev = ++p;
7182 }
7183 continue;
7184 }
7185 n = MBCLEN_CHARFOUND_LEN(n);
7186 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7187 p += n;
7188 cc = ruby_escaped_char(c);
7189 if (cc) {
7190 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7191 str_buf_cat(result, cc, strlen(cc));
7192 prev = p;
7193 }
7194 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7195 }
7196 else {
7197 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7198 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7199 prev = p;
7200 }
7201 }
7202 if (p > prev) str_buf_cat(result, prev, p - prev);
7203 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7204
7205 return result;
7206}
7207
7208/*
7209 * call-seq:
7210 * inspect -> string
7211 *
7212 * :include: doc/string/inspect.rdoc
7213 *
7214 */
7215
7216VALUE
7218{
7219 int encidx = ENCODING_GET(str);
7220 rb_encoding *enc = rb_enc_from_index(encidx);
7221 const char *p, *pend, *prev;
7222 char buf[CHAR_ESC_LEN + 1];
7223 VALUE result = rb_str_buf_new(0);
7224 rb_encoding *resenc = rb_default_internal_encoding();
7225 int unicode_p = rb_enc_unicode_p(enc);
7226 int asciicompat = rb_enc_asciicompat(enc);
7227
7228 if (resenc == NULL) resenc = rb_default_external_encoding();
7229 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7230 rb_enc_associate(result, resenc);
7231 str_buf_cat2(result, "\"");
7232
7233 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7234 prev = p;
7235 while (p < pend) {
7236 unsigned int c, cc;
7237 int n;
7238
7239 n = rb_enc_precise_mbclen(p, pend, enc);
7240 if (!MBCLEN_CHARFOUND_P(n)) {
7241 if (p > prev) str_buf_cat(result, prev, p - prev);
7242 n = rb_enc_mbminlen(enc);
7243 if (pend < p + n)
7244 n = (int)(pend - p);
7245 while (n--) {
7246 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7247 str_buf_cat(result, buf, strlen(buf));
7248 prev = ++p;
7249 }
7250 continue;
7251 }
7252 n = MBCLEN_CHARFOUND_LEN(n);
7253 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7254 p += n;
7255 if ((asciicompat || unicode_p) &&
7256 (c == '"'|| c == '\\' ||
7257 (c == '#' &&
7258 p < pend &&
7259 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7260 (cc = rb_enc_codepoint(p,pend,enc),
7261 (cc == '$' || cc == '@' || cc == '{'))))) {
7262 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7263 str_buf_cat2(result, "\\");
7264 if (asciicompat || enc == resenc) {
7265 prev = p - n;
7266 continue;
7267 }
7268 }
7269 switch (c) {
7270 case '\n': cc = 'n'; break;
7271 case '\r': cc = 'r'; break;
7272 case '\t': cc = 't'; break;
7273 case '\f': cc = 'f'; break;
7274 case '\013': cc = 'v'; break;
7275 case '\010': cc = 'b'; break;
7276 case '\007': cc = 'a'; break;
7277 case 033: cc = 'e'; break;
7278 default: cc = 0; break;
7279 }
7280 if (cc) {
7281 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7282 buf[0] = '\\';
7283 buf[1] = (char)cc;
7284 str_buf_cat(result, buf, 2);
7285 prev = p;
7286 continue;
7287 }
7288 /* The special casing of 0x85 (NEXT_LINE) here is because
7289 * Oniguruma historically treats it as printable, but it
7290 * doesn't match the print POSIX bracket class or character
7291 * property in regexps.
7292 *
7293 * See Ruby Bug #16842 for details:
7294 * https://bugs.ruby-lang.org/issues/16842
7295 */
7296 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7297 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7298 continue;
7299 }
7300 else {
7301 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7302 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7303 prev = p;
7304 continue;
7305 }
7306 }
7307 if (p > prev) str_buf_cat(result, prev, p - prev);
7308 str_buf_cat2(result, "\"");
7309
7310 return result;
7311}
7312
7313#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7314
7315/*
7316 * call-seq:
7317 * dump -> new_string
7318 *
7319 * :include: doc/string/dump.rdoc
7320 *
7321 */
7322
7323VALUE
7325{
7326 int encidx = rb_enc_get_index(str);
7327 rb_encoding *enc = rb_enc_from_index(encidx);
7328 long len;
7329 const char *p, *pend;
7330 char *q, *qend;
7331 VALUE result;
7332 int u8 = (encidx == rb_utf8_encindex());
7333 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7334
7335 len = 2; /* "" */
7336 if (!rb_enc_asciicompat(enc)) {
7337 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7338 len += strlen(enc->name);
7339 }
7340
7341 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7342 while (p < pend) {
7343 int clen;
7344 unsigned char c = *p++;
7345
7346 switch (c) {
7347 case '"': case '\\':
7348 case '\n': case '\r':
7349 case '\t': case '\f':
7350 case '\013': case '\010': case '\007': case '\033':
7351 clen = 2;
7352 break;
7353
7354 case '#':
7355 clen = IS_EVSTR(p, pend) ? 2 : 1;
7356 break;
7357
7358 default:
7359 if (ISPRINT(c)) {
7360 clen = 1;
7361 }
7362 else {
7363 if (u8 && c > 0x7F) { /* \u notation */
7364 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7365 if (MBCLEN_CHARFOUND_P(n)) {
7366 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7367 if (cc <= 0xFFFF)
7368 clen = 6; /* \uXXXX */
7369 else if (cc <= 0xFFFFF)
7370 clen = 9; /* \u{XXXXX} */
7371 else
7372 clen = 10; /* \u{XXXXXX} */
7373 p += MBCLEN_CHARFOUND_LEN(n)-1;
7374 break;
7375 }
7376 }
7377 clen = 4; /* \xNN */
7378 }
7379 break;
7380 }
7381
7382 if (clen > LONG_MAX - len) {
7383 rb_raise(rb_eRuntimeError, "string size too big");
7384 }
7385 len += clen;
7386 }
7387
7388 result = rb_str_new(0, len);
7389 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7390 q = RSTRING_PTR(result); qend = q + len + 1;
7391
7392 *q++ = '"';
7393 while (p < pend) {
7394 unsigned char c = *p++;
7395
7396 if (c == '"' || c == '\\') {
7397 *q++ = '\\';
7398 *q++ = c;
7399 }
7400 else if (c == '#') {
7401 if (IS_EVSTR(p, pend)) *q++ = '\\';
7402 *q++ = '#';
7403 }
7404 else if (c == '\n') {
7405 *q++ = '\\';
7406 *q++ = 'n';
7407 }
7408 else if (c == '\r') {
7409 *q++ = '\\';
7410 *q++ = 'r';
7411 }
7412 else if (c == '\t') {
7413 *q++ = '\\';
7414 *q++ = 't';
7415 }
7416 else if (c == '\f') {
7417 *q++ = '\\';
7418 *q++ = 'f';
7419 }
7420 else if (c == '\013') {
7421 *q++ = '\\';
7422 *q++ = 'v';
7423 }
7424 else if (c == '\010') {
7425 *q++ = '\\';
7426 *q++ = 'b';
7427 }
7428 else if (c == '\007') {
7429 *q++ = '\\';
7430 *q++ = 'a';
7431 }
7432 else if (c == '\033') {
7433 *q++ = '\\';
7434 *q++ = 'e';
7435 }
7436 else if (ISPRINT(c)) {
7437 *q++ = c;
7438 }
7439 else {
7440 *q++ = '\\';
7441 if (u8) {
7442 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7443 if (MBCLEN_CHARFOUND_P(n)) {
7444 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7445 p += n;
7446 if (cc <= 0xFFFF)
7447 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7448 else
7449 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7450 q += strlen(q);
7451 continue;
7452 }
7453 }
7454 snprintf(q, qend-q, "x%02X", c);
7455 q += 3;
7456 }
7457 }
7458 *q++ = '"';
7459 *q = '\0';
7460 if (!rb_enc_asciicompat(enc)) {
7461 snprintf(q, qend-q, nonascii_suffix, enc->name);
7462 encidx = rb_ascii8bit_encindex();
7463 }
7464 /* result from dump is ASCII */
7465 rb_enc_associate_index(result, encidx);
7467 return result;
7468}
7469
7470static int
7471unescape_ascii(unsigned int c)
7472{
7473 switch (c) {
7474 case 'n':
7475 return '\n';
7476 case 'r':
7477 return '\r';
7478 case 't':
7479 return '\t';
7480 case 'f':
7481 return '\f';
7482 case 'v':
7483 return '\13';
7484 case 'b':
7485 return '\010';
7486 case 'a':
7487 return '\007';
7488 case 'e':
7489 return 033;
7490 }
7492}
7493
7494static void
7495undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7496{
7497 const char *s = *ss;
7498 unsigned int c;
7499 int codelen;
7500 size_t hexlen;
7501 unsigned char buf[6];
7502 static rb_encoding *enc_utf8 = NULL;
7503
7504 switch (*s) {
7505 case '\\':
7506 case '"':
7507 case '#':
7508 rb_str_cat(undumped, s, 1); /* cat itself */
7509 s++;
7510 break;
7511 case 'n':
7512 case 'r':
7513 case 't':
7514 case 'f':
7515 case 'v':
7516 case 'b':
7517 case 'a':
7518 case 'e':
7519 *buf = unescape_ascii(*s);
7520 rb_str_cat(undumped, (char *)buf, 1);
7521 s++;
7522 break;
7523 case 'u':
7524 if (*binary) {
7525 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7526 }
7527 *utf8 = true;
7528 if (++s >= s_end) {
7529 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7530 }
7531 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7532 if (*penc != enc_utf8) {
7533 *penc = enc_utf8;
7534 rb_enc_associate(undumped, enc_utf8);
7535 }
7536 if (*s == '{') { /* handle \u{...} form */
7537 s++;
7538 for (;;) {
7539 if (s >= s_end) {
7540 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7541 }
7542 if (*s == '}') {
7543 s++;
7544 break;
7545 }
7546 if (ISSPACE(*s)) {
7547 s++;
7548 continue;
7549 }
7550 c = scan_hex(s, s_end-s, &hexlen);
7551 if (hexlen == 0 || hexlen > 6) {
7552 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7553 }
7554 if (c > 0x10ffff) {
7555 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7556 }
7557 if (0xd800 <= c && c <= 0xdfff) {
7558 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7559 }
7560 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7561 rb_str_cat(undumped, (char *)buf, codelen);
7562 s += hexlen;
7563 }
7564 }
7565 else { /* handle \uXXXX form */
7566 c = scan_hex(s, 4, &hexlen);
7567 if (hexlen != 4) {
7568 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7569 }
7570 if (0xd800 <= c && c <= 0xdfff) {
7571 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7572 }
7573 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7574 rb_str_cat(undumped, (char *)buf, codelen);
7575 s += hexlen;
7576 }
7577 break;
7578 case 'x':
7579 if (*utf8) {
7580 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7581 }
7582 *binary = true;
7583 if (++s >= s_end) {
7584 rb_raise(rb_eRuntimeError, "invalid hex escape");
7585 }
7586 *buf = scan_hex(s, 2, &hexlen);
7587 if (hexlen != 2) {
7588 rb_raise(rb_eRuntimeError, "invalid hex escape");
7589 }
7590 rb_str_cat(undumped, (char *)buf, 1);
7591 s += hexlen;
7592 break;
7593 default:
7594 rb_str_cat(undumped, s-1, 2);
7595 s++;
7596 }
7597
7598 *ss = s;
7599}
7600
7601static VALUE rb_str_is_ascii_only_p(VALUE str);
7602
7603/*
7604 * call-seq:
7605 * undump -> new_string
7606 *
7607 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7608 *
7609 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7610 */
7611
7612static VALUE
7613str_undump(VALUE str)
7614{
7615 const char *s = RSTRING_PTR(str);
7616 const char *s_end = RSTRING_END(str);
7617 rb_encoding *enc = rb_enc_get(str);
7618 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7619 bool utf8 = false;
7620 bool binary = false;
7621 int w;
7622
7624 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7625 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7626 }
7627 if (!str_null_check(str, &w)) {
7628 rb_raise(rb_eRuntimeError, "string contains null byte");
7629 }
7630 if (RSTRING_LEN(str) < 2) goto invalid_format;
7631 if (*s != '"') goto invalid_format;
7632
7633 /* strip '"' at the start */
7634 s++;
7635
7636 for (;;) {
7637 if (s >= s_end) {
7638 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7639 }
7640
7641 if (*s == '"') {
7642 /* epilogue */
7643 s++;
7644 if (s == s_end) {
7645 /* ascii compatible dumped string */
7646 break;
7647 }
7648 else {
7649 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7650 static const char dup_suffix[] = ".dup";
7651 const char *encname;
7652 int encidx;
7653 ptrdiff_t size;
7654
7655 /* check separately for strings dumped by older versions */
7656 size = sizeof(dup_suffix) - 1;
7657 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7658
7659 size = sizeof(force_encoding_suffix) - 1;
7660 if (s_end - s <= size) goto invalid_format;
7661 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7662 s += size;
7663
7664 if (utf8) {
7665 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7666 }
7667
7668 encname = s;
7669 s = memchr(s, '"', s_end-s);
7670 size = s - encname;
7671 if (!s) goto invalid_format;
7672 if (s_end - s != 2) goto invalid_format;
7673 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7674
7675 encidx = rb_enc_find_index2(encname, (long)size);
7676 if (encidx < 0) {
7677 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7678 }
7679 rb_enc_associate_index(undumped, encidx);
7680 }
7681 break;
7682 }
7683
7684 if (*s == '\\') {
7685 s++;
7686 if (s >= s_end) {
7687 rb_raise(rb_eRuntimeError, "invalid escape");
7688 }
7689 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7690 }
7691 else {
7692 rb_str_cat(undumped, s++, 1);
7693 }
7694 }
7695
7696 RB_GC_GUARD(str);
7697
7698 return undumped;
7699invalid_format:
7700 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7701}
7702
7703static void
7704rb_str_check_dummy_enc(rb_encoding *enc)
7705{
7706 if (rb_enc_dummy_p(enc)) {
7707 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7708 rb_enc_name(enc));
7709 }
7710}
7711
7712static rb_encoding *
7713str_true_enc(VALUE str)
7714{
7715 rb_encoding *enc = STR_ENC_GET(str);
7716 rb_str_check_dummy_enc(enc);
7717 return enc;
7718}
7719
7720static OnigCaseFoldType
7721check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7722{
7723 if (argc==0)
7724 return flags;
7725 if (argc>2)
7726 rb_raise(rb_eArgError, "too many options");
7727 if (argv[0]==sym_turkic) {
7728 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7729 if (argc==2) {
7730 if (argv[1]==sym_lithuanian)
7731 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7732 else
7733 rb_raise(rb_eArgError, "invalid second option");
7734 }
7735 }
7736 else if (argv[0]==sym_lithuanian) {
7737 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7738 if (argc==2) {
7739 if (argv[1]==sym_turkic)
7740 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7741 else
7742 rb_raise(rb_eArgError, "invalid second option");
7743 }
7744 }
7745 else if (argc>1)
7746 rb_raise(rb_eArgError, "too many options");
7747 else if (argv[0]==sym_ascii)
7748 flags |= ONIGENC_CASE_ASCII_ONLY;
7749 else if (argv[0]==sym_fold) {
7750 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7751 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7752 else
7753 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7754 }
7755 else
7756 rb_raise(rb_eArgError, "invalid option");
7757 return flags;
7758}
7759
7760static inline bool
7761case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7762{
7763 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7764 return true;
7765 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7766}
7767
7768/* 16 should be long enough to absorb any kind of single character length increase */
7769#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7770#ifndef CASEMAP_DEBUG
7771# define CASEMAP_DEBUG 0
7772#endif
7773
7774struct mapping_buffer;
7775typedef struct mapping_buffer {
7776 size_t capa;
7777 size_t used;
7778 struct mapping_buffer *next;
7779 OnigUChar space[FLEX_ARY_LEN];
7781
7782static void
7783mapping_buffer_free(void *p)
7784{
7785 mapping_buffer *previous_buffer;
7786 mapping_buffer *current_buffer = p;
7787 while (current_buffer) {
7788 previous_buffer = current_buffer;
7789 current_buffer = current_buffer->next;
7790 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7791 }
7792}
7793
7794static const rb_data_type_t mapping_buffer_type = {
7795 "mapping_buffer",
7796 {0, mapping_buffer_free,},
7797 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7798};
7799
7800static VALUE
7801rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7802{
7803 VALUE target;
7804
7805 const OnigUChar *source_current, *source_end;
7806 int target_length = 0;
7807 VALUE buffer_anchor;
7808 mapping_buffer *current_buffer = 0;
7809 mapping_buffer **pre_buffer;
7810 size_t buffer_count = 0;
7811 int buffer_length_or_invalid;
7812
7813 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7814
7815 source_current = (OnigUChar*)RSTRING_PTR(source);
7816 source_end = (OnigUChar*)RSTRING_END(source);
7817
7818 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7819 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7820 while (source_current < source_end) {
7821 /* increase multiplier using buffer count to converge quickly */
7822 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7823 if (CASEMAP_DEBUG) {
7824 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7825 }
7826 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7827 *pre_buffer = current_buffer;
7828 pre_buffer = &current_buffer->next;
7829 current_buffer->next = NULL;
7830 current_buffer->capa = capa;
7831 buffer_length_or_invalid = enc->case_map(flags,
7832 &source_current, source_end,
7833 current_buffer->space,
7834 current_buffer->space+current_buffer->capa,
7835 enc);
7836 if (buffer_length_or_invalid < 0) {
7837 current_buffer = DATA_PTR(buffer_anchor);
7838 DATA_PTR(buffer_anchor) = 0;
7839 mapping_buffer_free(current_buffer);
7840 rb_raise(rb_eArgError, "input string invalid");
7841 }
7842 target_length += current_buffer->used = buffer_length_or_invalid;
7843 }
7844 if (CASEMAP_DEBUG) {
7845 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7846 }
7847
7848 if (buffer_count==1) {
7849 target = rb_str_new((const char*)current_buffer->space, target_length);
7850 }
7851 else {
7852 char *target_current;
7853
7854 target = rb_str_new(0, target_length);
7855 target_current = RSTRING_PTR(target);
7856 current_buffer = DATA_PTR(buffer_anchor);
7857 while (current_buffer) {
7858 memcpy(target_current, current_buffer->space, current_buffer->used);
7859 target_current += current_buffer->used;
7860 current_buffer = current_buffer->next;
7861 }
7862 }
7863 current_buffer = DATA_PTR(buffer_anchor);
7864 DATA_PTR(buffer_anchor) = 0;
7865 mapping_buffer_free(current_buffer);
7866
7867 RB_GC_GUARD(buffer_anchor);
7868
7869 /* TODO: check about string terminator character */
7870 str_enc_copy_direct(target, source);
7871 /*ENC_CODERANGE_SET(mapped, cr);*/
7872
7873 return target;
7874}
7875
7876static VALUE
7877rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7878{
7879 const OnigUChar *source_current, *source_end;
7880 OnigUChar *target_current, *target_end;
7881 long old_length = RSTRING_LEN(source);
7882 int length_or_invalid;
7883
7884 if (old_length == 0) return Qnil;
7885
7886 source_current = (OnigUChar*)RSTRING_PTR(source);
7887 source_end = (OnigUChar*)RSTRING_END(source);
7888 if (source == target) {
7889 target_current = (OnigUChar*)source_current;
7890 target_end = (OnigUChar*)source_end;
7891 }
7892 else {
7893 target_current = (OnigUChar*)RSTRING_PTR(target);
7894 target_end = (OnigUChar*)RSTRING_END(target);
7895 }
7896
7897 length_or_invalid = onigenc_ascii_only_case_map(flags,
7898 &source_current, source_end,
7899 target_current, target_end, enc);
7900 if (length_or_invalid < 0)
7901 rb_raise(rb_eArgError, "input string invalid");
7902 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7903 fprintf(stderr, "problem with rb_str_ascii_casemap"
7904 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7905 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7906 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7907 }
7908
7909 str_enc_copy(target, source);
7910
7911 return target;
7912}
7913
7914static bool
7915upcase_single(VALUE str)
7916{
7917 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7918 bool modified = false;
7919
7920 while (s < send) {
7921 unsigned int c = *(unsigned char*)s;
7922
7923 if ('a' <= c && c <= 'z') {
7924 *s = 'A' + (c - 'a');
7925 modified = true;
7926 }
7927 s++;
7928 }
7929 return modified;
7930}
7931
7932/*
7933 * call-seq:
7934 * upcase!(mapping) -> self or nil
7935 *
7936 * Like String#upcase, except that:
7937 *
7938 * - Changes character casings in +self+ (not in a copy of +self+).
7939 * - Returns +self+ if any changes are made, +nil+ otherwise.
7940 *
7941 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7942 */
7943
7944static VALUE
7945rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7946{
7947 rb_encoding *enc;
7948 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7949
7950 flags = check_case_options(argc, argv, flags);
7951 str_modify_keep_cr(str);
7952 enc = str_true_enc(str);
7953 if (case_option_single_p(flags, enc, str)) {
7954 if (upcase_single(str))
7955 flags |= ONIGENC_CASE_MODIFIED;
7956 }
7957 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7958 rb_str_ascii_casemap(str, str, &flags, enc);
7959 else
7960 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7961
7962 if (ONIGENC_CASE_MODIFIED&flags) return str;
7963 return Qnil;
7964}
7965
7966
7967/*
7968 * call-seq:
7969 * upcase(mapping = :ascii) -> new_string
7970 *
7971 * :include: doc/string/upcase.rdoc
7972 */
7973
7974static VALUE
7975rb_str_upcase(int argc, VALUE *argv, VALUE str)
7976{
7977 rb_encoding *enc;
7978 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7979 VALUE ret;
7980
7981 flags = check_case_options(argc, argv, flags);
7982 enc = str_true_enc(str);
7983 if (case_option_single_p(flags, enc, str)) {
7984 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7985 str_enc_copy_direct(ret, str);
7986 upcase_single(ret);
7987 }
7988 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7989 ret = rb_str_new(0, RSTRING_LEN(str));
7990 rb_str_ascii_casemap(str, ret, &flags, enc);
7991 }
7992 else {
7993 ret = rb_str_casemap(str, &flags, enc);
7994 }
7995
7996 return ret;
7997}
7998
7999static bool
8000downcase_single(VALUE str)
8001{
8002 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8003 bool modified = false;
8004
8005 while (s < send) {
8006 unsigned int c = *(unsigned char*)s;
8007
8008 if ('A' <= c && c <= 'Z') {
8009 *s = 'a' + (c - 'A');
8010 modified = true;
8011 }
8012 s++;
8013 }
8014
8015 return modified;
8016}
8017
8018/*
8019 * call-seq:
8020 * downcase!(mapping) -> self or nil
8021 *
8022 * Like String#downcase, except that:
8023 *
8024 * - Changes character casings in +self+ (not in a copy of +self+).
8025 * - Returns +self+ if any changes are made, +nil+ otherwise.
8026 *
8027 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8028 */
8029
8030static VALUE
8031rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8032{
8033 rb_encoding *enc;
8034 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8035
8036 flags = check_case_options(argc, argv, flags);
8037 str_modify_keep_cr(str);
8038 enc = str_true_enc(str);
8039 if (case_option_single_p(flags, enc, str)) {
8040 if (downcase_single(str))
8041 flags |= ONIGENC_CASE_MODIFIED;
8042 }
8043 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8044 rb_str_ascii_casemap(str, str, &flags, enc);
8045 else
8046 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8047
8048 if (ONIGENC_CASE_MODIFIED&flags) return str;
8049 return Qnil;
8050}
8051
8052
8053/*
8054 * call-seq:
8055 * downcase(mapping = :ascii) -> new_string
8056 *
8057 * :include: doc/string/downcase.rdoc
8058 *
8059 */
8060
8061static VALUE
8062rb_str_downcase(int argc, VALUE *argv, VALUE str)
8063{
8064 rb_encoding *enc;
8065 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8066 VALUE ret;
8067
8068 flags = check_case_options(argc, argv, flags);
8069 enc = str_true_enc(str);
8070 if (case_option_single_p(flags, enc, str)) {
8071 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8072 str_enc_copy_direct(ret, str);
8073 downcase_single(ret);
8074 }
8075 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8076 ret = rb_str_new(0, RSTRING_LEN(str));
8077 rb_str_ascii_casemap(str, ret, &flags, enc);
8078 }
8079 else {
8080 ret = rb_str_casemap(str, &flags, enc);
8081 }
8082
8083 return ret;
8084}
8085
8086
8087/*
8088 * call-seq:
8089 * capitalize!(mapping = :ascii) -> self or nil
8090 *
8091 * Like String#capitalize, except that:
8092 *
8093 * - Changes character casings in +self+ (not in a copy of +self+).
8094 * - Returns +self+ if any changes are made, +nil+ otherwise.
8095 *
8096 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8097 */
8098
8099static VALUE
8100rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8101{
8102 rb_encoding *enc;
8103 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8104
8105 flags = check_case_options(argc, argv, flags);
8106 str_modify_keep_cr(str);
8107 enc = str_true_enc(str);
8108 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8109 if (flags&ONIGENC_CASE_ASCII_ONLY)
8110 rb_str_ascii_casemap(str, str, &flags, enc);
8111 else
8112 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8113
8114 if (ONIGENC_CASE_MODIFIED&flags) return str;
8115 return Qnil;
8116}
8117
8118
8119/*
8120 * call-seq:
8121 * capitalize(mapping = :ascii) -> new_string
8122 *
8123 * :include: doc/string/capitalize.rdoc
8124 *
8125 */
8126
8127static VALUE
8128rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8129{
8130 rb_encoding *enc;
8131 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8132 VALUE ret;
8133
8134 flags = check_case_options(argc, argv, flags);
8135 enc = str_true_enc(str);
8136 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8137 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8138 ret = rb_str_new(0, RSTRING_LEN(str));
8139 rb_str_ascii_casemap(str, ret, &flags, enc);
8140 }
8141 else {
8142 ret = rb_str_casemap(str, &flags, enc);
8143 }
8144 return ret;
8145}
8146
8147
8148/*
8149 * call-seq:
8150 * swapcase!(mapping) -> self or nil
8151 *
8152 * Like String#swapcase, except that:
8153 *
8154 * - Changes are made to +self+, not to copy of +self+.
8155 * - Returns +self+ if any changes are made, +nil+ otherwise.
8156 *
8157 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8158 */
8159
8160static VALUE
8161rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8162{
8163 rb_encoding *enc;
8164 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8165
8166 flags = check_case_options(argc, argv, flags);
8167 str_modify_keep_cr(str);
8168 enc = str_true_enc(str);
8169 if (flags&ONIGENC_CASE_ASCII_ONLY)
8170 rb_str_ascii_casemap(str, str, &flags, enc);
8171 else
8172 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8173
8174 if (ONIGENC_CASE_MODIFIED&flags) return str;
8175 return Qnil;
8176}
8177
8178
8179/*
8180 * call-seq:
8181 * swapcase(mapping = :ascii) -> new_string
8182 *
8183 * :include: doc/string/swapcase.rdoc
8184 *
8185 */
8186
8187static VALUE
8188rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8189{
8190 rb_encoding *enc;
8191 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8192 VALUE ret;
8193
8194 flags = check_case_options(argc, argv, flags);
8195 enc = str_true_enc(str);
8196 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8197 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8198 ret = rb_str_new(0, RSTRING_LEN(str));
8199 rb_str_ascii_casemap(str, ret, &flags, enc);
8200 }
8201 else {
8202 ret = rb_str_casemap(str, &flags, enc);
8203 }
8204 return ret;
8205}
8206
8207typedef unsigned char *USTR;
8208
8209struct tr {
8210 int gen;
8211 unsigned int now, max;
8212 char *p, *pend;
8213};
8214
8215static unsigned int
8216trnext(struct tr *t, rb_encoding *enc)
8217{
8218 int n;
8219
8220 for (;;) {
8221 nextpart:
8222 if (!t->gen) {
8223 if (t->p == t->pend) return -1;
8224 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8225 t->p += n;
8226 }
8227 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8228 t->p += n;
8229 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8230 t->p += n;
8231 if (t->p < t->pend) {
8232 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8233 t->p += n;
8234 if (t->now > c) {
8235 if (t->now < 0x80 && c < 0x80) {
8236 rb_raise(rb_eArgError,
8237 "invalid range \"%c-%c\" in string transliteration",
8238 t->now, c);
8239 }
8240 else {
8241 rb_raise(rb_eArgError, "invalid range in string transliteration");
8242 }
8243 continue; /* not reached */
8244 }
8245 else if (t->now < c) {
8246 t->gen = 1;
8247 t->max = c;
8248 }
8249 }
8250 }
8251 return t->now;
8252 }
8253 else {
8254 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8255 if (t->now == t->max) {
8256 t->gen = 0;
8257 goto nextpart;
8258 }
8259 }
8260 if (t->now < t->max) {
8261 return t->now;
8262 }
8263 else {
8264 t->gen = 0;
8265 return t->max;
8266 }
8267 }
8268 }
8269}
8270
8271static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8272
8273static VALUE
8274tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8275{
8276 const unsigned int errc = -1;
8277 unsigned int trans[256];
8278 rb_encoding *enc, *e1, *e2;
8279 struct tr trsrc, trrepl;
8280 int cflag = 0;
8281 unsigned int c, c0, last = 0;
8282 int modify = 0, i, l;
8283 unsigned char *s, *send;
8284 VALUE hash = 0;
8285 int singlebyte = single_byte_optimizable(str);
8286 int termlen;
8287 int cr;
8288
8289#define CHECK_IF_ASCII(c) \
8290 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8291 (cr = ENC_CODERANGE_VALID) : 0)
8292
8293 StringValue(src);
8294 StringValue(repl);
8295 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8296 if (RSTRING_LEN(repl) == 0) {
8297 return rb_str_delete_bang(1, &src, str);
8298 }
8299
8300 cr = ENC_CODERANGE(str);
8301 e1 = rb_enc_check(str, src);
8302 e2 = rb_enc_check(str, repl);
8303 if (e1 == e2) {
8304 enc = e1;
8305 }
8306 else {
8307 enc = rb_enc_check(src, repl);
8308 }
8309 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8310 if (RSTRING_LEN(src) > 1 &&
8311 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8312 trsrc.p + l < trsrc.pend) {
8313 cflag = 1;
8314 trsrc.p += l;
8315 }
8316 trrepl.p = RSTRING_PTR(repl);
8317 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8318 trsrc.gen = trrepl.gen = 0;
8319 trsrc.now = trrepl.now = 0;
8320 trsrc.max = trrepl.max = 0;
8321
8322 if (cflag) {
8323 for (i=0; i<256; i++) {
8324 trans[i] = 1;
8325 }
8326 while ((c = trnext(&trsrc, enc)) != errc) {
8327 if (c < 256) {
8328 trans[c] = errc;
8329 }
8330 else {
8331 if (!hash) hash = rb_hash_new();
8332 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8333 }
8334 }
8335 while ((c = trnext(&trrepl, enc)) != errc)
8336 /* retrieve last replacer */;
8337 last = trrepl.now;
8338 for (i=0; i<256; i++) {
8339 if (trans[i] != errc) {
8340 trans[i] = last;
8341 }
8342 }
8343 }
8344 else {
8345 unsigned int r;
8346
8347 for (i=0; i<256; i++) {
8348 trans[i] = errc;
8349 }
8350 while ((c = trnext(&trsrc, enc)) != errc) {
8351 r = trnext(&trrepl, enc);
8352 if (r == errc) r = trrepl.now;
8353 if (c < 256) {
8354 trans[c] = r;
8355 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8356 }
8357 else {
8358 if (!hash) hash = rb_hash_new();
8359 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8360 }
8361 }
8362 }
8363
8364 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8365 cr = ENC_CODERANGE_7BIT;
8366 str_modify_keep_cr(str);
8367 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8368 termlen = rb_enc_mbminlen(enc);
8369 if (sflag) {
8370 int clen, tlen;
8371 long offset, max = RSTRING_LEN(str);
8372 unsigned int save = -1;
8373 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8374
8375 while (s < send) {
8376 int may_modify = 0;
8377
8378 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8379 if (!MBCLEN_CHARFOUND_P(r)) {
8380 xfree(buf);
8381 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8382 }
8383 clen = MBCLEN_CHARFOUND_LEN(r);
8384 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8385
8386 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8387
8388 s += clen;
8389 if (c < 256) {
8390 c = trans[c];
8391 }
8392 else if (hash) {
8393 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8394 if (NIL_P(tmp)) {
8395 if (cflag) c = last;
8396 else c = errc;
8397 }
8398 else if (cflag) c = errc;
8399 else c = NUM2INT(tmp);
8400 }
8401 else {
8402 c = errc;
8403 }
8404 if (c != (unsigned int)-1) {
8405 if (save == c) {
8406 CHECK_IF_ASCII(c);
8407 continue;
8408 }
8409 save = c;
8410 tlen = rb_enc_codelen(c, enc);
8411 modify = 1;
8412 }
8413 else {
8414 save = -1;
8415 c = c0;
8416 if (enc != e1) may_modify = 1;
8417 }
8418 if ((offset = t - buf) + tlen > max) {
8419 size_t MAYBE_UNUSED(old) = max + termlen;
8420 max = offset + tlen + (send - s);
8421 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8422 t = buf + offset;
8423 }
8424 rb_enc_mbcput(c, t, enc);
8425 if (may_modify && memcmp(s, t, tlen) != 0) {
8426 modify = 1;
8427 }
8428 CHECK_IF_ASCII(c);
8429 t += tlen;
8430 }
8431 if (!STR_EMBED_P(str)) {
8432 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8433 }
8434 TERM_FILL((char *)t, termlen);
8435 RSTRING(str)->as.heap.ptr = (char *)buf;
8436 STR_SET_LEN(str, t - buf);
8437 STR_SET_NOEMBED(str);
8438 RSTRING(str)->as.heap.aux.capa = max;
8439 }
8440 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8441 while (s < send) {
8442 c = (unsigned char)*s;
8443 if (trans[c] != errc) {
8444 if (!cflag) {
8445 c = trans[c];
8446 *s = c;
8447 modify = 1;
8448 }
8449 else {
8450 *s = last;
8451 modify = 1;
8452 }
8453 }
8454 CHECK_IF_ASCII(c);
8455 s++;
8456 }
8457 }
8458 else {
8459 int clen, tlen;
8460 long offset, max = (long)((send - s) * 1.2);
8461 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8462
8463 while (s < send) {
8464 int may_modify = 0;
8465
8466 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8467 if (!MBCLEN_CHARFOUND_P(r)) {
8468 xfree(buf);
8469 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8470 }
8471 clen = MBCLEN_CHARFOUND_LEN(r);
8472 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8473
8474 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8475
8476 if (c < 256) {
8477 c = trans[c];
8478 }
8479 else if (hash) {
8480 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8481 if (NIL_P(tmp)) {
8482 if (cflag) c = last;
8483 else c = errc;
8484 }
8485 else if (cflag) c = errc;
8486 else c = NUM2INT(tmp);
8487 }
8488 else {
8489 c = cflag ? last : errc;
8490 }
8491 if (c != errc) {
8492 tlen = rb_enc_codelen(c, enc);
8493 modify = 1;
8494 }
8495 else {
8496 c = c0;
8497 if (enc != e1) may_modify = 1;
8498 }
8499 if ((offset = t - buf) + tlen > max) {
8500 size_t MAYBE_UNUSED(old) = max + termlen;
8501 max = offset + tlen + (long)((send - s) * 1.2);
8502 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8503 t = buf + offset;
8504 }
8505 if (s != t) {
8506 rb_enc_mbcput(c, t, enc);
8507 if (may_modify && memcmp(s, t, tlen) != 0) {
8508 modify = 1;
8509 }
8510 }
8511 CHECK_IF_ASCII(c);
8512 s += clen;
8513 t += tlen;
8514 }
8515 if (!STR_EMBED_P(str)) {
8516 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8517 }
8518 TERM_FILL((char *)t, termlen);
8519 RSTRING(str)->as.heap.ptr = (char *)buf;
8520 STR_SET_LEN(str, t - buf);
8521 STR_SET_NOEMBED(str);
8522 RSTRING(str)->as.heap.aux.capa = max;
8523 }
8524
8525 if (modify) {
8526 if (cr != ENC_CODERANGE_BROKEN)
8527 ENC_CODERANGE_SET(str, cr);
8528 rb_enc_associate(str, enc);
8529 return str;
8530 }
8531 return Qnil;
8532}
8533
8534
8535/*
8536 * call-seq:
8537 * tr!(selector, replacements) -> self or nil
8538 *
8539 * Like String#tr, except:
8540 *
8541 * - Performs substitutions in +self+ (not in a copy of +self+).
8542 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8543 *
8544 * Related: {Modifying}[rdoc-ref:String@Modifying].
8545 */
8546
8547static VALUE
8548rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8549{
8550 return tr_trans(str, src, repl, 0);
8551}
8552
8553
8554/*
8555 * call-seq:
8556 * tr(selector, replacements) -> new_string
8557 *
8558 * Returns a copy of +self+ with each character specified by string +selector+
8559 * translated to the corresponding character in string +replacements+.
8560 * The correspondence is _positional_:
8561 *
8562 * - Each occurrence of the first character specified by +selector+
8563 * is translated to the first character in +replacements+.
8564 * - Each occurrence of the second character specified by +selector+
8565 * is translated to the second character in +replacements+.
8566 * - And so on.
8567 *
8568 * Example:
8569 *
8570 * 'hello'.tr('el', 'ip') #=> "hippo"
8571 *
8572 * If +replacements+ is shorter than +selector+,
8573 * it is implicitly padded with its own last character:
8574 *
8575 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8576 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8577 *
8578 * Arguments +selector+ and +replacements+ must be valid character selectors
8579 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8580 * and may use any of its valid forms, including negation, ranges, and escapes:
8581 *
8582 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8583 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8584 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8585 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8586 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8587 *
8588 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8589 */
8590
8591static VALUE
8592rb_str_tr(VALUE str, VALUE src, VALUE repl)
8593{
8594 str = str_duplicate(rb_cString, str);
8595 tr_trans(str, src, repl, 0);
8596 return str;
8597}
8598
8599#define TR_TABLE_MAX (UCHAR_MAX+1)
8600#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8601static void
8602tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8603 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8604{
8605 const unsigned int errc = -1;
8606 char buf[TR_TABLE_MAX];
8607 struct tr tr;
8608 unsigned int c;
8609 VALUE table = 0, ptable = 0;
8610 int i, l, cflag = 0;
8611
8612 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8613 tr.gen = tr.now = tr.max = 0;
8614
8615 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8616 cflag = 1;
8617 tr.p += l;
8618 }
8619 if (first) {
8620 for (i=0; i<TR_TABLE_MAX; i++) {
8621 stable[i] = 1;
8622 }
8623 stable[TR_TABLE_MAX] = cflag;
8624 }
8625 else if (stable[TR_TABLE_MAX] && !cflag) {
8626 stable[TR_TABLE_MAX] = 0;
8627 }
8628 for (i=0; i<TR_TABLE_MAX; i++) {
8629 buf[i] = cflag;
8630 }
8631
8632 while ((c = trnext(&tr, enc)) != errc) {
8633 if (c < TR_TABLE_MAX) {
8634 buf[(unsigned char)c] = !cflag;
8635 }
8636 else {
8637 VALUE key = UINT2NUM(c);
8638
8639 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8640 if (cflag) {
8641 ptable = *ctablep;
8642 table = ptable ? ptable : rb_hash_new();
8643 *ctablep = table;
8644 }
8645 else {
8646 table = rb_hash_new();
8647 ptable = *tablep;
8648 *tablep = table;
8649 }
8650 }
8651 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8652 rb_hash_aset(table, key, Qtrue);
8653 }
8654 }
8655 }
8656 for (i=0; i<TR_TABLE_MAX; i++) {
8657 stable[i] = stable[i] && buf[i];
8658 }
8659 if (!table && !cflag) {
8660 *tablep = 0;
8661 }
8662}
8663
8664
8665static int
8666tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8667{
8668 if (c < TR_TABLE_MAX) {
8669 return table[c] != 0;
8670 }
8671 else {
8672 VALUE v = UINT2NUM(c);
8673
8674 if (del) {
8675 if (!NIL_P(rb_hash_lookup(del, v)) &&
8676 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8677 return TRUE;
8678 }
8679 }
8680 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8681 return FALSE;
8682 }
8683 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8684 }
8685}
8686
8687/*
8688 * call-seq:
8689 * delete!(*selectors) -> self or nil
8690 *
8691 * Like String#delete, but modifies +self+ in place;
8692 * returns +self+ if any characters were deleted, +nil+ otherwise.
8693 *
8694 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8695 */
8696
8697static VALUE
8698rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8699{
8700 char squeez[TR_TABLE_SIZE];
8701 rb_encoding *enc = 0;
8702 char *s, *send, *t;
8703 VALUE del = 0, nodel = 0;
8704 int modify = 0;
8705 int i, ascompat, cr;
8706
8707 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8709 for (i=0; i<argc; i++) {
8710 VALUE s = argv[i];
8711
8712 StringValue(s);
8713 enc = rb_enc_check(str, s);
8714 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8715 }
8716
8717 str_modify_keep_cr(str);
8718 ascompat = rb_enc_asciicompat(enc);
8719 s = t = RSTRING_PTR(str);
8720 send = RSTRING_END(str);
8721 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8722 while (s < send) {
8723 unsigned int c;
8724 int clen;
8725
8726 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8727 if (squeez[c]) {
8728 modify = 1;
8729 }
8730 else {
8731 if (t != s) *t = c;
8732 t++;
8733 }
8734 s++;
8735 }
8736 else {
8737 c = rb_enc_codepoint_len(s, send, &clen, enc);
8738
8739 if (tr_find(c, squeez, del, nodel)) {
8740 modify = 1;
8741 }
8742 else {
8743 if (t != s) rb_enc_mbcput(c, t, enc);
8744 t += clen;
8746 }
8747 s += clen;
8748 }
8749 }
8750 TERM_FILL(t, TERM_LEN(str));
8751 STR_SET_LEN(str, t - RSTRING_PTR(str));
8752 ENC_CODERANGE_SET(str, cr);
8753
8754 if (modify) return str;
8755 return Qnil;
8756}
8757
8758
8759/*
8760 * call-seq:
8761 * delete(*selectors) -> new_string
8762 *
8763 * :include: doc/string/delete.rdoc
8764 *
8765 */
8766
8767static VALUE
8768rb_str_delete(int argc, VALUE *argv, VALUE str)
8769{
8770 str = str_duplicate(rb_cString, str);
8771 rb_str_delete_bang(argc, argv, str);
8772 return str;
8773}
8774
8775
8776/*
8777 * call-seq:
8778 * squeeze!(*selectors) -> self or nil
8779 *
8780 * Like String#squeeze, except that:
8781 *
8782 * - Characters are squeezed in +self+ (not in a copy of +self+).
8783 * - Returns +self+ if any changes are made, +nil+ otherwise.
8784 *
8785 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8786 */
8787
8788static VALUE
8789rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8790{
8791 char squeez[TR_TABLE_SIZE];
8792 rb_encoding *enc = 0;
8793 VALUE del = 0, nodel = 0;
8794 unsigned char *s, *send, *t;
8795 int i, modify = 0;
8796 int ascompat, singlebyte = single_byte_optimizable(str);
8797 unsigned int save;
8798
8799 if (argc == 0) {
8800 enc = STR_ENC_GET(str);
8801 }
8802 else {
8803 for (i=0; i<argc; i++) {
8804 VALUE s = argv[i];
8805
8806 StringValue(s);
8807 enc = rb_enc_check(str, s);
8808 if (singlebyte && !single_byte_optimizable(s))
8809 singlebyte = 0;
8810 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8811 }
8812 }
8813
8814 str_modify_keep_cr(str);
8815 s = t = (unsigned char *)RSTRING_PTR(str);
8816 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8817 send = (unsigned char *)RSTRING_END(str);
8818 save = -1;
8819 ascompat = rb_enc_asciicompat(enc);
8820
8821 if (singlebyte) {
8822 while (s < send) {
8823 unsigned int c = *s++;
8824 if (c != save || (argc > 0 && !squeez[c])) {
8825 *t++ = save = c;
8826 }
8827 }
8828 }
8829 else {
8830 while (s < send) {
8831 unsigned int c;
8832 int clen;
8833
8834 if (ascompat && (c = *s) < 0x80) {
8835 if (c != save || (argc > 0 && !squeez[c])) {
8836 *t++ = save = c;
8837 }
8838 s++;
8839 }
8840 else {
8841 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8842
8843 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8844 if (t != s) rb_enc_mbcput(c, t, enc);
8845 save = c;
8846 t += clen;
8847 }
8848 s += clen;
8849 }
8850 }
8851 }
8852
8853 TERM_FILL((char *)t, TERM_LEN(str));
8854 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8855 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8856 modify = 1;
8857 }
8858
8859 if (modify) return str;
8860 return Qnil;
8861}
8862
8863
8864/*
8865 * call-seq:
8866 * squeeze(*selectors) -> new_string
8867 *
8868 * :include: doc/string/squeeze.rdoc
8869 *
8870 */
8871
8872static VALUE
8873rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8874{
8875 str = str_duplicate(rb_cString, str);
8876 rb_str_squeeze_bang(argc, argv, str);
8877 return str;
8878}
8879
8880
8881/*
8882 * call-seq:
8883 * tr_s!(selector, replacements) -> self or nil
8884 *
8885 * Like String#tr_s, except:
8886 *
8887 * - Modifies +self+ in place (not a copy of +self+).
8888 * - Returns +self+ if any changes were made, +nil+ otherwise.
8889 *
8890 * Related: {Modifying}[rdoc-ref:String@Modifying].
8891 */
8892
8893static VALUE
8894rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8895{
8896 return tr_trans(str, src, repl, 1);
8897}
8898
8899
8900/*
8901 * call-seq:
8902 * tr_s(selector, replacements) -> new_string
8903 *
8904 * Like String#tr, except:
8905 *
8906 * - Also squeezes the modified portions of the translated string;
8907 * see String#squeeze.
8908 * - Returns the translated and squeezed string.
8909 *
8910 * Examples:
8911 *
8912 * 'hello'.tr_s('l', 'r') #=> "hero"
8913 * 'hello'.tr_s('el', '-') #=> "h-o"
8914 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8915 *
8916 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8917 *
8918 */
8919
8920static VALUE
8921rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8922{
8923 str = str_duplicate(rb_cString, str);
8924 tr_trans(str, src, repl, 1);
8925 return str;
8926}
8927
8928
8929/*
8930 * call-seq:
8931 * count(*selectors) -> integer
8932 *
8933 * :include: doc/string/count.rdoc
8934 */
8935
8936static VALUE
8937rb_str_count(int argc, VALUE *argv, VALUE str)
8938{
8939 char table[TR_TABLE_SIZE];
8940 rb_encoding *enc = 0;
8941 VALUE del = 0, nodel = 0, tstr;
8942 char *s, *send;
8943 int i;
8944 int ascompat;
8945 size_t n = 0;
8946
8948
8949 tstr = argv[0];
8950 StringValue(tstr);
8951 enc = rb_enc_check(str, tstr);
8952 if (argc == 1) {
8953 const char *ptstr;
8954 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8955 (ptstr = RSTRING_PTR(tstr),
8956 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8957 !is_broken_string(str)) {
8958 int clen;
8959 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8960
8961 s = RSTRING_PTR(str);
8962 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8963 send = RSTRING_END(str);
8964 while (s < send) {
8965 if (*(unsigned char*)s++ == c) n++;
8966 }
8967 return SIZET2NUM(n);
8968 }
8969 }
8970
8971 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8972 for (i=1; i<argc; i++) {
8973 tstr = argv[i];
8974 StringValue(tstr);
8975 enc = rb_enc_check(str, tstr);
8976 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8977 }
8978
8979 s = RSTRING_PTR(str);
8980 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8981 send = RSTRING_END(str);
8982 ascompat = rb_enc_asciicompat(enc);
8983 while (s < send) {
8984 unsigned int c;
8985
8986 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8987 if (table[c]) {
8988 n++;
8989 }
8990 s++;
8991 }
8992 else {
8993 int clen;
8994 c = rb_enc_codepoint_len(s, send, &clen, enc);
8995 if (tr_find(c, table, del, nodel)) {
8996 n++;
8997 }
8998 s += clen;
8999 }
9000 }
9001
9002 return SIZET2NUM(n);
9003}
9004
9005static VALUE
9006rb_fs_check(VALUE val)
9007{
9008 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9009 val = rb_check_string_type(val);
9010 if (NIL_P(val)) return 0;
9011 }
9012 return val;
9013}
9014
9015static const char isspacetable[256] = {
9016 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9017 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9018 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9019 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9020 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9021 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9022 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9023 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9024 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9025 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9026 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9027 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9028 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9029 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9030 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9032};
9033
9034#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9035
9036static long
9037split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9038{
9039 if (empty_count >= 0 && len == 0) {
9040 return empty_count + 1;
9041 }
9042 if (empty_count > 0) {
9043 /* make different substrings */
9044 if (result) {
9045 do {
9046 rb_ary_push(result, str_new_empty_String(str));
9047 } while (--empty_count > 0);
9048 }
9049 else {
9050 do {
9051 rb_yield(str_new_empty_String(str));
9052 } while (--empty_count > 0);
9053 }
9054 }
9055 str = rb_str_subseq(str, beg, len);
9056 if (result) {
9057 rb_ary_push(result, str);
9058 }
9059 else {
9060 rb_yield(str);
9061 }
9062 return empty_count;
9063}
9064
9065typedef enum {
9066 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9067} split_type_t;
9068
9069static split_type_t
9070literal_split_pattern(VALUE spat, split_type_t default_type)
9071{
9072 rb_encoding *enc = STR_ENC_GET(spat);
9073 const char *ptr;
9074 long len;
9075 RSTRING_GETMEM(spat, ptr, len);
9076 if (len == 0) {
9077 /* Special case - split into chars */
9078 return SPLIT_TYPE_CHARS;
9079 }
9080 else if (rb_enc_asciicompat(enc)) {
9081 if (len == 1 && ptr[0] == ' ') {
9082 return SPLIT_TYPE_AWK;
9083 }
9084 }
9085 else {
9086 int l;
9087 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9088 return SPLIT_TYPE_AWK;
9089 }
9090 }
9091 return default_type;
9092}
9093
9094/*
9095 * call-seq:
9096 * split(field_sep = $;, limit = 0) -> array_of_substrings
9097 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9098 *
9099 * :include: doc/string/split.rdoc
9100 *
9101 */
9102
9103static VALUE
9104rb_str_split_m(int argc, VALUE *argv, VALUE str)
9105{
9106 rb_encoding *enc;
9107 VALUE spat;
9108 VALUE limit;
9109 split_type_t split_type;
9110 long beg, end, i = 0, empty_count = -1;
9111 int lim = 0;
9112 VALUE result, tmp;
9113
9114 result = rb_block_given_p() ? Qfalse : Qnil;
9115 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9116 lim = NUM2INT(limit);
9117 if (lim <= 0) limit = Qnil;
9118 else if (lim == 1) {
9119 if (RSTRING_LEN(str) == 0)
9120 return result ? rb_ary_new2(0) : str;
9121 tmp = str_duplicate(rb_cString, str);
9122 if (!result) {
9123 rb_yield(tmp);
9124 return str;
9125 }
9126 return rb_ary_new3(1, tmp);
9127 }
9128 i = 1;
9129 }
9130 if (NIL_P(limit) && !lim) empty_count = 0;
9131
9132 enc = STR_ENC_GET(str);
9133 split_type = SPLIT_TYPE_REGEXP;
9134 if (!NIL_P(spat)) {
9135 spat = get_pat_quoted(spat, 0);
9136 }
9137 else if (NIL_P(spat = rb_fs)) {
9138 split_type = SPLIT_TYPE_AWK;
9139 }
9140 else if (!(spat = rb_fs_check(spat))) {
9141 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9142 }
9143 else {
9144 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9145 }
9146 if (split_type != SPLIT_TYPE_AWK) {
9147 switch (BUILTIN_TYPE(spat)) {
9148 case T_REGEXP:
9149 rb_reg_options(spat); /* check if uninitialized */
9150 tmp = RREGEXP_SRC(spat);
9151 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9152 if (split_type == SPLIT_TYPE_AWK) {
9153 spat = tmp;
9154 split_type = SPLIT_TYPE_STRING;
9155 }
9156 break;
9157
9158 case T_STRING:
9159 mustnot_broken(spat);
9160 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9161 break;
9162
9163 default:
9165 }
9166 }
9167
9168#define SPLIT_STR(beg, len) ( \
9169 empty_count = split_string(result, str, beg, len, empty_count), \
9170 str_mod_check(str, str_start, str_len))
9171
9172 beg = 0;
9173 char *ptr = RSTRING_PTR(str);
9174 char *const str_start = ptr;
9175 const long str_len = RSTRING_LEN(str);
9176 char *const eptr = str_start + str_len;
9177 if (split_type == SPLIT_TYPE_AWK) {
9178 char *bptr = ptr;
9179 int skip = 1;
9180 unsigned int c;
9181
9182 if (result) result = rb_ary_new();
9183 end = beg;
9184 if (is_ascii_string(str)) {
9185 while (ptr < eptr) {
9186 c = (unsigned char)*ptr++;
9187 if (skip) {
9188 if (ascii_isspace(c)) {
9189 beg = ptr - bptr;
9190 }
9191 else {
9192 end = ptr - bptr;
9193 skip = 0;
9194 if (!NIL_P(limit) && lim <= i) break;
9195 }
9196 }
9197 else if (ascii_isspace(c)) {
9198 SPLIT_STR(beg, end-beg);
9199 skip = 1;
9200 beg = ptr - bptr;
9201 if (!NIL_P(limit)) ++i;
9202 }
9203 else {
9204 end = ptr - bptr;
9205 }
9206 }
9207 }
9208 else {
9209 while (ptr < eptr) {
9210 int n;
9211
9212 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9213 ptr += n;
9214 if (skip) {
9215 if (rb_isspace(c)) {
9216 beg = ptr - bptr;
9217 }
9218 else {
9219 end = ptr - bptr;
9220 skip = 0;
9221 if (!NIL_P(limit) && lim <= i) break;
9222 }
9223 }
9224 else if (rb_isspace(c)) {
9225 SPLIT_STR(beg, end-beg);
9226 skip = 1;
9227 beg = ptr - bptr;
9228 if (!NIL_P(limit)) ++i;
9229 }
9230 else {
9231 end = ptr - bptr;
9232 }
9233 }
9234 }
9235 }
9236 else if (split_type == SPLIT_TYPE_STRING) {
9237 char *substr_start = ptr;
9238 char *sptr = RSTRING_PTR(spat);
9239 long slen = RSTRING_LEN(spat);
9240
9241 if (result) result = rb_ary_new();
9242 mustnot_broken(str);
9243 enc = rb_enc_check(str, spat);
9244 while (ptr < eptr &&
9245 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9246 /* Check we are at the start of a char */
9247 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9248 if (t != ptr + end) {
9249 ptr = t;
9250 continue;
9251 }
9252 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9253 str_mod_check(spat, sptr, slen);
9254 ptr += end + slen;
9255 substr_start = ptr;
9256 if (!NIL_P(limit) && lim <= ++i) break;
9257 }
9258 beg = ptr - str_start;
9259 }
9260 else if (split_type == SPLIT_TYPE_CHARS) {
9261 int n;
9262
9263 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9264 mustnot_broken(str);
9265 enc = rb_enc_get(str);
9266 while (ptr < eptr &&
9267 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9268 SPLIT_STR(ptr - str_start, n);
9269 ptr += n;
9270 if (!NIL_P(limit) && lim <= ++i) break;
9271 }
9272 beg = ptr - str_start;
9273 }
9274 else {
9275 if (result) result = rb_ary_new();
9276 long len = RSTRING_LEN(str);
9277 long start = beg;
9278 long idx;
9279 int last_null = 0;
9280 struct re_registers *regs;
9281 VALUE match = 0;
9282
9283 for (; rb_reg_search(spat, str, start, 0) >= 0;
9284 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9285 match = rb_backref_get();
9286 if (!result) rb_match_busy(match);
9287 regs = RMATCH_REGS(match);
9288 end = BEG(0);
9289 if (start == end && BEG(0) == END(0)) {
9290 if (!ptr) {
9291 SPLIT_STR(0, 0);
9292 break;
9293 }
9294 else if (last_null == 1) {
9295 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9296 beg = start;
9297 }
9298 else {
9299 if (start == len)
9300 start++;
9301 else
9302 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9303 last_null = 1;
9304 continue;
9305 }
9306 }
9307 else {
9308 SPLIT_STR(beg, end-beg);
9309 beg = start = END(0);
9310 }
9311 last_null = 0;
9312
9313 for (idx=1; idx < regs->num_regs; idx++) {
9314 if (BEG(idx) == -1) continue;
9315 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9316 }
9317 if (!NIL_P(limit) && lim <= ++i) break;
9318 }
9319 if (match) rb_match_unbusy(match);
9320 }
9321 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9322 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9323 }
9324
9325 return result ? result : str;
9326}
9327
9328VALUE
9329rb_str_split(VALUE str, const char *sep0)
9330{
9331 VALUE sep;
9332
9333 StringValue(str);
9334 sep = rb_str_new_cstr(sep0);
9335 return rb_str_split_m(1, &sep, str);
9336}
9337
9338#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9339
9340static inline int
9341enumerator_element(VALUE ary, VALUE e)
9342{
9343 if (ary) {
9344 rb_ary_push(ary, e);
9345 return 0;
9346 }
9347 else {
9348 rb_yield(e);
9349 return 1;
9350 }
9351}
9352
9353#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9354
9355static const char *
9356chomp_newline(const char *p, const char *e, rb_encoding *enc)
9357{
9358 const char *prev = rb_enc_prev_char(p, e, e, enc);
9359 if (rb_enc_is_newline(prev, e, enc)) {
9360 e = prev;
9361 prev = rb_enc_prev_char(p, e, e, enc);
9362 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9363 e = prev;
9364 }
9365 return e;
9366}
9367
9368static VALUE
9369get_rs(void)
9370{
9371 VALUE rs = rb_rs;
9372 if (!NIL_P(rs) &&
9373 (!RB_TYPE_P(rs, T_STRING) ||
9374 RSTRING_LEN(rs) != 1 ||
9375 RSTRING_PTR(rs)[0] != '\n')) {
9376 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9377 }
9378 return rs;
9379}
9380
9381#define rb_rs get_rs()
9382
9383static VALUE
9384rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9385{
9386 rb_encoding *enc;
9387 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9388 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9389 long pos, len, rslen;
9390 int rsnewline = 0;
9391
9392 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9393 rs = rb_rs;
9394 if (!NIL_P(opts)) {
9395 static ID keywords[1];
9396 if (!keywords[0]) {
9397 keywords[0] = rb_intern_const("chomp");
9398 }
9399 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9400 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9401 }
9402
9403 if (NIL_P(rs)) {
9404 if (!ENUM_ELEM(ary, str)) {
9405 return ary;
9406 }
9407 else {
9408 return orig;
9409 }
9410 }
9411
9412 if (!RSTRING_LEN(str)) goto end;
9413 str = rb_str_new_frozen(str);
9414 ptr = subptr = RSTRING_PTR(str);
9415 pend = RSTRING_END(str);
9416 len = RSTRING_LEN(str);
9417 StringValue(rs);
9418 rslen = RSTRING_LEN(rs);
9419
9420 if (rs == rb_default_rs)
9421 enc = rb_enc_get(str);
9422 else
9423 enc = rb_enc_check(str, rs);
9424
9425 if (rslen == 0) {
9426 /* paragraph mode */
9427 int n;
9428 const char *eol = NULL;
9429 subend = subptr;
9430 while (subend < pend) {
9431 long chomp_rslen = 0;
9432 do {
9433 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9434 n = 0;
9435 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9436 if (rb_enc_is_newline(subend + n, pend, enc)) {
9437 if (eol == subend) break;
9438 subend += rslen;
9439 if (subptr) {
9440 eol = subend;
9441 chomp_rslen = -rslen;
9442 }
9443 }
9444 else {
9445 if (!subptr) subptr = subend;
9446 subend += rslen;
9447 }
9448 rslen = 0;
9449 } while (subend < pend);
9450 if (!subptr) break;
9451 if (rslen == 0) chomp_rslen = 0;
9452 line = rb_str_subseq(str, subptr - ptr,
9453 subend - subptr + (chomp ? chomp_rslen : rslen));
9454 if (ENUM_ELEM(ary, line)) {
9455 str_mod_check(str, ptr, len);
9456 }
9457 subptr = eol = NULL;
9458 }
9459 goto end;
9460 }
9461 else {
9462 rsptr = RSTRING_PTR(rs);
9463 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9464 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9465 rsnewline = 1;
9466 }
9467 }
9468
9469 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9470 rs = rb_str_new(rsptr, rslen);
9471 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9472 rsptr = RSTRING_PTR(rs);
9473 rslen = RSTRING_LEN(rs);
9474 }
9475
9476 while (subptr < pend) {
9477 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9478 if (pos < 0) break;
9479 hit = subptr + pos;
9480 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9481 if (hit != adjusted) {
9482 subptr = adjusted;
9483 continue;
9484 }
9485 subend = hit += rslen;
9486 if (chomp) {
9487 if (rsnewline) {
9488 subend = chomp_newline(subptr, subend, enc);
9489 }
9490 else {
9491 subend -= rslen;
9492 }
9493 }
9494 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9495 if (ENUM_ELEM(ary, line)) {
9496 str_mod_check(str, ptr, len);
9497 }
9498 subptr = hit;
9499 }
9500
9501 if (subptr != pend) {
9502 if (chomp) {
9503 if (rsnewline) {
9504 pend = chomp_newline(subptr, pend, enc);
9505 }
9506 else if (pend - subptr >= rslen &&
9507 memcmp(pend - rslen, rsptr, rslen) == 0) {
9508 pend -= rslen;
9509 }
9510 }
9511 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9512 ENUM_ELEM(ary, line);
9513 RB_GC_GUARD(str);
9514 }
9515
9516 end:
9517 if (ary)
9518 return ary;
9519 else
9520 return orig;
9521}
9522
9523/*
9524 * call-seq:
9525 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9526 * each_line(record_separator = $/, chomp: false) -> enumerator
9527 *
9528 * :include: doc/string/each_line.rdoc
9529 *
9530 */
9531
9532static VALUE
9533rb_str_each_line(int argc, VALUE *argv, VALUE str)
9534{
9535 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9536 return rb_str_enumerate_lines(argc, argv, str, 0);
9537}
9538
9539/*
9540 * call-seq:
9541 * lines(record_separator = $/, chomp: false) -> array_of_strings
9542 *
9543 * Returns substrings ("lines") of +self+
9544 * according to the given arguments:
9545 *
9546 * s = <<~EOT
9547 * This is the first line.
9548 * This is line two.
9549 *
9550 * This is line four.
9551 * This is line five.
9552 * EOT
9553 *
9554 * With the default argument values:
9555 *
9556 * $/ # => "\n"
9557 * s.lines
9558 * # =>
9559 * ["This is the first line.\n",
9560 * "This is line two.\n",
9561 * "\n",
9562 * "This is line four.\n",
9563 * "This is line five.\n"]
9564 *
9565 * With a different +record_separator+:
9566 *
9567 * record_separator = ' is '
9568 * s.lines(record_separator)
9569 * # =>
9570 * ["This is ",
9571 * "the first line.\nThis is ",
9572 * "line two.\n\nThis is ",
9573 * "line four.\nThis is ",
9574 * "line five.\n"]
9575 *
9576 * With keyword argument +chomp+ as +true+,
9577 * removes the trailing newline from each line:
9578 *
9579 * s.lines(chomp: true)
9580 * # =>
9581 * ["This is the first line.",
9582 * "This is line two.",
9583 * "",
9584 * "This is line four.",
9585 * "This is line five."]
9586 *
9587 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9588 */
9589
9590static VALUE
9591rb_str_lines(int argc, VALUE *argv, VALUE str)
9592{
9593 VALUE ary = WANTARRAY("lines", 0);
9594 return rb_str_enumerate_lines(argc, argv, str, ary);
9595}
9596
9597static VALUE
9598rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9599{
9600 return LONG2FIX(RSTRING_LEN(str));
9601}
9602
9603static VALUE
9604rb_str_enumerate_bytes(VALUE str, VALUE ary)
9605{
9606 long i;
9607
9608 for (i=0; i<RSTRING_LEN(str); i++) {
9609 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9610 }
9611 if (ary)
9612 return ary;
9613 else
9614 return str;
9615}
9616
9617/*
9618 * call-seq:
9619 * each_byte {|byte| ... } -> self
9620 * each_byte -> enumerator
9621 *
9622 * :include: doc/string/each_byte.rdoc
9623 *
9624 */
9625
9626static VALUE
9627rb_str_each_byte(VALUE str)
9628{
9629 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9630 return rb_str_enumerate_bytes(str, 0);
9631}
9632
9633/*
9634 * call-seq:
9635 * bytes -> array_of_bytes
9636 *
9637 * :include: doc/string/bytes.rdoc
9638 *
9639 */
9640
9641static VALUE
9642rb_str_bytes(VALUE str)
9643{
9644 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9645 return rb_str_enumerate_bytes(str, ary);
9646}
9647
9648static VALUE
9649rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9650{
9651 return rb_str_length(str);
9652}
9653
9654static VALUE
9655rb_str_enumerate_chars(VALUE str, VALUE ary)
9656{
9657 VALUE orig = str;
9658 long i, len, n;
9659 const char *ptr;
9660 rb_encoding *enc;
9661
9662 str = rb_str_new_frozen(str);
9663 ptr = RSTRING_PTR(str);
9664 len = RSTRING_LEN(str);
9665 enc = rb_enc_get(str);
9666
9668 for (i = 0; i < len; i += n) {
9669 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9670 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9671 }
9672 }
9673 else {
9674 for (i = 0; i < len; i += n) {
9675 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9676 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9677 }
9678 }
9679 RB_GC_GUARD(str);
9680 if (ary)
9681 return ary;
9682 else
9683 return orig;
9684}
9685
9686/*
9687 * call-seq:
9688 * each_char {|char| ... } -> self
9689 * each_char -> enumerator
9690 *
9691 * :include: doc/string/each_char.rdoc
9692 *
9693 */
9694
9695static VALUE
9696rb_str_each_char(VALUE str)
9697{
9698 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9699 return rb_str_enumerate_chars(str, 0);
9700}
9701
9702/*
9703 * call-seq:
9704 * chars -> array_of_characters
9705 *
9706 * :include: doc/string/chars.rdoc
9707 *
9708 */
9709
9710static VALUE
9711rb_str_chars(VALUE str)
9712{
9713 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9714 return rb_str_enumerate_chars(str, ary);
9715}
9716
9717static VALUE
9718rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9719{
9720 VALUE orig = str;
9721 int n;
9722 unsigned int c;
9723 const char *ptr, *end;
9724 rb_encoding *enc;
9725
9726 if (single_byte_optimizable(str))
9727 return rb_str_enumerate_bytes(str, ary);
9728
9729 str = rb_str_new_frozen(str);
9730 ptr = RSTRING_PTR(str);
9731 end = RSTRING_END(str);
9732 enc = STR_ENC_GET(str);
9733
9734 while (ptr < end) {
9735 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9736 ENUM_ELEM(ary, UINT2NUM(c));
9737 ptr += n;
9738 }
9739 RB_GC_GUARD(str);
9740 if (ary)
9741 return ary;
9742 else
9743 return orig;
9744}
9745
9746/*
9747 * call-seq:
9748 * each_codepoint {|codepoint| ... } -> self
9749 * each_codepoint -> enumerator
9750 *
9751 * :include: doc/string/each_codepoint.rdoc
9752 *
9753 */
9754
9755static VALUE
9756rb_str_each_codepoint(VALUE str)
9757{
9758 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9759 return rb_str_enumerate_codepoints(str, 0);
9760}
9761
9762/*
9763 * call-seq:
9764 * codepoints -> array_of_integers
9765 *
9766 * :include: doc/string/codepoints.rdoc
9767 *
9768 */
9769
9770static VALUE
9771rb_str_codepoints(VALUE str)
9772{
9773 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9774 return rb_str_enumerate_codepoints(str, ary);
9775}
9776
9777static regex_t *
9778get_reg_grapheme_cluster(rb_encoding *enc)
9779{
9780 int encidx = rb_enc_to_index(enc);
9781
9782 const OnigUChar source_ascii[] = "\\X";
9783 const OnigUChar *source = source_ascii;
9784 size_t source_len = sizeof(source_ascii) - 1;
9785
9786 switch (encidx) {
9787#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9788#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9789#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9790#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9791#define CASE_UTF(e) \
9792 case ENCINDEX_UTF_##e: { \
9793 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9794 source = source_UTF_##e; \
9795 source_len = sizeof(source_UTF_##e); \
9796 break; \
9797 }
9798 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9799#undef CASE_UTF
9800#undef CHARS_16BE
9801#undef CHARS_16LE
9802#undef CHARS_32BE
9803#undef CHARS_32LE
9804 }
9805
9806 regex_t *reg_grapheme_cluster;
9807 OnigErrorInfo einfo;
9808 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9809 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9810 if (r) {
9811 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9812 onig_error_code_to_str(message, r, &einfo);
9813 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9814 }
9815
9816 return reg_grapheme_cluster;
9817}
9818
9819static regex_t *
9820get_cached_reg_grapheme_cluster(rb_encoding *enc)
9821{
9822 int encidx = rb_enc_to_index(enc);
9823 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9824
9825 if (encidx == rb_utf8_encindex()) {
9826 if (!reg_grapheme_cluster_utf8) {
9827 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9828 }
9829
9830 return reg_grapheme_cluster_utf8;
9831 }
9832
9833 return NULL;
9834}
9835
9836static VALUE
9837rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9838{
9839 size_t grapheme_cluster_count = 0;
9840 rb_encoding *enc = get_encoding(str);
9841 const char *ptr, *end;
9842
9843 if (!rb_enc_unicode_p(enc)) {
9844 return rb_str_length(str);
9845 }
9846
9847 bool cached_reg_grapheme_cluster = true;
9848 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9849 if (!reg_grapheme_cluster) {
9850 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9851 cached_reg_grapheme_cluster = false;
9852 }
9853
9854 ptr = RSTRING_PTR(str);
9855 end = RSTRING_END(str);
9856
9857 while (ptr < end) {
9858 OnigPosition len = onig_match(reg_grapheme_cluster,
9859 (const OnigUChar *)ptr, (const OnigUChar *)end,
9860 (const OnigUChar *)ptr, NULL, 0);
9861 if (len <= 0) break;
9862 grapheme_cluster_count++;
9863 ptr += len;
9864 }
9865
9866 if (!cached_reg_grapheme_cluster) {
9867 onig_free(reg_grapheme_cluster);
9868 }
9869
9870 return SIZET2NUM(grapheme_cluster_count);
9871}
9872
9873static VALUE
9874rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9875{
9876 VALUE orig = str;
9877 rb_encoding *enc = get_encoding(str);
9878 const char *ptr0, *ptr, *end;
9879
9880 if (!rb_enc_unicode_p(enc)) {
9881 return rb_str_enumerate_chars(str, ary);
9882 }
9883
9884 if (!ary) str = rb_str_new_frozen(str);
9885
9886 bool cached_reg_grapheme_cluster = true;
9887 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9888 if (!reg_grapheme_cluster) {
9889 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9890 cached_reg_grapheme_cluster = false;
9891 }
9892
9893 ptr0 = ptr = RSTRING_PTR(str);
9894 end = RSTRING_END(str);
9895
9896 while (ptr < end) {
9897 OnigPosition len = onig_match(reg_grapheme_cluster,
9898 (const OnigUChar *)ptr, (const OnigUChar *)end,
9899 (const OnigUChar *)ptr, NULL, 0);
9900 if (len <= 0) break;
9901 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9902 ptr += len;
9903 }
9904
9905 if (!cached_reg_grapheme_cluster) {
9906 onig_free(reg_grapheme_cluster);
9907 }
9908
9909 RB_GC_GUARD(str);
9910 if (ary)
9911 return ary;
9912 else
9913 return orig;
9914}
9915
9916/*
9917 * call-seq:
9918 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9919 * each_grapheme_cluster -> enumerator
9920 *
9921 * :include: doc/string/each_grapheme_cluster.rdoc
9922 *
9923 */
9924
9925static VALUE
9926rb_str_each_grapheme_cluster(VALUE str)
9927{
9928 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9929 return rb_str_enumerate_grapheme_clusters(str, 0);
9930}
9931
9932/*
9933 * call-seq:
9934 * grapheme_clusters -> array_of_grapheme_clusters
9935 *
9936 * :include: doc/string/grapheme_clusters.rdoc
9937 *
9938 */
9939
9940static VALUE
9941rb_str_grapheme_clusters(VALUE str)
9942{
9943 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9944 return rb_str_enumerate_grapheme_clusters(str, ary);
9945}
9946
9947static long
9948chopped_length(VALUE str)
9949{
9950 rb_encoding *enc = STR_ENC_GET(str);
9951 const char *p, *p2, *beg, *end;
9952
9953 beg = RSTRING_PTR(str);
9954 end = beg + RSTRING_LEN(str);
9955 if (beg >= end) return 0;
9956 p = rb_enc_prev_char(beg, end, end, enc);
9957 if (!p) return 0;
9958 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9959 p2 = rb_enc_prev_char(beg, p, end, enc);
9960 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9961 }
9962 return p - beg;
9963}
9964
9965/*
9966 * call-seq:
9967 * chop! -> self or nil
9968 *
9969 * Like String#chop, except that:
9970 *
9971 * - Removes trailing characters from +self+ (not from a copy of +self+).
9972 * - Returns +self+ if any characters are removed, +nil+ otherwise.
9973 *
9974 * Related: see {Modifying}[rdoc-ref:String@Modifying].
9975 */
9976
9977static VALUE
9978rb_str_chop_bang(VALUE str)
9979{
9980 str_modify_keep_cr(str);
9981 if (RSTRING_LEN(str) > 0) {
9982 long len;
9983 len = chopped_length(str);
9984 STR_SET_LEN(str, len);
9985 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9986 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9988 }
9989 return str;
9990 }
9991 return Qnil;
9992}
9993
9994
9995/*
9996 * call-seq:
9997 * chop -> new_string
9998 *
9999 * :include: doc/string/chop.rdoc
10000 *
10001 */
10002
10003static VALUE
10004rb_str_chop(VALUE str)
10005{
10006 return rb_str_subseq(str, 0, chopped_length(str));
10007}
10008
10009static long
10010smart_chomp(VALUE str, const char *e, const char *p)
10011{
10012 rb_encoding *enc = rb_enc_get(str);
10013 if (rb_enc_mbminlen(enc) > 1) {
10014 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10015 if (rb_enc_is_newline(pp, e, enc)) {
10016 e = pp;
10017 }
10018 pp = e - rb_enc_mbminlen(enc);
10019 if (pp >= p) {
10020 pp = rb_enc_left_char_head(p, pp, e, enc);
10021 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10022 e = pp;
10023 }
10024 }
10025 }
10026 else {
10027 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10028 case '\n':
10029 if (--e > p && *(e-1) == '\r') {
10030 --e;
10031 }
10032 break;
10033 case '\r':
10034 --e;
10035 break;
10036 }
10037 }
10038 return e - p;
10039}
10040
10041static long
10042chompped_length(VALUE str, VALUE rs)
10043{
10044 rb_encoding *enc;
10045 int newline;
10046 char *pp, *e, *rsptr;
10047 long rslen;
10048 char *const p = RSTRING_PTR(str);
10049 long len = RSTRING_LEN(str);
10050
10051 if (len == 0) return 0;
10052 e = p + len;
10053 if (rs == rb_default_rs) {
10054 return smart_chomp(str, e, p);
10055 }
10056
10057 enc = rb_enc_get(str);
10058 RSTRING_GETMEM(rs, rsptr, rslen);
10059 if (rslen == 0) {
10060 if (rb_enc_mbminlen(enc) > 1) {
10061 while (e > p) {
10062 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10063 if (!rb_enc_is_newline(pp, e, enc)) break;
10064 e = pp;
10065 pp -= rb_enc_mbminlen(enc);
10066 if (pp >= p) {
10067 pp = rb_enc_left_char_head(p, pp, e, enc);
10068 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10069 e = pp;
10070 }
10071 }
10072 }
10073 }
10074 else {
10075 while (e > p && *(e-1) == '\n') {
10076 --e;
10077 if (e > p && *(e-1) == '\r')
10078 --e;
10079 }
10080 }
10081 return e - p;
10082 }
10083 if (rslen > len) return len;
10084
10085 enc = rb_enc_get(rs);
10086 newline = rsptr[rslen-1];
10087 if (rslen == rb_enc_mbminlen(enc)) {
10088 if (rslen == 1) {
10089 if (newline == '\n')
10090 return smart_chomp(str, e, p);
10091 }
10092 else {
10093 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10094 return smart_chomp(str, e, p);
10095 }
10096 }
10097
10098 enc = rb_enc_check(str, rs);
10099 if (is_broken_string(rs)) {
10100 return len;
10101 }
10102 pp = e - rslen;
10103 if (p[len-1] == newline &&
10104 (rslen <= 1 ||
10105 memcmp(rsptr, pp, rslen) == 0)) {
10106 if (at_char_boundary(p, pp, e, enc))
10107 return len - rslen;
10108 RB_GC_GUARD(rs);
10109 }
10110 return len;
10111}
10112
10118static VALUE
10119chomp_rs(int argc, const VALUE *argv)
10120{
10121 rb_check_arity(argc, 0, 1);
10122 if (argc > 0) {
10123 VALUE rs = argv[0];
10124 if (!NIL_P(rs)) StringValue(rs);
10125 return rs;
10126 }
10127 else {
10128 return rb_rs;
10129 }
10130}
10131
10132VALUE
10133rb_str_chomp_string(VALUE str, VALUE rs)
10134{
10135 long olen = RSTRING_LEN(str);
10136 long len = chompped_length(str, rs);
10137 if (len >= olen) return Qnil;
10138 str_modify_keep_cr(str);
10139 STR_SET_LEN(str, len);
10140 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10141 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10143 }
10144 return str;
10145}
10146
10147/*
10148 * call-seq:
10149 * chomp!(line_sep = $/) -> self or nil
10150 *
10151 * Like String#chomp, except that:
10152 *
10153 * - Removes trailing characters from +self+ (not from a copy of +self+).
10154 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10155 *
10156 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10157 */
10158
10159static VALUE
10160rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10161{
10162 VALUE rs;
10163 str_modifiable(str);
10164 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10165 rs = chomp_rs(argc, argv);
10166 if (NIL_P(rs)) return Qnil;
10167 return rb_str_chomp_string(str, rs);
10168}
10169
10170
10171/*
10172 * call-seq:
10173 * chomp(line_sep = $/) -> new_string
10174 *
10175 * :include: doc/string/chomp.rdoc
10176 *
10177 */
10178
10179static VALUE
10180rb_str_chomp(int argc, VALUE *argv, VALUE str)
10181{
10182 VALUE rs = chomp_rs(argc, argv);
10183 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10184 return rb_str_subseq(str, 0, chompped_length(str, rs));
10185}
10186
10187static long
10188lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10189{
10190 const char *const start = s;
10191
10192 if (!s || s >= e) return 0;
10193
10194 /* remove spaces at head */
10195 if (single_byte_optimizable(str)) {
10196 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10197 }
10198 else {
10199 while (s < e) {
10200 int n;
10201 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10202
10203 if (cc && !rb_isspace(cc)) break;
10204 s += n;
10205 }
10206 }
10207 return s - start;
10208}
10209
10210/*
10211 * call-seq:
10212 * lstrip! -> self or nil
10213 *
10214 * Like String#lstrip, except that:
10215 *
10216 * - Performs stripping in +self+ (not in a copy of +self+).
10217 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10218 *
10219 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10220 */
10221
10222static VALUE
10223rb_str_lstrip_bang(VALUE str)
10224{
10225 rb_encoding *enc;
10226 char *start, *s;
10227 long olen, loffset;
10228
10229 str_modify_keep_cr(str);
10230 enc = STR_ENC_GET(str);
10231 RSTRING_GETMEM(str, start, olen);
10232 loffset = lstrip_offset(str, start, start+olen, enc);
10233 if (loffset > 0) {
10234 long len = olen-loffset;
10235 s = start + loffset;
10236 memmove(start, s, len);
10237 STR_SET_LEN(str, len);
10238 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10239 return str;
10240 }
10241 return Qnil;
10242}
10243
10244
10245/*
10246 * call-seq:
10247 * lstrip -> new_string
10248 *
10249 * Returns a copy of +self+ with leading whitespace removed;
10250 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10251 *
10252 * whitespace = "\x00\t\n\v\f\r "
10253 * s = whitespace + 'abc' + whitespace
10254 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10255 * s.lstrip
10256 * # => "abc\u0000\t\n\v\f\r "
10257 *
10258 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10259 */
10260
10261static VALUE
10262rb_str_lstrip(VALUE str)
10263{
10264 char *start;
10265 long len, loffset;
10266 RSTRING_GETMEM(str, start, len);
10267 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10268 if (loffset <= 0) return str_duplicate(rb_cString, str);
10269 return rb_str_subseq(str, loffset, len - loffset);
10270}
10271
10272static long
10273rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10274{
10275 const char *t;
10276
10277 rb_str_check_dummy_enc(enc);
10279 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10280 }
10281 if (!s || s >= e) return 0;
10282 t = e;
10283
10284 /* remove trailing spaces or '\0's */
10285 if (single_byte_optimizable(str)) {
10286 unsigned char c;
10287 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10288 }
10289 else {
10290 char *tp;
10291
10292 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10293 unsigned int c = rb_enc_codepoint(tp, e, enc);
10294 if (c && !rb_isspace(c)) break;
10295 t = tp;
10296 }
10297 }
10298 return e - t;
10299}
10300
10301/*
10302 * call-seq:
10303 * rstrip! -> self or nil
10304 *
10305 * Like String#rstrip, except that:
10306 *
10307 * - Performs stripping in +self+ (not in a copy of +self+).
10308 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10309 *
10310 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10311 */
10312
10313static VALUE
10314rb_str_rstrip_bang(VALUE str)
10315{
10316 rb_encoding *enc;
10317 char *start;
10318 long olen, roffset;
10319
10320 str_modify_keep_cr(str);
10321 enc = STR_ENC_GET(str);
10322 RSTRING_GETMEM(str, start, olen);
10323 roffset = rstrip_offset(str, start, start+olen, enc);
10324 if (roffset > 0) {
10325 long len = olen - roffset;
10326
10327 STR_SET_LEN(str, len);
10328 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10329 return str;
10330 }
10331 return Qnil;
10332}
10333
10334
10335/*
10336 * call-seq:
10337 * rstrip -> new_string
10338 *
10339 * Returns a copy of +self+ with trailing whitespace removed;
10340 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10341 *
10342 * whitespace = "\x00\t\n\v\f\r "
10343 * s = whitespace + 'abc' + whitespace
10344 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10345 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10346 *
10347 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10348 */
10349
10350static VALUE
10351rb_str_rstrip(VALUE str)
10352{
10353 rb_encoding *enc;
10354 char *start;
10355 long olen, roffset;
10356
10357 enc = STR_ENC_GET(str);
10358 RSTRING_GETMEM(str, start, olen);
10359 roffset = rstrip_offset(str, start, start+olen, enc);
10360
10361 if (roffset <= 0) return str_duplicate(rb_cString, str);
10362 return rb_str_subseq(str, 0, olen-roffset);
10363}
10364
10365
10366/*
10367 * call-seq:
10368 * strip! -> self or nil
10369 *
10370 * Like String#strip, except that:
10371 *
10372 * - Any modifications are made to +self+.
10373 * - Returns +self+ if any modification are made, +nil+ otherwise.
10374 *
10375 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10376 */
10377
10378static VALUE
10379rb_str_strip_bang(VALUE str)
10380{
10381 char *start;
10382 long olen, loffset, roffset;
10383 rb_encoding *enc;
10384
10385 str_modify_keep_cr(str);
10386 enc = STR_ENC_GET(str);
10387 RSTRING_GETMEM(str, start, olen);
10388 loffset = lstrip_offset(str, start, start+olen, enc);
10389 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10390
10391 if (loffset > 0 || roffset > 0) {
10392 long len = olen-roffset;
10393 if (loffset > 0) {
10394 len -= loffset;
10395 memmove(start, start + loffset, len);
10396 }
10397 STR_SET_LEN(str, len);
10398 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10399 return str;
10400 }
10401 return Qnil;
10402}
10403
10404
10405/*
10406 * call-seq:
10407 * strip -> new_string
10408 *
10409 * Returns a copy of +self+ with leading and trailing whitespace removed;
10410 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10411 *
10412 * whitespace = "\x00\t\n\v\f\r "
10413 * s = whitespace + 'abc' + whitespace
10414 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10415 * s.strip # => "abc"
10416 *
10417 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10418 */
10419
10420static VALUE
10421rb_str_strip(VALUE str)
10422{
10423 char *start;
10424 long olen, loffset, roffset;
10425 rb_encoding *enc = STR_ENC_GET(str);
10426
10427 RSTRING_GETMEM(str, start, olen);
10428 loffset = lstrip_offset(str, start, start+olen, enc);
10429 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10430
10431 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10432 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10433}
10434
10435static VALUE
10436scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10437{
10438 VALUE result = Qnil;
10439 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10440 if (pos >= 0) {
10441 VALUE match;
10442 struct re_registers *regs;
10443 if (BUILTIN_TYPE(pat) == T_STRING) {
10444 regs = NULL;
10445 end = pos + RSTRING_LEN(pat);
10446 }
10447 else {
10448 match = rb_backref_get();
10449 regs = RMATCH_REGS(match);
10450 pos = BEG(0);
10451 end = END(0);
10452 }
10453
10454 if (pos == end) {
10455 rb_encoding *enc = STR_ENC_GET(str);
10456 /*
10457 * Always consume at least one character of the input string
10458 */
10459 if (RSTRING_LEN(str) > end)
10460 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10461 RSTRING_END(str), enc);
10462 else
10463 *start = end + 1;
10464 }
10465 else {
10466 *start = end;
10467 }
10468
10469 if (!regs || regs->num_regs == 1) {
10470 result = rb_str_subseq(str, pos, end - pos);
10471 return result;
10472 }
10473 else {
10474 result = rb_ary_new2(regs->num_regs);
10475 for (int i = 1; i < regs->num_regs; i++) {
10476 VALUE s = Qnil;
10477 if (BEG(i) >= 0) {
10478 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10479 }
10480
10481 rb_ary_push(result, s);
10482 }
10483 }
10484
10485 RB_GC_GUARD(match);
10486 }
10487
10488 return result;
10489}
10490
10491
10492/*
10493 * call-seq:
10494 * scan(pattern) -> array_of_results
10495 * scan(pattern) {|result| ... } -> self
10496 *
10497 * :include: doc/string/scan.rdoc
10498 *
10499 */
10500
10501static VALUE
10502rb_str_scan(VALUE str, VALUE pat)
10503{
10504 VALUE result;
10505 long start = 0;
10506 long last = -1, prev = 0;
10507 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10508
10509 pat = get_pat_quoted(pat, 1);
10510 mustnot_broken(str);
10511 if (!rb_block_given_p()) {
10512 VALUE ary = rb_ary_new();
10513
10514 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10515 last = prev;
10516 prev = start;
10517 rb_ary_push(ary, result);
10518 }
10519 if (last >= 0) rb_pat_search(pat, str, last, 1);
10520 else rb_backref_set(Qnil);
10521 return ary;
10522 }
10523
10524 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10525 last = prev;
10526 prev = start;
10527 rb_yield(result);
10528 str_mod_check(str, p, len);
10529 }
10530 if (last >= 0) rb_pat_search(pat, str, last, 1);
10531 return str;
10532}
10533
10534
10535/*
10536 * call-seq:
10537 * hex -> integer
10538 *
10539 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10540 * returns its value as an integer.
10541 *
10542 * The leading substring is interpreted as hexadecimal when it begins with:
10543 *
10544 * - One or more character representing hexadecimal digits
10545 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10546 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10547 *
10548 * 'f'.hex # => 15
10549 * '11'.hex # => 17
10550 * 'FFF'.hex # => 4095
10551 * 'fffg'.hex # => 4095
10552 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10553 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10554 * 'deadbeef'.hex # => 3735928559
10555 *
10556 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10557 *
10558 * '0xfff'.hex # => 4095
10559 * '0xfffg'.hex # => 4095
10560 *
10561 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10562 *
10563 * '-fff'.hex # => -4095
10564 * '-0xFFF'.hex # => -4095
10565 *
10566 * For any substring not described above, returns zero:
10567 *
10568 * 'xxx'.hex # => 0
10569 * ''.hex # => 0
10570 *
10571 * Note that, unlike #oct, this method interprets only hexadecimal,
10572 * and not binary, octal, or decimal notations:
10573 *
10574 * '0b111'.hex # => 45329
10575 * '0o777'.hex # => 0
10576 * '0d999'.hex # => 55705
10577 *
10578 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10579 */
10580
10581static VALUE
10582rb_str_hex(VALUE str)
10583{
10584 return rb_str_to_inum(str, 16, FALSE);
10585}
10586
10587
10588/*
10589 * call-seq:
10590 * oct -> integer
10591 *
10592 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10593 * returns their value as an integer.
10594 *
10595 * In brief:
10596 *
10597 * # Interpreted as octal.
10598 * '777'.oct # => 511
10599 * '777x'.oct # => 511
10600 * '0777'.oct # => 511
10601 * '0o777'.oct # => 511
10602 * '-777'.oct # => -511
10603 * # Not interpreted as octal.
10604 * '0b111'.oct # => 7 # Interpreted as binary.
10605 * '0d999'.oct # => 999 # Interpreted as decimal.
10606 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10607 *
10608 * The leading substring is interpreted as octal when it begins with:
10609 *
10610 * - One or more character representing octal digits
10611 * (each in the range <tt>'0'..'7'</tt>);
10612 * the string to be interpreted ends at the first character that does not represent an octal digit:
10613 *
10614 * '7'.oct @ => 7
10615 * '11'.oct # => 9
10616 * '777'.oct # => 511
10617 * '0777'.oct # => 511
10618 * '7778'.oct # => 511
10619 * '777x'.oct # => 511
10620 *
10621 * - <tt>'0o'</tt>, followed by one or more octal digits:
10622 *
10623 * '0o777'.oct # => 511
10624 * '0o7778'.oct # => 511
10625 *
10626 * The leading substring is _not_ interpreted as octal when it begins with:
10627 *
10628 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10629 * (each in the range <tt>'0'..'1'</tt>);
10630 * the string to be interpreted ends at the first character that does not represent a binary digit.
10631 * the string is interpreted as binary digits (base 2):
10632 *
10633 * '0b111'.oct # => 7
10634 * '0b1112'.oct # => 7
10635 *
10636 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10637 * (each in the range <tt>'0'..'9'</tt>);
10638 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10639 * the string is interpreted as decimal digits (base 10):
10640 *
10641 * '0d999'.oct # => 999
10642 * '0d999x'.oct # => 999
10643 *
10644 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10645 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10646 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10647 * the string is interpreted as hexadecimal digits (base 16):
10648 *
10649 * '0xfff'.oct # => 4095
10650 * '0xfffg'.oct # => 4095
10651 *
10652 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10653 *
10654 * '-777'.oct # => -511
10655 * '-0777'.oct # => -511
10656 * '-0b111'.oct # => -7
10657 * '-0xfff'.oct # => -4095
10658 *
10659 * For any substring not described above, returns zero:
10660 *
10661 * 'foo'.oct # => 0
10662 * ''.oct # => 0
10663 *
10664 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10665 */
10666
10667static VALUE
10668rb_str_oct(VALUE str)
10669{
10670 return rb_str_to_inum(str, -8, FALSE);
10671}
10672
10673#ifndef HAVE_CRYPT_R
10674# include "ruby/thread_native.h"
10675# include "ruby/atomic.h"
10676
10677static struct {
10678 rb_nativethread_lock_t lock;
10679} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10680#endif
10681
10682/*
10683 * call-seq:
10684 * crypt(salt_str) -> new_string
10685 *
10686 * Returns the string generated by calling <code>crypt(3)</code>
10687 * standard library function with <code>str</code> and
10688 * <code>salt_str</code>, in this order, as its arguments. Please do
10689 * not use this method any longer. It is legacy; provided only for
10690 * backward compatibility with ruby scripts in earlier days. It is
10691 * bad to use in contemporary programs for several reasons:
10692 *
10693 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10694 * run. The generated string lacks data portability.
10695 *
10696 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10697 * (i.e. silently ends up in unexpected results).
10698 *
10699 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10700 * thread safe.
10701 *
10702 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10703 * very very weak. According to its manpage, Linux's traditional
10704 * <code>crypt(3)</code> output has only 2**56 variations; too
10705 * easy to brute force today. And this is the default behaviour.
10706 *
10707 * * In order to make things robust some OSes implement so-called
10708 * "modular" usage. To go through, you have to do a complex
10709 * build-up of the <code>salt_str</code> parameter, by hand.
10710 * Failure in generation of a proper salt string tends not to
10711 * yield any errors; typos in parameters are normally not
10712 * detectable.
10713 *
10714 * * For instance, in the following example, the second invocation
10715 * of String#crypt is wrong; it has a typo in "round=" (lacks
10716 * "s"). However the call does not fail and something unexpected
10717 * is generated.
10718 *
10719 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10720 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10721 *
10722 * * Even in the "modular" mode, some hash functions are considered
10723 * archaic and no longer recommended at all; for instance module
10724 * <code>$1$</code> is officially abandoned by its author: see
10725 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10726 * instance module <code>$3$</code> is considered completely
10727 * broken: see the manpage of FreeBSD.
10728 *
10729 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10730 * written above, <code>crypt(3)</code> on Mac OS never fails.
10731 * This means even if you build up a proper salt string it
10732 * generates a traditional DES hash anyways, and there is no way
10733 * for you to be aware of.
10734 *
10735 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10736 *
10737 * If for some reason you cannot migrate to other secure contemporary
10738 * password hashing algorithms, install the string-crypt gem and
10739 * <code>require 'string/crypt'</code> to continue using it.
10740 */
10741
10742static VALUE
10743rb_str_crypt(VALUE str, VALUE salt)
10744{
10745#ifdef HAVE_CRYPT_R
10746 VALUE databuf;
10747 struct crypt_data *data;
10748# define CRYPT_END() ALLOCV_END(databuf)
10749#else
10750 char *tmp_buf;
10751 extern char *crypt(const char *, const char *);
10752# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10753#endif
10754 VALUE result;
10755 const char *s, *saltp;
10756 char *res;
10757#ifdef BROKEN_CRYPT
10758 char salt_8bit_clean[3];
10759#endif
10760
10761 StringValue(salt);
10762 mustnot_wchar(str);
10763 mustnot_wchar(salt);
10764 s = StringValueCStr(str);
10765 saltp = RSTRING_PTR(salt);
10766 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10767 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10768 }
10769
10770#ifdef BROKEN_CRYPT
10771 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10772 salt_8bit_clean[0] = saltp[0] & 0x7f;
10773 salt_8bit_clean[1] = saltp[1] & 0x7f;
10774 salt_8bit_clean[2] = '\0';
10775 saltp = salt_8bit_clean;
10776 }
10777#endif
10778#ifdef HAVE_CRYPT_R
10779 data = ALLOCV(databuf, sizeof(struct crypt_data));
10780# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10781 data->initialized = 0;
10782# endif
10783 res = crypt_r(s, saltp, data);
10784#else
10785 rb_nativethread_lock_lock(&crypt_mutex.lock);
10786 res = crypt(s, saltp);
10787#endif
10788 if (!res) {
10789 int err = errno;
10790 CRYPT_END();
10791 rb_syserr_fail(err, "crypt");
10792 }
10793#ifdef HAVE_CRYPT_R
10794 result = rb_str_new_cstr(res);
10795 CRYPT_END();
10796#else
10797 // We need to copy this buffer because it's static and we need to unlock the mutex
10798 // before allocating a new object (the string to be returned). If we allocate while
10799 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10800 // if other ractors are waiting on this lock.
10801 size_t res_size = strlen(res)+1;
10802 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10803 memcpy(tmp_buf, res, res_size);
10804 res = tmp_buf;
10805 CRYPT_END();
10806 result = rb_str_new_cstr(res);
10807#endif
10808 return result;
10809}
10810
10811
10812/*
10813 * call-seq:
10814 * ord -> integer
10815 *
10816 * :include: doc/string/ord.rdoc
10817 *
10818 */
10819
10820static VALUE
10821rb_str_ord(VALUE s)
10822{
10823 unsigned int c;
10824
10825 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10826 return UINT2NUM(c);
10827}
10828/*
10829 * call-seq:
10830 * sum(n = 16) -> integer
10831 *
10832 * :include: doc/string/sum.rdoc
10833 *
10834 */
10835
10836static VALUE
10837rb_str_sum(int argc, VALUE *argv, VALUE str)
10838{
10839 int bits = 16;
10840 char *ptr, *p, *pend;
10841 long len;
10842 VALUE sum = INT2FIX(0);
10843 unsigned long sum0 = 0;
10844
10845 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10846 bits = 0;
10847 }
10848 ptr = p = RSTRING_PTR(str);
10849 len = RSTRING_LEN(str);
10850 pend = p + len;
10851
10852 while (p < pend) {
10853 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10854 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10855 str_mod_check(str, ptr, len);
10856 sum0 = 0;
10857 }
10858 sum0 += (unsigned char)*p;
10859 p++;
10860 }
10861
10862 if (bits == 0) {
10863 if (sum0) {
10864 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10865 }
10866 }
10867 else {
10868 if (sum == INT2FIX(0)) {
10869 if (bits < (int)sizeof(long)*CHAR_BIT) {
10870 sum0 &= (((unsigned long)1)<<bits)-1;
10871 }
10872 sum = LONG2FIX(sum0);
10873 }
10874 else {
10875 VALUE mod;
10876
10877 if (sum0) {
10878 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10879 }
10880
10881 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10882 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10883 sum = rb_funcall(sum, '&', 1, mod);
10884 }
10885 }
10886 return sum;
10887}
10888
10889static VALUE
10890rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10891{
10892 rb_encoding *enc;
10893 VALUE w;
10894 long width, len, flen = 1, fclen = 1;
10895 VALUE res;
10896 char *p;
10897 const char *f = " ";
10898 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10899 VALUE pad;
10900 int singlebyte = 1, cr;
10901 int termlen;
10902
10903 rb_scan_args(argc, argv, "11", &w, &pad);
10904 enc = STR_ENC_GET(str);
10905 termlen = rb_enc_mbminlen(enc);
10906 width = NUM2LONG(w);
10907 if (argc == 2) {
10908 StringValue(pad);
10909 enc = rb_enc_check(str, pad);
10910 f = RSTRING_PTR(pad);
10911 flen = RSTRING_LEN(pad);
10912 fclen = str_strlen(pad, enc); /* rb_enc_check */
10913 singlebyte = single_byte_optimizable(pad);
10914 if (flen == 0 || fclen == 0) {
10915 rb_raise(rb_eArgError, "zero width padding");
10916 }
10917 }
10918 len = str_strlen(str, enc); /* rb_enc_check */
10919 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10920 n = width - len;
10921 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10922 rlen = n - llen;
10923 cr = ENC_CODERANGE(str);
10924 if (flen > 1) {
10925 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10926 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10927 }
10928 size = RSTRING_LEN(str);
10929 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10930 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10931 (len += llen2 + rlen2) >= LONG_MAX - size) {
10932 rb_raise(rb_eArgError, "argument too big");
10933 }
10934 len += size;
10935 res = str_enc_new(rb_cString, 0, len, enc);
10936 p = RSTRING_PTR(res);
10937 if (flen <= 1) {
10938 memset(p, *f, llen);
10939 p += llen;
10940 }
10941 else {
10942 while (llen >= fclen) {
10943 memcpy(p,f,flen);
10944 p += flen;
10945 llen -= fclen;
10946 }
10947 if (llen > 0) {
10948 memcpy(p, f, llen2);
10949 p += llen2;
10950 }
10951 }
10952 memcpy(p, RSTRING_PTR(str), size);
10953 p += size;
10954 if (flen <= 1) {
10955 memset(p, *f, rlen);
10956 p += rlen;
10957 }
10958 else {
10959 while (rlen >= fclen) {
10960 memcpy(p,f,flen);
10961 p += flen;
10962 rlen -= fclen;
10963 }
10964 if (rlen > 0) {
10965 memcpy(p, f, rlen2);
10966 p += rlen2;
10967 }
10968 }
10969 TERM_FILL(p, termlen);
10970 STR_SET_LEN(res, p-RSTRING_PTR(res));
10971
10972 if (argc == 2)
10973 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10974 if (cr != ENC_CODERANGE_BROKEN)
10975 ENC_CODERANGE_SET(res, cr);
10976
10977 RB_GC_GUARD(pad);
10978 return res;
10979}
10980
10981
10982/*
10983 * call-seq:
10984 * ljust(width, pad_string = ' ') -> new_string
10985 *
10986 * :include: doc/string/ljust.rdoc
10987 *
10988 */
10989
10990static VALUE
10991rb_str_ljust(int argc, VALUE *argv, VALUE str)
10992{
10993 return rb_str_justify(argc, argv, str, 'l');
10994}
10995
10996/*
10997 * call-seq:
10998 * rjust(width, pad_string = ' ') -> new_string
10999 *
11000 * :include: doc/string/rjust.rdoc
11001 *
11002 */
11003
11004static VALUE
11005rb_str_rjust(int argc, VALUE *argv, VALUE str)
11006{
11007 return rb_str_justify(argc, argv, str, 'r');
11008}
11009
11010
11011/*
11012 * call-seq:
11013 * center(size, pad_string = ' ') -> new_string
11014 *
11015 * :include: doc/string/center.rdoc
11016 *
11017 */
11018
11019static VALUE
11020rb_str_center(int argc, VALUE *argv, VALUE str)
11021{
11022 return rb_str_justify(argc, argv, str, 'c');
11023}
11024
11025/*
11026 * call-seq:
11027 * partition(pattern) -> [pre_match, first_match, post_match]
11028 *
11029 * :include: doc/string/partition.rdoc
11030 *
11031 */
11032
11033static VALUE
11034rb_str_partition(VALUE str, VALUE sep)
11035{
11036 long pos;
11037
11038 sep = get_pat_quoted(sep, 0);
11039 if (RB_TYPE_P(sep, T_REGEXP)) {
11040 if (rb_reg_search(sep, str, 0, 0) < 0) {
11041 goto failed;
11042 }
11043 VALUE match = rb_backref_get();
11044 struct re_registers *regs = RMATCH_REGS(match);
11045
11046 pos = BEG(0);
11047 sep = rb_str_subseq(str, pos, END(0) - pos);
11048 }
11049 else {
11050 pos = rb_str_index(str, sep, 0);
11051 if (pos < 0) goto failed;
11052 }
11053 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11054 sep,
11055 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11056 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11057
11058 failed:
11059 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11060}
11061
11062/*
11063 * call-seq:
11064 * rpartition(pattern) -> [pre_match, last_match, post_match]
11065 *
11066 * :include: doc/string/rpartition.rdoc
11067 *
11068 */
11069
11070static VALUE
11071rb_str_rpartition(VALUE str, VALUE sep)
11072{
11073 long pos = RSTRING_LEN(str);
11074
11075 sep = get_pat_quoted(sep, 0);
11076 if (RB_TYPE_P(sep, T_REGEXP)) {
11077 if (rb_reg_search(sep, str, pos, 1) < 0) {
11078 goto failed;
11079 }
11080 VALUE match = rb_backref_get();
11081 struct re_registers *regs = RMATCH_REGS(match);
11082
11083 pos = BEG(0);
11084 sep = rb_str_subseq(str, pos, END(0) - pos);
11085 }
11086 else {
11087 pos = rb_str_sublen(str, pos);
11088 pos = rb_str_rindex(str, sep, pos);
11089 if (pos < 0) {
11090 goto failed;
11091 }
11092 }
11093
11094 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11095 sep,
11096 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11097 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11098 failed:
11099 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11100}
11101
11102/*
11103 * call-seq:
11104 * start_with?(*patterns) -> true or false
11105 *
11106 * :include: doc/string/start_with_p.rdoc
11107 *
11108 */
11109
11110static VALUE
11111rb_str_start_with(int argc, VALUE *argv, VALUE str)
11112{
11113 int i;
11114
11115 for (i=0; i<argc; i++) {
11116 VALUE tmp = argv[i];
11117 if (RB_TYPE_P(tmp, T_REGEXP)) {
11118 if (rb_reg_start_with_p(tmp, str))
11119 return Qtrue;
11120 }
11121 else {
11122 const char *p, *s, *e;
11123 long slen, tlen;
11124 rb_encoding *enc;
11125
11126 StringValue(tmp);
11127 enc = rb_enc_check(str, tmp);
11128 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11129 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11130 p = RSTRING_PTR(str);
11131 e = p + slen;
11132 s = p + tlen;
11133 if (!at_char_right_boundary(p, s, e, enc))
11134 continue;
11135 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11136 return Qtrue;
11137 }
11138 }
11139 return Qfalse;
11140}
11141
11142/*
11143 * call-seq:
11144 * end_with?(*strings) -> true or false
11145 *
11146 * :include: doc/string/end_with_p.rdoc
11147 *
11148 */
11149
11150static VALUE
11151rb_str_end_with(int argc, VALUE *argv, VALUE str)
11152{
11153 int i;
11154
11155 for (i=0; i<argc; i++) {
11156 VALUE tmp = argv[i];
11157 const char *p, *s, *e;
11158 long slen, tlen;
11159 rb_encoding *enc;
11160
11161 StringValue(tmp);
11162 enc = rb_enc_check(str, tmp);
11163 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11164 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11165 p = RSTRING_PTR(str);
11166 e = p + slen;
11167 s = e - tlen;
11168 if (!at_char_boundary(p, s, e, enc))
11169 continue;
11170 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11171 return Qtrue;
11172 }
11173 return Qfalse;
11174}
11175
11185static long
11186deleted_prefix_length(VALUE str, VALUE prefix)
11187{
11188 const char *strptr, *prefixptr;
11189 long olen, prefixlen;
11190 rb_encoding *enc = rb_enc_get(str);
11191
11192 StringValue(prefix);
11193
11194 if (!is_broken_string(prefix) ||
11195 !rb_enc_asciicompat(enc) ||
11196 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11197 enc = rb_enc_check(str, prefix);
11198 }
11199
11200 /* return 0 if not start with prefix */
11201 prefixlen = RSTRING_LEN(prefix);
11202 if (prefixlen <= 0) return 0;
11203 olen = RSTRING_LEN(str);
11204 if (olen < prefixlen) return 0;
11205 strptr = RSTRING_PTR(str);
11206 prefixptr = RSTRING_PTR(prefix);
11207 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11208 if (is_broken_string(prefix)) {
11209 if (!is_broken_string(str)) {
11210 /* prefix in a valid string cannot be broken */
11211 return 0;
11212 }
11213 const char *strend = strptr + olen;
11214 const char *after_prefix = strptr + prefixlen;
11215 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11216 /* prefix does not end at char-boundary */
11217 return 0;
11218 }
11219 }
11220 /* prefix part in `str` also should be valid. */
11221
11222 return prefixlen;
11223}
11224
11225/*
11226 * call-seq:
11227 * delete_prefix!(prefix) -> self or nil
11228 *
11229 * Like String#delete_prefix, except that +self+ is modified in place;
11230 * returns +self+ if the prefix is removed, +nil+ otherwise.
11231 *
11232 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11233 */
11234
11235static VALUE
11236rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11237{
11238 long prefixlen;
11239 str_modify_keep_cr(str);
11240
11241 prefixlen = deleted_prefix_length(str, prefix);
11242 if (prefixlen <= 0) return Qnil;
11243
11244 return rb_str_drop_bytes(str, prefixlen);
11245}
11246
11247/*
11248 * call-seq:
11249 * delete_prefix(prefix) -> new_string
11250 *
11251 * :include: doc/string/delete_prefix.rdoc
11252 *
11253 */
11254
11255static VALUE
11256rb_str_delete_prefix(VALUE str, VALUE prefix)
11257{
11258 long prefixlen;
11259
11260 prefixlen = deleted_prefix_length(str, prefix);
11261 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11262
11263 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11264}
11265
11275static long
11276deleted_suffix_length(VALUE str, VALUE suffix)
11277{
11278 const char *strptr, *suffixptr;
11279 long olen, suffixlen;
11280 rb_encoding *enc;
11281
11282 StringValue(suffix);
11283 if (is_broken_string(suffix)) return 0;
11284 enc = rb_enc_check(str, suffix);
11285
11286 /* return 0 if not start with suffix */
11287 suffixlen = RSTRING_LEN(suffix);
11288 if (suffixlen <= 0) return 0;
11289 olen = RSTRING_LEN(str);
11290 if (olen < suffixlen) return 0;
11291 strptr = RSTRING_PTR(str);
11292 suffixptr = RSTRING_PTR(suffix);
11293 const char *strend = strptr + olen;
11294 const char *before_suffix = strend - suffixlen;
11295 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11296 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11297
11298 return suffixlen;
11299}
11300
11301/*
11302 * call-seq:
11303 * delete_suffix!(suffix) -> self or nil
11304 *
11305 * Like String#delete_suffix, except that +self+ is modified in place;
11306 * returns +self+ if the suffix is removed, +nil+ otherwise.
11307 *
11308 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11309 */
11310
11311static VALUE
11312rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11313{
11314 long olen, suffixlen, len;
11315 str_modifiable(str);
11316
11317 suffixlen = deleted_suffix_length(str, suffix);
11318 if (suffixlen <= 0) return Qnil;
11319
11320 olen = RSTRING_LEN(str);
11321 str_modify_keep_cr(str);
11322 len = olen - suffixlen;
11323 STR_SET_LEN(str, len);
11324 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11325 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11327 }
11328 return str;
11329}
11330
11331/*
11332 * call-seq:
11333 * delete_suffix(suffix) -> new_string
11334 *
11335 * :include: doc/string/delete_suffix.rdoc
11336 *
11337 */
11338
11339static VALUE
11340rb_str_delete_suffix(VALUE str, VALUE suffix)
11341{
11342 long suffixlen;
11343
11344 suffixlen = deleted_suffix_length(str, suffix);
11345 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11346
11347 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11348}
11349
11350void
11351rb_str_setter(VALUE val, ID id, VALUE *var)
11352{
11353 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11354 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11355 }
11356 *var = val;
11357}
11358
11359static void
11360nil_setter_warning(ID id)
11361{
11362 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11363}
11364
11365void
11366rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11367{
11368 rb_str_setter(val, id, var);
11369 if (!NIL_P(*var)) {
11370 nil_setter_warning(id);
11371 }
11372}
11373
11374static void
11375rb_fs_setter(VALUE val, ID id, VALUE *var)
11376{
11377 val = rb_fs_check(val);
11378 if (!val) {
11379 rb_raise(rb_eTypeError,
11380 "value of %"PRIsVALUE" must be String or Regexp",
11381 rb_id2str(id));
11382 }
11383 if (!NIL_P(val)) {
11384 nil_setter_warning(id);
11385 }
11386 *var = val;
11387}
11388
11389
11390/*
11391 * call-seq:
11392 * force_encoding(encoding) -> self
11393 *
11394 * :include: doc/string/force_encoding.rdoc
11395 *
11396 */
11397
11398static VALUE
11399rb_str_force_encoding(VALUE str, VALUE enc)
11400{
11401 str_modifiable(str);
11402
11403 rb_encoding *encoding = rb_to_encoding(enc);
11404 int idx = rb_enc_to_index(encoding);
11405
11406 // If the encoding is unchanged, we do nothing.
11407 if (ENCODING_GET(str) == idx) {
11408 return str;
11409 }
11410
11411 rb_enc_associate_index(str, idx);
11412
11413 // If the coderange was 7bit and the new encoding is ASCII-compatible
11414 // we can keep the coderange.
11415 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11416 return str;
11417 }
11418
11420 return str;
11421}
11422
11423/*
11424 * call-seq:
11425 * b -> new_string
11426 *
11427 * :include: doc/string/b.rdoc
11428 *
11429 */
11430
11431static VALUE
11432rb_str_b(VALUE str)
11433{
11434 VALUE str2;
11435 if (STR_EMBED_P(str)) {
11436 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11437 }
11438 else {
11439 str2 = str_alloc_heap(rb_cString);
11440 }
11441 str_replace_shared_without_enc(str2, str);
11442
11443 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11444 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11445 // If we know the receiver's code range then we know the result's code range.
11446 int cr = ENC_CODERANGE(str);
11447 switch (cr) {
11448 case ENC_CODERANGE_7BIT:
11450 break;
11454 break;
11455 default:
11456 ENC_CODERANGE_CLEAR(str2);
11457 break;
11458 }
11459 }
11460
11461 return str2;
11462}
11463
11464/*
11465 * call-seq:
11466 * valid_encoding? -> true or false
11467 *
11468 * :include: doc/string/valid_encoding_p.rdoc
11469 *
11470 */
11471
11472static VALUE
11473rb_str_valid_encoding_p(VALUE str)
11474{
11475 int cr = rb_enc_str_coderange(str);
11476
11477 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11478}
11479
11480/*
11481 * call-seq:
11482 * ascii_only? -> true or false
11483 *
11484 * Returns whether +self+ contains only ASCII characters:
11485 *
11486 * 'abc'.ascii_only? # => true
11487 * "abc\u{6666}".ascii_only? # => false
11488 *
11489 * Related: see {Querying}[rdoc-ref:String@Querying].
11490 */
11491
11492static VALUE
11493rb_str_is_ascii_only_p(VALUE str)
11494{
11495 int cr = rb_enc_str_coderange(str);
11496
11497 return RBOOL(cr == ENC_CODERANGE_7BIT);
11498}
11499
11500VALUE
11502{
11503 static const char ellipsis[] = "...";
11504 const long ellipsislen = sizeof(ellipsis) - 1;
11505 rb_encoding *const enc = rb_enc_get(str);
11506 const long blen = RSTRING_LEN(str);
11507 const char *const p = RSTRING_PTR(str), *e = p + blen;
11508 VALUE estr, ret = 0;
11509
11510 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11511 if (len * rb_enc_mbminlen(enc) >= blen ||
11512 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11513 ret = str;
11514 }
11515 else if (len <= ellipsislen ||
11516 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11517 if (rb_enc_asciicompat(enc)) {
11518 ret = rb_str_new(ellipsis, len);
11519 rb_enc_associate(ret, enc);
11520 }
11521 else {
11522 estr = rb_usascii_str_new(ellipsis, len);
11523 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11524 }
11525 }
11526 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11527 rb_str_cat(ret, ellipsis, ellipsislen);
11528 }
11529 else {
11530 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11531 rb_enc_from_encoding(enc), 0, Qnil);
11532 rb_str_append(ret, estr);
11533 }
11534 return ret;
11535}
11536
11537static VALUE
11538str_compat_and_valid(VALUE str, rb_encoding *enc)
11539{
11540 int cr;
11541 str = StringValue(str);
11542 cr = rb_enc_str_coderange(str);
11543 if (cr == ENC_CODERANGE_BROKEN) {
11544 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11545 }
11546 else {
11547 rb_encoding *e = STR_ENC_GET(str);
11548 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11549 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11550 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11551 }
11552 }
11553 return str;
11554}
11555
11556static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11557
11558VALUE
11560{
11561 rb_encoding *enc = STR_ENC_GET(str);
11562 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11563}
11564
11565VALUE
11566rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11567{
11568 int cr = ENC_CODERANGE_UNKNOWN;
11569 if (enc == STR_ENC_GET(str)) {
11570 /* cached coderange makes sense only when enc equals the
11571 * actual encoding of str */
11572 cr = ENC_CODERANGE(str);
11573 }
11574 return enc_str_scrub(enc, str, repl, cr);
11575}
11576
11577static VALUE
11578enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11579{
11580 int encidx;
11581 VALUE buf = Qnil;
11582 const char *rep, *p, *e, *p1, *sp;
11583 long replen = -1;
11584 long slen;
11585
11586 if (rb_block_given_p()) {
11587 if (!NIL_P(repl))
11588 rb_raise(rb_eArgError, "both of block and replacement given");
11589 replen = 0;
11590 }
11591
11592 if (ENC_CODERANGE_CLEAN_P(cr))
11593 return Qnil;
11594
11595 if (!NIL_P(repl)) {
11596 repl = str_compat_and_valid(repl, enc);
11597 }
11598
11599 if (rb_enc_dummy_p(enc)) {
11600 return Qnil;
11601 }
11602 encidx = rb_enc_to_index(enc);
11603
11604#define DEFAULT_REPLACE_CHAR(str) do { \
11605 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11606 rep = replace; replen = (int)sizeof(replace); \
11607 } while (0)
11608
11609 slen = RSTRING_LEN(str);
11610 p = RSTRING_PTR(str);
11611 e = RSTRING_END(str);
11612 p1 = p;
11613 sp = p;
11614
11615 if (rb_enc_asciicompat(enc)) {
11616 int rep7bit_p;
11617 if (!replen) {
11618 rep = NULL;
11619 rep7bit_p = FALSE;
11620 }
11621 else if (!NIL_P(repl)) {
11622 rep = RSTRING_PTR(repl);
11623 replen = RSTRING_LEN(repl);
11624 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11625 }
11626 else if (encidx == rb_utf8_encindex()) {
11627 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11628 rep7bit_p = FALSE;
11629 }
11630 else {
11631 DEFAULT_REPLACE_CHAR("?");
11632 rep7bit_p = TRUE;
11633 }
11634 cr = ENC_CODERANGE_7BIT;
11635
11636 p = search_nonascii(p, e);
11637 if (!p) {
11638 p = e;
11639 }
11640 while (p < e) {
11641 int ret = rb_enc_precise_mbclen(p, e, enc);
11642 if (MBCLEN_NEEDMORE_P(ret)) {
11643 break;
11644 }
11645 else if (MBCLEN_CHARFOUND_P(ret)) {
11647 p += MBCLEN_CHARFOUND_LEN(ret);
11648 }
11649 else if (MBCLEN_INVALID_P(ret)) {
11650 /*
11651 * p1~p: valid ascii/multibyte chars
11652 * p ~e: invalid bytes + unknown bytes
11653 */
11654 long clen = rb_enc_mbmaxlen(enc);
11655 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11656 if (p > p1) {
11657 rb_str_buf_cat(buf, p1, p - p1);
11658 }
11659
11660 if (e - p < clen) clen = e - p;
11661 if (clen <= 2) {
11662 clen = 1;
11663 }
11664 else {
11665 const char *q = p;
11666 clen--;
11667 for (; clen > 1; clen--) {
11668 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11669 if (MBCLEN_NEEDMORE_P(ret)) break;
11670 if (MBCLEN_INVALID_P(ret)) continue;
11672 }
11673 }
11674 if (rep) {
11675 rb_str_buf_cat(buf, rep, replen);
11676 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11677 }
11678 else {
11679 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11680 str_mod_check(str, sp, slen);
11681 repl = str_compat_and_valid(repl, enc);
11682 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11685 }
11686 p += clen;
11687 p1 = p;
11688 p = search_nonascii(p, e);
11689 if (!p) {
11690 p = e;
11691 break;
11692 }
11693 }
11694 else {
11696 }
11697 }
11698 if (NIL_P(buf)) {
11699 if (p == e) {
11700 ENC_CODERANGE_SET(str, cr);
11701 return Qnil;
11702 }
11703 buf = rb_str_buf_new(RSTRING_LEN(str));
11704 }
11705 if (p1 < p) {
11706 rb_str_buf_cat(buf, p1, p - p1);
11707 }
11708 if (p < e) {
11709 if (rep) {
11710 rb_str_buf_cat(buf, rep, replen);
11711 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11712 }
11713 else {
11714 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11715 str_mod_check(str, sp, slen);
11716 repl = str_compat_and_valid(repl, enc);
11717 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11720 }
11721 }
11722 }
11723 else {
11724 /* ASCII incompatible */
11725 long mbminlen = rb_enc_mbminlen(enc);
11726 if (!replen) {
11727 rep = NULL;
11728 }
11729 else if (!NIL_P(repl)) {
11730 rep = RSTRING_PTR(repl);
11731 replen = RSTRING_LEN(repl);
11732 }
11733 else if (encidx == ENCINDEX_UTF_16BE) {
11734 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11735 }
11736 else if (encidx == ENCINDEX_UTF_16LE) {
11737 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11738 }
11739 else if (encidx == ENCINDEX_UTF_32BE) {
11740 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11741 }
11742 else if (encidx == ENCINDEX_UTF_32LE) {
11743 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11744 }
11745 else {
11746 DEFAULT_REPLACE_CHAR("?");
11747 }
11748
11749 while (p < e) {
11750 int ret = rb_enc_precise_mbclen(p, e, enc);
11751 if (MBCLEN_NEEDMORE_P(ret)) {
11752 break;
11753 }
11754 else if (MBCLEN_CHARFOUND_P(ret)) {
11755 p += MBCLEN_CHARFOUND_LEN(ret);
11756 }
11757 else if (MBCLEN_INVALID_P(ret)) {
11758 const char *q = p;
11759 long clen = rb_enc_mbmaxlen(enc);
11760 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11761 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11762
11763 if (e - p < clen) clen = e - p;
11764 if (clen <= mbminlen * 2) {
11765 clen = mbminlen;
11766 }
11767 else {
11768 clen -= mbminlen;
11769 for (; clen > mbminlen; clen-=mbminlen) {
11770 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11771 if (MBCLEN_NEEDMORE_P(ret)) break;
11772 if (MBCLEN_INVALID_P(ret)) continue;
11774 }
11775 }
11776 if (rep) {
11777 rb_str_buf_cat(buf, rep, replen);
11778 }
11779 else {
11780 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11781 str_mod_check(str, sp, slen);
11782 repl = str_compat_and_valid(repl, enc);
11783 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11784 }
11785 p += clen;
11786 p1 = p;
11787 }
11788 else {
11790 }
11791 }
11792 if (NIL_P(buf)) {
11793 if (p == e) {
11795 return Qnil;
11796 }
11797 buf = rb_str_buf_new(RSTRING_LEN(str));
11798 }
11799 if (p1 < p) {
11800 rb_str_buf_cat(buf, p1, p - p1);
11801 }
11802 if (p < e) {
11803 if (rep) {
11804 rb_str_buf_cat(buf, rep, replen);
11805 }
11806 else {
11807 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11808 str_mod_check(str, sp, slen);
11809 repl = str_compat_and_valid(repl, enc);
11810 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11811 }
11812 }
11814 }
11815 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11816 return buf;
11817}
11818
11819/*
11820 * call-seq:
11821 * scrub(replacement_string = default_replacement_string) -> new_string
11822 * scrub{|sequence| ... } -> new_string
11823 *
11824 * :include: doc/string/scrub.rdoc
11825 *
11826 */
11827static VALUE
11828str_scrub(int argc, VALUE *argv, VALUE str)
11829{
11830 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11831 VALUE new = rb_str_scrub(str, repl);
11832 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11833}
11834
11835/*
11836 * call-seq:
11837 * scrub!(replacement_string = default_replacement_string) -> self
11838 * scrub!{|sequence| ... } -> self
11839 *
11840 * Like String#scrub, except that:
11841 *
11842 * - Any replacements are made in +self+.
11843 * - Returns +self+.
11844 *
11845 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11846 *
11847 */
11848static VALUE
11849str_scrub_bang(int argc, VALUE *argv, VALUE str)
11850{
11851 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11852 VALUE new = rb_str_scrub(str, repl);
11853 if (!NIL_P(new)) rb_str_replace(str, new);
11854 return str;
11855}
11856
11857static ID id_normalize;
11858static ID id_normalized_p;
11859static VALUE mUnicodeNormalize;
11860
11861static VALUE
11862unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11863{
11864 static int UnicodeNormalizeRequired = 0;
11865 VALUE argv2[2];
11866
11867 if (!UnicodeNormalizeRequired) {
11868 rb_require("unicode_normalize/normalize.rb");
11869 UnicodeNormalizeRequired = 1;
11870 }
11871 argv2[0] = str;
11872 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11873 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11874}
11875
11876/*
11877 * call-seq:
11878 * unicode_normalize(form = :nfc) -> string
11879 *
11880 * :include: doc/string/unicode_normalize.rdoc
11881 *
11882 */
11883static VALUE
11884rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11885{
11886 return unicode_normalize_common(argc, argv, str, id_normalize);
11887}
11888
11889/*
11890 * call-seq:
11891 * unicode_normalize!(form = :nfc) -> self
11892 *
11893 * Like String#unicode_normalize, except that the normalization
11894 * is performed on +self+ (not on a copy of +self+).
11895 *
11896 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11897 *
11898 */
11899static VALUE
11900rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11901{
11902 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11903}
11904
11905/* call-seq:
11906 * unicode_normalized?(form = :nfc) -> true or false
11907 *
11908 * Returns whether +self+ is in the given +form+ of Unicode normalization;
11909 * see String#unicode_normalize.
11910 *
11911 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11912 *
11913 * Examples:
11914 *
11915 * "a\u0300".unicode_normalized? # => false
11916 * "a\u0300".unicode_normalized?(:nfd) # => true
11917 * "\u00E0".unicode_normalized? # => true
11918 * "\u00E0".unicode_normalized?(:nfd) # => false
11919 *
11920 *
11921 * Raises an exception if +self+ is not in a Unicode encoding:
11922 *
11923 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
11924 * s.unicode_normalized? # Raises Encoding::CompatibilityError
11925 *
11926 * Related: see {Querying}[rdoc-ref:String@Querying].
11927 */
11928static VALUE
11929rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11930{
11931 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11932}
11933
11934/**********************************************************************
11935 * Document-class: Symbol
11936 *
11937 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11938 *
11939 * You can create a +Symbol+ object explicitly with:
11940 *
11941 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11942 *
11943 * The same +Symbol+ object will be
11944 * created for a given name or string for the duration of a program's
11945 * execution, regardless of the context or meaning of that name. Thus
11946 * if <code>Fred</code> is a constant in one context, a method in
11947 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11948 * will be the same object in all three contexts.
11949 *
11950 * module One
11951 * class Fred
11952 * end
11953 * $f1 = :Fred
11954 * end
11955 * module Two
11956 * Fred = 1
11957 * $f2 = :Fred
11958 * end
11959 * def Fred()
11960 * end
11961 * $f3 = :Fred
11962 * $f1.object_id #=> 2514190
11963 * $f2.object_id #=> 2514190
11964 * $f3.object_id #=> 2514190
11965 *
11966 * Constant, method, and variable names are returned as symbols:
11967 *
11968 * module One
11969 * Two = 2
11970 * def three; 3 end
11971 * @four = 4
11972 * @@five = 5
11973 * $six = 6
11974 * end
11975 * seven = 7
11976 *
11977 * One.constants
11978 * # => [:Two]
11979 * One.instance_methods(true)
11980 * # => [:three]
11981 * One.instance_variables
11982 * # => [:@four]
11983 * One.class_variables
11984 * # => [:@@five]
11985 * global_variables.grep(/six/)
11986 * # => [:$six]
11987 * local_variables
11988 * # => [:seven]
11989 *
11990 * A +Symbol+ object differs from a String object in that
11991 * a +Symbol+ object represents an identifier, while a String object
11992 * represents text or data.
11993 *
11994 * == What's Here
11995 *
11996 * First, what's elsewhere. Class +Symbol+:
11997 *
11998 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11999 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12000 *
12001 * Here, class +Symbol+ provides methods that are useful for:
12002 *
12003 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12004 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12005 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12006 *
12007 * === Methods for Querying
12008 *
12009 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12010 * - #=~: Returns the index of the first substring in symbol that matches a
12011 * given Regexp or other object; returns +nil+ if no match is found.
12012 * - #[], #slice : Returns a substring of symbol
12013 * determined by a given index, start/length, or range, or string.
12014 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12015 * - #encoding: Returns the Encoding object that represents the encoding
12016 * of symbol.
12017 * - #end_with?: Returns +true+ if symbol ends with
12018 * any of the given strings.
12019 * - #match: Returns a MatchData object if symbol
12020 * matches a given Regexp; +nil+ otherwise.
12021 * - #match?: Returns +true+ if symbol
12022 * matches a given Regexp; +false+ otherwise.
12023 * - #length, #size: Returns the number of characters in symbol.
12024 * - #start_with?: Returns +true+ if symbol starts with
12025 * any of the given strings.
12026 *
12027 * === Methods for Comparing
12028 *
12029 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12030 * or larger than symbol.
12031 * - #==, #===: Returns +true+ if a given symbol has the same content and
12032 * encoding.
12033 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12034 * symbol is smaller than, equal to, or larger than symbol.
12035 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12036 * after Unicode case folding; +false+ otherwise.
12037 *
12038 * === Methods for Converting
12039 *
12040 * - #capitalize: Returns symbol with the first character upcased
12041 * and all other characters downcased.
12042 * - #downcase: Returns symbol with all characters downcased.
12043 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12044 * - #name: Returns the frozen string corresponding to symbol.
12045 * - #succ, #next: Returns the symbol that is the successor to symbol.
12046 * - #swapcase: Returns symbol with all upcase characters downcased
12047 * and all downcase characters upcased.
12048 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12049 * - #to_s, #id2name: Returns the string corresponding to +self+.
12050 * - #to_sym, #intern: Returns +self+.
12051 * - #upcase: Returns symbol with all characters upcased.
12052 *
12053 */
12054
12055
12056/*
12057 * call-seq:
12058 * symbol == object -> true or false
12059 *
12060 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12061 */
12062
12063#define sym_equal rb_obj_equal
12064
12065static int
12066sym_printable(const char *s, const char *send, rb_encoding *enc)
12067{
12068 while (s < send) {
12069 int n;
12070 int c = rb_enc_precise_mbclen(s, send, enc);
12071
12072 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12073 n = MBCLEN_CHARFOUND_LEN(c);
12074 c = rb_enc_mbc_to_codepoint(s, send, enc);
12075 if (!rb_enc_isprint(c, enc)) return FALSE;
12076 s += n;
12077 }
12078 return TRUE;
12079}
12080
12081int
12082rb_str_symname_p(VALUE sym)
12083{
12084 rb_encoding *enc;
12085 const char *ptr;
12086 long len;
12087 rb_encoding *resenc = rb_default_internal_encoding();
12088
12089 if (resenc == NULL) resenc = rb_default_external_encoding();
12090 enc = STR_ENC_GET(sym);
12091 ptr = RSTRING_PTR(sym);
12092 len = RSTRING_LEN(sym);
12093 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12094 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12095 return FALSE;
12096 }
12097 return TRUE;
12098}
12099
12100VALUE
12101rb_str_quote_unprintable(VALUE str)
12102{
12103 rb_encoding *enc;
12104 const char *ptr;
12105 long len;
12106 rb_encoding *resenc;
12107
12108 Check_Type(str, T_STRING);
12109 resenc = rb_default_internal_encoding();
12110 if (resenc == NULL) resenc = rb_default_external_encoding();
12111 enc = STR_ENC_GET(str);
12112 ptr = RSTRING_PTR(str);
12113 len = RSTRING_LEN(str);
12114 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12115 !sym_printable(ptr, ptr + len, enc)) {
12116 return rb_str_escape(str);
12117 }
12118 return str;
12119}
12120
12121VALUE
12122rb_id_quote_unprintable(ID id)
12123{
12124 VALUE str = rb_id2str(id);
12125 if (!rb_str_symname_p(str)) {
12126 return rb_str_escape(str);
12127 }
12128 return str;
12129}
12130
12131/*
12132 * call-seq:
12133 * inspect -> string
12134 *
12135 * Returns a string representation of +self+ (including the leading colon):
12136 *
12137 * :foo.inspect # => ":foo"
12138 *
12139 * Related: Symbol#to_s, Symbol#name.
12140 *
12141 */
12142
12143static VALUE
12144sym_inspect(VALUE sym)
12145{
12146 VALUE str = rb_sym2str(sym);
12147 const char *ptr;
12148 long len;
12149 char *dest;
12150
12151 if (!rb_str_symname_p(str)) {
12152 str = rb_str_inspect(str);
12153 len = RSTRING_LEN(str);
12154 rb_str_resize(str, len + 1);
12155 dest = RSTRING_PTR(str);
12156 memmove(dest + 1, dest, len);
12157 }
12158 else {
12159 rb_encoding *enc = STR_ENC_GET(str);
12160 VALUE orig_str = str;
12161
12162 len = RSTRING_LEN(orig_str);
12163 str = rb_enc_str_new(0, len + 1, enc);
12164
12165 // Get data pointer after allocation
12166 ptr = RSTRING_PTR(orig_str);
12167 dest = RSTRING_PTR(str);
12168 memcpy(dest + 1, ptr, len);
12169
12170 RB_GC_GUARD(orig_str);
12171 }
12172 dest[0] = ':';
12173
12175
12176 return str;
12177}
12178
12179VALUE
12181{
12182 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12183 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12184 return str;
12185}
12186
12187VALUE
12188rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12189{
12190 VALUE obj;
12191
12192 if (argc < 1) {
12193 rb_raise(rb_eArgError, "no receiver given");
12194 }
12195 obj = argv[0];
12196 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12197}
12198
12199/*
12200 * call-seq:
12201 * succ
12202 *
12203 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12204 *
12205 * :foo.succ # => :fop
12206 *
12207 * Related: String#succ.
12208 */
12209
12210static VALUE
12211sym_succ(VALUE sym)
12212{
12213 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12214}
12215
12216/*
12217 * call-seq:
12218 * symbol <=> object -> -1, 0, +1, or nil
12219 *
12220 * If +object+ is a symbol,
12221 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12222 *
12223 * :bar <=> :foo # => -1
12224 * :foo <=> :foo # => 0
12225 * :foo <=> :bar # => 1
12226 *
12227 * Otherwise, returns +nil+:
12228 *
12229 * :foo <=> 'bar' # => nil
12230 *
12231 * Related: String#<=>.
12232 */
12233
12234static VALUE
12235sym_cmp(VALUE sym, VALUE other)
12236{
12237 if (!SYMBOL_P(other)) {
12238 return Qnil;
12239 }
12240 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12241}
12242
12243/*
12244 * call-seq:
12245 * casecmp(object) -> -1, 0, 1, or nil
12246 *
12247 * :include: doc/symbol/casecmp.rdoc
12248 *
12249 */
12250
12251static VALUE
12252sym_casecmp(VALUE sym, VALUE other)
12253{
12254 if (!SYMBOL_P(other)) {
12255 return Qnil;
12256 }
12257 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12258}
12259
12260/*
12261 * call-seq:
12262 * casecmp?(object) -> true, false, or nil
12263 *
12264 * :include: doc/symbol/casecmp_p.rdoc
12265 *
12266 */
12267
12268static VALUE
12269sym_casecmp_p(VALUE sym, VALUE other)
12270{
12271 if (!SYMBOL_P(other)) {
12272 return Qnil;
12273 }
12274 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12275}
12276
12277/*
12278 * call-seq:
12279 * symbol =~ object -> integer or nil
12280 *
12281 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12282 * including possible updates to global variables;
12283 * see String#=~.
12284 *
12285 */
12286
12287static VALUE
12288sym_match(VALUE sym, VALUE other)
12289{
12290 return rb_str_match(rb_sym2str(sym), other);
12291}
12292
12293/*
12294 * call-seq:
12295 * match(pattern, offset = 0) -> matchdata or nil
12296 * match(pattern, offset = 0) {|matchdata| } -> object
12297 *
12298 * Equivalent to <tt>self.to_s.match</tt>,
12299 * including possible updates to global variables;
12300 * see String#match.
12301 *
12302 */
12303
12304static VALUE
12305sym_match_m(int argc, VALUE *argv, VALUE sym)
12306{
12307 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12308}
12309
12310/*
12311 * call-seq:
12312 * match?(pattern, offset) -> true or false
12313 *
12314 * Equivalent to <tt>sym.to_s.match?</tt>;
12315 * see String#match.
12316 *
12317 */
12318
12319static VALUE
12320sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12321{
12322 return rb_str_match_m_p(argc, argv, sym);
12323}
12324
12325/*
12326 * call-seq:
12327 * symbol[index] -> string or nil
12328 * symbol[start, length] -> string or nil
12329 * symbol[range] -> string or nil
12330 * symbol[regexp, capture = 0] -> string or nil
12331 * symbol[substring] -> string or nil
12332 *
12333 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12334 *
12335 */
12336
12337static VALUE
12338sym_aref(int argc, VALUE *argv, VALUE sym)
12339{
12340 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12341}
12342
12343/*
12344 * call-seq:
12345 * length -> integer
12346 *
12347 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12348 */
12349
12350static VALUE
12351sym_length(VALUE sym)
12352{
12353 return rb_str_length(rb_sym2str(sym));
12354}
12355
12356/*
12357 * call-seq:
12358 * empty? -> true or false
12359 *
12360 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12361 *
12362 */
12363
12364static VALUE
12365sym_empty(VALUE sym)
12366{
12367 return rb_str_empty(rb_sym2str(sym));
12368}
12369
12370/*
12371 * call-seq:
12372 * upcase(mapping) -> symbol
12373 *
12374 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12375 *
12376 * See String#upcase.
12377 *
12378 */
12379
12380static VALUE
12381sym_upcase(int argc, VALUE *argv, VALUE sym)
12382{
12383 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12384}
12385
12386/*
12387 * call-seq:
12388 * downcase(mapping) -> symbol
12389 *
12390 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12391 *
12392 * See String#downcase.
12393 *
12394 * Related: Symbol#upcase.
12395 *
12396 */
12397
12398static VALUE
12399sym_downcase(int argc, VALUE *argv, VALUE sym)
12400{
12401 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12402}
12403
12404/*
12405 * call-seq:
12406 * capitalize(mapping) -> symbol
12407 *
12408 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12409 *
12410 * See String#capitalize.
12411 *
12412 */
12413
12414static VALUE
12415sym_capitalize(int argc, VALUE *argv, VALUE sym)
12416{
12417 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12418}
12419
12420/*
12421 * call-seq:
12422 * swapcase(mapping) -> symbol
12423 *
12424 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12425 *
12426 * See String#swapcase.
12427 *
12428 */
12429
12430static VALUE
12431sym_swapcase(int argc, VALUE *argv, VALUE sym)
12432{
12433 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12434}
12435
12436/*
12437 * call-seq:
12438 * start_with?(*string_or_regexp) -> true or false
12439 *
12440 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12441 *
12442 */
12443
12444static VALUE
12445sym_start_with(int argc, VALUE *argv, VALUE sym)
12446{
12447 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12448}
12449
12450/*
12451 * call-seq:
12452 * end_with?(*strings) -> true or false
12453 *
12454 *
12455 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12456 *
12457 */
12458
12459static VALUE
12460sym_end_with(int argc, VALUE *argv, VALUE sym)
12461{
12462 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12463}
12464
12465/*
12466 * call-seq:
12467 * encoding -> encoding
12468 *
12469 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12470 *
12471 */
12472
12473static VALUE
12474sym_encoding(VALUE sym)
12475{
12476 return rb_obj_encoding(rb_sym2str(sym));
12477}
12478
12479static VALUE
12480string_for_symbol(VALUE name)
12481{
12482 if (!RB_TYPE_P(name, T_STRING)) {
12483 VALUE tmp = rb_check_string_type(name);
12484 if (NIL_P(tmp)) {
12485 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12486 name);
12487 }
12488 name = tmp;
12489 }
12490 return name;
12491}
12492
12493ID
12495{
12496 if (SYMBOL_P(name)) {
12497 return SYM2ID(name);
12498 }
12499 name = string_for_symbol(name);
12500 return rb_intern_str(name);
12501}
12502
12503VALUE
12505{
12506 if (SYMBOL_P(name)) {
12507 return name;
12508 }
12509 name = string_for_symbol(name);
12510 return rb_str_intern(name);
12511}
12512
12513/*
12514 * call-seq:
12515 * Symbol.all_symbols -> array_of_symbols
12516 *
12517 * Returns an array of all symbols currently in Ruby's symbol table:
12518 *
12519 * Symbol.all_symbols.size # => 9334
12520 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12521 *
12522 */
12523
12524static VALUE
12525sym_all_symbols(VALUE _)
12526{
12527 return rb_sym_all_symbols();
12528}
12529
12530VALUE
12531rb_str_to_interned_str(VALUE str)
12532{
12533 return rb_fstring(str);
12534}
12535
12536VALUE
12537rb_interned_str(const char *ptr, long len)
12538{
12539 struct RString fake_str = {RBASIC_INIT};
12540 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12541}
12542
12543VALUE
12545{
12546 return rb_interned_str(ptr, strlen(ptr));
12547}
12548
12549VALUE
12550rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12551{
12552 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12553 rb_enc_autoload(enc);
12554 }
12555
12556 struct RString fake_str = {RBASIC_INIT};
12557 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12558}
12559
12560VALUE
12561rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12562{
12563 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12564 rb_enc_autoload(enc);
12565 }
12566
12567 struct RString fake_str = {RBASIC_INIT};
12568 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12569 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12570 return str;
12571}
12572
12573VALUE
12575{
12576 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12577}
12578
12579#if USE_YJIT || USE_ZJIT
12580void
12581rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12582{
12583 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12584 ssize_t code = RB_NUM2SSIZE(codepoint);
12585
12586 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12587 rb_str_buf_cat_byte(str, (char) code);
12588 return;
12589 }
12590 }
12591
12592 rb_str_concat(str, codepoint);
12593}
12594#endif
12595
12596static int
12597fstring_set_class_i(VALUE *str, void *data)
12598{
12599 RBASIC_SET_CLASS(*str, rb_cString);
12600
12601 return ST_CONTINUE;
12602}
12603
12604void
12605Init_String(void)
12606{
12607 rb_cString = rb_define_class("String", rb_cObject);
12608
12609 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12610
12612 rb_define_alloc_func(rb_cString, empty_str_alloc);
12613 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12614 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12615 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12617 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12618 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12621 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12622 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12623 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12624 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12627 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12628 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12629 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12630 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12633 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12634 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12635 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12636 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12637 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12639 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12641 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12642 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12643 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12644 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12645 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12646 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12647 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12648 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12649 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12650 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12651 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12652 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12653 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12654 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12656 rb_define_method(rb_cString, "+@", str_uplus, 0);
12657 rb_define_method(rb_cString, "-@", str_uminus, 0);
12658 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12659 rb_define_alias(rb_cString, "dedup", "-@");
12660
12661 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12662 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12663 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12664 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12667 rb_define_method(rb_cString, "undump", str_undump, 0);
12668
12669 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12670 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12671 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12672 sym_fold = ID2SYM(rb_intern_const("fold"));
12673
12674 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12675 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12676 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12677 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12678
12679 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12680 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12681 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12682 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12683
12684 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12685 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12686 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12687 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12688 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12689 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12690 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12691 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12692 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12693 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12694 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12695 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12697 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12698 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12699 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12700 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12701 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12702
12703 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12704 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12705 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12706
12707 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12708
12709 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12710 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12711 rb_define_method(rb_cString, "center", rb_str_center, -1);
12712
12713 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12714 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12715 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12716 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12717 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12718 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12719 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12720 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12721 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12722
12723 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12724 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12725 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12726 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12727 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12728 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12729 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12730 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12731 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12732
12733 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12734 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12735 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12736 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12737 rb_define_method(rb_cString, "count", rb_str_count, -1);
12738
12739 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12740 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12741 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12742 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12743
12744 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12745 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12746 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12747 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12748 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12749
12750 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12751
12752 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12753 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12754
12755 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12756 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12757
12758 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12759 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12760 rb_define_method(rb_cString, "b", rb_str_b, 0);
12761 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12762 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12763
12764 /* define UnicodeNormalize module here so that we don't have to look it up */
12765 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12766 id_normalize = rb_intern_const("normalize");
12767 id_normalized_p = rb_intern_const("normalized?");
12768
12769 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12770 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12771 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12772
12773 rb_fs = Qnil;
12774 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12775 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12776 rb_gc_register_address(&rb_fs);
12777
12778 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12782 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12783
12784 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12785 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12786 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12787 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12788 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12789 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12790
12791 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12792 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12793 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12794 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12795
12796 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12797 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12798 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12799 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12800 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12801 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12802 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12803
12804 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12805 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12806 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12807 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12808
12809 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12810 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12811
12812 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12813}
12814
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1795
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1588
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1701
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2947
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2767
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3237
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1037
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3026
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:683
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3909
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1435
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1431
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1438
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1429
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1433
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:675
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2189
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2207
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1354
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3603
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:264
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:582
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:176
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1342
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3287
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1340
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:945
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1205
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3026
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1224
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12550
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2332
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3730
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1153
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1445
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1346
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:964
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12574
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:829
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:714
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2024
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2030
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1936
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4223
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3720
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1750
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1510
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2485
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3795
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1421
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12180
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2558
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1397
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1744
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3054
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5331
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4158
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3151
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11501
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1786
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1187
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:999
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1516
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1994
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4144
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3563
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2421
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2012
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6538
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3159
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12544
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1427
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3761
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3101
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4265
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3385
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7217
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2788
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12537
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4212
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4032
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4187
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3737
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3276
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5815
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11559
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1700
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2948
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3248
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3367
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1199
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2742
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7324
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1409
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1716
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2435
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5733
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9329
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1193
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1848
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2013
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2092
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3366
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1624
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12504
ID rb_to_id(VALUE str)
Definition string.c:12494
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3499
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4467
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1439
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2925
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2807
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1433
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2820
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1777
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:461
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1472
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:208
Definition string.c:8209
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:307
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113