Ruby 3.5.0dev (2025-11-03 revision 4a3d8346a6d0e068508631541f6bc43e8b154ea1)
string.c (4a3d8346a6d0e068508631541f6bc43e8b154ea1)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 FL_SET((shared_str), STR_SHARED_ROOT); \
209 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
210 FL_SET_RAW((shared_str), STR_BORROWED); \
211 } \
212} while (0)
213
214#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
215#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216/* TODO: include the terminator size in capa. */
217
218#define STR_ENC_GET(str) get_encoding(str)
219
220#if !defined SHARABLE_MIDDLE_SUBSTRING
221# define SHARABLE_MIDDLE_SUBSTRING 0
222#endif
223#if !SHARABLE_MIDDLE_SUBSTRING
224#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#else
226#define SHARABLE_SUBSTRING_P(beg, len, end) 1
227#endif
228
229
230static inline long
231str_embed_capa(VALUE str)
232{
233 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
234}
235
236bool
237rb_str_reembeddable_p(VALUE str)
238{
239 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
240}
241
242static inline size_t
243rb_str_embed_size(long capa, long termlen)
244{
245 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
246 if (size < sizeof(struct RString)) size = sizeof(struct RString);
247 return size;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254 if (STR_EMBED_P(str)) {
255 size_t capa = RSTRING(str)->len;
256 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
257
258 real_size = rb_str_embed_size(capa, TERM_LEN(str));
259 }
260 /* if the string is not currently embedded, but it can be embedded, how
261 * much space would it require */
262 else if (rb_str_reembeddable_p(str)) {
263 size_t capa = RSTRING(str)->as.heap.aux.capa;
264 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
265
266 real_size = rb_str_embed_size(capa, TERM_LEN(str));
267 }
268 else {
269 real_size = sizeof(struct RString);
270 }
271
272 return real_size;
273}
274
275static inline bool
276STR_EMBEDDABLE_P(long len, long termlen)
277{
278 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
279}
280
281static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
282static VALUE str_new_frozen(VALUE klass, VALUE orig);
283static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
284static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
285static VALUE str_new(VALUE klass, const char *ptr, long len);
286static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
287static inline void str_modifiable(VALUE str);
288static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
289static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->len;
317
318 STR_SET_EMBED(str);
319 STR_SET_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_debug_rstring_null_ptr(const char *func)
331{
332 fprintf(stderr, "%s is returning NULL!! "
333 "SIGSEGV is highly expected to follow immediately.\n"
334 "If you could reproduce, attach your debugger here, "
335 "and look at the passed string.\n",
336 func);
337}
338
339/* symbols for [up|down|swap]case/capitalize options */
340static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
341
342static rb_encoding *
343get_encoding(VALUE str)
344{
345 return rb_enc_from_index(ENCODING_GET(str));
346}
347
348static void
349mustnot_broken(VALUE str)
350{
351 if (is_broken_string(str)) {
352 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
353 }
354}
355
356static void
357mustnot_wchar(VALUE str)
358{
359 rb_encoding *enc = STR_ENC_GET(str);
360 if (rb_enc_mbminlen(enc) > 1) {
361 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
362 }
363}
364
365static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
366
367#if SIZEOF_LONG == SIZEOF_VOIDP
368#define PRECOMPUTED_FAKESTR_HASH 1
369#else
370#endif
371
372static inline bool
373BARE_STRING_P(VALUE str)
374{
375 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
376}
377
378static inline st_index_t
379str_do_hash(VALUE str)
380{
381 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
382 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
383 if (e && !is_ascii_string(str)) {
384 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
385 }
386 return h;
387}
388
389static VALUE
390str_store_precomputed_hash(VALUE str, st_index_t hash)
391{
392 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
393 RUBY_ASSERT(STR_EMBED_P(str));
394
395#if RUBY_DEBUG
396 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
397 size_t free_bytes = str_embed_capa(str) - used_bytes;
398 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
399#endif
400
401 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
402
403 FL_SET(str, STR_PRECOMPUTED_HASH);
404
405 return str;
406}
407
408VALUE
409rb_fstring(VALUE str)
410{
411 VALUE fstr;
412 int bare;
413
414 Check_Type(str, T_STRING);
415
416 if (FL_TEST(str, RSTRING_FSTR))
417 return str;
418
419 bare = BARE_STRING_P(str);
420 if (!bare) {
421 if (STR_EMBED_P(str)) {
422 OBJ_FREEZE(str);
423 return str;
424 }
425
426 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 return str;
429 }
430 }
431
432 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
433 rb_str_resize(str, RSTRING_LEN(str));
434
435 fstr = register_fstring(str, false, false);
436
437 if (!bare) {
438 str_replace_shared_without_enc(str, fstr);
439 OBJ_FREEZE(str);
440 return str;
441 }
442 return fstr;
443}
444
445static VALUE fstring_table_obj;
446
447static VALUE
448fstring_concurrent_set_hash(VALUE str)
449{
450#ifdef PRECOMPUTED_FAKESTR_HASH
451 st_index_t h;
452 if (FL_TEST_RAW(str, STR_FAKESTR)) {
453 // register_fstring precomputes the hash and stores it in capa for fake strings
454 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
455 }
456 else {
457 h = rb_str_hash(str);
458 }
459 // rb_str_hash doesn't include the encoding for ascii only strings, so
460 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
461 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
462#else
463 return (VALUE)rb_str_hash(str);
464#endif
465}
466
467static bool
468fstring_concurrent_set_cmp(VALUE a, VALUE b)
469{
470 long alen, blen;
471 const char *aptr, *bptr;
472
475
476 RSTRING_GETMEM(a, aptr, alen);
477 RSTRING_GETMEM(b, bptr, blen);
478 return (alen == blen &&
479 ENCODING_GET(a) == ENCODING_GET(b) &&
480 memcmp(aptr, bptr, alen) == 0);
481}
482
484 bool copy;
485 bool force_precompute_hash;
486};
487
488static VALUE
489fstring_concurrent_set_create(VALUE str, void *data)
490{
491 struct fstr_create_arg *arg = data;
492
493 // Unless the string is empty or binary, its coderange has been precomputed.
494 int coderange = ENC_CODERANGE(str);
495
496 if (FL_TEST_RAW(str, STR_FAKESTR)) {
497 if (arg->copy) {
498 VALUE new_str;
499 long len = RSTRING_LEN(str);
500 long capa = len + sizeof(st_index_t);
501 int term_len = TERM_LEN(str);
502
503 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
504 new_str = str_alloc_embed(rb_cString, capa + term_len);
505 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
506 STR_SET_LEN(new_str, RSTRING_LEN(str));
507 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
508 rb_enc_copy(new_str, str);
509 str_store_precomputed_hash(new_str, str_do_hash(str));
510 }
511 else {
512 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
513 rb_enc_copy(new_str, str);
514#ifdef PRECOMPUTED_FAKESTR_HASH
515 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
516 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
517 }
518#endif
519 }
520 str = new_str;
521 }
522 else {
523 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
524 RSTRING(str)->len,
525 ENCODING_GET(str));
526 }
527 OBJ_FREEZE(str);
528 }
529 else {
530 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
531 str = str_new_frozen(rb_cString, str);
532 }
533 if (STR_SHARED_P(str)) { /* str should not be shared */
534 /* shared substring */
535 str_make_independent(str);
537 }
538 if (!BARE_STRING_P(str)) {
539 str = str_new_frozen(rb_cString, str);
540 }
541 }
542
543 ENC_CODERANGE_SET(str, coderange);
544 RBASIC(str)->flags |= RSTRING_FSTR;
545 if (!RB_OBJ_SHAREABLE_P(str)) {
546 RB_OBJ_SET_SHAREABLE(str);
547 }
548 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
551 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
552 RUBY_ASSERT(!rb_obj_exivar_p(str));
554 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
555
556 return str;
557}
558
559static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
560 .hash = fstring_concurrent_set_hash,
561 .cmp = fstring_concurrent_set_cmp,
562 .create = fstring_concurrent_set_create,
563 .free = NULL,
564};
565
566void
567Init_fstring_table(void)
568{
569 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
570 rb_gc_register_address(&fstring_table_obj);
571}
572
573static VALUE
574register_fstring(VALUE str, bool copy, bool force_precompute_hash)
575{
576 struct fstr_create_arg args = {
577 .copy = copy,
578 .force_precompute_hash = force_precompute_hash
579 };
580
581#if SIZEOF_VOIDP == SIZEOF_LONG
582 if (FL_TEST_RAW(str, STR_FAKESTR)) {
583 // if the string hasn't been interned, we'll need the hash twice, so we
584 // compute it once and store it in capa
585 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
586 }
587#endif
588
589 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
590
591 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593 RUBY_ASSERT(OBJ_FROZEN(result));
595 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
596 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
598
599 return result;
600}
601
602bool
603rb_obj_is_fstring_table(VALUE obj)
604{
605 ASSERT_vm_locking();
606
607 return obj == fstring_table_obj;
608}
609
610void
611rb_gc_free_fstring(VALUE obj)
612{
613 ASSERT_vm_locking_with_barrier();
614
615 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
616
617 RB_DEBUG_COUNTER_INC(obj_str_fstr);
618
619 FL_UNSET(obj, RSTRING_FSTR);
620}
621
622void
623rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
624{
625 if (fstring_table_obj) {
626 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
627 }
628}
629
630static VALUE
631setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
632{
633 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
634 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
635
636 if (!name) {
638 name = "";
639 }
640
641 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
642
643 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
644 fake_str->len = len;
645 fake_str->as.heap.ptr = (char *)name;
646 fake_str->as.heap.aux.capa = len;
647 return (VALUE)fake_str;
648}
649
650/*
651 * set up a fake string which refers a static string literal.
652 */
653VALUE
654rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
655{
656 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
657}
658
659/*
660 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
661 * shared string which refers a static string literal. `ptr` must
662 * point a constant string.
663 */
664VALUE
665rb_fstring_new(const char *ptr, long len)
666{
667 struct RString fake_str = {RBASIC_INIT};
668 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
669}
670
671VALUE
672rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
673{
674 struct RString fake_str = {RBASIC_INIT};
675 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
676}
677
678VALUE
679rb_fstring_cstr(const char *ptr)
680{
681 return rb_fstring_new(ptr, strlen(ptr));
682}
683
684static inline bool
685single_byte_optimizable(VALUE str)
686{
687 int encindex = ENCODING_GET(str);
688 switch (encindex) {
689 case ENCINDEX_ASCII_8BIT:
690 case ENCINDEX_US_ASCII:
691 return true;
692 case ENCINDEX_UTF_8:
693 // For UTF-8 it's worth scanning the string coderange when unknown.
695 }
696 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
697 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
698 return true;
699 }
700
701 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
702 return true;
703 }
704
705 /* Conservative. Possibly single byte.
706 * "\xa1" in Shift_JIS for example. */
707 return false;
708}
709
711
712static inline const char *
713search_nonascii(const char *p, const char *e)
714{
715 const uintptr_t *s, *t;
716
717#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
718# if SIZEOF_UINTPTR_T == 8
719# define NONASCII_MASK UINT64_C(0x8080808080808080)
720# elif SIZEOF_UINTPTR_T == 4
721# define NONASCII_MASK UINT32_C(0x80808080)
722# else
723# error "don't know what to do."
724# endif
725#else
726# if SIZEOF_UINTPTR_T == 8
727# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
728# elif SIZEOF_UINTPTR_T == 4
729# define NONASCII_MASK 0x80808080UL /* or...? */
730# else
731# error "don't know what to do."
732# endif
733#endif
734
735 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
736#if !UNALIGNED_WORD_ACCESS
737 if ((uintptr_t)p % SIZEOF_VOIDP) {
738 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
739 p += l;
740 switch (l) {
741 default: UNREACHABLE;
742#if SIZEOF_VOIDP > 4
743 case 7: if (p[-7]&0x80) return p-7;
744 case 6: if (p[-6]&0x80) return p-6;
745 case 5: if (p[-5]&0x80) return p-5;
746 case 4: if (p[-4]&0x80) return p-4;
747#endif
748 case 3: if (p[-3]&0x80) return p-3;
749 case 2: if (p[-2]&0x80) return p-2;
750 case 1: if (p[-1]&0x80) return p-1;
751 case 0: break;
752 }
753 }
754#endif
755#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
756#define aligned_ptr(value) \
757 __builtin_assume_aligned((value), sizeof(uintptr_t))
758#else
759#define aligned_ptr(value) (uintptr_t *)(value)
760#endif
761 s = aligned_ptr(p);
762 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
763#undef aligned_ptr
764 for (;s < t; s++) {
765 if (*s & NONASCII_MASK) {
766#ifdef WORDS_BIGENDIAN
767 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
768#else
769 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
770#endif
771 }
772 }
773 p = (const char *)s;
774 }
775
776 switch (e - p) {
777 default: UNREACHABLE;
778#if SIZEOF_VOIDP > 4
779 case 7: if (e[-7]&0x80) return e-7;
780 case 6: if (e[-6]&0x80) return e-6;
781 case 5: if (e[-5]&0x80) return e-5;
782 case 4: if (e[-4]&0x80) return e-4;
783#endif
784 case 3: if (e[-3]&0x80) return e-3;
785 case 2: if (e[-2]&0x80) return e-2;
786 case 1: if (e[-1]&0x80) return e-1;
787 case 0: return NULL;
788 }
789}
790
791static int
792coderange_scan(const char *p, long len, rb_encoding *enc)
793{
794 const char *e = p + len;
795
796 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
797 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
798 p = search_nonascii(p, e);
800 }
801
802 if (rb_enc_asciicompat(enc)) {
803 p = search_nonascii(p, e);
804 if (!p) return ENC_CODERANGE_7BIT;
805 for (;;) {
806 int ret = rb_enc_precise_mbclen(p, e, enc);
808 p += MBCLEN_CHARFOUND_LEN(ret);
809 if (p == e) break;
810 p = search_nonascii(p, e);
811 if (!p) break;
812 }
813 }
814 else {
815 while (p < e) {
816 int ret = rb_enc_precise_mbclen(p, e, enc);
818 p += MBCLEN_CHARFOUND_LEN(ret);
819 }
820 }
821 return ENC_CODERANGE_VALID;
822}
823
824long
825rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
826{
827 const char *p = s;
828
829 if (*cr == ENC_CODERANGE_BROKEN)
830 return e - s;
831
832 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
833 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
834 if (*cr == ENC_CODERANGE_VALID) return e - s;
835 p = search_nonascii(p, e);
837 return e - s;
838 }
839 else if (rb_enc_asciicompat(enc)) {
840 p = search_nonascii(p, e);
841 if (!p) {
842 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
843 return e - s;
844 }
845 for (;;) {
846 int ret = rb_enc_precise_mbclen(p, e, enc);
847 if (!MBCLEN_CHARFOUND_P(ret)) {
849 return p - s;
850 }
851 p += MBCLEN_CHARFOUND_LEN(ret);
852 if (p == e) break;
853 p = search_nonascii(p, e);
854 if (!p) break;
855 }
856 }
857 else {
858 while (p < e) {
859 int ret = rb_enc_precise_mbclen(p, e, enc);
860 if (!MBCLEN_CHARFOUND_P(ret)) {
862 return p - s;
863 }
864 p += MBCLEN_CHARFOUND_LEN(ret);
865 }
866 }
868 return e - s;
869}
870
871static inline void
872str_enc_copy(VALUE str1, VALUE str2)
873{
874 rb_enc_set_index(str1, ENCODING_GET(str2));
875}
876
877/* Like str_enc_copy, but does not check frozen status of str1.
878 * You should use this only if you're certain that str1 is not frozen. */
879static inline void
880str_enc_copy_direct(VALUE str1, VALUE str2)
881{
882 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
883 if (inlined_encoding == ENCODING_INLINE_MAX) {
884 rb_enc_set_index(str1, rb_enc_get_index(str2));
885 }
886 else {
887 ENCODING_SET_INLINED(str1, inlined_encoding);
888 }
889}
890
891static void
892rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
893{
894 /* this function is designed for copying encoding and coderange
895 * from src to new string "dest" which is made from the part of src.
896 */
897 str_enc_copy(dest, src);
898 if (RSTRING_LEN(dest) == 0) {
899 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
901 else
903 return;
904 }
905 switch (ENC_CODERANGE(src)) {
908 break;
910 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
911 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
913 else
915 break;
916 default:
917 break;
918 }
919}
920
921static void
922rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
923{
924 str_enc_copy(dest, src);
926}
927
928static int
929enc_coderange_scan(VALUE str, rb_encoding *enc)
930{
931 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
932}
933
934int
935rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
936{
937 return enc_coderange_scan(str, enc);
938}
939
940int
942{
943 int cr = ENC_CODERANGE(str);
944
945 if (cr == ENC_CODERANGE_UNKNOWN) {
946 cr = enc_coderange_scan(str, get_encoding(str));
947 ENC_CODERANGE_SET(str, cr);
948 }
949 return cr;
950}
951
952static inline bool
953rb_enc_str_asciicompat(VALUE str)
954{
955 int encindex = ENCODING_GET_INLINED(str);
956 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
957}
958
959int
961{
962 switch(ENC_CODERANGE(str)) {
964 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
966 return true;
967 default:
968 return false;
969 }
970}
971
972static inline void
973str_mod_check(VALUE s, const char *p, long len)
974{
975 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
976 rb_raise(rb_eRuntimeError, "string modified");
977 }
978}
979
980static size_t
981str_capacity(VALUE str, const int termlen)
982{
983 if (STR_EMBED_P(str)) {
984 return str_embed_capa(str) - termlen;
985 }
986 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
987 return RSTRING(str)->len;
988 }
989 else {
990 return RSTRING(str)->as.heap.aux.capa;
991 }
992}
993
994size_t
996{
997 return str_capacity(str, TERM_LEN(str));
998}
999
1000static inline void
1001must_not_null(const char *ptr)
1002{
1003 if (!ptr) {
1004 rb_raise(rb_eArgError, "NULL pointer given");
1005 }
1006}
1007
1008static inline VALUE
1009str_alloc_embed(VALUE klass, size_t capa)
1010{
1011 size_t size = rb_str_embed_size(capa, 0);
1012 RUBY_ASSERT(size > 0);
1013 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1014
1015 NEWOBJ_OF(str, struct RString, klass,
1017
1018 str->len = 0;
1019 str->as.embed.ary[0] = 0;
1020
1021 return (VALUE)str;
1022}
1023
1024static inline VALUE
1025str_alloc_heap(VALUE klass)
1026{
1027 NEWOBJ_OF(str, struct RString, klass,
1028 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1029
1030 str->len = 0;
1031 str->as.heap.aux.capa = 0;
1032 str->as.heap.ptr = NULL;
1033
1034 return (VALUE)str;
1035}
1036
1037static inline VALUE
1038empty_str_alloc(VALUE klass)
1039{
1040 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1041 VALUE str = str_alloc_embed(klass, 0);
1042 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1044 return str;
1045}
1046
1047static VALUE
1048str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1049{
1050 VALUE str;
1051
1052 if (len < 0) {
1053 rb_raise(rb_eArgError, "negative string size (or size too big)");
1054 }
1055
1056 if (enc == NULL) {
1057 enc = rb_ascii8bit_encoding();
1058 }
1059
1060 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1061
1062 int termlen = rb_enc_mbminlen(enc);
1063
1064 if (STR_EMBEDDABLE_P(len, termlen)) {
1065 str = str_alloc_embed(klass, len + termlen);
1066 if (len == 0) {
1067 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1068 }
1069 }
1070 else {
1071 str = str_alloc_heap(klass);
1072 RSTRING(str)->as.heap.aux.capa = len;
1073 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1074 * integer overflow. If we can STATIC_ASSERT that, the following
1075 * mul_add_mul can be reverted to a simple ALLOC_N. */
1076 RSTRING(str)->as.heap.ptr =
1077 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1078 }
1079
1080 rb_enc_raw_set(str, enc);
1081
1082 if (ptr) {
1083 memcpy(RSTRING_PTR(str), ptr, len);
1084 }
1085 else {
1086 memset(RSTRING_PTR(str), 0, len);
1087 }
1088
1089 STR_SET_LEN(str, len);
1090 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1091 return str;
1092}
1093
1094static VALUE
1095str_new(VALUE klass, const char *ptr, long len)
1096{
1097 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1098}
1099
1100VALUE
1101rb_str_new(const char *ptr, long len)
1102{
1103 return str_new(rb_cString, ptr, len);
1104}
1105
1106VALUE
1107rb_usascii_str_new(const char *ptr, long len)
1108{
1109 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1110}
1111
1112VALUE
1113rb_utf8_str_new(const char *ptr, long len)
1114{
1115 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1116}
1117
1118VALUE
1119rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1120{
1121 return str_enc_new(rb_cString, ptr, len, enc);
1122}
1123
1124VALUE
1126{
1127 must_not_null(ptr);
1128 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1129 * memory regions, and that cannot be detected by the MSAN. Just
1130 * trust the programmer that the argument passed here is a sane C
1131 * string. */
1132 __msan_unpoison_string(ptr);
1133 return rb_str_new(ptr, strlen(ptr));
1134}
1135
1136VALUE
1138{
1139 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1140}
1141
1142VALUE
1144{
1145 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1146}
1147
1148VALUE
1150{
1151 must_not_null(ptr);
1152 if (rb_enc_mbminlen(enc) != 1) {
1153 rb_raise(rb_eArgError, "wchar encoding given");
1154 }
1155 return rb_enc_str_new(ptr, strlen(ptr), enc);
1156}
1157
1158static VALUE
1159str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1160{
1161 VALUE str;
1162
1163 if (len < 0) {
1164 rb_raise(rb_eArgError, "negative string size (or size too big)");
1165 }
1166
1167 if (!ptr) {
1168 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1169 }
1170 else {
1171 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1172 str = str_alloc_heap(klass);
1173 RSTRING(str)->len = len;
1174 RSTRING(str)->as.heap.ptr = (char *)ptr;
1175 RSTRING(str)->as.heap.aux.capa = len;
1176 RBASIC(str)->flags |= STR_NOFREE;
1177 rb_enc_associate_index(str, encindex);
1178 }
1179 return str;
1180}
1181
1182VALUE
1183rb_str_new_static(const char *ptr, long len)
1184{
1185 return str_new_static(rb_cString, ptr, len, 0);
1186}
1187
1188VALUE
1190{
1191 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1192}
1193
1194VALUE
1196{
1197 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1198}
1199
1200VALUE
1202{
1203 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1204}
1205
1206static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1207 rb_encoding *from, rb_encoding *to,
1208 int ecflags, VALUE ecopts);
1209
1210static inline bool
1211is_enc_ascii_string(VALUE str, rb_encoding *enc)
1212{
1213 int encidx = rb_enc_to_index(enc);
1214 if (rb_enc_get_index(str) == encidx)
1215 return is_ascii_string(str);
1216 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1217}
1218
1219VALUE
1220rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1221{
1222 long len;
1223 const char *ptr;
1224 VALUE newstr;
1225
1226 if (!to) return str;
1227 if (!from) from = rb_enc_get(str);
1228 if (from == to) return str;
1229 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1230 rb_is_ascii8bit_enc(to)) {
1231 if (STR_ENC_GET(str) != to) {
1232 str = rb_str_dup(str);
1233 rb_enc_associate(str, to);
1234 }
1235 return str;
1236 }
1237
1238 RSTRING_GETMEM(str, ptr, len);
1239 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1240 from, to, ecflags, ecopts);
1241 if (NIL_P(newstr)) {
1242 /* some error, return original */
1243 return str;
1244 }
1245 return newstr;
1246}
1247
1248VALUE
1249rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1250 rb_encoding *from, int ecflags, VALUE ecopts)
1251{
1252 long olen;
1253
1254 olen = RSTRING_LEN(newstr);
1255 if (ofs < -olen || olen < ofs)
1256 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1257 if (ofs < 0) ofs += olen;
1258 if (!from) {
1259 STR_SET_LEN(newstr, ofs);
1260 return rb_str_cat(newstr, ptr, len);
1261 }
1262
1263 rb_str_modify(newstr);
1264 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1265 rb_enc_get(newstr),
1266 ecflags, ecopts);
1267}
1268
1269VALUE
1270rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1271{
1272 STR_SET_LEN(str, 0);
1273 rb_enc_associate(str, enc);
1274 rb_str_cat(str, ptr, len);
1275 return str;
1276}
1277
1278static VALUE
1279str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1280 rb_encoding *from, rb_encoding *to,
1281 int ecflags, VALUE ecopts)
1282{
1283 rb_econv_t *ec;
1285 long olen;
1286 VALUE econv_wrapper;
1287 const unsigned char *start, *sp;
1288 unsigned char *dest, *dp;
1289 size_t converted_output = (size_t)ofs;
1290
1291 olen = rb_str_capacity(newstr);
1292
1293 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1294 RBASIC_CLEAR_CLASS(econv_wrapper);
1295 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1296 if (!ec) return Qnil;
1297 DATA_PTR(econv_wrapper) = ec;
1298
1299 sp = (unsigned char*)ptr;
1300 start = sp;
1301 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1302 (dp = dest + converted_output),
1303 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1305 /* destination buffer short */
1306 size_t converted_input = sp - start;
1307 size_t rest = len - converted_input;
1308 converted_output = dp - dest;
1309 rb_str_set_len(newstr, converted_output);
1310 if (converted_input && converted_output &&
1311 rest < (LONG_MAX / converted_output)) {
1312 rest = (rest * converted_output) / converted_input;
1313 }
1314 else {
1315 rest = olen;
1316 }
1317 olen += rest < 2 ? 2 : rest;
1318 rb_str_resize(newstr, olen);
1319 }
1320 DATA_PTR(econv_wrapper) = 0;
1321 RB_GC_GUARD(econv_wrapper);
1322 rb_econv_close(ec);
1323 switch (ret) {
1324 case econv_finished:
1325 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1326 rb_str_set_len(newstr, len);
1327 rb_enc_associate(newstr, to);
1328 return newstr;
1329
1330 default:
1331 return Qnil;
1332 }
1333}
1334
1335VALUE
1337{
1338 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1339}
1340
1341VALUE
1343{
1344 rb_encoding *ienc;
1345 VALUE str;
1346 const int eidx = rb_enc_to_index(eenc);
1347
1348 if (!ptr) {
1349 return rb_enc_str_new(ptr, len, eenc);
1350 }
1351
1352 /* ASCII-8BIT case, no conversion */
1353 if ((eidx == rb_ascii8bit_encindex()) ||
1354 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1355 return rb_str_new(ptr, len);
1356 }
1357 /* no default_internal or same encoding, no conversion */
1358 ienc = rb_default_internal_encoding();
1359 if (!ienc || eenc == ienc) {
1360 return rb_enc_str_new(ptr, len, eenc);
1361 }
1362 /* ASCII compatible, and ASCII only string, no conversion in
1363 * default_internal */
1364 if ((eidx == rb_ascii8bit_encindex()) ||
1365 (eidx == rb_usascii_encindex()) ||
1366 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1367 return rb_enc_str_new(ptr, len, ienc);
1368 }
1369 /* convert from the given encoding to default_internal */
1370 str = rb_enc_str_new(NULL, 0, ienc);
1371 /* when the conversion failed for some reason, just ignore the
1372 * default_internal and result in the given encoding as-is. */
1373 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1374 rb_str_initialize(str, ptr, len, eenc);
1375 }
1376 return str;
1377}
1378
1379VALUE
1380rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1381{
1382 int eidx = rb_enc_to_index(eenc);
1383 if (eidx == rb_usascii_encindex() &&
1384 !is_ascii_string(str)) {
1385 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1386 return str;
1387 }
1388 rb_enc_associate_index(str, eidx);
1389 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1390}
1391
1392VALUE
1393rb_external_str_new(const char *ptr, long len)
1394{
1395 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1396}
1397
1398VALUE
1400{
1401 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1402}
1403
1404VALUE
1405rb_locale_str_new(const char *ptr, long len)
1406{
1407 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1408}
1409
1410VALUE
1412{
1413 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1414}
1415
1416VALUE
1418{
1419 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1420}
1421
1422VALUE
1424{
1425 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1426}
1427
1428VALUE
1430{
1431 return rb_str_export_to_enc(str, rb_default_external_encoding());
1432}
1433
1434VALUE
1436{
1437 return rb_str_export_to_enc(str, rb_locale_encoding());
1438}
1439
1440VALUE
1442{
1443 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1444}
1445
1446static VALUE
1447str_replace_shared_without_enc(VALUE str2, VALUE str)
1448{
1449 const int termlen = TERM_LEN(str);
1450 char *ptr;
1451 long len;
1452
1453 RSTRING_GETMEM(str, ptr, len);
1454 if (str_embed_capa(str2) >= len + termlen) {
1455 char *ptr2 = RSTRING(str2)->as.embed.ary;
1456 STR_SET_EMBED(str2);
1457 memcpy(ptr2, RSTRING_PTR(str), len);
1458 TERM_FILL(ptr2+len, termlen);
1459 }
1460 else {
1461 VALUE root;
1462 if (STR_SHARED_P(str)) {
1463 root = RSTRING(str)->as.heap.aux.shared;
1464 RSTRING_GETMEM(str, ptr, len);
1465 }
1466 else {
1467 root = rb_str_new_frozen(str);
1468 RSTRING_GETMEM(root, ptr, len);
1469 }
1470 RUBY_ASSERT(OBJ_FROZEN(root));
1471
1472 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1473 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1474 rb_fatal("about to free a possible shared root");
1475 }
1476 char *ptr2 = STR_HEAP_PTR(str2);
1477 if (ptr2 != ptr) {
1478 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1479 }
1480 }
1481 FL_SET(str2, STR_NOEMBED);
1482 RSTRING(str2)->as.heap.ptr = ptr;
1483 STR_SET_SHARED(str2, root);
1484 }
1485
1486 STR_SET_LEN(str2, len);
1487
1488 return str2;
1489}
1490
1491static VALUE
1492str_replace_shared(VALUE str2, VALUE str)
1493{
1494 str_replace_shared_without_enc(str2, str);
1495 rb_enc_cr_str_exact_copy(str2, str);
1496 return str2;
1497}
1498
1499static VALUE
1500str_new_shared(VALUE klass, VALUE str)
1501{
1502 return str_replace_shared(str_alloc_heap(klass), str);
1503}
1504
1505VALUE
1507{
1508 return str_new_shared(rb_obj_class(str), str);
1509}
1510
1511VALUE
1513{
1514 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1515 return str_new_frozen(rb_obj_class(orig), orig);
1516}
1517
1518static VALUE
1519rb_str_new_frozen_String(VALUE orig)
1520{
1521 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1522 return str_new_frozen(rb_cString, orig);
1523}
1524
1525
1526VALUE
1527rb_str_frozen_bare_string(VALUE orig)
1528{
1529 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1530 return str_new_frozen(rb_cString, orig);
1531}
1532
1533VALUE
1534rb_str_tmp_frozen_acquire(VALUE orig)
1535{
1536 if (OBJ_FROZEN_RAW(orig)) return orig;
1537 return str_new_frozen_buffer(0, orig, FALSE);
1538}
1539
1540VALUE
1541rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1542{
1543 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1544 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1545
1546 VALUE str = str_alloc_heap(0);
1547 OBJ_FREEZE(str);
1548 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1549 FL_SET(str, STR_SHARED_ROOT);
1550
1551 size_t capa = str_capacity(orig, TERM_LEN(orig));
1552
1553 /* If the string is embedded then we want to create a copy that is heap
1554 * allocated. If the string is shared then the shared root must be
1555 * embedded, so we want to create a copy. If the string is a shared root
1556 * then it must be embedded, so we want to create a copy. */
1557 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1558 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1559 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1560 }
1561 else {
1562 /* orig must be heap allocated and not shared, so we can safely transfer
1563 * the pointer to str. */
1564 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1565 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1566 RBASIC(orig)->flags &= ~STR_NOFREE;
1567 STR_SET_SHARED(orig, str);
1568 if (RB_OBJ_SHAREABLE_P(orig)) {
1569 RB_OBJ_SET_SHAREABLE(str);
1570 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1571 }
1572 }
1573
1574 RSTRING(str)->len = RSTRING(orig)->len;
1575 RSTRING(str)->as.heap.aux.capa = capa;
1576
1577 return str;
1578}
1579
1580void
1581rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1582{
1583 if (RBASIC_CLASS(tmp) != 0)
1584 return;
1585
1586 if (STR_EMBED_P(tmp)) {
1588 }
1589 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1590 !OBJ_FROZEN_RAW(orig)) {
1591 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1592
1593 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1594 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1595 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1596
1597 /* Unshare orig since the root (tmp) only has this one child. */
1598 FL_UNSET_RAW(orig, STR_SHARED);
1599 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1600 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1602
1603 /* Make tmp embedded and empty so it is safe for sweeping. */
1604 STR_SET_EMBED(tmp);
1605 STR_SET_LEN(tmp, 0);
1606 }
1607 }
1608}
1609
1610static VALUE
1611str_new_frozen(VALUE klass, VALUE orig)
1612{
1613 return str_new_frozen_buffer(klass, orig, TRUE);
1614}
1615
1616static VALUE
1617heap_str_make_shared(VALUE klass, VALUE orig)
1618{
1619 RUBY_ASSERT(!STR_EMBED_P(orig));
1620 RUBY_ASSERT(!STR_SHARED_P(orig));
1622
1623 VALUE str = str_alloc_heap(klass);
1624 STR_SET_LEN(str, RSTRING_LEN(orig));
1625 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1626 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1627 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1628 RBASIC(orig)->flags &= ~STR_NOFREE;
1629 STR_SET_SHARED(orig, str);
1630 if (klass == 0)
1631 FL_UNSET_RAW(str, STR_BORROWED);
1632 return str;
1633}
1634
1635static VALUE
1636str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1637{
1638 VALUE str;
1639
1640 long len = RSTRING_LEN(orig);
1641 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1642 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1643
1644 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1645 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1646 RUBY_ASSERT(STR_EMBED_P(str));
1647 }
1648 else {
1649 if (FL_TEST_RAW(orig, STR_SHARED)) {
1650 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1651 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1652 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1653 RUBY_ASSERT(ofs >= 0);
1654 RUBY_ASSERT(rest >= 0);
1655 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1657
1658 if ((ofs > 0) || (rest > 0) ||
1659 (klass != RBASIC(shared)->klass) ||
1660 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1661 str = str_new_shared(klass, shared);
1662 RUBY_ASSERT(!STR_EMBED_P(str));
1663 RSTRING(str)->as.heap.ptr += ofs;
1664 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1665 }
1666 else {
1667 if (RBASIC_CLASS(shared) == 0)
1668 FL_SET_RAW(shared, STR_BORROWED);
1669 return shared;
1670 }
1671 }
1672 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1673 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1674 STR_SET_EMBED(str);
1675 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1676 STR_SET_LEN(str, RSTRING_LEN(orig));
1677 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1678 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1679 }
1680 else {
1681 if (RB_OBJ_SHAREABLE_P(orig)) {
1682 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1683 }
1684 else {
1685 str = heap_str_make_shared(klass, orig);
1686 }
1687 }
1688 }
1689
1690 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1691 OBJ_FREEZE(str);
1692 return str;
1693}
1694
1695VALUE
1696rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1697{
1698 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1699}
1700
1701static VALUE
1702str_new_empty_String(VALUE str)
1703{
1704 VALUE v = rb_str_new(0, 0);
1705 rb_enc_copy(v, str);
1706 return v;
1707}
1708
1709#define STR_BUF_MIN_SIZE 63
1710
1711VALUE
1713{
1714 if (STR_EMBEDDABLE_P(capa, 1)) {
1715 return str_alloc_embed(rb_cString, capa + 1);
1716 }
1717
1718 VALUE str = str_alloc_heap(rb_cString);
1719
1720 RSTRING(str)->as.heap.aux.capa = capa;
1721 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1722 RSTRING(str)->as.heap.ptr[0] = '\0';
1723
1724 return str;
1725}
1726
1727VALUE
1729{
1730 VALUE str;
1731 long len = strlen(ptr);
1732
1733 str = rb_str_buf_new(len);
1734 rb_str_buf_cat(str, ptr, len);
1735
1736 return str;
1737}
1738
1739VALUE
1741{
1742 return str_new(0, 0, len);
1743}
1744
1745void
1747{
1748 if (STR_EMBED_P(str)) {
1749 RB_DEBUG_COUNTER_INC(obj_str_embed);
1750 }
1751 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1752 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1753 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1754 }
1755 else {
1756 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1757 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1758 }
1759}
1760
1761size_t
1762rb_str_memsize(VALUE str)
1763{
1764 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1765 return STR_HEAP_SIZE(str);
1766 }
1767 else {
1768 return 0;
1769 }
1770}
1771
1772VALUE
1774{
1775 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1776}
1777
1778static inline void str_discard(VALUE str);
1779static void str_shared_replace(VALUE str, VALUE str2);
1780
1781void
1783{
1784 if (str != str2) str_shared_replace(str, str2);
1785}
1786
1787static void
1788str_shared_replace(VALUE str, VALUE str2)
1789{
1790 rb_encoding *enc;
1791 int cr;
1792 int termlen;
1793
1794 RUBY_ASSERT(str2 != str);
1795 enc = STR_ENC_GET(str2);
1796 cr = ENC_CODERANGE(str2);
1797 str_discard(str);
1798 termlen = rb_enc_mbminlen(enc);
1799
1800 STR_SET_LEN(str, RSTRING_LEN(str2));
1801
1802 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1803 STR_SET_EMBED(str);
1804 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1805 rb_enc_associate(str, enc);
1806 ENC_CODERANGE_SET(str, cr);
1807 }
1808 else {
1809 if (STR_EMBED_P(str2)) {
1810 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1811 long len = RSTRING_LEN(str2);
1812 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1813
1814 char *new_ptr = ALLOC_N(char, len + termlen);
1815 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1816 RSTRING(str2)->as.heap.ptr = new_ptr;
1817 STR_SET_LEN(str2, len);
1818 RSTRING(str2)->as.heap.aux.capa = len;
1819 STR_SET_NOEMBED(str2);
1820 }
1821
1822 STR_SET_NOEMBED(str);
1823 FL_UNSET(str, STR_SHARED);
1824 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1825
1826 if (FL_TEST(str2, STR_SHARED)) {
1827 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1828 STR_SET_SHARED(str, shared);
1829 }
1830 else {
1831 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1832 }
1833
1834 /* abandon str2 */
1835 STR_SET_EMBED(str2);
1836 RSTRING_PTR(str2)[0] = 0;
1837 STR_SET_LEN(str2, 0);
1838 rb_enc_associate(str, enc);
1839 ENC_CODERANGE_SET(str, cr);
1840 }
1841}
1842
1843VALUE
1845{
1846 VALUE str;
1847
1848 if (RB_TYPE_P(obj, T_STRING)) {
1849 return obj;
1850 }
1851 str = rb_funcall(obj, idTo_s, 0);
1852 return rb_obj_as_string_result(str, obj);
1853}
1854
1855VALUE
1856rb_obj_as_string_result(VALUE str, VALUE obj)
1857{
1858 if (!RB_TYPE_P(str, T_STRING))
1859 return rb_any_to_s(obj);
1860 return str;
1861}
1862
1863static VALUE
1864str_replace(VALUE str, VALUE str2)
1865{
1866 long len;
1867
1868 len = RSTRING_LEN(str2);
1869 if (STR_SHARED_P(str2)) {
1870 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1872 STR_SET_NOEMBED(str);
1873 STR_SET_LEN(str, len);
1874 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1875 STR_SET_SHARED(str, shared);
1876 rb_enc_cr_str_exact_copy(str, str2);
1877 }
1878 else {
1879 str_replace_shared(str, str2);
1880 }
1881
1882 return str;
1883}
1884
1885static inline VALUE
1886ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1887{
1888 size_t size = rb_str_embed_size(capa, 0);
1889 RUBY_ASSERT(size > 0);
1890 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1891
1892 NEWOBJ_OF(str, struct RString, klass,
1894
1895 str->len = 0;
1896
1897 return (VALUE)str;
1898}
1899
1900static inline VALUE
1901ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1902{
1903 NEWOBJ_OF(str, struct RString, klass,
1904 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1905
1906 str->as.heap.aux.capa = 0;
1907 str->as.heap.ptr = NULL;
1908
1909 return (VALUE)str;
1910}
1911
1912static inline VALUE
1913str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1914{
1915 int encidx = 0;
1916 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1917 encidx = rb_enc_get_index(str);
1918 flags &= ~ENCODING_MASK;
1919 }
1920 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1921 if (encidx) rb_enc_associate_index(dup, encidx);
1922 return dup;
1923}
1924
1925static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1926
1927static inline VALUE
1928str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1929{
1930 VALUE flags = FL_TEST_RAW(str, flag_mask);
1931 long len = RSTRING_LEN(str);
1932
1933 RUBY_ASSERT(STR_EMBED_P(dup));
1934 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1935 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1936 STR_SET_LEN(dup, RSTRING_LEN(str));
1937 return str_duplicate_setup_encoding(str, dup, flags);
1938}
1939
1940static inline VALUE
1941str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1942{
1943 VALUE flags = FL_TEST_RAW(str, flag_mask);
1944 VALUE root = str;
1945 if (FL_TEST_RAW(str, STR_SHARED)) {
1946 root = RSTRING(str)->as.heap.aux.shared;
1947 }
1948 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1949 root = str = str_new_frozen(klass, str);
1950 flags = FL_TEST_RAW(str, flag_mask);
1951 }
1952 RUBY_ASSERT(!STR_SHARED_P(root));
1954
1955 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1956 FL_SET(root, STR_SHARED_ROOT);
1957 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1958 flags |= RSTRING_NOEMBED | STR_SHARED;
1959
1960 STR_SET_LEN(dup, RSTRING_LEN(str));
1961 return str_duplicate_setup_encoding(str, dup, flags);
1962}
1963
1964static inline VALUE
1965str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1966{
1967 if (STR_EMBED_P(str)) {
1968 return str_duplicate_setup_embed(klass, str, dup);
1969 }
1970 else {
1971 return str_duplicate_setup_heap(klass, str, dup);
1972 }
1973}
1974
1975static inline VALUE
1976str_duplicate(VALUE klass, VALUE str)
1977{
1978 VALUE dup;
1979 if (STR_EMBED_P(str)) {
1980 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1981 }
1982 else {
1983 dup = str_alloc_heap(klass);
1984 }
1985
1986 return str_duplicate_setup(klass, str, dup);
1987}
1988
1989VALUE
1991{
1992 return str_duplicate(rb_obj_class(str), str);
1993}
1994
1995/* :nodoc: */
1996VALUE
1997rb_str_dup_m(VALUE str)
1998{
1999 if (LIKELY(BARE_STRING_P(str))) {
2000 return str_duplicate(rb_cString, str);
2001 }
2002 else {
2003 return rb_obj_dup(str);
2004 }
2005}
2006
2007VALUE
2009{
2010 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2011 return str_duplicate(rb_cString, str);
2012}
2013
2014VALUE
2015rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2016{
2017 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2018 VALUE new_str, klass = rb_cString;
2019
2020 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2021 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2022 str_duplicate_setup_embed(klass, str, new_str);
2023 }
2024 else {
2025 new_str = ec_str_alloc_heap(ec, klass);
2026 str_duplicate_setup_heap(klass, str, new_str);
2027 }
2028 if (chilled) {
2029 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2030 }
2031 return new_str;
2032}
2033
2034VALUE
2035rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2036{
2037 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2038 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2039 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2040 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2041 return rb_str_freeze(str);
2042}
2043
2044/*
2045 * The documentation block below uses an include (instead of inline text)
2046 * because the included text has non-ASCII characters (which are not allowed in a C file).
2047 */
2048
2049/*
2050 *
2051 * call-seq:
2052 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2053 *
2054 * :include: doc/string/new.rdoc
2055 *
2056 */
2057
2058static VALUE
2059rb_str_init(int argc, VALUE *argv, VALUE str)
2060{
2061 static ID keyword_ids[2];
2062 VALUE orig, opt, venc, vcapa;
2063 VALUE kwargs[2];
2064 rb_encoding *enc = 0;
2065 int n;
2066
2067 if (!keyword_ids[0]) {
2068 keyword_ids[0] = rb_id_encoding();
2069 CONST_ID(keyword_ids[1], "capacity");
2070 }
2071
2072 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2073 if (!NIL_P(opt)) {
2074 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2075 venc = kwargs[0];
2076 vcapa = kwargs[1];
2077 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2078 enc = rb_to_encoding(venc);
2079 }
2080 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2081 long capa = NUM2LONG(vcapa);
2082 long len = 0;
2083 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2084
2085 if (capa < STR_BUF_MIN_SIZE) {
2086 capa = STR_BUF_MIN_SIZE;
2087 }
2088 if (n == 1) {
2089 StringValue(orig);
2090 len = RSTRING_LEN(orig);
2091 if (capa < len) {
2092 capa = len;
2093 }
2094 if (orig == str) n = 0;
2095 }
2096 str_modifiable(str);
2097 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2098 /* make noembed always */
2099 const size_t size = (size_t)capa + termlen;
2100 const char *const old_ptr = RSTRING_PTR(str);
2101 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2102 char *new_ptr = ALLOC_N(char, size);
2103 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2104 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2105 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2106 RSTRING(str)->as.heap.ptr = new_ptr;
2107 }
2108 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2109 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2110 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2111 }
2112 STR_SET_LEN(str, len);
2113 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2114 if (n == 1) {
2115 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2116 rb_enc_cr_str_exact_copy(str, orig);
2117 }
2118 FL_SET(str, STR_NOEMBED);
2119 RSTRING(str)->as.heap.aux.capa = capa;
2120 }
2121 else if (n == 1) {
2122 rb_str_replace(str, orig);
2123 }
2124 if (enc) {
2125 rb_enc_associate(str, enc);
2127 }
2128 }
2129 else if (n == 1) {
2130 rb_str_replace(str, orig);
2131 }
2132 return str;
2133}
2134
2135/* :nodoc: */
2136static VALUE
2137rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2138{
2139 if (klass != rb_cString) {
2140 return rb_class_new_instance_pass_kw(argc, argv, klass);
2141 }
2142
2143 static ID keyword_ids[2];
2144 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2145 VALUE kwargs[2];
2146 rb_encoding *enc = NULL;
2147
2148 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2149 if (NIL_P(opt)) {
2150 return rb_class_new_instance_pass_kw(argc, argv, klass);
2151 }
2152
2153 keyword_ids[0] = rb_id_encoding();
2154 CONST_ID(keyword_ids[1], "capacity");
2155 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2156 encoding = kwargs[0];
2157 capacity = kwargs[1];
2158
2159 if (n == 1) {
2160 orig = StringValue(orig);
2161 }
2162 else {
2163 orig = Qnil;
2164 }
2165
2166 if (UNDEF_P(encoding)) {
2167 if (!NIL_P(orig)) {
2168 encoding = rb_obj_encoding(orig);
2169 }
2170 }
2171
2172 if (!UNDEF_P(encoding)) {
2173 enc = rb_to_encoding(encoding);
2174 }
2175
2176 // If capacity is nil, we're basically just duping `orig`.
2177 if (UNDEF_P(capacity)) {
2178 if (NIL_P(orig)) {
2179 VALUE empty_str = str_new(klass, "", 0);
2180 if (enc) {
2181 rb_enc_associate(empty_str, enc);
2182 }
2183 return empty_str;
2184 }
2185 VALUE copy = str_duplicate(klass, orig);
2186 rb_enc_associate(copy, enc);
2187 ENC_CODERANGE_CLEAR(copy);
2188 return copy;
2189 }
2190
2191 long capa = 0;
2192 capa = NUM2LONG(capacity);
2193 if (capa < 0) {
2194 capa = 0;
2195 }
2196
2197 if (!NIL_P(orig)) {
2198 long orig_capa = rb_str_capacity(orig);
2199 if (orig_capa > capa) {
2200 capa = orig_capa;
2201 }
2202 }
2203
2204 VALUE str = str_enc_new(klass, NULL, capa, enc);
2205 STR_SET_LEN(str, 0);
2206 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2207
2208 if (!NIL_P(orig)) {
2209 rb_str_buf_append(str, orig);
2210 }
2211
2212 return str;
2213}
2214
2215#ifdef NONASCII_MASK
2216#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2217
2218/*
2219 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2220 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2221 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2222 *
2223 * if (!(byte & 0x80))
2224 * byte |= 0x40; // turn on bit6
2225 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2226 *
2227 * This function calculates whether a byte is leading or not for all bytes
2228 * in the argument word by concurrently using the above logic, and then
2229 * adds up the number of leading bytes in the word.
2230 */
2231static inline uintptr_t
2232count_utf8_lead_bytes_with_word(const uintptr_t *s)
2233{
2234 uintptr_t d = *s;
2235
2236 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2237 d = (d>>6) | (~d>>7);
2238 d &= NONASCII_MASK >> 7;
2239
2240 /* Gather all bytes. */
2241#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2242 /* use only if it can use POPCNT */
2243 return rb_popcount_intptr(d);
2244#else
2245 d += (d>>8);
2246 d += (d>>16);
2247# if SIZEOF_VOIDP == 8
2248 d += (d>>32);
2249# endif
2250 return (d&0xF);
2251#endif
2252}
2253#endif
2254
2255static inline long
2256enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2257{
2258 long c;
2259 const char *q;
2260
2261 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2262 long diff = (long)(e - p);
2263 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2264 }
2265#ifdef NONASCII_MASK
2266 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2267 uintptr_t len = 0;
2268 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2269 const uintptr_t *s, *t;
2270 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2271 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2272 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2273 while (p < (const char *)s) {
2274 if (is_utf8_lead_byte(*p)) len++;
2275 p++;
2276 }
2277 while (s < t) {
2278 len += count_utf8_lead_bytes_with_word(s);
2279 s++;
2280 }
2281 p = (const char *)s;
2282 }
2283 while (p < e) {
2284 if (is_utf8_lead_byte(*p)) len++;
2285 p++;
2286 }
2287 return (long)len;
2288 }
2289#endif
2290 else if (rb_enc_asciicompat(enc)) {
2291 c = 0;
2292 if (ENC_CODERANGE_CLEAN_P(cr)) {
2293 while (p < e) {
2294 if (ISASCII(*p)) {
2295 q = search_nonascii(p, e);
2296 if (!q)
2297 return c + (e - p);
2298 c += q - p;
2299 p = q;
2300 }
2301 p += rb_enc_fast_mbclen(p, e, enc);
2302 c++;
2303 }
2304 }
2305 else {
2306 while (p < e) {
2307 if (ISASCII(*p)) {
2308 q = search_nonascii(p, e);
2309 if (!q)
2310 return c + (e - p);
2311 c += q - p;
2312 p = q;
2313 }
2314 p += rb_enc_mbclen(p, e, enc);
2315 c++;
2316 }
2317 }
2318 return c;
2319 }
2320
2321 for (c=0; p<e; c++) {
2322 p += rb_enc_mbclen(p, e, enc);
2323 }
2324 return c;
2325}
2326
2327long
2328rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2329{
2330 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2331}
2332
2333/* To get strlen with cr
2334 * Note that given cr is not used.
2335 */
2336long
2337rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2338{
2339 long c;
2340 const char *q;
2341 int ret;
2342
2343 *cr = 0;
2344 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2345 long diff = (long)(e - p);
2346 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2347 }
2348 else if (rb_enc_asciicompat(enc)) {
2349 c = 0;
2350 while (p < e) {
2351 if (ISASCII(*p)) {
2352 q = search_nonascii(p, e);
2353 if (!q) {
2354 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2355 return c + (e - p);
2356 }
2357 c += q - p;
2358 p = q;
2359 }
2360 ret = rb_enc_precise_mbclen(p, e, enc);
2361 if (MBCLEN_CHARFOUND_P(ret)) {
2362 *cr |= ENC_CODERANGE_VALID;
2363 p += MBCLEN_CHARFOUND_LEN(ret);
2364 }
2365 else {
2367 p++;
2368 }
2369 c++;
2370 }
2371 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2372 return c;
2373 }
2374
2375 for (c=0; p<e; c++) {
2376 ret = rb_enc_precise_mbclen(p, e, enc);
2377 if (MBCLEN_CHARFOUND_P(ret)) {
2378 *cr |= ENC_CODERANGE_VALID;
2379 p += MBCLEN_CHARFOUND_LEN(ret);
2380 }
2381 else {
2383 if (p + rb_enc_mbminlen(enc) <= e)
2384 p += rb_enc_mbminlen(enc);
2385 else
2386 p = e;
2387 }
2388 }
2389 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2390 return c;
2391}
2392
2393/* enc must be str's enc or rb_enc_check(str, str2) */
2394static long
2395str_strlen(VALUE str, rb_encoding *enc)
2396{
2397 const char *p, *e;
2398 int cr;
2399
2400 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2401 if (!enc) enc = STR_ENC_GET(str);
2402 p = RSTRING_PTR(str);
2403 e = RSTRING_END(str);
2404 cr = ENC_CODERANGE(str);
2405
2406 if (cr == ENC_CODERANGE_UNKNOWN) {
2407 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2408 if (cr) ENC_CODERANGE_SET(str, cr);
2409 return n;
2410 }
2411 else {
2412 return enc_strlen(p, e, enc, cr);
2413 }
2414}
2415
2416long
2418{
2419 return str_strlen(str, NULL);
2420}
2421
2422/*
2423 * call-seq:
2424 * length -> integer
2425 *
2426 * :include: doc/string/length.rdoc
2427 *
2428 */
2429
2430VALUE
2432{
2433 return LONG2NUM(str_strlen(str, NULL));
2434}
2435
2436/*
2437 * call-seq:
2438 * bytesize -> integer
2439 *
2440 * :include: doc/string/bytesize.rdoc
2441 *
2442 */
2443
2444VALUE
2445rb_str_bytesize(VALUE str)
2446{
2447 return LONG2NUM(RSTRING_LEN(str));
2448}
2449
2450/*
2451 * call-seq:
2452 * empty? -> true or false
2453 *
2454 * Returns whether the length of +self+ is zero:
2455 *
2456 * 'hello'.empty? # => false
2457 * ' '.empty? # => false
2458 * ''.empty? # => true
2459 *
2460 * Related: see {Querying}[rdoc-ref:String@Querying].
2461 */
2462
2463static VALUE
2464rb_str_empty(VALUE str)
2465{
2466 return RBOOL(RSTRING_LEN(str) == 0);
2467}
2468
2469/*
2470 * call-seq:
2471 * self + other_string -> new_string
2472 *
2473 * Returns a new string containing +other_string+ concatenated to +self+:
2474 *
2475 * 'Hello from ' + self.to_s # => "Hello from main"
2476 *
2477 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2478 */
2479
2480VALUE
2482{
2483 VALUE str3;
2484 rb_encoding *enc;
2485 char *ptr1, *ptr2, *ptr3;
2486 long len1, len2;
2487 int termlen;
2488
2489 StringValue(str2);
2490 enc = rb_enc_check_str(str1, str2);
2491 RSTRING_GETMEM(str1, ptr1, len1);
2492 RSTRING_GETMEM(str2, ptr2, len2);
2493 termlen = rb_enc_mbminlen(enc);
2494 if (len1 > LONG_MAX - len2) {
2495 rb_raise(rb_eArgError, "string size too big");
2496 }
2497 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2498 ptr3 = RSTRING_PTR(str3);
2499 memcpy(ptr3, ptr1, len1);
2500 memcpy(ptr3+len1, ptr2, len2);
2501 TERM_FILL(&ptr3[len1+len2], termlen);
2502
2503 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2505 RB_GC_GUARD(str1);
2506 RB_GC_GUARD(str2);
2507 return str3;
2508}
2509
2510/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2511VALUE
2512rb_str_opt_plus(VALUE str1, VALUE str2)
2513{
2516 long len1, len2;
2517 MAYBE_UNUSED(char) *ptr1, *ptr2;
2518 RSTRING_GETMEM(str1, ptr1, len1);
2519 RSTRING_GETMEM(str2, ptr2, len2);
2520 int enc1 = rb_enc_get_index(str1);
2521 int enc2 = rb_enc_get_index(str2);
2522
2523 if (enc1 < 0) {
2524 return Qundef;
2525 }
2526 else if (enc2 < 0) {
2527 return Qundef;
2528 }
2529 else if (enc1 != enc2) {
2530 return Qundef;
2531 }
2532 else if (len1 > LONG_MAX - len2) {
2533 return Qundef;
2534 }
2535 else {
2536 return rb_str_plus(str1, str2);
2537 }
2538
2539}
2540
2541/*
2542 * call-seq:
2543 * self * n -> new_string
2544 *
2545 * Returns a new string containing +n+ copies of +self+:
2546 *
2547 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2548 * 'No!' * 0 # => ""
2549 *
2550 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2551 */
2552
2553VALUE
2555{
2556 VALUE str2;
2557 long n, len;
2558 char *ptr2;
2559 int termlen;
2560
2561 if (times == INT2FIX(1)) {
2562 return str_duplicate(rb_cString, str);
2563 }
2564 if (times == INT2FIX(0)) {
2565 str2 = str_alloc_embed(rb_cString, 0);
2566 rb_enc_copy(str2, str);
2567 return str2;
2568 }
2569 len = NUM2LONG(times);
2570 if (len < 0) {
2571 rb_raise(rb_eArgError, "negative argument");
2572 }
2573 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2574 if (STR_EMBEDDABLE_P(len, 1)) {
2575 str2 = str_alloc_embed(rb_cString, len + 1);
2576 memset(RSTRING_PTR(str2), 0, len + 1);
2577 }
2578 else {
2579 str2 = str_alloc_heap(rb_cString);
2580 RSTRING(str2)->as.heap.aux.capa = len;
2581 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2582 }
2583 STR_SET_LEN(str2, len);
2584 rb_enc_copy(str2, str);
2585 return str2;
2586 }
2587 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2588 rb_raise(rb_eArgError, "argument too big");
2589 }
2590
2591 len *= RSTRING_LEN(str);
2592 termlen = TERM_LEN(str);
2593 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2594 ptr2 = RSTRING_PTR(str2);
2595 if (len) {
2596 n = RSTRING_LEN(str);
2597 memcpy(ptr2, RSTRING_PTR(str), n);
2598 while (n <= len/2) {
2599 memcpy(ptr2 + n, ptr2, n);
2600 n *= 2;
2601 }
2602 memcpy(ptr2 + n, ptr2, len-n);
2603 }
2604 STR_SET_LEN(str2, len);
2605 TERM_FILL(&ptr2[len], termlen);
2606 rb_enc_cr_str_copy_for_substr(str2, str);
2607
2608 return str2;
2609}
2610
2611/*
2612 * call-seq:
2613 * self % object -> new_string
2614 *
2615 * Returns the result of formatting +object+ into the format specifications
2616 * contained in +self+
2617 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2618 *
2619 * '%05d' % 123 # => "00123"
2620 *
2621 * If +self+ contains multiple format specifications,
2622 * +object+ must be an array or hash containing the objects to be formatted:
2623 *
2624 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2625 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2626 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2627 *
2628 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2629 */
2630
2631static VALUE
2632rb_str_format_m(VALUE str, VALUE arg)
2633{
2634 VALUE tmp = rb_check_array_type(arg);
2635
2636 if (!NIL_P(tmp)) {
2637 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2638 }
2639 return rb_str_format(1, &arg, str);
2640}
2641
2642static inline void
2643rb_check_lockedtmp(VALUE str)
2644{
2645 if (FL_TEST(str, STR_TMPLOCK)) {
2646 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2647 }
2648}
2649
2650// If none of these flags are set, we know we have an modifiable string.
2651// If any is set, we need to do more detailed checks.
2652#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2653static inline void
2654str_modifiable(VALUE str)
2655{
2656 RUBY_ASSERT(ruby_thread_has_gvl_p());
2657
2658 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2659 if (CHILLED_STRING_P(str)) {
2660 CHILLED_STRING_MUTATED(str);
2661 }
2662 rb_check_lockedtmp(str);
2663 rb_check_frozen(str);
2664 }
2665}
2666
2667static inline int
2668str_dependent_p(VALUE str)
2669{
2670 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2671 return FALSE;
2672 }
2673 else {
2674 return TRUE;
2675 }
2676}
2677
2678// If none of these flags are set, we know we have an independent string.
2679// If any is set, we need to do more detailed checks.
2680#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2681static inline int
2682str_independent(VALUE str)
2683{
2684 RUBY_ASSERT(ruby_thread_has_gvl_p());
2685
2686 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2687 str_modifiable(str);
2688 return !str_dependent_p(str);
2689 }
2690 return TRUE;
2691}
2692
2693static void
2694str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2695{
2696 RUBY_ASSERT(ruby_thread_has_gvl_p());
2697
2698 char *ptr;
2699 char *oldptr;
2700 long capa = len + expand;
2701
2702 if (len > capa) len = capa;
2703
2704 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2705 ptr = RSTRING(str)->as.heap.ptr;
2706 STR_SET_EMBED(str);
2707 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2708 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2709 STR_SET_LEN(str, len);
2710 return;
2711 }
2712
2713 ptr = ALLOC_N(char, (size_t)capa + termlen);
2714 oldptr = RSTRING_PTR(str);
2715 if (oldptr) {
2716 memcpy(ptr, oldptr, len);
2717 }
2718 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2719 xfree(oldptr);
2720 }
2721 STR_SET_NOEMBED(str);
2722 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2723 TERM_FILL(ptr + len, termlen);
2724 RSTRING(str)->as.heap.ptr = ptr;
2725 STR_SET_LEN(str, len);
2726 RSTRING(str)->as.heap.aux.capa = capa;
2727}
2728
2729void
2730rb_str_modify(VALUE str)
2731{
2732 if (!str_independent(str))
2733 str_make_independent(str);
2735}
2736
2737void
2739{
2740 RUBY_ASSERT(ruby_thread_has_gvl_p());
2741
2742 int termlen = TERM_LEN(str);
2743 long len = RSTRING_LEN(str);
2744
2745 if (expand < 0) {
2746 rb_raise(rb_eArgError, "negative expanding string size");
2747 }
2748 if (expand >= LONG_MAX - len) {
2749 rb_raise(rb_eArgError, "string size too big");
2750 }
2751
2752 if (!str_independent(str)) {
2753 str_make_independent_expand(str, len, expand, termlen);
2754 }
2755 else if (expand > 0) {
2756 RESIZE_CAPA_TERM(str, len + expand, termlen);
2757 }
2759}
2760
2761/* As rb_str_modify(), but don't clear coderange */
2762static void
2763str_modify_keep_cr(VALUE str)
2764{
2765 if (!str_independent(str))
2766 str_make_independent(str);
2768 /* Force re-scan later */
2770}
2771
2772static inline void
2773str_discard(VALUE str)
2774{
2775 str_modifiable(str);
2776 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2777 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2778 RSTRING(str)->as.heap.ptr = 0;
2779 STR_SET_LEN(str, 0);
2780 }
2781}
2782
2783void
2785{
2786 int encindex = rb_enc_get_index(str);
2787
2788 if (RB_UNLIKELY(encindex == -1)) {
2789 rb_raise(rb_eTypeError, "not encoding capable object");
2790 }
2791
2792 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2793 return;
2794 }
2795
2796 rb_encoding *enc = rb_enc_from_index(encindex);
2797 if (!rb_enc_asciicompat(enc)) {
2798 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2799 }
2800}
2801
2802VALUE
2804{
2805 RUBY_ASSERT(ruby_thread_has_gvl_p());
2806
2807 VALUE s = *ptr;
2808 if (!RB_TYPE_P(s, T_STRING)) {
2809 s = rb_str_to_str(s);
2810 *ptr = s;
2811 }
2812 return s;
2813}
2814
2815char *
2817{
2818 VALUE str = rb_string_value(ptr);
2819 return RSTRING_PTR(str);
2820}
2821
2822static int
2823zero_filled(const char *s, int n)
2824{
2825 for (; n > 0; --n) {
2826 if (*s++) return 0;
2827 }
2828 return 1;
2829}
2830
2831static const char *
2832str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2833{
2834 const char *e = s + len;
2835
2836 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2837 if (zero_filled(s, minlen)) return s;
2838 }
2839 return 0;
2840}
2841
2842static char *
2843str_fill_term(VALUE str, char *s, long len, int termlen)
2844{
2845 /* This function assumes that (capa + termlen) bytes of memory
2846 * is allocated, like many other functions in this file.
2847 */
2848 if (str_dependent_p(str)) {
2849 if (!zero_filled(s + len, termlen))
2850 str_make_independent_expand(str, len, 0L, termlen);
2851 }
2852 else {
2853 TERM_FILL(s + len, termlen);
2854 return s;
2855 }
2856 return RSTRING_PTR(str);
2857}
2858
2859void
2860rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2861{
2862 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2863 long len = RSTRING_LEN(str);
2864
2865 RUBY_ASSERT(capa >= len);
2866 if (capa - len < termlen) {
2867 rb_check_lockedtmp(str);
2868 str_make_independent_expand(str, len, 0L, termlen);
2869 }
2870 else if (str_dependent_p(str)) {
2871 if (termlen > oldtermlen)
2872 str_make_independent_expand(str, len, 0L, termlen);
2873 }
2874 else {
2875 if (!STR_EMBED_P(str)) {
2876 /* modify capa instead of realloc */
2877 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2878 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2879 }
2880 if (termlen > oldtermlen) {
2881 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2882 }
2883 }
2884
2885 return;
2886}
2887
2888static char *
2889str_null_check(VALUE str, int *w)
2890{
2891 char *s = RSTRING_PTR(str);
2892 long len = RSTRING_LEN(str);
2893 rb_encoding *enc = rb_enc_get(str);
2894 const int minlen = rb_enc_mbminlen(enc);
2895
2896 if (minlen > 1) {
2897 *w = 1;
2898 if (str_null_char(s, len, minlen, enc)) {
2899 return NULL;
2900 }
2901 return str_fill_term(str, s, len, minlen);
2902 }
2903 *w = 0;
2904 if (!s || memchr(s, 0, len)) {
2905 return NULL;
2906 }
2907 if (s[len]) {
2908 s = str_fill_term(str, s, len, minlen);
2909 }
2910 return s;
2911}
2912
2913char *
2914rb_str_to_cstr(VALUE str)
2915{
2916 int w;
2917 return str_null_check(str, &w);
2918}
2919
2920char *
2922{
2923 VALUE str = rb_string_value(ptr);
2924 int w;
2925 char *s = str_null_check(str, &w);
2926 if (!s) {
2927 if (w) {
2928 rb_raise(rb_eArgError, "string contains null char");
2929 }
2930 rb_raise(rb_eArgError, "string contains null byte");
2931 }
2932 return s;
2933}
2934
2935char *
2936rb_str_fill_terminator(VALUE str, const int newminlen)
2937{
2938 char *s = RSTRING_PTR(str);
2939 long len = RSTRING_LEN(str);
2940 return str_fill_term(str, s, len, newminlen);
2941}
2942
2943VALUE
2945{
2946 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2947 return str;
2948}
2949
2950/*
2951 * call-seq:
2952 * String.try_convert(object) -> object, new_string, or nil
2953 *
2954 * Attempts to convert the given +object+ to a string.
2955 *
2956 * If +object+ is already a string, returns +object+, unmodified.
2957 *
2958 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2959 * calls <tt>object.to_str</tt> and returns the result.
2960 *
2961 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2962 *
2963 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2964 */
2965static VALUE
2966rb_str_s_try_convert(VALUE dummy, VALUE str)
2967{
2968 return rb_check_string_type(str);
2969}
2970
2971static char*
2972str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2973{
2974 long nth = *nthp;
2975 if (rb_enc_mbmaxlen(enc) == 1) {
2976 p += nth;
2977 }
2978 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2979 p += nth * rb_enc_mbmaxlen(enc);
2980 }
2981 else if (rb_enc_asciicompat(enc)) {
2982 const char *p2, *e2;
2983 int n;
2984
2985 while (p < e && 0 < nth) {
2986 e2 = p + nth;
2987 if (e < e2) {
2988 *nthp = nth;
2989 return (char *)e;
2990 }
2991 if (ISASCII(*p)) {
2992 p2 = search_nonascii(p, e2);
2993 if (!p2) {
2994 nth -= e2 - p;
2995 *nthp = nth;
2996 return (char *)e2;
2997 }
2998 nth -= p2 - p;
2999 p = p2;
3000 }
3001 n = rb_enc_mbclen(p, e, enc);
3002 p += n;
3003 nth--;
3004 }
3005 *nthp = nth;
3006 if (nth != 0) {
3007 return (char *)e;
3008 }
3009 return (char *)p;
3010 }
3011 else {
3012 while (p < e && nth--) {
3013 p += rb_enc_mbclen(p, e, enc);
3014 }
3015 }
3016 if (p > e) p = e;
3017 *nthp = nth;
3018 return (char*)p;
3019}
3020
3021char*
3022rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3023{
3024 return str_nth_len(p, e, &nth, enc);
3025}
3026
3027static char*
3028str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3029{
3030 if (singlebyte)
3031 p += nth;
3032 else {
3033 p = str_nth_len(p, e, &nth, enc);
3034 }
3035 if (!p) return 0;
3036 if (p > e) p = e;
3037 return (char *)p;
3038}
3039
3040/* char offset to byte offset */
3041static long
3042str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3043{
3044 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3045 if (!pp) return e - p;
3046 return pp - p;
3047}
3048
3049long
3050rb_str_offset(VALUE str, long pos)
3051{
3052 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3053 STR_ENC_GET(str), single_byte_optimizable(str));
3054}
3055
3056#ifdef NONASCII_MASK
3057static char *
3058str_utf8_nth(const char *p, const char *e, long *nthp)
3059{
3060 long nth = *nthp;
3061 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3062 const uintptr_t *s, *t;
3063 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3064 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3065 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3066 while (p < (const char *)s) {
3067 if (is_utf8_lead_byte(*p)) nth--;
3068 p++;
3069 }
3070 do {
3071 nth -= count_utf8_lead_bytes_with_word(s);
3072 s++;
3073 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3074 p = (char *)s;
3075 }
3076 while (p < e) {
3077 if (is_utf8_lead_byte(*p)) {
3078 if (nth == 0) break;
3079 nth--;
3080 }
3081 p++;
3082 }
3083 *nthp = nth;
3084 return (char *)p;
3085}
3086
3087static long
3088str_utf8_offset(const char *p, const char *e, long nth)
3089{
3090 const char *pp = str_utf8_nth(p, e, &nth);
3091 return pp - p;
3092}
3093#endif
3094
3095/* byte offset to char offset */
3096long
3097rb_str_sublen(VALUE str, long pos)
3098{
3099 if (single_byte_optimizable(str) || pos < 0)
3100 return pos;
3101 else {
3102 char *p = RSTRING_PTR(str);
3103 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3104 }
3105}
3106
3107static VALUE
3108str_subseq(VALUE str, long beg, long len)
3109{
3110 VALUE str2;
3111
3112 RUBY_ASSERT(beg >= 0);
3113 RUBY_ASSERT(len >= 0);
3114 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3115
3116 const int termlen = TERM_LEN(str);
3117 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3118 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3119 RB_GC_GUARD(str);
3120 return str2;
3121 }
3122
3123 str2 = str_alloc_heap(rb_cString);
3124 if (str_embed_capa(str2) >= len + termlen) {
3125 char *ptr2 = RSTRING(str2)->as.embed.ary;
3126 STR_SET_EMBED(str2);
3127 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3128 TERM_FILL(ptr2+len, termlen);
3129
3130 STR_SET_LEN(str2, len);
3131 RB_GC_GUARD(str);
3132 }
3133 else {
3134 str_replace_shared(str2, str);
3135 RUBY_ASSERT(!STR_EMBED_P(str2));
3136 ENC_CODERANGE_CLEAR(str2);
3137 RSTRING(str2)->as.heap.ptr += beg;
3138 if (RSTRING_LEN(str2) > len) {
3139 STR_SET_LEN(str2, len);
3140 }
3141 }
3142
3143 return str2;
3144}
3145
3146VALUE
3147rb_str_subseq(VALUE str, long beg, long len)
3148{
3149 VALUE str2 = str_subseq(str, beg, len);
3150 rb_enc_cr_str_copy_for_substr(str2, str);
3151 return str2;
3152}
3153
3154char *
3155rb_str_subpos(VALUE str, long beg, long *lenp)
3156{
3157 long len = *lenp;
3158 long slen = -1L;
3159 const long blen = RSTRING_LEN(str);
3160 rb_encoding *enc = STR_ENC_GET(str);
3161 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3162
3163 if (len < 0) return 0;
3164 if (beg < 0 && -beg < 0) return 0;
3165 if (!blen) {
3166 len = 0;
3167 }
3168 if (single_byte_optimizable(str)) {
3169 if (beg > blen) return 0;
3170 if (beg < 0) {
3171 beg += blen;
3172 if (beg < 0) return 0;
3173 }
3174 if (len > blen - beg)
3175 len = blen - beg;
3176 if (len < 0) return 0;
3177 p = s + beg;
3178 goto end;
3179 }
3180 if (beg < 0) {
3181 if (len > -beg) len = -beg;
3182 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3183 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3184 beg = -beg;
3185 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3186 p = e;
3187 if (!p) return 0;
3188 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3189 if (!p) return 0;
3190 len = e - p;
3191 goto end;
3192 }
3193 else {
3194 slen = str_strlen(str, enc);
3195 beg += slen;
3196 if (beg < 0) return 0;
3197 p = s + beg;
3198 if (len == 0) goto end;
3199 }
3200 }
3201 else if (beg > 0 && beg > blen) {
3202 return 0;
3203 }
3204 if (len == 0) {
3205 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3206 p = s + beg;
3207 }
3208#ifdef NONASCII_MASK
3209 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3210 enc == rb_utf8_encoding()) {
3211 p = str_utf8_nth(s, e, &beg);
3212 if (beg > 0) return 0;
3213 len = str_utf8_offset(p, e, len);
3214 }
3215#endif
3216 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3217 int char_sz = rb_enc_mbmaxlen(enc);
3218
3219 p = s + beg * char_sz;
3220 if (p > e) {
3221 return 0;
3222 }
3223 else if (len * char_sz > e - p)
3224 len = e - p;
3225 else
3226 len *= char_sz;
3227 }
3228 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3229 if (beg > 0) return 0;
3230 len = 0;
3231 }
3232 else {
3233 len = str_offset(p, e, len, enc, 0);
3234 }
3235 end:
3236 *lenp = len;
3237 RB_GC_GUARD(str);
3238 return p;
3239}
3240
3241static VALUE str_substr(VALUE str, long beg, long len, int empty);
3242
3243VALUE
3244rb_str_substr(VALUE str, long beg, long len)
3245{
3246 return str_substr(str, beg, len, TRUE);
3247}
3248
3249VALUE
3250rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3251{
3252 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3253}
3254
3255static VALUE
3256str_substr(VALUE str, long beg, long len, int empty)
3257{
3258 char *p = rb_str_subpos(str, beg, &len);
3259
3260 if (!p) return Qnil;
3261 if (!len && !empty) return Qnil;
3262
3263 beg = p - RSTRING_PTR(str);
3264
3265 VALUE str2 = str_subseq(str, beg, len);
3266 rb_enc_cr_str_copy_for_substr(str2, str);
3267 return str2;
3268}
3269
3270/* :nodoc: */
3271VALUE
3273{
3274 if (CHILLED_STRING_P(str)) {
3275 FL_UNSET_RAW(str, STR_CHILLED);
3276 }
3277
3278 if (OBJ_FROZEN(str)) return str;
3279 rb_str_resize(str, RSTRING_LEN(str));
3280 return rb_obj_freeze(str);
3281}
3282
3283/*
3284 * call-seq:
3285 * +string -> new_string or self
3286 *
3287 * Returns +self+ if +self+ is not frozen and can be mutated
3288 * without warning issuance.
3289 *
3290 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3291 *
3292 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3293 */
3294static VALUE
3295str_uplus(VALUE str)
3296{
3297 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3298 return rb_str_dup(str);
3299 }
3300 else {
3301 return str;
3302 }
3303}
3304
3305/*
3306 * call-seq:
3307 * -self -> frozen_string
3308 *
3309 * Returns a frozen string equal to +self+.
3310 *
3311 * The returned string is +self+ if and only if all of the following are true:
3312 *
3313 * - +self+ is already frozen.
3314 * - +self+ is an instance of \String (rather than of a subclass of \String)
3315 * - +self+ has no instance variables set on it.
3316 *
3317 * Otherwise, the returned string is a frozen copy of +self+.
3318 *
3319 * Returning +self+, when possible, saves duplicating +self+;
3320 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3321 *
3322 * It may also save duplicating other, already-existing, strings:
3323 *
3324 * s0 = 'foo'
3325 * s1 = 'foo'
3326 * s0.object_id == s1.object_id # => false
3327 * (-s0).object_id == (-s1).object_id # => true
3328 *
3329 * Note that method #-@ is convenient for defining a constant:
3330 *
3331 * FileName = -'config/database.yml'
3332 *
3333 * While its alias #dedup is better suited for chaining:
3334 *
3335 * 'foo'.dedup.gsub!('o')
3336 *
3337 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3338 */
3339static VALUE
3340str_uminus(VALUE str)
3341{
3342 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3343 str = rb_str_dup(str);
3344 }
3345 return rb_fstring(str);
3346}
3347
3348RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3349#define rb_str_dup_frozen rb_str_new_frozen
3350
3351VALUE
3353{
3354 rb_check_frozen(str);
3355 if (FL_TEST(str, STR_TMPLOCK)) {
3356 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3357 }
3358 FL_SET(str, STR_TMPLOCK);
3359 return str;
3360}
3361
3362VALUE
3364{
3365 rb_check_frozen(str);
3366 if (!FL_TEST(str, STR_TMPLOCK)) {
3367 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3368 }
3369 FL_UNSET(str, STR_TMPLOCK);
3370 return str;
3371}
3372
3373VALUE
3374rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3375{
3376 rb_str_locktmp(str);
3377 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3378}
3379
3380void
3382{
3383 RUBY_ASSERT(ruby_thread_has_gvl_p());
3384
3385 long capa;
3386 const int termlen = TERM_LEN(str);
3387
3388 str_modifiable(str);
3389 if (STR_SHARED_P(str)) {
3390 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3391 }
3392 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3393 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3394 }
3395
3396 int cr = ENC_CODERANGE(str);
3397 if (len == 0) {
3398 /* Empty string does not contain non-ASCII */
3400 }
3401 else if (cr == ENC_CODERANGE_UNKNOWN) {
3402 /* Leave unknown. */
3403 }
3404 else if (len > RSTRING_LEN(str)) {
3405 if (ENC_CODERANGE_CLEAN_P(cr)) {
3406 /* Update the coderange regarding the extended part. */
3407 const char *const prev_end = RSTRING_END(str);
3408 const char *const new_end = RSTRING_PTR(str) + len;
3409 rb_encoding *enc = rb_enc_get(str);
3410 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3411 ENC_CODERANGE_SET(str, cr);
3412 }
3413 else if (cr == ENC_CODERANGE_BROKEN) {
3414 /* May be valid now, by appended part. */
3416 }
3417 }
3418 else if (len < RSTRING_LEN(str)) {
3419 if (cr != ENC_CODERANGE_7BIT) {
3420 /* ASCII-only string is keeping after truncated. Valid
3421 * and broken may be invalid or valid, leave unknown. */
3423 }
3424 }
3425
3426 STR_SET_LEN(str, len);
3427 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3428}
3429
3430VALUE
3431rb_str_resize(VALUE str, long len)
3432{
3433 if (len < 0) {
3434 rb_raise(rb_eArgError, "negative string size (or size too big)");
3435 }
3436
3437 int independent = str_independent(str);
3438 long slen = RSTRING_LEN(str);
3439 const int termlen = TERM_LEN(str);
3440
3441 if (slen > len || (termlen != 1 && slen < len)) {
3443 }
3444
3445 {
3446 long capa;
3447 if (STR_EMBED_P(str)) {
3448 if (len == slen) return str;
3449 if (str_embed_capa(str) >= len + termlen) {
3450 STR_SET_LEN(str, len);
3451 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3452 return str;
3453 }
3454 str_make_independent_expand(str, slen, len - slen, termlen);
3455 }
3456 else if (str_embed_capa(str) >= len + termlen) {
3457 char *ptr = STR_HEAP_PTR(str);
3458 STR_SET_EMBED(str);
3459 if (slen > len) slen = len;
3460 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3461 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3462 STR_SET_LEN(str, len);
3463 if (independent) ruby_xfree(ptr);
3464 return str;
3465 }
3466 else if (!independent) {
3467 if (len == slen) return str;
3468 str_make_independent_expand(str, slen, len - slen, termlen);
3469 }
3470 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3471 (capa - len) > (len < 1024 ? len : 1024)) {
3472 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3473 (size_t)len + termlen, STR_HEAP_SIZE(str));
3474 RSTRING(str)->as.heap.aux.capa = len;
3475 }
3476 else if (len == slen) return str;
3477 STR_SET_LEN(str, len);
3478 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3479 }
3480 return str;
3481}
3482
3483static void
3484str_ensure_available_capa(VALUE str, long len)
3485{
3486 str_modify_keep_cr(str);
3487
3488 const int termlen = TERM_LEN(str);
3489 long olen = RSTRING_LEN(str);
3490
3491 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3492 rb_raise(rb_eArgError, "string sizes too big");
3493 }
3494
3495 long total = olen + len;
3496 long capa = str_capacity(str, termlen);
3497
3498 if (capa < total) {
3499 if (total >= LONG_MAX / 2) {
3500 capa = total;
3501 }
3502 while (total > capa) {
3503 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3504 }
3505 RESIZE_CAPA_TERM(str, capa, termlen);
3506 }
3507}
3508
3509static VALUE
3510str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3511{
3512 if (keep_cr) {
3513 str_modify_keep_cr(str);
3514 }
3515 else {
3516 rb_str_modify(str);
3517 }
3518 if (len == 0) return 0;
3519
3520 long total, olen, off = -1;
3521 char *sptr;
3522 const int termlen = TERM_LEN(str);
3523
3524 RSTRING_GETMEM(str, sptr, olen);
3525 if (ptr >= sptr && ptr <= sptr + olen) {
3526 off = ptr - sptr;
3527 }
3528
3529 long capa = str_capacity(str, termlen);
3530
3531 if (olen > LONG_MAX - len) {
3532 rb_raise(rb_eArgError, "string sizes too big");
3533 }
3534 total = olen + len;
3535 if (capa < total) {
3536 if (total >= LONG_MAX / 2) {
3537 capa = total;
3538 }
3539 while (total > capa) {
3540 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3541 }
3542 RESIZE_CAPA_TERM(str, capa, termlen);
3543 sptr = RSTRING_PTR(str);
3544 }
3545 if (off != -1) {
3546 ptr = sptr + off;
3547 }
3548 memcpy(sptr + olen, ptr, len);
3549 STR_SET_LEN(str, total);
3550 TERM_FILL(sptr + total, termlen); /* sentinel */
3551
3552 return str;
3553}
3554
3555#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3556#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3557
3558VALUE
3559rb_str_cat(VALUE str, const char *ptr, long len)
3560{
3561 if (len == 0) return str;
3562 if (len < 0) {
3563 rb_raise(rb_eArgError, "negative string size (or size too big)");
3564 }
3565 return str_buf_cat(str, ptr, len);
3566}
3567
3568VALUE
3569rb_str_cat_cstr(VALUE str, const char *ptr)
3570{
3571 must_not_null(ptr);
3572 return rb_str_buf_cat(str, ptr, strlen(ptr));
3573}
3574
3575static void
3576rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3577{
3578 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3579
3580 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3581 if (UNLIKELY(!str_independent(str))) {
3582 str_make_independent(str);
3583 }
3584
3585 long string_length = -1;
3586 const int null_terminator_length = 1;
3587 char *sptr;
3588 RSTRING_GETMEM(str, sptr, string_length);
3589
3590 // Ensure the resulting string wouldn't be too long.
3591 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3592 rb_raise(rb_eArgError, "string sizes too big");
3593 }
3594
3595 long string_capacity = str_capacity(str, null_terminator_length);
3596
3597 // Get the code range before any modifications since those might clear the code range.
3598 int cr = ENC_CODERANGE(str);
3599
3600 // Check if the string has spare string_capacity to write the new byte.
3601 if (LIKELY(string_capacity >= string_length + 1)) {
3602 // In fast path we can write the new byte and note the string's new length.
3603 sptr[string_length] = byte;
3604 STR_SET_LEN(str, string_length + 1);
3605 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3606 }
3607 else {
3608 // If there's not enough string_capacity, make a call into the general string concatenation function.
3609 str_buf_cat(str, (char *)&byte, 1);
3610 }
3611
3612 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3613 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3614 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3615 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3616 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3617 if (ISASCII(byte)) {
3619 }
3620 else {
3622
3623 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3624 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3625 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3626 }
3627 }
3628 }
3629}
3630
3631RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3632RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3633RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3634
3635static VALUE
3636rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3637 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3638{
3639 int str_encindex = ENCODING_GET(str);
3640 int res_encindex;
3641 int str_cr, res_cr;
3642 rb_encoding *str_enc, *ptr_enc;
3643
3644 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3645
3646 if (str_encindex == ptr_encindex) {
3647 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3648 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3649 }
3650 }
3651 else {
3652 str_enc = rb_enc_from_index(str_encindex);
3653 ptr_enc = rb_enc_from_index(ptr_encindex);
3654 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3655 if (len == 0)
3656 return str;
3657 if (RSTRING_LEN(str) == 0) {
3658 rb_str_buf_cat(str, ptr, len);
3659 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3660 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3661 return str;
3662 }
3663 goto incompatible;
3664 }
3665 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3666 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3667 }
3668 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3669 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3670 str_cr = rb_enc_str_coderange(str);
3671 }
3672 }
3673 }
3674 if (ptr_cr_ret)
3675 *ptr_cr_ret = ptr_cr;
3676
3677 if (str_encindex != ptr_encindex &&
3678 str_cr != ENC_CODERANGE_7BIT &&
3679 ptr_cr != ENC_CODERANGE_7BIT) {
3680 str_enc = rb_enc_from_index(str_encindex);
3681 ptr_enc = rb_enc_from_index(ptr_encindex);
3682 goto incompatible;
3683 }
3684
3685 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3686 res_encindex = str_encindex;
3687 res_cr = ENC_CODERANGE_UNKNOWN;
3688 }
3689 else if (str_cr == ENC_CODERANGE_7BIT) {
3690 if (ptr_cr == ENC_CODERANGE_7BIT) {
3691 res_encindex = str_encindex;
3692 res_cr = ENC_CODERANGE_7BIT;
3693 }
3694 else {
3695 res_encindex = ptr_encindex;
3696 res_cr = ptr_cr;
3697 }
3698 }
3699 else if (str_cr == ENC_CODERANGE_VALID) {
3700 res_encindex = str_encindex;
3701 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3702 res_cr = str_cr;
3703 else
3704 res_cr = ptr_cr;
3705 }
3706 else { /* str_cr == ENC_CODERANGE_BROKEN */
3707 res_encindex = str_encindex;
3708 res_cr = str_cr;
3709 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3710 }
3711
3712 if (len < 0) {
3713 rb_raise(rb_eArgError, "negative string size (or size too big)");
3714 }
3715 str_buf_cat(str, ptr, len);
3716 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3717 return str;
3718
3719 incompatible:
3720 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3721 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3723}
3724
3725VALUE
3726rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3727{
3728 return rb_enc_cr_str_buf_cat(str, ptr, len,
3729 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3730}
3731
3732VALUE
3734{
3735 /* ptr must reference NUL terminated ASCII string. */
3736 int encindex = ENCODING_GET(str);
3737 rb_encoding *enc = rb_enc_from_index(encindex);
3738 if (rb_enc_asciicompat(enc)) {
3739 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3740 encindex, ENC_CODERANGE_7BIT, 0);
3741 }
3742 else {
3743 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3744 while (*ptr) {
3745 unsigned int c = (unsigned char)*ptr;
3746 int len = rb_enc_codelen(c, enc);
3747 rb_enc_mbcput(c, buf, enc);
3748 rb_enc_cr_str_buf_cat(str, buf, len,
3749 encindex, ENC_CODERANGE_VALID, 0);
3750 ptr++;
3751 }
3752 return str;
3753 }
3754}
3755
3756VALUE
3758{
3759 int str2_cr = rb_enc_str_coderange(str2);
3760
3761 if (str_enc_fastpath(str)) {
3762 switch (str2_cr) {
3763 case ENC_CODERANGE_7BIT:
3764 // If RHS is 7bit we can do simple concatenation
3765 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3766 RB_GC_GUARD(str2);
3767 return str;
3769 // If RHS is valid, we can do simple concatenation if encodings are the same
3770 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3771 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3772 int str_cr = ENC_CODERANGE(str);
3773 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3774 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3775 }
3776 RB_GC_GUARD(str2);
3777 return str;
3778 }
3779 }
3780 }
3781
3782 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3783 ENCODING_GET(str2), str2_cr, &str2_cr);
3784
3785 ENC_CODERANGE_SET(str2, str2_cr);
3786
3787 return str;
3788}
3789
3790VALUE
3792{
3793 StringValue(str2);
3794 return rb_str_buf_append(str, str2);
3795}
3796
3797VALUE
3798rb_str_concat_literals(size_t num, const VALUE *strary)
3799{
3800 VALUE str;
3801 size_t i, s = 0;
3802 unsigned long len = 1;
3803
3804 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3805 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3806
3807 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3808 str = rb_str_buf_new(len);
3809 str_enc_copy_direct(str, strary[0]);
3810
3811 for (i = s; i < num; ++i) {
3812 const VALUE v = strary[i];
3813 int encidx = ENCODING_GET(v);
3814
3815 rb_str_buf_append(str, v);
3816 if (encidx != ENCINDEX_US_ASCII) {
3817 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3818 rb_enc_set_index(str, encidx);
3819 }
3820 }
3821 return str;
3822}
3823
3824/*
3825 * call-seq:
3826 * concat(*objects) -> string
3827 *
3828 * :include: doc/string/concat.rdoc
3829 */
3830static VALUE
3831rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3832{
3833 str_modifiable(str);
3834
3835 if (argc == 1) {
3836 return rb_str_concat(str, argv[0]);
3837 }
3838 else if (argc > 1) {
3839 int i;
3840 VALUE arg_str = rb_str_tmp_new(0);
3841 rb_enc_copy(arg_str, str);
3842 for (i = 0; i < argc; i++) {
3843 rb_str_concat(arg_str, argv[i]);
3844 }
3845 rb_str_buf_append(str, arg_str);
3846 }
3847
3848 return str;
3849}
3850
3851/*
3852 * call-seq:
3853 * append_as_bytes(*objects) -> self
3854 *
3855 * Concatenates each object in +objects+ into +self+; returns +self+;
3856 * performs no encoding validation or conversion:
3857 *
3858 * s = 'foo'
3859 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3860 * s.valid_encoding? # => false
3861 * s.append_as_bytes("\xAC 12")
3862 * s.valid_encoding? # => true
3863 *
3864 * When a given object is an integer,
3865 * the value is considered an 8-bit byte;
3866 * if the integer occupies more than one byte (i.e,. is greater than 255),
3867 * appends only the low-order byte (similar to String#setbyte):
3868 *
3869 * s = ""
3870 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3871 * s.bytesize # => 2
3872 *
3873 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3874 */
3875
3876VALUE
3877rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3878{
3879 long needed_capacity = 0;
3880 volatile VALUE t0;
3881 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3882
3883 for (int index = 0; index < argc; index++) {
3884 VALUE obj = argv[index];
3885 enum ruby_value_type type = types[index] = rb_type(obj);
3886 switch (type) {
3887 case T_FIXNUM:
3888 case T_BIGNUM:
3889 needed_capacity++;
3890 break;
3891 case T_STRING:
3892 needed_capacity += RSTRING_LEN(obj);
3893 break;
3894 default:
3895 rb_raise(
3897 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3898 rb_obj_class(obj)
3899 );
3900 break;
3901 }
3902 }
3903
3904 str_ensure_available_capa(str, needed_capacity);
3905 char *sptr = RSTRING_END(str);
3906
3907 for (int index = 0; index < argc; index++) {
3908 VALUE obj = argv[index];
3909 enum ruby_value_type type = types[index];
3910 switch (type) {
3911 case T_FIXNUM:
3912 case T_BIGNUM: {
3913 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3914 char byte = (char)(NUM2INT(obj) & 0xFF);
3915 *sptr = byte;
3916 sptr++;
3917 break;
3918 }
3919 case T_STRING: {
3920 const char *ptr;
3921 long len;
3922 RSTRING_GETMEM(obj, ptr, len);
3923 memcpy(sptr, ptr, len);
3924 sptr += len;
3925 break;
3926 }
3927 default:
3928 rb_bug("append_as_bytes arguments should have been validated");
3929 }
3930 }
3931
3932 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3933 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3934
3935 int cr = ENC_CODERANGE(str);
3936 switch (cr) {
3937 case ENC_CODERANGE_7BIT: {
3938 for (int index = 0; index < argc; index++) {
3939 VALUE obj = argv[index];
3940 enum ruby_value_type type = types[index];
3941 switch (type) {
3942 case T_FIXNUM:
3943 case T_BIGNUM: {
3944 if (!ISASCII(NUM2INT(obj))) {
3945 goto clear_cr;
3946 }
3947 break;
3948 }
3949 case T_STRING: {
3950 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3951 goto clear_cr;
3952 }
3953 break;
3954 }
3955 default:
3956 rb_bug("append_as_bytes arguments should have been validated");
3957 }
3958 }
3959 break;
3960 }
3962 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3963 goto keep_cr;
3964 }
3965 else {
3966 goto clear_cr;
3967 }
3968 break;
3969 default:
3970 goto clear_cr;
3971 break;
3972 }
3973
3974 RB_GC_GUARD(t0);
3975
3976 clear_cr:
3977 // If no fast path was hit, we clear the coderange.
3978 // append_as_bytes is predominently meant to be used in
3979 // buffering situation, hence it's likely the coderange
3980 // will never be scanned, so it's not worth spending time
3981 // precomputing the coderange except for simple and common
3982 // situations.
3984 keep_cr:
3985 return str;
3986}
3987
3988/*
3989 * call-seq:
3990 * self << object -> self
3991 *
3992 * Appends a string representation of +object+ to +self+;
3993 * returns +self+.
3994 *
3995 * If +object+ is a string, appends it to +self+:
3996 *
3997 * s = 'foo'
3998 * s << 'bar' # => "foobar"
3999 * s # => "foobar"
4000 *
4001 * If +object+ is an integer,
4002 * its value is considered a codepoint;
4003 * converts the value to a character before concatenating:
4004 *
4005 * s = 'foo'
4006 * s << 33 # => "foo!"
4007 *
4008 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4009 * and the encoding of +self+ is Encoding::US_ASCII,
4010 * changes the encoding to Encoding::ASCII_8BIT:
4011 *
4012 * s = 'foo'.encode(Encoding::US_ASCII)
4013 * s.encoding # => #<Encoding:US-ASCII>
4014 * s << 0xff # => "foo\xFF"
4015 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4016 *
4017 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4018 *
4019 * s = 'foo'
4020 * s.encoding # => <Encoding:UTF-8>
4021 * s << 0x00110000 # 1114112 out of char range (RangeError)
4022 * s = 'foo'.encode(Encoding::EUC_JP)
4023 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4024 *
4025 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4026 */
4027VALUE
4029{
4030 unsigned int code;
4031 rb_encoding *enc = STR_ENC_GET(str1);
4032 int encidx;
4033
4034 if (RB_INTEGER_TYPE_P(str2)) {
4035 if (rb_num_to_uint(str2, &code) == 0) {
4036 }
4037 else if (FIXNUM_P(str2)) {
4038 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4039 }
4040 else {
4041 rb_raise(rb_eRangeError, "bignum out of char range");
4042 }
4043 }
4044 else {
4045 return rb_str_append(str1, str2);
4046 }
4047
4048 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4049
4050 if (encidx >= 0) {
4051 rb_str_buf_cat_byte(str1, (unsigned char)code);
4052 }
4053 else {
4054 long pos = RSTRING_LEN(str1);
4055 int cr = ENC_CODERANGE(str1);
4056 int len;
4057 char *buf;
4058
4059 switch (len = rb_enc_codelen(code, enc)) {
4060 case ONIGERR_INVALID_CODE_POINT_VALUE:
4061 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4062 break;
4063 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4064 case 0:
4065 rb_raise(rb_eRangeError, "%u out of char range", code);
4066 break;
4067 }
4068 buf = ALLOCA_N(char, len + 1);
4069 rb_enc_mbcput(code, buf, enc);
4070 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4071 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4072 }
4073 rb_str_resize(str1, pos+len);
4074 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4075 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4077 }
4078 else if (cr == ENC_CODERANGE_BROKEN) {
4080 }
4081 ENC_CODERANGE_SET(str1, cr);
4082 }
4083 return str1;
4084}
4085
4086int
4087rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4088{
4089 int encidx = rb_enc_to_index(enc);
4090
4091 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4092 /* US-ASCII automatically extended to ASCII-8BIT */
4093 if (code > 0xFF) {
4094 rb_raise(rb_eRangeError, "%u out of char range", code);
4095 }
4096 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4097 return ENCINDEX_ASCII_8BIT;
4098 }
4099 return encidx;
4100 }
4101 else {
4102 return -1;
4103 }
4104}
4105
4106/*
4107 * call-seq:
4108 * prepend(*other_strings) -> new_string
4109 *
4110 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4111 *
4112 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4113 *
4114 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4115 *
4116 */
4117
4118static VALUE
4119rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4120{
4121 str_modifiable(str);
4122
4123 if (argc == 1) {
4124 rb_str_update(str, 0L, 0L, argv[0]);
4125 }
4126 else if (argc > 1) {
4127 int i;
4128 VALUE arg_str = rb_str_tmp_new(0);
4129 rb_enc_copy(arg_str, str);
4130 for (i = 0; i < argc; i++) {
4131 rb_str_append(arg_str, argv[i]);
4132 }
4133 rb_str_update(str, 0L, 0L, arg_str);
4134 }
4135
4136 return str;
4137}
4138
4139st_index_t
4141{
4142 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4143 st_index_t precomputed_hash;
4144 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4145
4146 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4147 return precomputed_hash;
4148 }
4149
4150 return str_do_hash(str);
4151}
4152
4153int
4155{
4156 long len1, len2;
4157 const char *ptr1, *ptr2;
4158 RSTRING_GETMEM(str1, ptr1, len1);
4159 RSTRING_GETMEM(str2, ptr2, len2);
4160 return (len1 != len2 ||
4161 !rb_str_comparable(str1, str2) ||
4162 memcmp(ptr1, ptr2, len1) != 0);
4163}
4164
4165/*
4166 * call-seq:
4167 * hash -> integer
4168 *
4169 * :include: doc/string/hash.rdoc
4170 *
4171 */
4172
4173static VALUE
4174rb_str_hash_m(VALUE str)
4175{
4176 st_index_t hval = rb_str_hash(str);
4177 return ST2FIX(hval);
4178}
4179
4180#define lesser(a,b) (((a)>(b))?(b):(a))
4181
4182int
4184{
4185 int idx1, idx2;
4186 int rc1, rc2;
4187
4188 if (RSTRING_LEN(str1) == 0) return TRUE;
4189 if (RSTRING_LEN(str2) == 0) return TRUE;
4190 idx1 = ENCODING_GET(str1);
4191 idx2 = ENCODING_GET(str2);
4192 if (idx1 == idx2) return TRUE;
4193 rc1 = rb_enc_str_coderange(str1);
4194 rc2 = rb_enc_str_coderange(str2);
4195 if (rc1 == ENC_CODERANGE_7BIT) {
4196 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4197 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4198 return TRUE;
4199 }
4200 if (rc2 == ENC_CODERANGE_7BIT) {
4201 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4202 return TRUE;
4203 }
4204 return FALSE;
4205}
4206
4207int
4209{
4210 long len1, len2;
4211 const char *ptr1, *ptr2;
4212 int retval;
4213
4214 if (str1 == str2) return 0;
4215 RSTRING_GETMEM(str1, ptr1, len1);
4216 RSTRING_GETMEM(str2, ptr2, len2);
4217 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4218 if (len1 == len2) {
4219 if (!rb_str_comparable(str1, str2)) {
4220 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4221 return 1;
4222 return -1;
4223 }
4224 return 0;
4225 }
4226 if (len1 > len2) return 1;
4227 return -1;
4228 }
4229 if (retval > 0) return 1;
4230 return -1;
4231}
4232
4233/*
4234 * call-seq:
4235 * self == object -> true or false
4236 *
4237 * Returns whether +object+ is equal to +self+.
4238 *
4239 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4240 *
4241 * s = 'foo'
4242 * s == 'foo' # => true
4243 * s == 'food' # => false
4244 * s == 'FOO' # => false
4245 *
4246 * Returns +false+ if the two strings' encodings are not compatible:
4247 *
4248 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4249 *
4250 * When +object+ is not a string:
4251 *
4252 * - If +object+ responds to method <tt>to_str</tt>,
4253 * <tt>object == self</tt> is called and its return value is returned.
4254 * - If +object+ does not respond to <tt>to_str</tt>,
4255 * +false+ is returned.
4256 *
4257 * Related: {Comparing}[rdoc-ref:String@Comparing].
4258 */
4259
4260VALUE
4262{
4263 if (str1 == str2) return Qtrue;
4264 if (!RB_TYPE_P(str2, T_STRING)) {
4265 if (!rb_respond_to(str2, idTo_str)) {
4266 return Qfalse;
4267 }
4268 return rb_equal(str2, str1);
4269 }
4270 return rb_str_eql_internal(str1, str2);
4271}
4272
4273/*
4274 * call-seq:
4275 * eql?(object) -> true or false
4276 *
4277 * :include: doc/string/eql_p.rdoc
4278 *
4279 */
4280
4281VALUE
4282rb_str_eql(VALUE str1, VALUE str2)
4283{
4284 if (str1 == str2) return Qtrue;
4285 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4286 return rb_str_eql_internal(str1, str2);
4287}
4288
4289/*
4290 * call-seq:
4291 * self <=> other_string -> -1, 0, 1, or nil
4292 *
4293 * Compares +self+ and +other_string+, returning:
4294 *
4295 * - -1 if +other_string+ is larger.
4296 * - 0 if the two are equal.
4297 * - 1 if +other_string+ is smaller.
4298 * - +nil+ if the two are incomparable.
4299 *
4300 * Examples:
4301 *
4302 * 'foo' <=> 'foo' # => 0
4303 * 'foo' <=> 'food' # => -1
4304 * 'food' <=> 'foo' # => 1
4305 * 'FOO' <=> 'foo' # => -1
4306 * 'foo' <=> 'FOO' # => 1
4307 * 'foo' <=> 1 # => nil
4308 *
4309 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4310 */
4311
4312static VALUE
4313rb_str_cmp_m(VALUE str1, VALUE str2)
4314{
4315 int result;
4316 VALUE s = rb_check_string_type(str2);
4317 if (NIL_P(s)) {
4318 return rb_invcmp(str1, str2);
4319 }
4320 result = rb_str_cmp(str1, s);
4321 return INT2FIX(result);
4322}
4323
4324static VALUE str_casecmp(VALUE str1, VALUE str2);
4325static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4326
4327/*
4328 * call-seq:
4329 * casecmp(other_string) -> -1, 0, 1, or nil
4330 *
4331 * Ignoring case, compares +self+ and +other_string+; returns:
4332 *
4333 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4334 * - 0 if the two are equal.
4335 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4336 * - +nil+ if the two are incomparable.
4337 *
4338 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4339 *
4340 * Examples:
4341 *
4342 * 'foo'.casecmp('goo') # => -1
4343 * 'goo'.casecmp('foo') # => 1
4344 * 'foo'.casecmp('food') # => -1
4345 * 'food'.casecmp('foo') # => 1
4346 * 'FOO'.casecmp('foo') # => 0
4347 * 'foo'.casecmp('FOO') # => 0
4348 * 'foo'.casecmp(1) # => nil
4349 *
4350 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4351 */
4352
4353static VALUE
4354rb_str_casecmp(VALUE str1, VALUE str2)
4355{
4356 VALUE s = rb_check_string_type(str2);
4357 if (NIL_P(s)) {
4358 return Qnil;
4359 }
4360 return str_casecmp(str1, s);
4361}
4362
4363static VALUE
4364str_casecmp(VALUE str1, VALUE str2)
4365{
4366 long len;
4367 rb_encoding *enc;
4368 const char *p1, *p1end, *p2, *p2end;
4369
4370 enc = rb_enc_compatible(str1, str2);
4371 if (!enc) {
4372 return Qnil;
4373 }
4374
4375 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4376 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4377 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4378 while (p1 < p1end && p2 < p2end) {
4379 if (*p1 != *p2) {
4380 unsigned int c1 = TOLOWER(*p1 & 0xff);
4381 unsigned int c2 = TOLOWER(*p2 & 0xff);
4382 if (c1 != c2)
4383 return INT2FIX(c1 < c2 ? -1 : 1);
4384 }
4385 p1++;
4386 p2++;
4387 }
4388 }
4389 else {
4390 while (p1 < p1end && p2 < p2end) {
4391 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4392 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4393
4394 if (0 <= c1 && 0 <= c2) {
4395 c1 = TOLOWER(c1);
4396 c2 = TOLOWER(c2);
4397 if (c1 != c2)
4398 return INT2FIX(c1 < c2 ? -1 : 1);
4399 }
4400 else {
4401 int r;
4402 l1 = rb_enc_mbclen(p1, p1end, enc);
4403 l2 = rb_enc_mbclen(p2, p2end, enc);
4404 len = l1 < l2 ? l1 : l2;
4405 r = memcmp(p1, p2, len);
4406 if (r != 0)
4407 return INT2FIX(r < 0 ? -1 : 1);
4408 if (l1 != l2)
4409 return INT2FIX(l1 < l2 ? -1 : 1);
4410 }
4411 p1 += l1;
4412 p2 += l2;
4413 }
4414 }
4415 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4416 if (p1 == p1end) return INT2FIX(-1);
4417 return INT2FIX(1);
4418}
4419
4420/*
4421 * call-seq:
4422 * casecmp?(other_string) -> true, false, or nil
4423 *
4424 * Returns +true+ if +self+ and +other_string+ are equal after
4425 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4426 *
4427 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4428 *
4429 * Examples:
4430 *
4431 * 'foo'.casecmp?('goo') # => false
4432 * 'goo'.casecmp?('foo') # => false
4433 * 'foo'.casecmp?('food') # => false
4434 * 'food'.casecmp?('foo') # => false
4435 * 'FOO'.casecmp?('foo') # => true
4436 * 'foo'.casecmp?('FOO') # => true
4437 * 'foo'.casecmp?(1) # => nil
4438 *
4439 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4440 */
4441
4442static VALUE
4443rb_str_casecmp_p(VALUE str1, VALUE str2)
4444{
4445 VALUE s = rb_check_string_type(str2);
4446 if (NIL_P(s)) {
4447 return Qnil;
4448 }
4449 return str_casecmp_p(str1, s);
4450}
4451
4452static VALUE
4453str_casecmp_p(VALUE str1, VALUE str2)
4454{
4455 rb_encoding *enc;
4456 VALUE folded_str1, folded_str2;
4457 VALUE fold_opt = sym_fold;
4458
4459 enc = rb_enc_compatible(str1, str2);
4460 if (!enc) {
4461 return Qnil;
4462 }
4463
4464 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4465 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4466
4467 return rb_str_eql(folded_str1, folded_str2);
4468}
4469
4470static long
4471strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4472 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4473{
4474 const char *search_start = str_ptr;
4475 long pos, search_len = str_len - offset;
4476
4477 for (;;) {
4478 const char *t;
4479 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4480 if (pos < 0) return pos;
4481 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4482 if (t == search_start + pos) break;
4483 search_len -= t - search_start;
4484 if (search_len <= 0) return -1;
4485 offset += t - search_start;
4486 search_start = t;
4487 }
4488 return pos + offset;
4489}
4490
4491/* found index in byte */
4492#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4493#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4494
4495static long
4496rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4497{
4498 const char *str_ptr, *str_ptr_end, *sub_ptr;
4499 long str_len, sub_len;
4500 rb_encoding *enc;
4501
4502 enc = rb_enc_check(str, sub);
4503 if (is_broken_string(sub)) return -1;
4504
4505 str_ptr = RSTRING_PTR(str);
4506 str_ptr_end = RSTRING_END(str);
4507 str_len = RSTRING_LEN(str);
4508 sub_ptr = RSTRING_PTR(sub);
4509 sub_len = RSTRING_LEN(sub);
4510
4511 if (str_len < sub_len) return -1;
4512
4513 if (offset != 0) {
4514 long str_len_char, sub_len_char;
4515 int single_byte = single_byte_optimizable(str);
4516 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4517 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4518 if (offset < 0) {
4519 offset += str_len_char;
4520 if (offset < 0) return -1;
4521 }
4522 if (str_len_char - offset < sub_len_char) return -1;
4523 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4524 str_ptr += offset;
4525 }
4526 if (sub_len == 0) return offset;
4527
4528 /* need proceed one character at a time */
4529 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4530}
4531
4532
4533/*
4534 * call-seq:
4535 * index(pattern, offset = 0) -> integer or nil
4536 *
4537 * :include: doc/string/index.rdoc
4538 *
4539 */
4540
4541static VALUE
4542rb_str_index_m(int argc, VALUE *argv, VALUE str)
4543{
4544 VALUE sub;
4545 VALUE initpos;
4546 rb_encoding *enc = STR_ENC_GET(str);
4547 long pos;
4548
4549 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4550 long slen = str_strlen(str, enc); /* str's enc */
4551 pos = NUM2LONG(initpos);
4552 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4553 if (RB_TYPE_P(sub, T_REGEXP)) {
4555 }
4556 return Qnil;
4557 }
4558 }
4559 else {
4560 pos = 0;
4561 }
4562
4563 if (RB_TYPE_P(sub, T_REGEXP)) {
4564 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4565 enc, single_byte_optimizable(str));
4566
4567 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4568 VALUE match = rb_backref_get();
4569 struct re_registers *regs = RMATCH_REGS(match);
4570 pos = rb_str_sublen(str, BEG(0));
4571 return LONG2NUM(pos);
4572 }
4573 }
4574 else {
4575 StringValue(sub);
4576 pos = rb_str_index(str, sub, pos);
4577 if (pos >= 0) {
4578 pos = rb_str_sublen(str, pos);
4579 return LONG2NUM(pos);
4580 }
4581 }
4582 return Qnil;
4583}
4584
4585/* Ensure that the given pos is a valid character boundary.
4586 * Note that in this function, "character" means a code point
4587 * (Unicode scalar value), not a grapheme cluster.
4588 */
4589static void
4590str_ensure_byte_pos(VALUE str, long pos)
4591{
4592 if (!single_byte_optimizable(str)) {
4593 const char *s = RSTRING_PTR(str);
4594 const char *e = RSTRING_END(str);
4595 const char *p = s + pos;
4596 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4597 rb_raise(rb_eIndexError,
4598 "offset %ld does not land on character boundary", pos);
4599 }
4600 }
4601}
4602
4603/*
4604 * call-seq:
4605 * byteindex(object, offset = 0) -> integer or nil
4606 *
4607 * Returns the 0-based integer index of a substring of +self+
4608 * specified by +object+ (a string or Regexp) and +offset+,
4609 * or +nil+ if there is no such substring;
4610 * the returned index is the count of _bytes_ (not characters).
4611 *
4612 * When +object+ is a string,
4613 * returns the index of the first found substring equal to +object+:
4614 *
4615 * s = 'foo' # => "foo"
4616 * s.size # => 3 # Three 1-byte characters.
4617 * s.bytesize # => 3 # Three bytes.
4618 * s.byteindex('f') # => 0
4619 * s.byteindex('o') # => 1
4620 * s.byteindex('oo') # => 1
4621 * s.byteindex('ooo') # => nil
4622 *
4623 * When +object+ is a Regexp,
4624 * returns the index of the first found substring matching +object+;
4625 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4626 *
4627 * s = 'foo'
4628 * s.byteindex(/f/) # => 0
4629 * $~ # => #<MatchData "f">
4630 * s.byteindex(/o/) # => 1
4631 * s.byteindex(/oo/) # => 1
4632 * s.byteindex(/ooo/) # => nil
4633 * $~ # => nil
4634 *
4635 * \Integer argument +offset+, if given, specifies the 0-based index
4636 * of the byte where searching is to begin.
4637 *
4638 * When +offset+ is non-negative,
4639 * searching begins at byte position +offset+:
4640 *
4641 * s = 'foo'
4642 * s.byteindex('o', 1) # => 1
4643 * s.byteindex('o', 2) # => 2
4644 * s.byteindex('o', 3) # => nil
4645 *
4646 * When +offset+ is negative, counts backward from the end of +self+:
4647 *
4648 * s = 'foo'
4649 * s.byteindex('o', -1) # => 2
4650 * s.byteindex('o', -2) # => 1
4651 * s.byteindex('o', -3) # => 1
4652 * s.byteindex('o', -4) # => nil
4653 *
4654 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4655 *
4656 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4657 * s.size # => 2 # Two 3-byte characters.
4658 * s.bytesize # => 6 # Six bytes.
4659 * s.byteindex("\uFFFF") # => 0
4660 * s.byteindex("\uFFFF", 1) # Raises IndexError
4661 * s.byteindex("\uFFFF", 2) # Raises IndexError
4662 * s.byteindex("\uFFFF", 3) # => 3
4663 * s.byteindex("\uFFFF", 4) # Raises IndexError
4664 * s.byteindex("\uFFFF", 5) # Raises IndexError
4665 * s.byteindex("\uFFFF", 6) # => nil
4666 *
4667 * Related: see {Querying}[rdoc-ref:String@Querying].
4668 */
4669
4670static VALUE
4671rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4672{
4673 VALUE sub;
4674 VALUE initpos;
4675 long pos;
4676
4677 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4678 long slen = RSTRING_LEN(str);
4679 pos = NUM2LONG(initpos);
4680 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4681 if (RB_TYPE_P(sub, T_REGEXP)) {
4683 }
4684 return Qnil;
4685 }
4686 }
4687 else {
4688 pos = 0;
4689 }
4690
4691 str_ensure_byte_pos(str, pos);
4692
4693 if (RB_TYPE_P(sub, T_REGEXP)) {
4694 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4695 VALUE match = rb_backref_get();
4696 struct re_registers *regs = RMATCH_REGS(match);
4697 pos = BEG(0);
4698 return LONG2NUM(pos);
4699 }
4700 }
4701 else {
4702 StringValue(sub);
4703 pos = rb_str_byteindex(str, sub, pos);
4704 if (pos >= 0) return LONG2NUM(pos);
4705 }
4706 return Qnil;
4707}
4708
4709#ifndef HAVE_MEMRCHR
4710static void*
4711memrchr(const char *search_str, int chr, long search_len)
4712{
4713 const char *ptr = search_str + search_len;
4714 while (ptr > search_str) {
4715 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4716 }
4717
4718 return ((void *)0);
4719}
4720#endif
4721
4722static long
4723str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4724{
4725 char *hit, *adjusted;
4726 int c;
4727 long slen, searchlen;
4728 char *sbeg, *e, *t;
4729
4730 sbeg = RSTRING_PTR(str);
4731 slen = RSTRING_LEN(sub);
4732 if (slen == 0) return s - sbeg;
4733 e = RSTRING_END(str);
4734 t = RSTRING_PTR(sub);
4735 c = *t & 0xff;
4736 searchlen = s - sbeg + 1;
4737
4738 if (memcmp(s, t, slen) == 0) {
4739 return s - sbeg;
4740 }
4741
4742 do {
4743 hit = memrchr(sbeg, c, searchlen);
4744 if (!hit) break;
4745 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4746 if (hit != adjusted) {
4747 searchlen = adjusted - sbeg;
4748 continue;
4749 }
4750 if (memcmp(hit, t, slen) == 0)
4751 return hit - sbeg;
4752 searchlen = adjusted - sbeg;
4753 } while (searchlen > 0);
4754
4755 return -1;
4756}
4757
4758/* found index in byte */
4759static long
4760rb_str_rindex(VALUE str, VALUE sub, long pos)
4761{
4762 long len, slen;
4763 char *sbeg, *s;
4764 rb_encoding *enc;
4765 int singlebyte;
4766
4767 enc = rb_enc_check(str, sub);
4768 if (is_broken_string(sub)) return -1;
4769 singlebyte = single_byte_optimizable(str);
4770 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4771 slen = str_strlen(sub, enc); /* rb_enc_check */
4772
4773 /* substring longer than string */
4774 if (len < slen) return -1;
4775 if (len - pos < slen) pos = len - slen;
4776 if (len == 0) return pos;
4777
4778 sbeg = RSTRING_PTR(str);
4779
4780 if (pos == 0) {
4781 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4782 return 0;
4783 else
4784 return -1;
4785 }
4786
4787 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4788 return str_rindex(str, sub, s, enc);
4789}
4790
4791/*
4792 * call-seq:
4793 * rindex(pattern, offset = self.length) -> integer or nil
4794 *
4795 * :include:doc/string/rindex.rdoc
4796 *
4797 */
4798
4799static VALUE
4800rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4801{
4802 VALUE sub;
4803 VALUE initpos;
4804 rb_encoding *enc = STR_ENC_GET(str);
4805 long pos, len = str_strlen(str, enc); /* str's enc */
4806
4807 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4808 pos = NUM2LONG(initpos);
4809 if (pos < 0 && (pos += len) < 0) {
4810 if (RB_TYPE_P(sub, T_REGEXP)) {
4812 }
4813 return Qnil;
4814 }
4815 if (pos > len) pos = len;
4816 }
4817 else {
4818 pos = len;
4819 }
4820
4821 if (RB_TYPE_P(sub, T_REGEXP)) {
4822 /* enc = rb_enc_check(str, sub); */
4823 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4824 enc, single_byte_optimizable(str));
4825
4826 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4827 VALUE match = rb_backref_get();
4828 struct re_registers *regs = RMATCH_REGS(match);
4829 pos = rb_str_sublen(str, BEG(0));
4830 return LONG2NUM(pos);
4831 }
4832 }
4833 else {
4834 StringValue(sub);
4835 pos = rb_str_rindex(str, sub, pos);
4836 if (pos >= 0) {
4837 pos = rb_str_sublen(str, pos);
4838 return LONG2NUM(pos);
4839 }
4840 }
4841 return Qnil;
4842}
4843
4844static long
4845rb_str_byterindex(VALUE str, VALUE sub, long pos)
4846{
4847 long len, slen;
4848 char *sbeg, *s;
4849 rb_encoding *enc;
4850
4851 enc = rb_enc_check(str, sub);
4852 if (is_broken_string(sub)) return -1;
4853 len = RSTRING_LEN(str);
4854 slen = RSTRING_LEN(sub);
4855
4856 /* substring longer than string */
4857 if (len < slen) return -1;
4858 if (len - pos < slen) pos = len - slen;
4859 if (len == 0) return pos;
4860
4861 sbeg = RSTRING_PTR(str);
4862
4863 if (pos == 0) {
4864 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4865 return 0;
4866 else
4867 return -1;
4868 }
4869
4870 s = sbeg + pos;
4871 return str_rindex(str, sub, s, enc);
4872}
4873
4874/*
4875 * call-seq:
4876 * byterindex(object, offset = self.bytesize) -> integer or nil
4877 *
4878 * Returns the 0-based integer index of a substring of +self+
4879 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4880 * or +nil+ if there is no such substring;
4881 * the returned index is the count of _bytes_ (not characters).
4882 *
4883 * When +object+ is a string,
4884 * returns the index of the _last_ found substring equal to +object+:
4885 *
4886 * s = 'foo' # => "foo"
4887 * s.size # => 3 # Three 1-byte characters.
4888 * s.bytesize # => 3 # Three bytes.
4889 * s.byterindex('f') # => 0
4890 s.byterindex('o') # => 2
4891 s.byterindex('oo') # => 1
4892 s.byterindex('ooo') # => nil
4893 *
4894 * When +object+ is a Regexp,
4895 * returns the index of the last found substring matching +object+;
4896 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4897 *
4898 * s = 'foo'
4899 * s.byterindex(/f/) # => 0
4900 * $~ # => #<MatchData "f">
4901 * s.byterindex(/o/) # => 2
4902 * s.byterindex(/oo/) # => 1
4903 * s.byterindex(/ooo/) # => nil
4904 * $~ # => nil
4905 *
4906 * The last match means starting at the possible last position,
4907 * not the last of the longest matches:
4908 *
4909 * s = 'foo'
4910 * s.byterindex(/o+/) # => 2
4911 * $~ #=> #<MatchData "o">
4912 *
4913 * To get the last longest match, use a negative lookbehind:
4914 *
4915 * s = 'foo'
4916 * s.byterindex(/(?<!o)o+/) # => 1
4917 * $~ # => #<MatchData "oo">
4918 *
4919 * Or use method #byteindex with negative lookahead:
4920 *
4921 * s = 'foo'
4922 * s.byteindex(/o+(?!.*o)/) # => 1
4923 * $~ #=> #<MatchData "oo">
4924 *
4925 * \Integer argument +offset+, if given, specifies the 0-based index
4926 * of the byte where searching is to end.
4927 *
4928 * When +offset+ is non-negative,
4929 * searching ends at byte position +offset+:
4930 *
4931 * s = 'foo'
4932 * s.byterindex('o', 0) # => nil
4933 * s.byterindex('o', 1) # => 1
4934 * s.byterindex('o', 2) # => 2
4935 * s.byterindex('o', 3) # => 2
4936 *
4937 * When +offset+ is negative, counts backward from the end of +self+:
4938 *
4939 * s = 'foo'
4940 * s.byterindex('o', -1) # => 2
4941 * s.byterindex('o', -2) # => 1
4942 * s.byterindex('o', -3) # => nil
4943 *
4944 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4945 *
4946 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4947 * s.size # => 2 # Two 3-byte characters.
4948 * s.bytesize # => 6 # Six bytes.
4949 * s.byterindex("\uFFFF") # => 3
4950 * s.byterindex("\uFFFF", 1) # Raises IndexError
4951 * s.byterindex("\uFFFF", 2) # Raises IndexError
4952 * s.byterindex("\uFFFF", 3) # => 3
4953 * s.byterindex("\uFFFF", 4) # Raises IndexError
4954 * s.byterindex("\uFFFF", 5) # Raises IndexError
4955 * s.byterindex("\uFFFF", 6) # => nil
4956 *
4957 * Related: see {Querying}[rdoc-ref:String@Querying].
4958 */
4959
4960static VALUE
4961rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4962{
4963 VALUE sub;
4964 VALUE initpos;
4965 long pos, len = RSTRING_LEN(str);
4966
4967 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4968 pos = NUM2LONG(initpos);
4969 if (pos < 0 && (pos += len) < 0) {
4970 if (RB_TYPE_P(sub, T_REGEXP)) {
4972 }
4973 return Qnil;
4974 }
4975 if (pos > len) pos = len;
4976 }
4977 else {
4978 pos = len;
4979 }
4980
4981 str_ensure_byte_pos(str, pos);
4982
4983 if (RB_TYPE_P(sub, T_REGEXP)) {
4984 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4985 VALUE match = rb_backref_get();
4986 struct re_registers *regs = RMATCH_REGS(match);
4987 pos = BEG(0);
4988 return LONG2NUM(pos);
4989 }
4990 }
4991 else {
4992 StringValue(sub);
4993 pos = rb_str_byterindex(str, sub, pos);
4994 if (pos >= 0) return LONG2NUM(pos);
4995 }
4996 return Qnil;
4997}
4998
4999/*
5000 * call-seq:
5001 * self =~ object -> integer or nil
5002 *
5003 * When +object+ is a Regexp, returns the index of the first substring in +self+
5004 * matched by +object+,
5005 * or +nil+ if no match is found;
5006 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5007 *
5008 * 'foo' =~ /f/ # => 0
5009 * $~ # => #<MatchData "f">
5010 * 'foo' =~ /o/ # => 1
5011 * $~ # => #<MatchData "o">
5012 * 'foo' =~ /x/ # => nil
5013 * $~ # => nil
5014 *
5015 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5016 * (see Regexp#=~):
5017 *
5018 * number = nil
5019 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5020 * number # => nil # Not assigned.
5021 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5022 * number # => "9" # Assigned.
5023 *
5024 * If +object+ is not a Regexp, returns the value
5025 * returned by <tt>object =~ self</tt>.
5026 *
5027 * Related: see {Querying}[rdoc-ref:String@Querying].
5028 */
5029
5030static VALUE
5031rb_str_match(VALUE x, VALUE y)
5032{
5033 switch (OBJ_BUILTIN_TYPE(y)) {
5034 case T_STRING:
5035 rb_raise(rb_eTypeError, "type mismatch: String given");
5036
5037 case T_REGEXP:
5038 return rb_reg_match(y, x);
5039
5040 default:
5041 return rb_funcall(y, idEqTilde, 1, x);
5042 }
5043}
5044
5045
5046static VALUE get_pat(VALUE);
5047
5048
5049/*
5050 * call-seq:
5051 * match(pattern, offset = 0) -> matchdata or nil
5052 * match(pattern, offset = 0) {|matchdata| ... } -> object
5053 *
5054 * Creates a MatchData object based on +self+ and the given arguments;
5055 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5056 *
5057 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5058 *
5059 * regexp = Regexp.new(pattern)
5060 *
5061 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5062 * (see Regexp#match):
5063 *
5064 * matchdata = regexp.match(self[offset..])
5065 *
5066 * With no block given, returns the computed +matchdata+ or +nil+:
5067 *
5068 * 'foo'.match('f') # => #<MatchData "f">
5069 * 'foo'.match('o') # => #<MatchData "o">
5070 * 'foo'.match('x') # => nil
5071 * 'foo'.match('f', 1) # => nil
5072 * 'foo'.match('o', 1) # => #<MatchData "o">
5073 *
5074 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5075 * returns the block's return value:
5076 *
5077 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5078 *
5079 * With a block given and +nil+ +matchdata+, does not call the block:
5080 *
5081 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5082 *
5083 * Related: see {Querying}[rdoc-ref:String@Querying].
5084 */
5085
5086static VALUE
5087rb_str_match_m(int argc, VALUE *argv, VALUE str)
5088{
5089 VALUE re, result;
5090 if (argc < 1)
5091 rb_check_arity(argc, 1, 2);
5092 re = argv[0];
5093 argv[0] = str;
5094 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5095 if (!NIL_P(result) && rb_block_given_p()) {
5096 return rb_yield(result);
5097 }
5098 return result;
5099}
5100
5101/*
5102 * call-seq:
5103 * match?(pattern, offset = 0) -> true or false
5104 *
5105 * Returns whether a match is found for +self+ and the given arguments;
5106 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5107 *
5108 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5109 *
5110 * regexp = Regexp.new(pattern)
5111 *
5112 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5113 * +false+ otherwise:
5114 *
5115 * 'foo'.match?(/o/) # => true
5116 * 'foo'.match?('o') # => true
5117 * 'foo'.match?(/x/) # => false
5118 * 'foo'.match?('f', 1) # => false
5119 * 'foo'.match?('o', 1) # => true
5120 *
5121 * Related: see {Querying}[rdoc-ref:String@Querying].
5122 */
5123
5124static VALUE
5125rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5126{
5127 VALUE re;
5128 rb_check_arity(argc, 1, 2);
5129 re = get_pat(argv[0]);
5130 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5131}
5132
5133enum neighbor_char {
5134 NEIGHBOR_NOT_CHAR,
5135 NEIGHBOR_FOUND,
5136 NEIGHBOR_WRAPPED
5137};
5138
5139static enum neighbor_char
5140enc_succ_char(char *p, long len, rb_encoding *enc)
5141{
5142 long i;
5143 int l;
5144
5145 if (rb_enc_mbminlen(enc) > 1) {
5146 /* wchar, trivial case */
5147 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5148 if (!MBCLEN_CHARFOUND_P(r)) {
5149 return NEIGHBOR_NOT_CHAR;
5150 }
5151 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5152 l = rb_enc_code_to_mbclen(c, enc);
5153 if (!l) return NEIGHBOR_NOT_CHAR;
5154 if (l != len) return NEIGHBOR_WRAPPED;
5155 rb_enc_mbcput(c, p, enc);
5156 r = rb_enc_precise_mbclen(p, p + len, enc);
5157 if (!MBCLEN_CHARFOUND_P(r)) {
5158 return NEIGHBOR_NOT_CHAR;
5159 }
5160 return NEIGHBOR_FOUND;
5161 }
5162 while (1) {
5163 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5164 p[i] = '\0';
5165 if (i < 0)
5166 return NEIGHBOR_WRAPPED;
5167 ++((unsigned char*)p)[i];
5168 l = rb_enc_precise_mbclen(p, p+len, enc);
5169 if (MBCLEN_CHARFOUND_P(l)) {
5170 l = MBCLEN_CHARFOUND_LEN(l);
5171 if (l == len) {
5172 return NEIGHBOR_FOUND;
5173 }
5174 else {
5175 memset(p+l, 0xff, len-l);
5176 }
5177 }
5178 if (MBCLEN_INVALID_P(l) && i < len-1) {
5179 long len2;
5180 int l2;
5181 for (len2 = len-1; 0 < len2; len2--) {
5182 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5183 if (!MBCLEN_INVALID_P(l2))
5184 break;
5185 }
5186 memset(p+len2+1, 0xff, len-(len2+1));
5187 }
5188 }
5189}
5190
5191static enum neighbor_char
5192enc_pred_char(char *p, long len, rb_encoding *enc)
5193{
5194 long i;
5195 int l;
5196 if (rb_enc_mbminlen(enc) > 1) {
5197 /* wchar, trivial case */
5198 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5199 if (!MBCLEN_CHARFOUND_P(r)) {
5200 return NEIGHBOR_NOT_CHAR;
5201 }
5202 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5203 if (!c) return NEIGHBOR_NOT_CHAR;
5204 --c;
5205 l = rb_enc_code_to_mbclen(c, enc);
5206 if (!l) return NEIGHBOR_NOT_CHAR;
5207 if (l != len) return NEIGHBOR_WRAPPED;
5208 rb_enc_mbcput(c, p, enc);
5209 r = rb_enc_precise_mbclen(p, p + len, enc);
5210 if (!MBCLEN_CHARFOUND_P(r)) {
5211 return NEIGHBOR_NOT_CHAR;
5212 }
5213 return NEIGHBOR_FOUND;
5214 }
5215 while (1) {
5216 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5217 p[i] = '\xff';
5218 if (i < 0)
5219 return NEIGHBOR_WRAPPED;
5220 --((unsigned char*)p)[i];
5221 l = rb_enc_precise_mbclen(p, p+len, enc);
5222 if (MBCLEN_CHARFOUND_P(l)) {
5223 l = MBCLEN_CHARFOUND_LEN(l);
5224 if (l == len) {
5225 return NEIGHBOR_FOUND;
5226 }
5227 else {
5228 memset(p+l, 0, len-l);
5229 }
5230 }
5231 if (MBCLEN_INVALID_P(l) && i < len-1) {
5232 long len2;
5233 int l2;
5234 for (len2 = len-1; 0 < len2; len2--) {
5235 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5236 if (!MBCLEN_INVALID_P(l2))
5237 break;
5238 }
5239 memset(p+len2+1, 0, len-(len2+1));
5240 }
5241 }
5242}
5243
5244/*
5245 overwrite +p+ by succeeding letter in +enc+ and returns
5246 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5247 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5248 assuming each ranges are successive, and mbclen
5249 never change in each ranges.
5250 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5251 character.
5252 */
5253static enum neighbor_char
5254enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5255{
5256 enum neighbor_char ret;
5257 unsigned int c;
5258 int ctype;
5259 int range;
5260 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5261
5262 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5263 int try;
5264 const int max_gaps = 1;
5265
5266 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5267 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5268 ctype = ONIGENC_CTYPE_DIGIT;
5269 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5270 ctype = ONIGENC_CTYPE_ALPHA;
5271 else
5272 return NEIGHBOR_NOT_CHAR;
5273
5274 MEMCPY(save, p, char, len);
5275 for (try = 0; try <= max_gaps; ++try) {
5276 ret = enc_succ_char(p, len, enc);
5277 if (ret == NEIGHBOR_FOUND) {
5278 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5279 if (rb_enc_isctype(c, ctype, enc))
5280 return NEIGHBOR_FOUND;
5281 }
5282 }
5283 MEMCPY(p, save, char, len);
5284 range = 1;
5285 while (1) {
5286 MEMCPY(save, p, char, len);
5287 ret = enc_pred_char(p, len, enc);
5288 if (ret == NEIGHBOR_FOUND) {
5289 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5290 if (!rb_enc_isctype(c, ctype, enc)) {
5291 MEMCPY(p, save, char, len);
5292 break;
5293 }
5294 }
5295 else {
5296 MEMCPY(p, save, char, len);
5297 break;
5298 }
5299 range++;
5300 }
5301 if (range == 1) {
5302 return NEIGHBOR_NOT_CHAR;
5303 }
5304
5305 if (ctype != ONIGENC_CTYPE_DIGIT) {
5306 MEMCPY(carry, p, char, len);
5307 return NEIGHBOR_WRAPPED;
5308 }
5309
5310 MEMCPY(carry, p, char, len);
5311 enc_succ_char(carry, len, enc);
5312 return NEIGHBOR_WRAPPED;
5313}
5314
5315
5316static VALUE str_succ(VALUE str);
5317
5318/*
5319 * call-seq:
5320 * succ -> new_str
5321 *
5322 * :include: doc/string/succ.rdoc
5323 *
5324 */
5325
5326VALUE
5328{
5329 VALUE str;
5330 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5331 rb_enc_cr_str_copy_for_substr(str, orig);
5332 return str_succ(str);
5333}
5334
5335static VALUE
5336str_succ(VALUE str)
5337{
5338 rb_encoding *enc;
5339 char *sbeg, *s, *e, *last_alnum = 0;
5340 int found_alnum = 0;
5341 long l, slen;
5342 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5343 long carry_pos = 0, carry_len = 1;
5344 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5345
5346 slen = RSTRING_LEN(str);
5347 if (slen == 0) return str;
5348
5349 enc = STR_ENC_GET(str);
5350 sbeg = RSTRING_PTR(str);
5351 s = e = sbeg + slen;
5352
5353 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5354 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5355 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5356 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5357 break;
5358 }
5359 }
5360 l = rb_enc_precise_mbclen(s, e, enc);
5361 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5362 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5363 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5364 switch (neighbor) {
5365 case NEIGHBOR_NOT_CHAR:
5366 continue;
5367 case NEIGHBOR_FOUND:
5368 return str;
5369 case NEIGHBOR_WRAPPED:
5370 last_alnum = s;
5371 break;
5372 }
5373 found_alnum = 1;
5374 carry_pos = s - sbeg;
5375 carry_len = l;
5376 }
5377 if (!found_alnum) { /* str contains no alnum */
5378 s = e;
5379 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5380 enum neighbor_char neighbor;
5381 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5382 l = rb_enc_precise_mbclen(s, e, enc);
5383 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5384 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5385 MEMCPY(tmp, s, char, l);
5386 neighbor = enc_succ_char(tmp, l, enc);
5387 switch (neighbor) {
5388 case NEIGHBOR_FOUND:
5389 MEMCPY(s, tmp, char, l);
5390 return str;
5391 break;
5392 case NEIGHBOR_WRAPPED:
5393 MEMCPY(s, tmp, char, l);
5394 break;
5395 case NEIGHBOR_NOT_CHAR:
5396 break;
5397 }
5398 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5399 /* wrapped to \0...\0. search next valid char. */
5400 enc_succ_char(s, l, enc);
5401 }
5402 if (!rb_enc_asciicompat(enc)) {
5403 MEMCPY(carry, s, char, l);
5404 carry_len = l;
5405 }
5406 carry_pos = s - sbeg;
5407 }
5409 }
5410 RESIZE_CAPA(str, slen + carry_len);
5411 sbeg = RSTRING_PTR(str);
5412 s = sbeg + carry_pos;
5413 memmove(s + carry_len, s, slen - carry_pos);
5414 memmove(s, carry, carry_len);
5415 slen += carry_len;
5416 STR_SET_LEN(str, slen);
5417 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5419 return str;
5420}
5421
5422
5423/*
5424 * call-seq:
5425 * succ! -> self
5426 *
5427 * Like String#succ, but modifies +self+ in place; returns +self+.
5428 *
5429 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5430 */
5431
5432static VALUE
5433rb_str_succ_bang(VALUE str)
5434{
5435 rb_str_modify(str);
5436 str_succ(str);
5437 return str;
5438}
5439
5440static int
5441all_digits_p(const char *s, long len)
5442{
5443 while (len-- > 0) {
5444 if (!ISDIGIT(*s)) return 0;
5445 s++;
5446 }
5447 return 1;
5448}
5449
5450static int
5451str_upto_i(VALUE str, VALUE arg)
5452{
5453 rb_yield(str);
5454 return 0;
5455}
5456
5457/*
5458 * call-seq:
5459 * upto(other_string, exclusive = false) {|string| ... } -> self
5460 * upto(other_string, exclusive = false) -> new_enumerator
5461 *
5462 * With a block given, calls the block with each +String+ value
5463 * returned by successive calls to String#succ;
5464 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5465 * the sequence terminates when value +other_string+ is reached;
5466 * returns +self+:
5467 *
5468 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5469 * Output:
5470 *
5471 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5472 *
5473 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5474 *
5475 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5476 *
5477 * Output:
5478 *
5479 * a8 a9 b0 b1 b2 b3 b4 b5
5480 *
5481 * If +other_string+ would not be reached, does not call the block:
5482 *
5483 * '25'.upto('5') {|s| fail s }
5484 * 'aa'.upto('a') {|s| fail s }
5485 *
5486 * With no block given, returns a new Enumerator:
5487 *
5488 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5489 *
5490 */
5491
5492static VALUE
5493rb_str_upto(int argc, VALUE *argv, VALUE beg)
5494{
5495 VALUE end, exclusive;
5496
5497 rb_scan_args(argc, argv, "11", &end, &exclusive);
5498 RETURN_ENUMERATOR(beg, argc, argv);
5499 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5500}
5501
5502VALUE
5503rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5504{
5505 VALUE current, after_end;
5506 ID succ;
5507 int n, ascii;
5508 rb_encoding *enc;
5509
5510 CONST_ID(succ, "succ");
5511 StringValue(end);
5512 enc = rb_enc_check(beg, end);
5513 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5514 /* single character */
5515 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5516 char c = RSTRING_PTR(beg)[0];
5517 char e = RSTRING_PTR(end)[0];
5518
5519 if (c > e || (excl && c == e)) return beg;
5520 for (;;) {
5521 VALUE str = rb_enc_str_new(&c, 1, enc);
5523 if ((*each)(str, arg)) break;
5524 if (!excl && c == e) break;
5525 c++;
5526 if (excl && c == e) break;
5527 }
5528 return beg;
5529 }
5530 /* both edges are all digits */
5531 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5532 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5533 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5534 VALUE b, e;
5535 int width;
5536
5537 width = RSTRING_LENINT(beg);
5538 b = rb_str_to_inum(beg, 10, FALSE);
5539 e = rb_str_to_inum(end, 10, FALSE);
5540 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5541 long bi = FIX2LONG(b);
5542 long ei = FIX2LONG(e);
5543 rb_encoding *usascii = rb_usascii_encoding();
5544
5545 while (bi <= ei) {
5546 if (excl && bi == ei) break;
5547 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5548 bi++;
5549 }
5550 }
5551 else {
5552 ID op = excl ? '<' : idLE;
5553 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5554
5555 args[0] = INT2FIX(width);
5556 while (rb_funcall(b, op, 1, e)) {
5557 args[1] = b;
5558 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5559 b = rb_funcallv(b, succ, 0, 0);
5560 }
5561 }
5562 return beg;
5563 }
5564 /* normal case */
5565 n = rb_str_cmp(beg, end);
5566 if (n > 0 || (excl && n == 0)) return beg;
5567
5568 after_end = rb_funcallv(end, succ, 0, 0);
5569 current = str_duplicate(rb_cString, beg);
5570 while (!rb_str_equal(current, after_end)) {
5571 VALUE next = Qnil;
5572 if (excl || !rb_str_equal(current, end))
5573 next = rb_funcallv(current, succ, 0, 0);
5574 if ((*each)(current, arg)) break;
5575 if (NIL_P(next)) break;
5576 current = next;
5577 StringValue(current);
5578 if (excl && rb_str_equal(current, end)) break;
5579 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5580 break;
5581 }
5582
5583 return beg;
5584}
5585
5586VALUE
5587rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5588{
5589 VALUE current;
5590 ID succ;
5591
5592 CONST_ID(succ, "succ");
5593 /* both edges are all digits */
5594 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5595 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5596 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5597 int width = RSTRING_LENINT(beg);
5598 b = rb_str_to_inum(beg, 10, FALSE);
5599 if (FIXNUM_P(b)) {
5600 long bi = FIX2LONG(b);
5601 rb_encoding *usascii = rb_usascii_encoding();
5602
5603 while (FIXABLE(bi)) {
5604 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5605 bi++;
5606 }
5607 b = LONG2NUM(bi);
5608 }
5609 args[0] = INT2FIX(width);
5610 while (1) {
5611 args[1] = b;
5612 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5613 b = rb_funcallv(b, succ, 0, 0);
5614 }
5615 }
5616 /* normal case */
5617 current = str_duplicate(rb_cString, beg);
5618 while (1) {
5619 VALUE next = rb_funcallv(current, succ, 0, 0);
5620 if ((*each)(current, arg)) break;
5621 current = next;
5622 StringValue(current);
5623 if (RSTRING_LEN(current) == 0)
5624 break;
5625 }
5626
5627 return beg;
5628}
5629
5630static int
5631include_range_i(VALUE str, VALUE arg)
5632{
5633 VALUE *argp = (VALUE *)arg;
5634 if (!rb_equal(str, *argp)) return 0;
5635 *argp = Qnil;
5636 return 1;
5637}
5638
5639VALUE
5640rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5641{
5642 beg = rb_str_new_frozen(beg);
5643 StringValue(end);
5644 end = rb_str_new_frozen(end);
5645 if (NIL_P(val)) return Qfalse;
5646 val = rb_check_string_type(val);
5647 if (NIL_P(val)) return Qfalse;
5648 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5649 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5650 rb_enc_asciicompat(STR_ENC_GET(val))) {
5651 const char *bp = RSTRING_PTR(beg);
5652 const char *ep = RSTRING_PTR(end);
5653 const char *vp = RSTRING_PTR(val);
5654 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5655 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5656 return Qfalse;
5657 else {
5658 char b = *bp;
5659 char e = *ep;
5660 char v = *vp;
5661
5662 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5663 if (b <= v && v < e) return Qtrue;
5664 return RBOOL(!RTEST(exclusive) && v == e);
5665 }
5666 }
5667 }
5668#if 0
5669 /* both edges are all digits */
5670 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5671 all_digits_p(bp, RSTRING_LEN(beg)) &&
5672 all_digits_p(ep, RSTRING_LEN(end))) {
5673 /* TODO */
5674 }
5675#endif
5676 }
5677 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5678
5679 return RBOOL(NIL_P(val));
5680}
5681
5682static VALUE
5683rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5684{
5685 if (rb_reg_search(re, str, 0, 0) >= 0) {
5686 VALUE match = rb_backref_get();
5687 int nth = rb_reg_backref_number(match, backref);
5688 return rb_reg_nth_match(nth, match);
5689 }
5690 return Qnil;
5691}
5692
5693static VALUE
5694rb_str_aref(VALUE str, VALUE indx)
5695{
5696 long idx;
5697
5698 if (FIXNUM_P(indx)) {
5699 idx = FIX2LONG(indx);
5700 }
5701 else if (RB_TYPE_P(indx, T_REGEXP)) {
5702 return rb_str_subpat(str, indx, INT2FIX(0));
5703 }
5704 else if (RB_TYPE_P(indx, T_STRING)) {
5705 if (rb_str_index(str, indx, 0) != -1)
5706 return str_duplicate(rb_cString, indx);
5707 return Qnil;
5708 }
5709 else {
5710 /* check if indx is Range */
5711 long beg, len = str_strlen(str, NULL);
5712 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5713 case Qfalse:
5714 break;
5715 case Qnil:
5716 return Qnil;
5717 default:
5718 return rb_str_substr(str, beg, len);
5719 }
5720 idx = NUM2LONG(indx);
5721 }
5722
5723 return str_substr(str, idx, 1, FALSE);
5724}
5725
5726
5727/*
5728 * call-seq:
5729 * self[index] -> new_string or nil
5730 * self[start, length] -> new_string or nil
5731 * self[range] -> new_string or nil
5732 * self[regexp, capture = 0] -> new_string or nil
5733 * self[substring] -> new_string or nil
5734 *
5735 * :include: doc/string/aref.rdoc
5736 *
5737 */
5738
5739static VALUE
5740rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5741{
5742 if (argc == 2) {
5743 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5744 return rb_str_subpat(str, argv[0], argv[1]);
5745 }
5746 else {
5747 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5748 }
5749 }
5750 rb_check_arity(argc, 1, 2);
5751 return rb_str_aref(str, argv[0]);
5752}
5753
5754VALUE
5756{
5757 char *ptr = RSTRING_PTR(str);
5758 long olen = RSTRING_LEN(str), nlen;
5759
5760 str_modifiable(str);
5761 if (len > olen) len = olen;
5762 nlen = olen - len;
5763 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5764 char *oldptr = ptr;
5765 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5766 STR_SET_EMBED(str);
5767 ptr = RSTRING(str)->as.embed.ary;
5768 memmove(ptr, oldptr + len, nlen);
5769 if (fl == STR_NOEMBED) xfree(oldptr);
5770 }
5771 else {
5772 if (!STR_SHARED_P(str)) {
5773 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5774 rb_enc_cr_str_exact_copy(shared, str);
5775 OBJ_FREEZE(shared);
5776 }
5777 ptr = RSTRING(str)->as.heap.ptr += len;
5778 }
5779 STR_SET_LEN(str, nlen);
5780
5781 if (!SHARABLE_MIDDLE_SUBSTRING) {
5782 TERM_FILL(ptr + nlen, TERM_LEN(str));
5783 }
5785 return str;
5786}
5787
5788static void
5789rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5790{
5791 char *sptr;
5792 long slen;
5793 int cr;
5794
5795 if (beg == 0 && vlen == 0) {
5796 rb_str_drop_bytes(str, len);
5797 return;
5798 }
5799
5800 str_modify_keep_cr(str);
5801 RSTRING_GETMEM(str, sptr, slen);
5802 if (len < vlen) {
5803 /* expand string */
5804 RESIZE_CAPA(str, slen + vlen - len);
5805 sptr = RSTRING_PTR(str);
5806 }
5807
5809 cr = rb_enc_str_coderange(val);
5810 else
5812
5813 if (vlen != len) {
5814 memmove(sptr + beg + vlen,
5815 sptr + beg + len,
5816 slen - (beg + len));
5817 }
5818 if (vlen < beg && len < 0) {
5819 MEMZERO(sptr + slen, char, -len);
5820 }
5821 if (vlen > 0) {
5822 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5823 }
5824 slen += vlen - len;
5825 STR_SET_LEN(str, slen);
5826 TERM_FILL(&sptr[slen], TERM_LEN(str));
5827 ENC_CODERANGE_SET(str, cr);
5828}
5829
5830static inline void
5831rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5832{
5833 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5834}
5835
5836void
5837rb_str_update(VALUE str, long beg, long len, VALUE val)
5838{
5839 long slen;
5840 char *p, *e;
5841 rb_encoding *enc;
5842 int singlebyte = single_byte_optimizable(str);
5843 int cr;
5844
5845 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5846
5847 StringValue(val);
5848 enc = rb_enc_check(str, val);
5849 slen = str_strlen(str, enc); /* rb_enc_check */
5850
5851 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5852 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5853 }
5854 if (beg < 0) {
5855 beg += slen;
5856 }
5857 RUBY_ASSERT(beg >= 0);
5858 RUBY_ASSERT(beg <= slen);
5859
5860 if (len > slen - beg) {
5861 len = slen - beg;
5862 }
5863 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5864 if (!p) p = RSTRING_END(str);
5865 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5866 if (!e) e = RSTRING_END(str);
5867 /* error check */
5868 beg = p - RSTRING_PTR(str); /* physical position */
5869 len = e - p; /* physical length */
5870 rb_str_update_0(str, beg, len, val);
5871 rb_enc_associate(str, enc);
5873 if (cr != ENC_CODERANGE_BROKEN)
5874 ENC_CODERANGE_SET(str, cr);
5875}
5876
5877static void
5878rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5879{
5880 int nth;
5881 VALUE match;
5882 long start, end, len;
5883 rb_encoding *enc;
5884 struct re_registers *regs;
5885
5886 if (rb_reg_search(re, str, 0, 0) < 0) {
5887 rb_raise(rb_eIndexError, "regexp not matched");
5888 }
5889 match = rb_backref_get();
5890 nth = rb_reg_backref_number(match, backref);
5891 regs = RMATCH_REGS(match);
5892 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5893 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5894 }
5895 if (nth < 0) {
5896 nth += regs->num_regs;
5897 }
5898
5899 start = BEG(nth);
5900 if (start == -1) {
5901 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5902 }
5903 end = END(nth);
5904 len = end - start;
5905 StringValue(val);
5906 enc = rb_enc_check_str(str, val);
5907 rb_str_update_0(str, start, len, val);
5908 rb_enc_associate(str, enc);
5909}
5910
5911static VALUE
5912rb_str_aset(VALUE str, VALUE indx, VALUE val)
5913{
5914 long idx, beg;
5915
5916 switch (TYPE(indx)) {
5917 case T_REGEXP:
5918 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5919 return val;
5920
5921 case T_STRING:
5922 beg = rb_str_index(str, indx, 0);
5923 if (beg < 0) {
5924 rb_raise(rb_eIndexError, "string not matched");
5925 }
5926 beg = rb_str_sublen(str, beg);
5927 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5928 return val;
5929
5930 default:
5931 /* check if indx is Range */
5932 {
5933 long beg, len;
5934 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5935 rb_str_update(str, beg, len, val);
5936 return val;
5937 }
5938 }
5939 /* FALLTHROUGH */
5940
5941 case T_FIXNUM:
5942 idx = NUM2LONG(indx);
5943 rb_str_update(str, idx, 1, val);
5944 return val;
5945 }
5946}
5947
5948/*
5949 * call-seq:
5950 * self[index] = other_string -> new_string
5951 * self[start, length] = other_string -> new_string
5952 * self[range] = other_string -> new_string
5953 * self[regexp, capture = 0] = other_string -> new_string
5954 * self[substring] = other_string -> new_string
5955 *
5956 * :include: doc/string/aset.rdoc
5957 *
5958 */
5959
5960static VALUE
5961rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5962{
5963 if (argc == 3) {
5964 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5965 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5966 }
5967 else {
5968 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5969 }
5970 return argv[2];
5971 }
5972 rb_check_arity(argc, 2, 3);
5973 return rb_str_aset(str, argv[0], argv[1]);
5974}
5975
5976/*
5977 * call-seq:
5978 * insert(offset, other_string) -> self
5979 *
5980 * :include: doc/string/insert.rdoc
5981 *
5982 */
5983
5984static VALUE
5985rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5986{
5987 long pos = NUM2LONG(idx);
5988
5989 if (pos == -1) {
5990 return rb_str_append(str, str2);
5991 }
5992 else if (pos < 0) {
5993 pos++;
5994 }
5995 rb_str_update(str, pos, 0, str2);
5996 return str;
5997}
5998
5999
6000/*
6001 * call-seq:
6002 * slice!(index) -> new_string or nil
6003 * slice!(start, length) -> new_string or nil
6004 * slice!(range) -> new_string or nil
6005 * slice!(regexp, capture = 0) -> new_string or nil
6006 * slice!(substring) -> new_string or nil
6007 *
6008 * Like String#[] (and its alias String#slice), except that:
6009 *
6010 * - Performs substitutions in +self+ (not in a copy of +self+).
6011 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
6012 *
6013 * A few examples:
6014 *
6015 * s = 'hello'
6016 * s.slice!('e') # => "e"
6017 * s # => "hllo"
6018 * s.slice!('e') # => nil
6019 * s # => "hllo"
6020 *
6021 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6022 */
6023
6024static VALUE
6025rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6026{
6027 VALUE result = Qnil;
6028 VALUE indx;
6029 long beg, len = 1;
6030 char *p;
6031
6032 rb_check_arity(argc, 1, 2);
6033 str_modify_keep_cr(str);
6034 indx = argv[0];
6035 if (RB_TYPE_P(indx, T_REGEXP)) {
6036 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6037 VALUE match = rb_backref_get();
6038 struct re_registers *regs = RMATCH_REGS(match);
6039 int nth = 0;
6040 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6041 if ((nth += regs->num_regs) <= 0) return Qnil;
6042 }
6043 else if (nth >= regs->num_regs) return Qnil;
6044 beg = BEG(nth);
6045 len = END(nth) - beg;
6046 goto subseq;
6047 }
6048 else if (argc == 2) {
6049 beg = NUM2LONG(indx);
6050 len = NUM2LONG(argv[1]);
6051 goto num_index;
6052 }
6053 else if (FIXNUM_P(indx)) {
6054 beg = FIX2LONG(indx);
6055 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6056 if (!len) return Qnil;
6057 beg = p - RSTRING_PTR(str);
6058 goto subseq;
6059 }
6060 else if (RB_TYPE_P(indx, T_STRING)) {
6061 beg = rb_str_index(str, indx, 0);
6062 if (beg == -1) return Qnil;
6063 len = RSTRING_LEN(indx);
6064 result = str_duplicate(rb_cString, indx);
6065 goto squash;
6066 }
6067 else {
6068 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6069 case Qnil:
6070 return Qnil;
6071 case Qfalse:
6072 beg = NUM2LONG(indx);
6073 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6074 if (!len) return Qnil;
6075 beg = p - RSTRING_PTR(str);
6076 goto subseq;
6077 default:
6078 goto num_index;
6079 }
6080 }
6081
6082 num_index:
6083 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6084 beg = p - RSTRING_PTR(str);
6085
6086 subseq:
6087 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6088 rb_enc_cr_str_copy_for_substr(result, str);
6089
6090 squash:
6091 if (len > 0) {
6092 if (beg == 0) {
6093 rb_str_drop_bytes(str, len);
6094 }
6095 else {
6096 char *sptr = RSTRING_PTR(str);
6097 long slen = RSTRING_LEN(str);
6098 if (beg + len > slen) /* pathological check */
6099 len = slen - beg;
6100 memmove(sptr + beg,
6101 sptr + beg + len,
6102 slen - (beg + len));
6103 slen -= len;
6104 STR_SET_LEN(str, slen);
6105 TERM_FILL(&sptr[slen], TERM_LEN(str));
6106 }
6107 }
6108 return result;
6109}
6110
6111static VALUE
6112get_pat(VALUE pat)
6113{
6114 VALUE val;
6115
6116 switch (OBJ_BUILTIN_TYPE(pat)) {
6117 case T_REGEXP:
6118 return pat;
6119
6120 case T_STRING:
6121 break;
6122
6123 default:
6124 val = rb_check_string_type(pat);
6125 if (NIL_P(val)) {
6126 Check_Type(pat, T_REGEXP);
6127 }
6128 pat = val;
6129 }
6130
6131 return rb_reg_regcomp(pat);
6132}
6133
6134static VALUE
6135get_pat_quoted(VALUE pat, int check)
6136{
6137 VALUE val;
6138
6139 switch (OBJ_BUILTIN_TYPE(pat)) {
6140 case T_REGEXP:
6141 return pat;
6142
6143 case T_STRING:
6144 break;
6145
6146 default:
6147 val = rb_check_string_type(pat);
6148 if (NIL_P(val)) {
6149 Check_Type(pat, T_REGEXP);
6150 }
6151 pat = val;
6152 }
6153 if (check && is_broken_string(pat)) {
6154 rb_exc_raise(rb_reg_check_preprocess(pat));
6155 }
6156 return pat;
6157}
6158
6159static long
6160rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6161{
6162 if (BUILTIN_TYPE(pat) == T_STRING) {
6163 pos = rb_str_byteindex(str, pat, pos);
6164 if (set_backref_str) {
6165 if (pos >= 0) {
6166 str = rb_str_new_frozen_String(str);
6167 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6168 if (match) {
6169 *match = match_data;
6170 }
6171 }
6172 else {
6174 }
6175 }
6176 return pos;
6177 }
6178 else {
6179 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6180 }
6181}
6182
6183static long
6184rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6185{
6186 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6187}
6188
6189
6190/*
6191 * call-seq:
6192 * sub!(pattern, replacement) -> self or nil
6193 * sub!(pattern) {|match| ... } -> self or nil
6194 *
6195 * Like String#sub, except that:
6196 *
6197 * - Changes are made to +self+, not to copy of +self+.
6198 * - Returns +self+ if any changes are made, +nil+ otherwise.
6199 *
6200 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6201 */
6202
6203static VALUE
6204rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6205{
6206 VALUE pat, repl, hash = Qnil;
6207 int iter = 0;
6208 long plen;
6209 int min_arity = rb_block_given_p() ? 1 : 2;
6210 long beg;
6211
6212 rb_check_arity(argc, min_arity, 2);
6213 if (argc == 1) {
6214 iter = 1;
6215 }
6216 else {
6217 repl = argv[1];
6218 hash = rb_check_hash_type(argv[1]);
6219 if (NIL_P(hash)) {
6220 StringValue(repl);
6221 }
6222 }
6223
6224 pat = get_pat_quoted(argv[0], 1);
6225
6226 str_modifiable(str);
6227 beg = rb_pat_search(pat, str, 0, 1);
6228 if (beg >= 0) {
6229 rb_encoding *enc;
6230 int cr = ENC_CODERANGE(str);
6231 long beg0, end0;
6232 VALUE match, match0 = Qnil;
6233 struct re_registers *regs;
6234 char *p, *rp;
6235 long len, rlen;
6236
6237 match = rb_backref_get();
6238 regs = RMATCH_REGS(match);
6239 if (RB_TYPE_P(pat, T_STRING)) {
6240 beg0 = beg;
6241 end0 = beg0 + RSTRING_LEN(pat);
6242 match0 = pat;
6243 }
6244 else {
6245 beg0 = BEG(0);
6246 end0 = END(0);
6247 if (iter) match0 = rb_reg_nth_match(0, match);
6248 }
6249
6250 if (iter || !NIL_P(hash)) {
6251 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6252
6253 if (iter) {
6254 repl = rb_obj_as_string(rb_yield(match0));
6255 }
6256 else {
6257 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6258 repl = rb_obj_as_string(repl);
6259 }
6260 str_mod_check(str, p, len);
6261 rb_check_frozen(str);
6262 }
6263 else {
6264 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6265 }
6266
6267 enc = rb_enc_compatible(str, repl);
6268 if (!enc) {
6269 rb_encoding *str_enc = STR_ENC_GET(str);
6270 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6271 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6272 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6273 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6274 rb_enc_inspect_name(str_enc),
6275 rb_enc_inspect_name(STR_ENC_GET(repl)));
6276 }
6277 enc = STR_ENC_GET(repl);
6278 }
6279 rb_str_modify(str);
6280 rb_enc_associate(str, enc);
6282 int cr2 = ENC_CODERANGE(repl);
6283 if (cr2 == ENC_CODERANGE_BROKEN ||
6284 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6286 else
6287 cr = cr2;
6288 }
6289 plen = end0 - beg0;
6290 rlen = RSTRING_LEN(repl);
6291 len = RSTRING_LEN(str);
6292 if (rlen > plen) {
6293 RESIZE_CAPA(str, len + rlen - plen);
6294 }
6295 p = RSTRING_PTR(str);
6296 if (rlen != plen) {
6297 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6298 }
6299 rp = RSTRING_PTR(repl);
6300 memmove(p + beg0, rp, rlen);
6301 len += rlen - plen;
6302 STR_SET_LEN(str, len);
6303 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6304 ENC_CODERANGE_SET(str, cr);
6305
6306 RB_GC_GUARD(match);
6307
6308 return str;
6309 }
6310 return Qnil;
6311}
6312
6313
6314/*
6315 * call-seq:
6316 * sub(pattern, replacement) -> new_string
6317 * sub(pattern) {|match| ... } -> new_string
6318 *
6319 * :include: doc/string/sub.rdoc
6320 */
6321
6322static VALUE
6323rb_str_sub(int argc, VALUE *argv, VALUE str)
6324{
6325 str = str_duplicate(rb_cString, str);
6326 rb_str_sub_bang(argc, argv, str);
6327 return str;
6328}
6329
6330static VALUE
6331str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6332{
6333 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6334 long beg, beg0, end0;
6335 long offset, blen, slen, len, last;
6336 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6337 char *sp, *cp;
6338 int need_backref_str = -1;
6339 rb_encoding *str_enc;
6340
6341 switch (argc) {
6342 case 1:
6343 RETURN_ENUMERATOR(str, argc, argv);
6344 mode = ITER;
6345 break;
6346 case 2:
6347 repl = argv[1];
6348 hash = rb_check_hash_type(argv[1]);
6349 if (NIL_P(hash)) {
6350 StringValue(repl);
6351 }
6352 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6353 mode = FAST_MAP;
6354 }
6355 else {
6356 mode = MAP;
6357 }
6358 break;
6359 default:
6360 rb_error_arity(argc, 1, 2);
6361 }
6362
6363 pat = get_pat_quoted(argv[0], 1);
6364 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6365
6366 if (beg < 0) {
6367 if (bang) return Qnil; /* no match, no substitution */
6368 return str_duplicate(rb_cString, str);
6369 }
6370
6371 offset = 0;
6372 blen = RSTRING_LEN(str) + 30; /* len + margin */
6373 dest = rb_str_buf_new(blen);
6374 sp = RSTRING_PTR(str);
6375 slen = RSTRING_LEN(str);
6376 cp = sp;
6377 str_enc = STR_ENC_GET(str);
6378 rb_enc_associate(dest, str_enc);
6379 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6380
6381 do {
6382 struct re_registers *regs = RMATCH_REGS(match);
6383 if (RB_TYPE_P(pat, T_STRING)) {
6384 beg0 = beg;
6385 end0 = beg0 + RSTRING_LEN(pat);
6386 match0 = pat;
6387 }
6388 else {
6389 beg0 = BEG(0);
6390 end0 = END(0);
6391 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6392 }
6393
6394 if (mode != STR) {
6395 if (mode == ITER) {
6396 val = rb_obj_as_string(rb_yield(match0));
6397 }
6398 else {
6399 struct RString fake_str = {RBASIC_INIT};
6400 VALUE key;
6401 if (mode == FAST_MAP) {
6402 // It is safe to use a fake_str here because we established that it won't escape,
6403 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6404 // default proc.
6405 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6406 }
6407 else {
6408 key = rb_str_subseq(str, beg0, end0 - beg0);
6409 }
6410 val = rb_hash_aref(hash, key);
6411 val = rb_obj_as_string(val);
6412 }
6413 str_mod_check(str, sp, slen);
6414 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6415 rb_raise(rb_eRuntimeError, "block should not cheat");
6416 }
6417 }
6418 else if (need_backref_str) {
6419 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6420 if (need_backref_str < 0) {
6421 need_backref_str = val != repl;
6422 }
6423 }
6424 else {
6425 val = repl;
6426 }
6427
6428 len = beg0 - offset; /* copy pre-match substr */
6429 if (len) {
6430 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6431 }
6432
6433 rb_str_buf_append(dest, val);
6434
6435 last = offset;
6436 offset = end0;
6437 if (beg0 == end0) {
6438 /*
6439 * Always consume at least one character of the input string
6440 * in order to prevent infinite loops.
6441 */
6442 if (RSTRING_LEN(str) <= end0) break;
6443 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6444 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6445 offset = end0 + len;
6446 }
6447 cp = RSTRING_PTR(str) + offset;
6448 if (offset > RSTRING_LEN(str)) break;
6449
6450 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6451 if (mode != FAST_MAP && mode != STR) {
6452 match = Qnil;
6453 }
6454 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6455
6456 RB_GC_GUARD(match);
6457 } while (beg >= 0);
6458
6459 if (RSTRING_LEN(str) > offset) {
6460 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6461 }
6462 rb_pat_search0(pat, str, last, 1, &match);
6463 if (bang) {
6464 str_shared_replace(str, dest);
6465 }
6466 else {
6467 str = dest;
6468 }
6469
6470 return str;
6471}
6472
6473
6474/*
6475 * call-seq:
6476 * gsub!(pattern, replacement) -> self or nil
6477 * gsub!(pattern) {|match| ... } -> self or nil
6478 * gsub!(pattern) -> an_enumerator
6479 *
6480 * Like String#gsub, except that:
6481 *
6482 * - Performs substitutions in +self+ (not in a copy of +self+).
6483 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6484 *
6485 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6486 */
6487
6488static VALUE
6489rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6490{
6491 str_modify_keep_cr(str);
6492 return str_gsub(argc, argv, str, 1);
6493}
6494
6495
6496/*
6497 * call-seq:
6498 * gsub(pattern, replacement) -> new_string
6499 * gsub(pattern) {|match| ... } -> new_string
6500 * gsub(pattern) -> enumerator
6501 *
6502 * Returns a copy of +self+ with zero or more substrings replaced.
6503 *
6504 * Argument +pattern+ may be a string or a Regexp;
6505 * argument +replacement+ may be a string or a Hash.
6506 * Varying types for the argument values makes this method very versatile.
6507 *
6508 * Below are some simple examples;
6509 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6510 *
6511 * With arguments +pattern+ and string +replacement+ given,
6512 * replaces each matching substring with the given +replacement+ string:
6513 *
6514 * s = 'abracadabra'
6515 * s.gsub('ab', 'AB') # => "ABracadABra"
6516 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6517 *
6518 * With arguments +pattern+ and hash +replacement+ given,
6519 * replaces each matching substring with a value from the given +replacement+ hash,
6520 * or removes it:
6521 *
6522 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6523 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6524 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6525 *
6526 * With argument +pattern+ and a block given,
6527 * calls the block with each matching substring;
6528 * replaces that substring with the block's return value:
6529 *
6530 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6531 * # => "ABrACADABrA"
6532 *
6533 * With argument +pattern+ and no block given,
6534 * returns a new Enumerator.
6535 *
6536 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6537 */
6538
6539static VALUE
6540rb_str_gsub(int argc, VALUE *argv, VALUE str)
6541{
6542 return str_gsub(argc, argv, str, 0);
6543}
6544
6545
6546/*
6547 * call-seq:
6548 * replace(other_string) -> self
6549 *
6550 * Replaces the contents of +self+ with the contents of +other_string+;
6551 * returns +self+:
6552 *
6553 * s = 'foo' # => "foo"
6554 * s.replace('bar') # => "bar"
6555 *
6556 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6557 */
6558
6559VALUE
6561{
6562 str_modifiable(str);
6563 if (str == str2) return str;
6564
6565 StringValue(str2);
6566 str_discard(str);
6567 return str_replace(str, str2);
6568}
6569
6570/*
6571 * call-seq:
6572 * clear -> self
6573 *
6574 * Removes the contents of +self+:
6575 *
6576 * s = 'foo'
6577 * s.clear # => ""
6578 * s # => ""
6579 *
6580 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6581 */
6582
6583static VALUE
6584rb_str_clear(VALUE str)
6585{
6586 str_discard(str);
6587 STR_SET_EMBED(str);
6588 STR_SET_LEN(str, 0);
6589 RSTRING_PTR(str)[0] = 0;
6590 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6592 else
6594 return str;
6595}
6596
6597/*
6598 * call-seq:
6599 * chr -> string
6600 *
6601 * :include: doc/string/chr.rdoc
6602 *
6603 */
6604
6605static VALUE
6606rb_str_chr(VALUE str)
6607{
6608 return rb_str_substr(str, 0, 1);
6609}
6610
6611/*
6612 * call-seq:
6613 * getbyte(index) -> integer or nil
6614 *
6615 * :include: doc/string/getbyte.rdoc
6616 *
6617 */
6618VALUE
6619rb_str_getbyte(VALUE str, VALUE index)
6620{
6621 long pos = NUM2LONG(index);
6622
6623 if (pos < 0)
6624 pos += RSTRING_LEN(str);
6625 if (pos < 0 || RSTRING_LEN(str) <= pos)
6626 return Qnil;
6627
6628 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6629}
6630
6631/*
6632 * call-seq:
6633 * setbyte(index, integer) -> integer
6634 *
6635 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6636 * returns +integer+:
6637 *
6638 * s = 'xyzzy'
6639 * s.setbyte(2, 129) # => 129
6640 * s # => "xy\x81zy"
6641 *
6642 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6643 */
6644VALUE
6645rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6646{
6647 long pos = NUM2LONG(index);
6648 long len = RSTRING_LEN(str);
6649 char *ptr, *head, *left = 0;
6650 rb_encoding *enc;
6651 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6652
6653 if (pos < -len || len <= pos)
6654 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6655 if (pos < 0)
6656 pos += len;
6657
6658 VALUE v = rb_to_int(value);
6659 VALUE w = rb_int_and(v, INT2FIX(0xff));
6660 char byte = (char)(NUM2INT(w) & 0xFF);
6661
6662 if (!str_independent(str))
6663 str_make_independent(str);
6664 enc = STR_ENC_GET(str);
6665 head = RSTRING_PTR(str);
6666 ptr = &head[pos];
6667 if (!STR_EMBED_P(str)) {
6668 cr = ENC_CODERANGE(str);
6669 switch (cr) {
6670 case ENC_CODERANGE_7BIT:
6671 left = ptr;
6672 *ptr = byte;
6673 if (ISASCII(byte)) goto end;
6674 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6675 if (!MBCLEN_CHARFOUND_P(nlen))
6677 else
6679 goto end;
6681 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6682 width = rb_enc_precise_mbclen(left, head+len, enc);
6683 *ptr = byte;
6684 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6685 if (!MBCLEN_CHARFOUND_P(nlen))
6687 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6689 goto end;
6690 }
6691 }
6693 *ptr = byte;
6694
6695 end:
6696 return value;
6697}
6698
6699static VALUE
6700str_byte_substr(VALUE str, long beg, long len, int empty)
6701{
6702 long n = RSTRING_LEN(str);
6703
6704 if (beg > n || len < 0) return Qnil;
6705 if (beg < 0) {
6706 beg += n;
6707 if (beg < 0) return Qnil;
6708 }
6709 if (len > n - beg)
6710 len = n - beg;
6711 if (len <= 0) {
6712 if (!empty) return Qnil;
6713 len = 0;
6714 }
6715
6716 VALUE str2 = str_subseq(str, beg, len);
6717
6718 str_enc_copy_direct(str2, str);
6719
6720 if (RSTRING_LEN(str2) == 0) {
6721 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6723 else
6725 }
6726 else {
6727 switch (ENC_CODERANGE(str)) {
6728 case ENC_CODERANGE_7BIT:
6730 break;
6731 default:
6733 break;
6734 }
6735 }
6736
6737 return str2;
6738}
6739
6740VALUE
6741rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6742{
6743 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6744}
6745
6746static VALUE
6747str_byte_aref(VALUE str, VALUE indx)
6748{
6749 long idx;
6750 if (FIXNUM_P(indx)) {
6751 idx = FIX2LONG(indx);
6752 }
6753 else {
6754 /* check if indx is Range */
6755 long beg, len = RSTRING_LEN(str);
6756
6757 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6758 case Qfalse:
6759 break;
6760 case Qnil:
6761 return Qnil;
6762 default:
6763 return str_byte_substr(str, beg, len, TRUE);
6764 }
6765
6766 idx = NUM2LONG(indx);
6767 }
6768 return str_byte_substr(str, idx, 1, FALSE);
6769}
6770
6771/*
6772 * call-seq:
6773 * byteslice(offset, length = 1) -> string or nil
6774 * byteslice(range) -> string or nil
6775 *
6776 * :include: doc/string/byteslice.rdoc
6777 */
6778
6779static VALUE
6780rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6781{
6782 if (argc == 2) {
6783 long beg = NUM2LONG(argv[0]);
6784 long len = NUM2LONG(argv[1]);
6785 return str_byte_substr(str, beg, len, TRUE);
6786 }
6787 rb_check_arity(argc, 1, 2);
6788 return str_byte_aref(str, argv[0]);
6789}
6790
6791static void
6792str_check_beg_len(VALUE str, long *beg, long *len)
6793{
6794 long end, slen = RSTRING_LEN(str);
6795
6796 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6797 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6798 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6799 }
6800 if (*beg < 0) {
6801 *beg += slen;
6802 }
6803 RUBY_ASSERT(*beg >= 0);
6804 RUBY_ASSERT(*beg <= slen);
6805
6806 if (*len > slen - *beg) {
6807 *len = slen - *beg;
6808 }
6809 end = *beg + *len;
6810 str_ensure_byte_pos(str, *beg);
6811 str_ensure_byte_pos(str, end);
6812}
6813
6814/*
6815 * call-seq:
6816 * bytesplice(offset, length, str) -> self
6817 * bytesplice(offset, length, str, str_offset, str_length) -> self
6818 * bytesplice(range, str) -> self
6819 * bytesplice(range, str, str_range) -> self
6820 *
6821 * :include: doc/string/bytesplice.rdoc
6822 */
6823
6824static VALUE
6825rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6826{
6827 long beg, len, vbeg, vlen;
6828 VALUE val;
6829 int cr;
6830
6831 rb_check_arity(argc, 2, 5);
6832 if (!(argc == 2 || argc == 3 || argc == 5)) {
6833 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6834 }
6835 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6836 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6837 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6838 rb_builtin_class_name(argv[0]));
6839 }
6840 val = argv[1];
6841 StringValue(val);
6842 if (argc == 2) {
6843 /* bytesplice(range, str) */
6844 vbeg = 0;
6845 vlen = RSTRING_LEN(val);
6846 }
6847 else {
6848 /* bytesplice(range, str, str_range) */
6849 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6850 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6851 rb_builtin_class_name(argv[2]));
6852 }
6853 }
6854 }
6855 else {
6856 beg = NUM2LONG(argv[0]);
6857 len = NUM2LONG(argv[1]);
6858 val = argv[2];
6859 StringValue(val);
6860 if (argc == 3) {
6861 /* bytesplice(index, length, str) */
6862 vbeg = 0;
6863 vlen = RSTRING_LEN(val);
6864 }
6865 else {
6866 /* bytesplice(index, length, str, str_index, str_length) */
6867 vbeg = NUM2LONG(argv[3]);
6868 vlen = NUM2LONG(argv[4]);
6869 }
6870 }
6871 str_check_beg_len(str, &beg, &len);
6872 str_check_beg_len(val, &vbeg, &vlen);
6873 str_modify_keep_cr(str);
6874
6875 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6876 rb_enc_associate(str, rb_enc_check(str, val));
6877 }
6878
6879 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6881 if (cr != ENC_CODERANGE_BROKEN)
6882 ENC_CODERANGE_SET(str, cr);
6883 return str;
6884}
6885
6886/*
6887 * call-seq:
6888 * reverse -> new_string
6889 *
6890 * Returns a new string with the characters from +self+ in reverse order.
6891 *
6892 * 'drawer'.reverse # => "reward"
6893 * 'reviled'.reverse # => "deliver"
6894 * 'stressed'.reverse # => "desserts"
6895 * 'semordnilaps'.reverse # => "spalindromes"
6896 *
6897 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6898 */
6899
6900static VALUE
6901rb_str_reverse(VALUE str)
6902{
6903 rb_encoding *enc;
6904 VALUE rev;
6905 char *s, *e, *p;
6906 int cr;
6907
6908 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6909 enc = STR_ENC_GET(str);
6910 rev = rb_str_new(0, RSTRING_LEN(str));
6911 s = RSTRING_PTR(str); e = RSTRING_END(str);
6912 p = RSTRING_END(rev);
6913 cr = ENC_CODERANGE(str);
6914
6915 if (RSTRING_LEN(str) > 1) {
6916 if (single_byte_optimizable(str)) {
6917 while (s < e) {
6918 *--p = *s++;
6919 }
6920 }
6921 else if (cr == ENC_CODERANGE_VALID) {
6922 while (s < e) {
6923 int clen = rb_enc_fast_mbclen(s, e, enc);
6924
6925 p -= clen;
6926 memcpy(p, s, clen);
6927 s += clen;
6928 }
6929 }
6930 else {
6931 cr = rb_enc_asciicompat(enc) ?
6933 while (s < e) {
6934 int clen = rb_enc_mbclen(s, e, enc);
6935
6936 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6937 p -= clen;
6938 memcpy(p, s, clen);
6939 s += clen;
6940 }
6941 }
6942 }
6943 STR_SET_LEN(rev, RSTRING_LEN(str));
6944 str_enc_copy_direct(rev, str);
6945 ENC_CODERANGE_SET(rev, cr);
6946
6947 return rev;
6948}
6949
6950
6951/*
6952 * call-seq:
6953 * reverse! -> self
6954 *
6955 * Returns +self+ with its characters reversed:
6956 *
6957 * 'drawer'.reverse! # => "reward"
6958 * 'reviled'.reverse! # => "deliver"
6959 * 'stressed'.reverse! # => "desserts"
6960 * 'semordnilaps'.reverse! # => "spalindromes"
6961 *
6962 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6963 */
6964
6965static VALUE
6966rb_str_reverse_bang(VALUE str)
6967{
6968 if (RSTRING_LEN(str) > 1) {
6969 if (single_byte_optimizable(str)) {
6970 char *s, *e, c;
6971
6972 str_modify_keep_cr(str);
6973 s = RSTRING_PTR(str);
6974 e = RSTRING_END(str) - 1;
6975 while (s < e) {
6976 c = *s;
6977 *s++ = *e;
6978 *e-- = c;
6979 }
6980 }
6981 else {
6982 str_shared_replace(str, rb_str_reverse(str));
6983 }
6984 }
6985 else {
6986 str_modify_keep_cr(str);
6987 }
6988 return str;
6989}
6990
6991
6992/*
6993 * call-seq:
6994 * include?(other_string) -> true or false
6995 *
6996 * Returns whether +self+ contains +other_string+:
6997 *
6998 * s = 'bar'
6999 * s.include?('ba') # => true
7000 * s.include?('ar') # => true
7001 * s.include?('bar') # => true
7002 * s.include?('a') # => true
7003 * s.include?('') # => true
7004 * s.include?('foo') # => false
7005 *
7006 * Related: see {Querying}[rdoc-ref:String@Querying].
7007 */
7008
7009VALUE
7010rb_str_include(VALUE str, VALUE arg)
7011{
7012 long i;
7013
7014 StringValue(arg);
7015 i = rb_str_index(str, arg, 0);
7016
7017 return RBOOL(i != -1);
7018}
7019
7020
7021/*
7022 * call-seq:
7023 * to_i(base = 10) -> integer
7024 *
7025 * Returns the result of interpreting leading characters in +self+
7026 * as an integer in the given +base+ (which must be in (0, 2..36)):
7027 *
7028 * '123456'.to_i # => 123456
7029 * '123def'.to_i(16) # => 1195503
7030 *
7031 * With +base+ zero, string +object+ may contain leading characters
7032 * to specify the actual base:
7033 *
7034 * '123def'.to_i(0) # => 123
7035 * '0123def'.to_i(0) # => 83
7036 * '0b123def'.to_i(0) # => 1
7037 * '0o123def'.to_i(0) # => 83
7038 * '0d123def'.to_i(0) # => 123
7039 * '0x123def'.to_i(0) # => 1195503
7040 *
7041 * Characters past a leading valid number (in the given +base+) are ignored:
7042 *
7043 * '12.345'.to_i # => 12
7044 * '12345'.to_i(2) # => 1
7045 *
7046 * Returns zero if there is no leading valid number:
7047 *
7048 * 'abcdef'.to_i # => 0
7049 * '2'.to_i(2) # => 0
7050 *
7051 */
7052
7053static VALUE
7054rb_str_to_i(int argc, VALUE *argv, VALUE str)
7055{
7056 int base = 10;
7057
7058 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7059 rb_raise(rb_eArgError, "invalid radix %d", base);
7060 }
7061 return rb_str_to_inum(str, base, FALSE);
7062}
7063
7064
7065/*
7066 * call-seq:
7067 * to_f -> float
7068 *
7069 * Returns the result of interpreting leading characters in +self+ as a Float:
7070 *
7071 * '3.14159'.to_f # => 3.14159
7072 * '1.234e-2'.to_f # => 0.01234
7073 *
7074 * Characters past a leading valid number are ignored:
7075 *
7076 * '3.14 (pi to two places)'.to_f # => 3.14
7077 *
7078 * Returns zero if there is no leading valid number:
7079 *
7080 * 'abcdef'.to_f # => 0.0
7081 *
7082 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7083 */
7084
7085static VALUE
7086rb_str_to_f(VALUE str)
7087{
7088 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7089}
7090
7091
7092/*
7093 * call-seq:
7094 * to_s -> self or string
7095 *
7096 * Returns +self+ if +self+ is a +String+,
7097 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7098 */
7099
7100static VALUE
7101rb_str_to_s(VALUE str)
7102{
7103 if (rb_obj_class(str) != rb_cString) {
7104 return str_duplicate(rb_cString, str);
7105 }
7106 return str;
7107}
7108
7109#if 0
7110static void
7111str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7112{
7113 char s[RUBY_MAX_CHAR_LEN];
7114 int n = rb_enc_codelen(c, enc);
7115
7116 rb_enc_mbcput(c, s, enc);
7117 rb_enc_str_buf_cat(str, s, n, enc);
7118}
7119#endif
7120
7121#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7122
7123int
7124rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7125{
7126 char buf[CHAR_ESC_LEN + 1];
7127 int l;
7128
7129#if SIZEOF_INT > 4
7130 c &= 0xffffffff;
7131#endif
7132 if (unicode_p) {
7133 if (c < 0x7F && ISPRINT(c)) {
7134 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7135 }
7136 else if (c < 0x10000) {
7137 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7138 }
7139 else {
7140 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7141 }
7142 }
7143 else {
7144 if (c < 0x100) {
7145 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7146 }
7147 else {
7148 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7149 }
7150 }
7151 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7152 rb_str_buf_cat(result, buf, l);
7153 return l;
7154}
7155
7156const char *
7157ruby_escaped_char(int c)
7158{
7159 switch (c) {
7160 case '\0': return "\\0";
7161 case '\n': return "\\n";
7162 case '\r': return "\\r";
7163 case '\t': return "\\t";
7164 case '\f': return "\\f";
7165 case '\013': return "\\v";
7166 case '\010': return "\\b";
7167 case '\007': return "\\a";
7168 case '\033': return "\\e";
7169 case '\x7f': return "\\c?";
7170 }
7171 return NULL;
7172}
7173
7174VALUE
7175rb_str_escape(VALUE str)
7176{
7177 int encidx = ENCODING_GET(str);
7178 rb_encoding *enc = rb_enc_from_index(encidx);
7179 const char *p = RSTRING_PTR(str);
7180 const char *pend = RSTRING_END(str);
7181 const char *prev = p;
7182 char buf[CHAR_ESC_LEN + 1];
7183 VALUE result = rb_str_buf_new(0);
7184 int unicode_p = rb_enc_unicode_p(enc);
7185 int asciicompat = rb_enc_asciicompat(enc);
7186
7187 while (p < pend) {
7188 unsigned int c;
7189 const char *cc;
7190 int n = rb_enc_precise_mbclen(p, pend, enc);
7191 if (!MBCLEN_CHARFOUND_P(n)) {
7192 if (p > prev) str_buf_cat(result, prev, p - prev);
7193 n = rb_enc_mbminlen(enc);
7194 if (pend < p + n)
7195 n = (int)(pend - p);
7196 while (n--) {
7197 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7198 str_buf_cat(result, buf, strlen(buf));
7199 prev = ++p;
7200 }
7201 continue;
7202 }
7203 n = MBCLEN_CHARFOUND_LEN(n);
7204 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7205 p += n;
7206 cc = ruby_escaped_char(c);
7207 if (cc) {
7208 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7209 str_buf_cat(result, cc, strlen(cc));
7210 prev = p;
7211 }
7212 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7213 }
7214 else {
7215 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7216 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7217 prev = p;
7218 }
7219 }
7220 if (p > prev) str_buf_cat(result, prev, p - prev);
7221 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7222
7223 return result;
7224}
7225
7226/*
7227 * call-seq:
7228 * inspect -> string
7229 *
7230 * :include: doc/string/inspect.rdoc
7231 *
7232 */
7233
7234VALUE
7236{
7237 int encidx = ENCODING_GET(str);
7238 rb_encoding *enc = rb_enc_from_index(encidx);
7239 const char *p, *pend, *prev;
7240 char buf[CHAR_ESC_LEN + 1];
7241 VALUE result = rb_str_buf_new(0);
7242 rb_encoding *resenc = rb_default_internal_encoding();
7243 int unicode_p = rb_enc_unicode_p(enc);
7244 int asciicompat = rb_enc_asciicompat(enc);
7245
7246 if (resenc == NULL) resenc = rb_default_external_encoding();
7247 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7248 rb_enc_associate(result, resenc);
7249 str_buf_cat2(result, "\"");
7250
7251 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7252 prev = p;
7253 while (p < pend) {
7254 unsigned int c, cc;
7255 int n;
7256
7257 n = rb_enc_precise_mbclen(p, pend, enc);
7258 if (!MBCLEN_CHARFOUND_P(n)) {
7259 if (p > prev) str_buf_cat(result, prev, p - prev);
7260 n = rb_enc_mbminlen(enc);
7261 if (pend < p + n)
7262 n = (int)(pend - p);
7263 while (n--) {
7264 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7265 str_buf_cat(result, buf, strlen(buf));
7266 prev = ++p;
7267 }
7268 continue;
7269 }
7270 n = MBCLEN_CHARFOUND_LEN(n);
7271 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7272 p += n;
7273 if ((asciicompat || unicode_p) &&
7274 (c == '"'|| c == '\\' ||
7275 (c == '#' &&
7276 p < pend &&
7277 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7278 (cc = rb_enc_codepoint(p,pend,enc),
7279 (cc == '$' || cc == '@' || cc == '{'))))) {
7280 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7281 str_buf_cat2(result, "\\");
7282 if (asciicompat || enc == resenc) {
7283 prev = p - n;
7284 continue;
7285 }
7286 }
7287 switch (c) {
7288 case '\n': cc = 'n'; break;
7289 case '\r': cc = 'r'; break;
7290 case '\t': cc = 't'; break;
7291 case '\f': cc = 'f'; break;
7292 case '\013': cc = 'v'; break;
7293 case '\010': cc = 'b'; break;
7294 case '\007': cc = 'a'; break;
7295 case 033: cc = 'e'; break;
7296 default: cc = 0; break;
7297 }
7298 if (cc) {
7299 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7300 buf[0] = '\\';
7301 buf[1] = (char)cc;
7302 str_buf_cat(result, buf, 2);
7303 prev = p;
7304 continue;
7305 }
7306 /* The special casing of 0x85 (NEXT_LINE) here is because
7307 * Oniguruma historically treats it as printable, but it
7308 * doesn't match the print POSIX bracket class or character
7309 * property in regexps.
7310 *
7311 * See Ruby Bug #16842 for details:
7312 * https://bugs.ruby-lang.org/issues/16842
7313 */
7314 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7315 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7316 continue;
7317 }
7318 else {
7319 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7320 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7321 prev = p;
7322 continue;
7323 }
7324 }
7325 if (p > prev) str_buf_cat(result, prev, p - prev);
7326 str_buf_cat2(result, "\"");
7327
7328 return result;
7329}
7330
7331#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7332
7333/*
7334 * call-seq:
7335 * dump -> new_string
7336 *
7337 * :include: doc/string/dump.rdoc
7338 *
7339 */
7340
7341VALUE
7343{
7344 int encidx = rb_enc_get_index(str);
7345 rb_encoding *enc = rb_enc_from_index(encidx);
7346 long len;
7347 const char *p, *pend;
7348 char *q, *qend;
7349 VALUE result;
7350 int u8 = (encidx == rb_utf8_encindex());
7351 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7352
7353 len = 2; /* "" */
7354 if (!rb_enc_asciicompat(enc)) {
7355 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7356 len += strlen(enc->name);
7357 }
7358
7359 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7360 while (p < pend) {
7361 int clen;
7362 unsigned char c = *p++;
7363
7364 switch (c) {
7365 case '"': case '\\':
7366 case '\n': case '\r':
7367 case '\t': case '\f':
7368 case '\013': case '\010': case '\007': case '\033':
7369 clen = 2;
7370 break;
7371
7372 case '#':
7373 clen = IS_EVSTR(p, pend) ? 2 : 1;
7374 break;
7375
7376 default:
7377 if (ISPRINT(c)) {
7378 clen = 1;
7379 }
7380 else {
7381 if (u8 && c > 0x7F) { /* \u notation */
7382 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7383 if (MBCLEN_CHARFOUND_P(n)) {
7384 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7385 if (cc <= 0xFFFF)
7386 clen = 6; /* \uXXXX */
7387 else if (cc <= 0xFFFFF)
7388 clen = 9; /* \u{XXXXX} */
7389 else
7390 clen = 10; /* \u{XXXXXX} */
7391 p += MBCLEN_CHARFOUND_LEN(n)-1;
7392 break;
7393 }
7394 }
7395 clen = 4; /* \xNN */
7396 }
7397 break;
7398 }
7399
7400 if (clen > LONG_MAX - len) {
7401 rb_raise(rb_eRuntimeError, "string size too big");
7402 }
7403 len += clen;
7404 }
7405
7406 result = rb_str_new(0, len);
7407 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7408 q = RSTRING_PTR(result); qend = q + len + 1;
7409
7410 *q++ = '"';
7411 while (p < pend) {
7412 unsigned char c = *p++;
7413
7414 if (c == '"' || c == '\\') {
7415 *q++ = '\\';
7416 *q++ = c;
7417 }
7418 else if (c == '#') {
7419 if (IS_EVSTR(p, pend)) *q++ = '\\';
7420 *q++ = '#';
7421 }
7422 else if (c == '\n') {
7423 *q++ = '\\';
7424 *q++ = 'n';
7425 }
7426 else if (c == '\r') {
7427 *q++ = '\\';
7428 *q++ = 'r';
7429 }
7430 else if (c == '\t') {
7431 *q++ = '\\';
7432 *q++ = 't';
7433 }
7434 else if (c == '\f') {
7435 *q++ = '\\';
7436 *q++ = 'f';
7437 }
7438 else if (c == '\013') {
7439 *q++ = '\\';
7440 *q++ = 'v';
7441 }
7442 else if (c == '\010') {
7443 *q++ = '\\';
7444 *q++ = 'b';
7445 }
7446 else if (c == '\007') {
7447 *q++ = '\\';
7448 *q++ = 'a';
7449 }
7450 else if (c == '\033') {
7451 *q++ = '\\';
7452 *q++ = 'e';
7453 }
7454 else if (ISPRINT(c)) {
7455 *q++ = c;
7456 }
7457 else {
7458 *q++ = '\\';
7459 if (u8) {
7460 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7461 if (MBCLEN_CHARFOUND_P(n)) {
7462 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7463 p += n;
7464 if (cc <= 0xFFFF)
7465 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7466 else
7467 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7468 q += strlen(q);
7469 continue;
7470 }
7471 }
7472 snprintf(q, qend-q, "x%02X", c);
7473 q += 3;
7474 }
7475 }
7476 *q++ = '"';
7477 *q = '\0';
7478 if (!rb_enc_asciicompat(enc)) {
7479 snprintf(q, qend-q, nonascii_suffix, enc->name);
7480 encidx = rb_ascii8bit_encindex();
7481 }
7482 /* result from dump is ASCII */
7483 rb_enc_associate_index(result, encidx);
7485 return result;
7486}
7487
7488static int
7489unescape_ascii(unsigned int c)
7490{
7491 switch (c) {
7492 case 'n':
7493 return '\n';
7494 case 'r':
7495 return '\r';
7496 case 't':
7497 return '\t';
7498 case 'f':
7499 return '\f';
7500 case 'v':
7501 return '\13';
7502 case 'b':
7503 return '\010';
7504 case 'a':
7505 return '\007';
7506 case 'e':
7507 return 033;
7508 }
7510}
7511
7512static void
7513undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7514{
7515 const char *s = *ss;
7516 unsigned int c;
7517 int codelen;
7518 size_t hexlen;
7519 unsigned char buf[6];
7520 static rb_encoding *enc_utf8 = NULL;
7521
7522 switch (*s) {
7523 case '\\':
7524 case '"':
7525 case '#':
7526 rb_str_cat(undumped, s, 1); /* cat itself */
7527 s++;
7528 break;
7529 case 'n':
7530 case 'r':
7531 case 't':
7532 case 'f':
7533 case 'v':
7534 case 'b':
7535 case 'a':
7536 case 'e':
7537 *buf = unescape_ascii(*s);
7538 rb_str_cat(undumped, (char *)buf, 1);
7539 s++;
7540 break;
7541 case 'u':
7542 if (*binary) {
7543 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7544 }
7545 *utf8 = true;
7546 if (++s >= s_end) {
7547 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7548 }
7549 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7550 if (*penc != enc_utf8) {
7551 *penc = enc_utf8;
7552 rb_enc_associate(undumped, enc_utf8);
7553 }
7554 if (*s == '{') { /* handle \u{...} form */
7555 s++;
7556 for (;;) {
7557 if (s >= s_end) {
7558 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7559 }
7560 if (*s == '}') {
7561 s++;
7562 break;
7563 }
7564 if (ISSPACE(*s)) {
7565 s++;
7566 continue;
7567 }
7568 c = scan_hex(s, s_end-s, &hexlen);
7569 if (hexlen == 0 || hexlen > 6) {
7570 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7571 }
7572 if (c > 0x10ffff) {
7573 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7574 }
7575 if (0xd800 <= c && c <= 0xdfff) {
7576 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7577 }
7578 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7579 rb_str_cat(undumped, (char *)buf, codelen);
7580 s += hexlen;
7581 }
7582 }
7583 else { /* handle \uXXXX form */
7584 c = scan_hex(s, 4, &hexlen);
7585 if (hexlen != 4) {
7586 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7587 }
7588 if (0xd800 <= c && c <= 0xdfff) {
7589 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7590 }
7591 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7592 rb_str_cat(undumped, (char *)buf, codelen);
7593 s += hexlen;
7594 }
7595 break;
7596 case 'x':
7597 if (*utf8) {
7598 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7599 }
7600 *binary = true;
7601 if (++s >= s_end) {
7602 rb_raise(rb_eRuntimeError, "invalid hex escape");
7603 }
7604 *buf = scan_hex(s, 2, &hexlen);
7605 if (hexlen != 2) {
7606 rb_raise(rb_eRuntimeError, "invalid hex escape");
7607 }
7608 rb_str_cat(undumped, (char *)buf, 1);
7609 s += hexlen;
7610 break;
7611 default:
7612 rb_str_cat(undumped, s-1, 2);
7613 s++;
7614 }
7615
7616 *ss = s;
7617}
7618
7619static VALUE rb_str_is_ascii_only_p(VALUE str);
7620
7621/*
7622 * call-seq:
7623 * undump -> string
7624 *
7625 * Returns an unescaped version of +self+:
7626 *
7627 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7628 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7629 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7630 * s_undumped == s_orig # => true
7631 *
7632 * Related: String#dump (inverse of String#undump).
7633 *
7634 */
7635
7636static VALUE
7637str_undump(VALUE str)
7638{
7639 const char *s = RSTRING_PTR(str);
7640 const char *s_end = RSTRING_END(str);
7641 rb_encoding *enc = rb_enc_get(str);
7642 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7643 bool utf8 = false;
7644 bool binary = false;
7645 int w;
7646
7648 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7649 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7650 }
7651 if (!str_null_check(str, &w)) {
7652 rb_raise(rb_eRuntimeError, "string contains null byte");
7653 }
7654 if (RSTRING_LEN(str) < 2) goto invalid_format;
7655 if (*s != '"') goto invalid_format;
7656
7657 /* strip '"' at the start */
7658 s++;
7659
7660 for (;;) {
7661 if (s >= s_end) {
7662 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7663 }
7664
7665 if (*s == '"') {
7666 /* epilogue */
7667 s++;
7668 if (s == s_end) {
7669 /* ascii compatible dumped string */
7670 break;
7671 }
7672 else {
7673 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7674 static const char dup_suffix[] = ".dup";
7675 const char *encname;
7676 int encidx;
7677 ptrdiff_t size;
7678
7679 /* check separately for strings dumped by older versions */
7680 size = sizeof(dup_suffix) - 1;
7681 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7682
7683 size = sizeof(force_encoding_suffix) - 1;
7684 if (s_end - s <= size) goto invalid_format;
7685 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7686 s += size;
7687
7688 if (utf8) {
7689 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7690 }
7691
7692 encname = s;
7693 s = memchr(s, '"', s_end-s);
7694 size = s - encname;
7695 if (!s) goto invalid_format;
7696 if (s_end - s != 2) goto invalid_format;
7697 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7698
7699 encidx = rb_enc_find_index2(encname, (long)size);
7700 if (encidx < 0) {
7701 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7702 }
7703 rb_enc_associate_index(undumped, encidx);
7704 }
7705 break;
7706 }
7707
7708 if (*s == '\\') {
7709 s++;
7710 if (s >= s_end) {
7711 rb_raise(rb_eRuntimeError, "invalid escape");
7712 }
7713 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7714 }
7715 else {
7716 rb_str_cat(undumped, s++, 1);
7717 }
7718 }
7719
7720 RB_GC_GUARD(str);
7721
7722 return undumped;
7723invalid_format:
7724 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7725}
7726
7727static void
7728rb_str_check_dummy_enc(rb_encoding *enc)
7729{
7730 if (rb_enc_dummy_p(enc)) {
7731 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7732 rb_enc_name(enc));
7733 }
7734}
7735
7736static rb_encoding *
7737str_true_enc(VALUE str)
7738{
7739 rb_encoding *enc = STR_ENC_GET(str);
7740 rb_str_check_dummy_enc(enc);
7741 return enc;
7742}
7743
7744static OnigCaseFoldType
7745check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7746{
7747 if (argc==0)
7748 return flags;
7749 if (argc>2)
7750 rb_raise(rb_eArgError, "too many options");
7751 if (argv[0]==sym_turkic) {
7752 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7753 if (argc==2) {
7754 if (argv[1]==sym_lithuanian)
7755 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7756 else
7757 rb_raise(rb_eArgError, "invalid second option");
7758 }
7759 }
7760 else if (argv[0]==sym_lithuanian) {
7761 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7762 if (argc==2) {
7763 if (argv[1]==sym_turkic)
7764 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7765 else
7766 rb_raise(rb_eArgError, "invalid second option");
7767 }
7768 }
7769 else if (argc>1)
7770 rb_raise(rb_eArgError, "too many options");
7771 else if (argv[0]==sym_ascii)
7772 flags |= ONIGENC_CASE_ASCII_ONLY;
7773 else if (argv[0]==sym_fold) {
7774 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7775 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7776 else
7777 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7778 }
7779 else
7780 rb_raise(rb_eArgError, "invalid option");
7781 return flags;
7782}
7783
7784static inline bool
7785case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7786{
7787 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7788 return true;
7789 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7790}
7791
7792/* 16 should be long enough to absorb any kind of single character length increase */
7793#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7794#ifndef CASEMAP_DEBUG
7795# define CASEMAP_DEBUG 0
7796#endif
7797
7798struct mapping_buffer;
7799typedef struct mapping_buffer {
7800 size_t capa;
7801 size_t used;
7802 struct mapping_buffer *next;
7803 OnigUChar space[FLEX_ARY_LEN];
7805
7806static void
7807mapping_buffer_free(void *p)
7808{
7809 mapping_buffer *previous_buffer;
7810 mapping_buffer *current_buffer = p;
7811 while (current_buffer) {
7812 previous_buffer = current_buffer;
7813 current_buffer = current_buffer->next;
7814 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7815 }
7816}
7817
7818static const rb_data_type_t mapping_buffer_type = {
7819 "mapping_buffer",
7820 {0, mapping_buffer_free,},
7821 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7822};
7823
7824static VALUE
7825rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7826{
7827 VALUE target;
7828
7829 const OnigUChar *source_current, *source_end;
7830 int target_length = 0;
7831 VALUE buffer_anchor;
7832 mapping_buffer *current_buffer = 0;
7833 mapping_buffer **pre_buffer;
7834 size_t buffer_count = 0;
7835 int buffer_length_or_invalid;
7836
7837 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7838
7839 source_current = (OnigUChar*)RSTRING_PTR(source);
7840 source_end = (OnigUChar*)RSTRING_END(source);
7841
7842 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7843 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7844 while (source_current < source_end) {
7845 /* increase multiplier using buffer count to converge quickly */
7846 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7847 if (CASEMAP_DEBUG) {
7848 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7849 }
7850 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7851 *pre_buffer = current_buffer;
7852 pre_buffer = &current_buffer->next;
7853 current_buffer->next = NULL;
7854 current_buffer->capa = capa;
7855 buffer_length_or_invalid = enc->case_map(flags,
7856 &source_current, source_end,
7857 current_buffer->space,
7858 current_buffer->space+current_buffer->capa,
7859 enc);
7860 if (buffer_length_or_invalid < 0) {
7861 current_buffer = DATA_PTR(buffer_anchor);
7862 DATA_PTR(buffer_anchor) = 0;
7863 mapping_buffer_free(current_buffer);
7864 rb_raise(rb_eArgError, "input string invalid");
7865 }
7866 target_length += current_buffer->used = buffer_length_or_invalid;
7867 }
7868 if (CASEMAP_DEBUG) {
7869 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7870 }
7871
7872 if (buffer_count==1) {
7873 target = rb_str_new((const char*)current_buffer->space, target_length);
7874 }
7875 else {
7876 char *target_current;
7877
7878 target = rb_str_new(0, target_length);
7879 target_current = RSTRING_PTR(target);
7880 current_buffer = DATA_PTR(buffer_anchor);
7881 while (current_buffer) {
7882 memcpy(target_current, current_buffer->space, current_buffer->used);
7883 target_current += current_buffer->used;
7884 current_buffer = current_buffer->next;
7885 }
7886 }
7887 current_buffer = DATA_PTR(buffer_anchor);
7888 DATA_PTR(buffer_anchor) = 0;
7889 mapping_buffer_free(current_buffer);
7890
7891 RB_GC_GUARD(buffer_anchor);
7892
7893 /* TODO: check about string terminator character */
7894 str_enc_copy_direct(target, source);
7895 /*ENC_CODERANGE_SET(mapped, cr);*/
7896
7897 return target;
7898}
7899
7900static VALUE
7901rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7902{
7903 const OnigUChar *source_current, *source_end;
7904 OnigUChar *target_current, *target_end;
7905 long old_length = RSTRING_LEN(source);
7906 int length_or_invalid;
7907
7908 if (old_length == 0) return Qnil;
7909
7910 source_current = (OnigUChar*)RSTRING_PTR(source);
7911 source_end = (OnigUChar*)RSTRING_END(source);
7912 if (source == target) {
7913 target_current = (OnigUChar*)source_current;
7914 target_end = (OnigUChar*)source_end;
7915 }
7916 else {
7917 target_current = (OnigUChar*)RSTRING_PTR(target);
7918 target_end = (OnigUChar*)RSTRING_END(target);
7919 }
7920
7921 length_or_invalid = onigenc_ascii_only_case_map(flags,
7922 &source_current, source_end,
7923 target_current, target_end, enc);
7924 if (length_or_invalid < 0)
7925 rb_raise(rb_eArgError, "input string invalid");
7926 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7927 fprintf(stderr, "problem with rb_str_ascii_casemap"
7928 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7929 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7930 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7931 }
7932
7933 str_enc_copy(target, source);
7934
7935 return target;
7936}
7937
7938static bool
7939upcase_single(VALUE str)
7940{
7941 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7942 bool modified = false;
7943
7944 while (s < send) {
7945 unsigned int c = *(unsigned char*)s;
7946
7947 if ('a' <= c && c <= 'z') {
7948 *s = 'A' + (c - 'a');
7949 modified = true;
7950 }
7951 s++;
7952 }
7953 return modified;
7954}
7955
7956/*
7957 * call-seq:
7958 * upcase!(mapping) -> self or nil
7959 *
7960 * Upcases the characters in +self+;
7961 * returns +self+ if any changes were made, +nil+ otherwise:
7962 *
7963 * s = 'Hello World!' # => "Hello World!"
7964 * s.upcase! # => "HELLO WORLD!"
7965 * s # => "HELLO WORLD!"
7966 * s.upcase! # => nil
7967 *
7968 * The casing may be affected by the given +mapping+;
7969 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7970 *
7971 * Related: String#upcase, String#downcase, String#downcase!.
7972 *
7973 */
7974
7975static VALUE
7976rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7977{
7978 rb_encoding *enc;
7979 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7980
7981 flags = check_case_options(argc, argv, flags);
7982 str_modify_keep_cr(str);
7983 enc = str_true_enc(str);
7984 if (case_option_single_p(flags, enc, str)) {
7985 if (upcase_single(str))
7986 flags |= ONIGENC_CASE_MODIFIED;
7987 }
7988 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7989 rb_str_ascii_casemap(str, str, &flags, enc);
7990 else
7991 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7992
7993 if (ONIGENC_CASE_MODIFIED&flags) return str;
7994 return Qnil;
7995}
7996
7997
7998/*
7999 * call-seq:
8000 * upcase(mapping) -> string
8001 *
8002 * Returns a string containing the upcased characters in +self+:
8003 *
8004 * s = 'Hello World!' # => "Hello World!"
8005 * s.upcase # => "HELLO WORLD!"
8006 *
8007 * The casing may be affected by the given +mapping+;
8008 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8009 *
8010 * Related: String#upcase!, String#downcase, String#downcase!.
8011 *
8012 */
8013
8014static VALUE
8015rb_str_upcase(int argc, VALUE *argv, VALUE str)
8016{
8017 rb_encoding *enc;
8018 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8019 VALUE ret;
8020
8021 flags = check_case_options(argc, argv, flags);
8022 enc = str_true_enc(str);
8023 if (case_option_single_p(flags, enc, str)) {
8024 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8025 str_enc_copy_direct(ret, str);
8026 upcase_single(ret);
8027 }
8028 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8029 ret = rb_str_new(0, RSTRING_LEN(str));
8030 rb_str_ascii_casemap(str, ret, &flags, enc);
8031 }
8032 else {
8033 ret = rb_str_casemap(str, &flags, enc);
8034 }
8035
8036 return ret;
8037}
8038
8039static bool
8040downcase_single(VALUE str)
8041{
8042 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8043 bool modified = false;
8044
8045 while (s < send) {
8046 unsigned int c = *(unsigned char*)s;
8047
8048 if ('A' <= c && c <= 'Z') {
8049 *s = 'a' + (c - 'A');
8050 modified = true;
8051 }
8052 s++;
8053 }
8054
8055 return modified;
8056}
8057
8058/*
8059 * call-seq:
8060 * downcase!(mapping) -> self or nil
8061 *
8062 * Like String#downcase, except that:
8063 *
8064 * - Changes character casings in +self+ (not in a copy of +self+).
8065 * - Returns +self+ if any changes are made, +nil+ otherwise.
8066 *
8067 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8068 */
8069
8070static VALUE
8071rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8072{
8073 rb_encoding *enc;
8074 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8075
8076 flags = check_case_options(argc, argv, flags);
8077 str_modify_keep_cr(str);
8078 enc = str_true_enc(str);
8079 if (case_option_single_p(flags, enc, str)) {
8080 if (downcase_single(str))
8081 flags |= ONIGENC_CASE_MODIFIED;
8082 }
8083 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8084 rb_str_ascii_casemap(str, str, &flags, enc);
8085 else
8086 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8087
8088 if (ONIGENC_CASE_MODIFIED&flags) return str;
8089 return Qnil;
8090}
8091
8092
8093/*
8094 * call-seq:
8095 * downcase(mapping) -> string
8096 *
8097 * :include: doc/string/downcase.rdoc
8098 *
8099 */
8100
8101static VALUE
8102rb_str_downcase(int argc, VALUE *argv, VALUE str)
8103{
8104 rb_encoding *enc;
8105 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8106 VALUE ret;
8107
8108 flags = check_case_options(argc, argv, flags);
8109 enc = str_true_enc(str);
8110 if (case_option_single_p(flags, enc, str)) {
8111 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8112 str_enc_copy_direct(ret, str);
8113 downcase_single(ret);
8114 }
8115 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8116 ret = rb_str_new(0, RSTRING_LEN(str));
8117 rb_str_ascii_casemap(str, ret, &flags, enc);
8118 }
8119 else {
8120 ret = rb_str_casemap(str, &flags, enc);
8121 }
8122
8123 return ret;
8124}
8125
8126
8127/*
8128 * call-seq:
8129 * capitalize!(mapping = :ascii) -> self or nil
8130 *
8131 * Like String#capitalize, except that:
8132 *
8133 * - Changes character casings in +self+ (not in a copy of +self+).
8134 * - Returns +self+ if any changes are made, +nil+ otherwise.
8135 *
8136 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8137 */
8138
8139static VALUE
8140rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8141{
8142 rb_encoding *enc;
8143 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8144
8145 flags = check_case_options(argc, argv, flags);
8146 str_modify_keep_cr(str);
8147 enc = str_true_enc(str);
8148 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8149 if (flags&ONIGENC_CASE_ASCII_ONLY)
8150 rb_str_ascii_casemap(str, str, &flags, enc);
8151 else
8152 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8153
8154 if (ONIGENC_CASE_MODIFIED&flags) return str;
8155 return Qnil;
8156}
8157
8158
8159/*
8160 * call-seq:
8161 * capitalize(mapping = :ascii) -> string
8162 *
8163 * Returns a string containing the characters in +self+,
8164 * each with possibly changed case:
8165 *
8166 * - The first character is upcased.
8167 * - All other characters are downcased.
8168 *
8169 * Examples:
8170 *
8171 * 'hello world'.capitalize # => "Hello world"
8172 * 'HELLO WORLD'.capitalize # => "Hello world"
8173 *
8174 * Some characters do not have upcase and downcase, and so are not changed;
8175 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc]:
8176 *
8177 * '1, 2, 3, ...'.capitalize # => "1, 2, 3, ..."
8178 *
8179 * The casing is affected by the given +mapping+,
8180 * which may be +:ascii+, +:fold+, or +:turkic+;
8181 * see {Case Mappings}[rdoc-ref:case_mapping.rdoc@Case+Mappings].
8182 *
8183 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8184 */
8185
8186static VALUE
8187rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8188{
8189 rb_encoding *enc;
8190 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8191 VALUE ret;
8192
8193 flags = check_case_options(argc, argv, flags);
8194 enc = str_true_enc(str);
8195 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8196 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8197 ret = rb_str_new(0, RSTRING_LEN(str));
8198 rb_str_ascii_casemap(str, ret, &flags, enc);
8199 }
8200 else {
8201 ret = rb_str_casemap(str, &flags, enc);
8202 }
8203 return ret;
8204}
8205
8206
8207/*
8208 * call-seq:
8209 * swapcase!(mapping) -> self or nil
8210 *
8211 * Like String#swapcase, except that:
8212 *
8213 * - Changes are made to +self+, not to copy of +self+.
8214 * - Returns +self+ if any changes are made, +nil+ otherwise.
8215 *
8216 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8217 */
8218
8219static VALUE
8220rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8221{
8222 rb_encoding *enc;
8223 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8224
8225 flags = check_case_options(argc, argv, flags);
8226 str_modify_keep_cr(str);
8227 enc = str_true_enc(str);
8228 if (flags&ONIGENC_CASE_ASCII_ONLY)
8229 rb_str_ascii_casemap(str, str, &flags, enc);
8230 else
8231 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8232
8233 if (ONIGENC_CASE_MODIFIED&flags) return str;
8234 return Qnil;
8235}
8236
8237
8238/*
8239 * call-seq:
8240 * swapcase(mapping) -> new_string
8241 *
8242 * :include: doc/string/swapcase.rdoc
8243 *
8244 */
8245
8246static VALUE
8247rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8248{
8249 rb_encoding *enc;
8250 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8251 VALUE ret;
8252
8253 flags = check_case_options(argc, argv, flags);
8254 enc = str_true_enc(str);
8255 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8256 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8257 ret = rb_str_new(0, RSTRING_LEN(str));
8258 rb_str_ascii_casemap(str, ret, &flags, enc);
8259 }
8260 else {
8261 ret = rb_str_casemap(str, &flags, enc);
8262 }
8263 return ret;
8264}
8265
8266typedef unsigned char *USTR;
8267
8268struct tr {
8269 int gen;
8270 unsigned int now, max;
8271 char *p, *pend;
8272};
8273
8274static unsigned int
8275trnext(struct tr *t, rb_encoding *enc)
8276{
8277 int n;
8278
8279 for (;;) {
8280 nextpart:
8281 if (!t->gen) {
8282 if (t->p == t->pend) return -1;
8283 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8284 t->p += n;
8285 }
8286 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8287 t->p += n;
8288 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8289 t->p += n;
8290 if (t->p < t->pend) {
8291 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8292 t->p += n;
8293 if (t->now > c) {
8294 if (t->now < 0x80 && c < 0x80) {
8295 rb_raise(rb_eArgError,
8296 "invalid range \"%c-%c\" in string transliteration",
8297 t->now, c);
8298 }
8299 else {
8300 rb_raise(rb_eArgError, "invalid range in string transliteration");
8301 }
8302 continue; /* not reached */
8303 }
8304 else if (t->now < c) {
8305 t->gen = 1;
8306 t->max = c;
8307 }
8308 }
8309 }
8310 return t->now;
8311 }
8312 else {
8313 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8314 if (t->now == t->max) {
8315 t->gen = 0;
8316 goto nextpart;
8317 }
8318 }
8319 if (t->now < t->max) {
8320 return t->now;
8321 }
8322 else {
8323 t->gen = 0;
8324 return t->max;
8325 }
8326 }
8327 }
8328}
8329
8330static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8331
8332static VALUE
8333tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8334{
8335 const unsigned int errc = -1;
8336 unsigned int trans[256];
8337 rb_encoding *enc, *e1, *e2;
8338 struct tr trsrc, trrepl;
8339 int cflag = 0;
8340 unsigned int c, c0, last = 0;
8341 int modify = 0, i, l;
8342 unsigned char *s, *send;
8343 VALUE hash = 0;
8344 int singlebyte = single_byte_optimizable(str);
8345 int termlen;
8346 int cr;
8347
8348#define CHECK_IF_ASCII(c) \
8349 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8350 (cr = ENC_CODERANGE_VALID) : 0)
8351
8352 StringValue(src);
8353 StringValue(repl);
8354 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8355 if (RSTRING_LEN(repl) == 0) {
8356 return rb_str_delete_bang(1, &src, str);
8357 }
8358
8359 cr = ENC_CODERANGE(str);
8360 e1 = rb_enc_check(str, src);
8361 e2 = rb_enc_check(str, repl);
8362 if (e1 == e2) {
8363 enc = e1;
8364 }
8365 else {
8366 enc = rb_enc_check(src, repl);
8367 }
8368 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8369 if (RSTRING_LEN(src) > 1 &&
8370 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8371 trsrc.p + l < trsrc.pend) {
8372 cflag = 1;
8373 trsrc.p += l;
8374 }
8375 trrepl.p = RSTRING_PTR(repl);
8376 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8377 trsrc.gen = trrepl.gen = 0;
8378 trsrc.now = trrepl.now = 0;
8379 trsrc.max = trrepl.max = 0;
8380
8381 if (cflag) {
8382 for (i=0; i<256; i++) {
8383 trans[i] = 1;
8384 }
8385 while ((c = trnext(&trsrc, enc)) != errc) {
8386 if (c < 256) {
8387 trans[c] = errc;
8388 }
8389 else {
8390 if (!hash) hash = rb_hash_new();
8391 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8392 }
8393 }
8394 while ((c = trnext(&trrepl, enc)) != errc)
8395 /* retrieve last replacer */;
8396 last = trrepl.now;
8397 for (i=0; i<256; i++) {
8398 if (trans[i] != errc) {
8399 trans[i] = last;
8400 }
8401 }
8402 }
8403 else {
8404 unsigned int r;
8405
8406 for (i=0; i<256; i++) {
8407 trans[i] = errc;
8408 }
8409 while ((c = trnext(&trsrc, enc)) != errc) {
8410 r = trnext(&trrepl, enc);
8411 if (r == errc) r = trrepl.now;
8412 if (c < 256) {
8413 trans[c] = r;
8414 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8415 }
8416 else {
8417 if (!hash) hash = rb_hash_new();
8418 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8419 }
8420 }
8421 }
8422
8423 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8424 cr = ENC_CODERANGE_7BIT;
8425 str_modify_keep_cr(str);
8426 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8427 termlen = rb_enc_mbminlen(enc);
8428 if (sflag) {
8429 int clen, tlen;
8430 long offset, max = RSTRING_LEN(str);
8431 unsigned int save = -1;
8432 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8433
8434 while (s < send) {
8435 int may_modify = 0;
8436
8437 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8438 if (!MBCLEN_CHARFOUND_P(r)) {
8439 xfree(buf);
8440 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8441 }
8442 clen = MBCLEN_CHARFOUND_LEN(r);
8443 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8444
8445 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8446
8447 s += clen;
8448 if (c < 256) {
8449 c = trans[c];
8450 }
8451 else if (hash) {
8452 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8453 if (NIL_P(tmp)) {
8454 if (cflag) c = last;
8455 else c = errc;
8456 }
8457 else if (cflag) c = errc;
8458 else c = NUM2INT(tmp);
8459 }
8460 else {
8461 c = errc;
8462 }
8463 if (c != (unsigned int)-1) {
8464 if (save == c) {
8465 CHECK_IF_ASCII(c);
8466 continue;
8467 }
8468 save = c;
8469 tlen = rb_enc_codelen(c, enc);
8470 modify = 1;
8471 }
8472 else {
8473 save = -1;
8474 c = c0;
8475 if (enc != e1) may_modify = 1;
8476 }
8477 if ((offset = t - buf) + tlen > max) {
8478 size_t MAYBE_UNUSED(old) = max + termlen;
8479 max = offset + tlen + (send - s);
8480 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8481 t = buf + offset;
8482 }
8483 rb_enc_mbcput(c, t, enc);
8484 if (may_modify && memcmp(s, t, tlen) != 0) {
8485 modify = 1;
8486 }
8487 CHECK_IF_ASCII(c);
8488 t += tlen;
8489 }
8490 if (!STR_EMBED_P(str)) {
8491 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8492 }
8493 TERM_FILL((char *)t, termlen);
8494 RSTRING(str)->as.heap.ptr = (char *)buf;
8495 STR_SET_LEN(str, t - buf);
8496 STR_SET_NOEMBED(str);
8497 RSTRING(str)->as.heap.aux.capa = max;
8498 }
8499 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8500 while (s < send) {
8501 c = (unsigned char)*s;
8502 if (trans[c] != errc) {
8503 if (!cflag) {
8504 c = trans[c];
8505 *s = c;
8506 modify = 1;
8507 }
8508 else {
8509 *s = last;
8510 modify = 1;
8511 }
8512 }
8513 CHECK_IF_ASCII(c);
8514 s++;
8515 }
8516 }
8517 else {
8518 int clen, tlen;
8519 long offset, max = (long)((send - s) * 1.2);
8520 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8521
8522 while (s < send) {
8523 int may_modify = 0;
8524
8525 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8526 if (!MBCLEN_CHARFOUND_P(r)) {
8527 xfree(buf);
8528 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8529 }
8530 clen = MBCLEN_CHARFOUND_LEN(r);
8531 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8532
8533 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8534
8535 if (c < 256) {
8536 c = trans[c];
8537 }
8538 else if (hash) {
8539 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8540 if (NIL_P(tmp)) {
8541 if (cflag) c = last;
8542 else c = errc;
8543 }
8544 else if (cflag) c = errc;
8545 else c = NUM2INT(tmp);
8546 }
8547 else {
8548 c = cflag ? last : errc;
8549 }
8550 if (c != errc) {
8551 tlen = rb_enc_codelen(c, enc);
8552 modify = 1;
8553 }
8554 else {
8555 c = c0;
8556 if (enc != e1) may_modify = 1;
8557 }
8558 if ((offset = t - buf) + tlen > max) {
8559 size_t MAYBE_UNUSED(old) = max + termlen;
8560 max = offset + tlen + (long)((send - s) * 1.2);
8561 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8562 t = buf + offset;
8563 }
8564 if (s != t) {
8565 rb_enc_mbcput(c, t, enc);
8566 if (may_modify && memcmp(s, t, tlen) != 0) {
8567 modify = 1;
8568 }
8569 }
8570 CHECK_IF_ASCII(c);
8571 s += clen;
8572 t += tlen;
8573 }
8574 if (!STR_EMBED_P(str)) {
8575 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8576 }
8577 TERM_FILL((char *)t, termlen);
8578 RSTRING(str)->as.heap.ptr = (char *)buf;
8579 STR_SET_LEN(str, t - buf);
8580 STR_SET_NOEMBED(str);
8581 RSTRING(str)->as.heap.aux.capa = max;
8582 }
8583
8584 if (modify) {
8585 if (cr != ENC_CODERANGE_BROKEN)
8586 ENC_CODERANGE_SET(str, cr);
8587 rb_enc_associate(str, enc);
8588 return str;
8589 }
8590 return Qnil;
8591}
8592
8593
8594/*
8595 * call-seq:
8596 * tr!(selector, replacements) -> self or nil
8597 *
8598 * Like String#tr, but modifies +self+ in place.
8599 * Returns +self+ if any changes were made, +nil+ otherwise.
8600 *
8601 */
8602
8603static VALUE
8604rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8605{
8606 return tr_trans(str, src, repl, 0);
8607}
8608
8609
8610/*
8611 * call-seq:
8612 * tr(selector, replacements) -> new_string
8613 *
8614 * Returns a copy of +self+ with each character specified by string +selector+
8615 * translated to the corresponding character in string +replacements+.
8616 * The correspondence is _positional_:
8617 *
8618 * - Each occurrence of the first character specified by +selector+
8619 * is translated to the first character in +replacements+.
8620 * - Each occurrence of the second character specified by +selector+
8621 * is translated to the second character in +replacements+.
8622 * - And so on.
8623 *
8624 * Example:
8625 *
8626 * 'hello'.tr('el', 'ip') #=> "hippo"
8627 *
8628 * If +replacements+ is shorter than +selector+,
8629 * it is implicitly padded with its own last character:
8630 *
8631 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8632 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8633 *
8634 * Arguments +selector+ and +replacements+ must be valid character selectors
8635 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8636 * and may use any of its valid forms, including negation, ranges, and escaping:
8637 *
8638 * # Negation.
8639 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8640 * # Ranges.
8641 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8642 * # Escapes.
8643 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8644 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8645 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8646 *
8647 */
8648
8649static VALUE
8650rb_str_tr(VALUE str, VALUE src, VALUE repl)
8651{
8652 str = str_duplicate(rb_cString, str);
8653 tr_trans(str, src, repl, 0);
8654 return str;
8655}
8656
8657#define TR_TABLE_MAX (UCHAR_MAX+1)
8658#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8659static void
8660tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8661 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8662{
8663 const unsigned int errc = -1;
8664 char buf[TR_TABLE_MAX];
8665 struct tr tr;
8666 unsigned int c;
8667 VALUE table = 0, ptable = 0;
8668 int i, l, cflag = 0;
8669
8670 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8671 tr.gen = tr.now = tr.max = 0;
8672
8673 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8674 cflag = 1;
8675 tr.p += l;
8676 }
8677 if (first) {
8678 for (i=0; i<TR_TABLE_MAX; i++) {
8679 stable[i] = 1;
8680 }
8681 stable[TR_TABLE_MAX] = cflag;
8682 }
8683 else if (stable[TR_TABLE_MAX] && !cflag) {
8684 stable[TR_TABLE_MAX] = 0;
8685 }
8686 for (i=0; i<TR_TABLE_MAX; i++) {
8687 buf[i] = cflag;
8688 }
8689
8690 while ((c = trnext(&tr, enc)) != errc) {
8691 if (c < TR_TABLE_MAX) {
8692 buf[(unsigned char)c] = !cflag;
8693 }
8694 else {
8695 VALUE key = UINT2NUM(c);
8696
8697 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8698 if (cflag) {
8699 ptable = *ctablep;
8700 table = ptable ? ptable : rb_hash_new();
8701 *ctablep = table;
8702 }
8703 else {
8704 table = rb_hash_new();
8705 ptable = *tablep;
8706 *tablep = table;
8707 }
8708 }
8709 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8710 rb_hash_aset(table, key, Qtrue);
8711 }
8712 }
8713 }
8714 for (i=0; i<TR_TABLE_MAX; i++) {
8715 stable[i] = stable[i] && buf[i];
8716 }
8717 if (!table && !cflag) {
8718 *tablep = 0;
8719 }
8720}
8721
8722
8723static int
8724tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8725{
8726 if (c < TR_TABLE_MAX) {
8727 return table[c] != 0;
8728 }
8729 else {
8730 VALUE v = UINT2NUM(c);
8731
8732 if (del) {
8733 if (!NIL_P(rb_hash_lookup(del, v)) &&
8734 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8735 return TRUE;
8736 }
8737 }
8738 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8739 return FALSE;
8740 }
8741 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8742 }
8743}
8744
8745/*
8746 * call-seq:
8747 * delete!(*selectors) -> self or nil
8748 *
8749 * Like String#delete, but modifies +self+ in place;
8750 * returns +self+ if any characters were deleted, +nil+ otherwise.
8751 *
8752 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8753 */
8754
8755static VALUE
8756rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8757{
8758 char squeez[TR_TABLE_SIZE];
8759 rb_encoding *enc = 0;
8760 char *s, *send, *t;
8761 VALUE del = 0, nodel = 0;
8762 int modify = 0;
8763 int i, ascompat, cr;
8764
8765 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8767 for (i=0; i<argc; i++) {
8768 VALUE s = argv[i];
8769
8770 StringValue(s);
8771 enc = rb_enc_check(str, s);
8772 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8773 }
8774
8775 str_modify_keep_cr(str);
8776 ascompat = rb_enc_asciicompat(enc);
8777 s = t = RSTRING_PTR(str);
8778 send = RSTRING_END(str);
8779 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8780 while (s < send) {
8781 unsigned int c;
8782 int clen;
8783
8784 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8785 if (squeez[c]) {
8786 modify = 1;
8787 }
8788 else {
8789 if (t != s) *t = c;
8790 t++;
8791 }
8792 s++;
8793 }
8794 else {
8795 c = rb_enc_codepoint_len(s, send, &clen, enc);
8796
8797 if (tr_find(c, squeez, del, nodel)) {
8798 modify = 1;
8799 }
8800 else {
8801 if (t != s) rb_enc_mbcput(c, t, enc);
8802 t += clen;
8804 }
8805 s += clen;
8806 }
8807 }
8808 TERM_FILL(t, TERM_LEN(str));
8809 STR_SET_LEN(str, t - RSTRING_PTR(str));
8810 ENC_CODERANGE_SET(str, cr);
8811
8812 if (modify) return str;
8813 return Qnil;
8814}
8815
8816
8817/*
8818 * call-seq:
8819 * delete(*selectors) -> new_string
8820 *
8821 * :include: doc/string/delete.rdoc
8822 *
8823 */
8824
8825static VALUE
8826rb_str_delete(int argc, VALUE *argv, VALUE str)
8827{
8828 str = str_duplicate(rb_cString, str);
8829 rb_str_delete_bang(argc, argv, str);
8830 return str;
8831}
8832
8833
8834/*
8835 * call-seq:
8836 * squeeze!(*selectors) -> self or nil
8837 *
8838 * Like String#squeeze, except that:
8839 *
8840 * - Characters are squeezed in +self+ (not in a copy of +self+).
8841 * - Returns +self+ if any changes are made, +nil+ otherwise.
8842 *
8843 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8844 */
8845
8846static VALUE
8847rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8848{
8849 char squeez[TR_TABLE_SIZE];
8850 rb_encoding *enc = 0;
8851 VALUE del = 0, nodel = 0;
8852 unsigned char *s, *send, *t;
8853 int i, modify = 0;
8854 int ascompat, singlebyte = single_byte_optimizable(str);
8855 unsigned int save;
8856
8857 if (argc == 0) {
8858 enc = STR_ENC_GET(str);
8859 }
8860 else {
8861 for (i=0; i<argc; i++) {
8862 VALUE s = argv[i];
8863
8864 StringValue(s);
8865 enc = rb_enc_check(str, s);
8866 if (singlebyte && !single_byte_optimizable(s))
8867 singlebyte = 0;
8868 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8869 }
8870 }
8871
8872 str_modify_keep_cr(str);
8873 s = t = (unsigned char *)RSTRING_PTR(str);
8874 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8875 send = (unsigned char *)RSTRING_END(str);
8876 save = -1;
8877 ascompat = rb_enc_asciicompat(enc);
8878
8879 if (singlebyte) {
8880 while (s < send) {
8881 unsigned int c = *s++;
8882 if (c != save || (argc > 0 && !squeez[c])) {
8883 *t++ = save = c;
8884 }
8885 }
8886 }
8887 else {
8888 while (s < send) {
8889 unsigned int c;
8890 int clen;
8891
8892 if (ascompat && (c = *s) < 0x80) {
8893 if (c != save || (argc > 0 && !squeez[c])) {
8894 *t++ = save = c;
8895 }
8896 s++;
8897 }
8898 else {
8899 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8900
8901 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8902 if (t != s) rb_enc_mbcput(c, t, enc);
8903 save = c;
8904 t += clen;
8905 }
8906 s += clen;
8907 }
8908 }
8909 }
8910
8911 TERM_FILL((char *)t, TERM_LEN(str));
8912 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8913 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8914 modify = 1;
8915 }
8916
8917 if (modify) return str;
8918 return Qnil;
8919}
8920
8921
8922/*
8923 * call-seq:
8924 * squeeze(*selectors) -> new_string
8925 *
8926 * :include: doc/string/squeeze.rdoc
8927 *
8928 */
8929
8930static VALUE
8931rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8932{
8933 str = str_duplicate(rb_cString, str);
8934 rb_str_squeeze_bang(argc, argv, str);
8935 return str;
8936}
8937
8938
8939/*
8940 * call-seq:
8941 * tr_s!(selector, replacements) -> self or nil
8942 *
8943 * Like String#tr_s, but modifies +self+ in place.
8944 * Returns +self+ if any changes were made, +nil+ otherwise.
8945 *
8946 * Related: String#squeeze!.
8947 */
8948
8949static VALUE
8950rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8951{
8952 return tr_trans(str, src, repl, 1);
8953}
8954
8955
8956/*
8957 * call-seq:
8958 * tr_s(selector, replacements) -> string
8959 *
8960 * Like String#tr, but also squeezes the modified portions of the translated string;
8961 * returns a new string (translated and squeezed).
8962 *
8963 * 'hello'.tr_s('l', 'r') #=> "hero"
8964 * 'hello'.tr_s('el', '-') #=> "h-o"
8965 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8966 *
8967 * Related: String#squeeze.
8968 *
8969 */
8970
8971static VALUE
8972rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8973{
8974 str = str_duplicate(rb_cString, str);
8975 tr_trans(str, src, repl, 1);
8976 return str;
8977}
8978
8979
8980/*
8981 * call-seq:
8982 * count(*selectors) -> integer
8983 *
8984 * :include: doc/string/count.rdoc
8985 */
8986
8987static VALUE
8988rb_str_count(int argc, VALUE *argv, VALUE str)
8989{
8990 char table[TR_TABLE_SIZE];
8991 rb_encoding *enc = 0;
8992 VALUE del = 0, nodel = 0, tstr;
8993 char *s, *send;
8994 int i;
8995 int ascompat;
8996 size_t n = 0;
8997
8999
9000 tstr = argv[0];
9001 StringValue(tstr);
9002 enc = rb_enc_check(str, tstr);
9003 if (argc == 1) {
9004 const char *ptstr;
9005 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9006 (ptstr = RSTRING_PTR(tstr),
9007 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9008 !is_broken_string(str)) {
9009 int clen;
9010 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9011
9012 s = RSTRING_PTR(str);
9013 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9014 send = RSTRING_END(str);
9015 while (s < send) {
9016 if (*(unsigned char*)s++ == c) n++;
9017 }
9018 return SIZET2NUM(n);
9019 }
9020 }
9021
9022 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9023 for (i=1; i<argc; i++) {
9024 tstr = argv[i];
9025 StringValue(tstr);
9026 enc = rb_enc_check(str, tstr);
9027 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9028 }
9029
9030 s = RSTRING_PTR(str);
9031 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9032 send = RSTRING_END(str);
9033 ascompat = rb_enc_asciicompat(enc);
9034 while (s < send) {
9035 unsigned int c;
9036
9037 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9038 if (table[c]) {
9039 n++;
9040 }
9041 s++;
9042 }
9043 else {
9044 int clen;
9045 c = rb_enc_codepoint_len(s, send, &clen, enc);
9046 if (tr_find(c, table, del, nodel)) {
9047 n++;
9048 }
9049 s += clen;
9050 }
9051 }
9052
9053 return SIZET2NUM(n);
9054}
9055
9056static VALUE
9057rb_fs_check(VALUE val)
9058{
9059 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9060 val = rb_check_string_type(val);
9061 if (NIL_P(val)) return 0;
9062 }
9063 return val;
9064}
9065
9066static const char isspacetable[256] = {
9067 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9068 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9069 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9070 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9071 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9083};
9084
9085#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9086
9087static long
9088split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9089{
9090 if (empty_count >= 0 && len == 0) {
9091 return empty_count + 1;
9092 }
9093 if (empty_count > 0) {
9094 /* make different substrings */
9095 if (result) {
9096 do {
9097 rb_ary_push(result, str_new_empty_String(str));
9098 } while (--empty_count > 0);
9099 }
9100 else {
9101 do {
9102 rb_yield(str_new_empty_String(str));
9103 } while (--empty_count > 0);
9104 }
9105 }
9106 str = rb_str_subseq(str, beg, len);
9107 if (result) {
9108 rb_ary_push(result, str);
9109 }
9110 else {
9111 rb_yield(str);
9112 }
9113 return empty_count;
9114}
9115
9116typedef enum {
9117 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9118} split_type_t;
9119
9120static split_type_t
9121literal_split_pattern(VALUE spat, split_type_t default_type)
9122{
9123 rb_encoding *enc = STR_ENC_GET(spat);
9124 const char *ptr;
9125 long len;
9126 RSTRING_GETMEM(spat, ptr, len);
9127 if (len == 0) {
9128 /* Special case - split into chars */
9129 return SPLIT_TYPE_CHARS;
9130 }
9131 else if (rb_enc_asciicompat(enc)) {
9132 if (len == 1 && ptr[0] == ' ') {
9133 return SPLIT_TYPE_AWK;
9134 }
9135 }
9136 else {
9137 int l;
9138 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9139 return SPLIT_TYPE_AWK;
9140 }
9141 }
9142 return default_type;
9143}
9144
9145/*
9146 * call-seq:
9147 * split(field_sep = $;, limit = 0) -> array_of_substrings
9148 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9149 *
9150 * :include: doc/string/split.rdoc
9151 *
9152 */
9153
9154static VALUE
9155rb_str_split_m(int argc, VALUE *argv, VALUE str)
9156{
9157 rb_encoding *enc;
9158 VALUE spat;
9159 VALUE limit;
9160 split_type_t split_type;
9161 long beg, end, i = 0, empty_count = -1;
9162 int lim = 0;
9163 VALUE result, tmp;
9164
9165 result = rb_block_given_p() ? Qfalse : Qnil;
9166 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9167 lim = NUM2INT(limit);
9168 if (lim <= 0) limit = Qnil;
9169 else if (lim == 1) {
9170 if (RSTRING_LEN(str) == 0)
9171 return result ? rb_ary_new2(0) : str;
9172 tmp = str_duplicate(rb_cString, str);
9173 if (!result) {
9174 rb_yield(tmp);
9175 return str;
9176 }
9177 return rb_ary_new3(1, tmp);
9178 }
9179 i = 1;
9180 }
9181 if (NIL_P(limit) && !lim) empty_count = 0;
9182
9183 enc = STR_ENC_GET(str);
9184 split_type = SPLIT_TYPE_REGEXP;
9185 if (!NIL_P(spat)) {
9186 spat = get_pat_quoted(spat, 0);
9187 }
9188 else if (NIL_P(spat = rb_fs)) {
9189 split_type = SPLIT_TYPE_AWK;
9190 }
9191 else if (!(spat = rb_fs_check(spat))) {
9192 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9193 }
9194 else {
9195 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9196 }
9197 if (split_type != SPLIT_TYPE_AWK) {
9198 switch (BUILTIN_TYPE(spat)) {
9199 case T_REGEXP:
9200 rb_reg_options(spat); /* check if uninitialized */
9201 tmp = RREGEXP_SRC(spat);
9202 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9203 if (split_type == SPLIT_TYPE_AWK) {
9204 spat = tmp;
9205 split_type = SPLIT_TYPE_STRING;
9206 }
9207 break;
9208
9209 case T_STRING:
9210 mustnot_broken(spat);
9211 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9212 break;
9213
9214 default:
9216 }
9217 }
9218
9219#define SPLIT_STR(beg, len) ( \
9220 empty_count = split_string(result, str, beg, len, empty_count), \
9221 str_mod_check(str, str_start, str_len))
9222
9223 beg = 0;
9224 char *ptr = RSTRING_PTR(str);
9225 char *const str_start = ptr;
9226 const long str_len = RSTRING_LEN(str);
9227 char *const eptr = str_start + str_len;
9228 if (split_type == SPLIT_TYPE_AWK) {
9229 char *bptr = ptr;
9230 int skip = 1;
9231 unsigned int c;
9232
9233 if (result) result = rb_ary_new();
9234 end = beg;
9235 if (is_ascii_string(str)) {
9236 while (ptr < eptr) {
9237 c = (unsigned char)*ptr++;
9238 if (skip) {
9239 if (ascii_isspace(c)) {
9240 beg = ptr - bptr;
9241 }
9242 else {
9243 end = ptr - bptr;
9244 skip = 0;
9245 if (!NIL_P(limit) && lim <= i) break;
9246 }
9247 }
9248 else if (ascii_isspace(c)) {
9249 SPLIT_STR(beg, end-beg);
9250 skip = 1;
9251 beg = ptr - bptr;
9252 if (!NIL_P(limit)) ++i;
9253 }
9254 else {
9255 end = ptr - bptr;
9256 }
9257 }
9258 }
9259 else {
9260 while (ptr < eptr) {
9261 int n;
9262
9263 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9264 ptr += n;
9265 if (skip) {
9266 if (rb_isspace(c)) {
9267 beg = ptr - bptr;
9268 }
9269 else {
9270 end = ptr - bptr;
9271 skip = 0;
9272 if (!NIL_P(limit) && lim <= i) break;
9273 }
9274 }
9275 else if (rb_isspace(c)) {
9276 SPLIT_STR(beg, end-beg);
9277 skip = 1;
9278 beg = ptr - bptr;
9279 if (!NIL_P(limit)) ++i;
9280 }
9281 else {
9282 end = ptr - bptr;
9283 }
9284 }
9285 }
9286 }
9287 else if (split_type == SPLIT_TYPE_STRING) {
9288 char *substr_start = ptr;
9289 char *sptr = RSTRING_PTR(spat);
9290 long slen = RSTRING_LEN(spat);
9291
9292 if (result) result = rb_ary_new();
9293 mustnot_broken(str);
9294 enc = rb_enc_check(str, spat);
9295 while (ptr < eptr &&
9296 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9297 /* Check we are at the start of a char */
9298 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9299 if (t != ptr + end) {
9300 ptr = t;
9301 continue;
9302 }
9303 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9304 str_mod_check(spat, sptr, slen);
9305 ptr += end + slen;
9306 substr_start = ptr;
9307 if (!NIL_P(limit) && lim <= ++i) break;
9308 }
9309 beg = ptr - str_start;
9310 }
9311 else if (split_type == SPLIT_TYPE_CHARS) {
9312 int n;
9313
9314 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9315 mustnot_broken(str);
9316 enc = rb_enc_get(str);
9317 while (ptr < eptr &&
9318 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9319 SPLIT_STR(ptr - str_start, n);
9320 ptr += n;
9321 if (!NIL_P(limit) && lim <= ++i) break;
9322 }
9323 beg = ptr - str_start;
9324 }
9325 else {
9326 if (result) result = rb_ary_new();
9327 long len = RSTRING_LEN(str);
9328 long start = beg;
9329 long idx;
9330 int last_null = 0;
9331 struct re_registers *regs;
9332 VALUE match = 0;
9333
9334 for (; rb_reg_search(spat, str, start, 0) >= 0;
9335 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9336 match = rb_backref_get();
9337 if (!result) rb_match_busy(match);
9338 regs = RMATCH_REGS(match);
9339 end = BEG(0);
9340 if (start == end && BEG(0) == END(0)) {
9341 if (!ptr) {
9342 SPLIT_STR(0, 0);
9343 break;
9344 }
9345 else if (last_null == 1) {
9346 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9347 beg = start;
9348 }
9349 else {
9350 if (start == len)
9351 start++;
9352 else
9353 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9354 last_null = 1;
9355 continue;
9356 }
9357 }
9358 else {
9359 SPLIT_STR(beg, end-beg);
9360 beg = start = END(0);
9361 }
9362 last_null = 0;
9363
9364 for (idx=1; idx < regs->num_regs; idx++) {
9365 if (BEG(idx) == -1) continue;
9366 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9367 }
9368 if (!NIL_P(limit) && lim <= ++i) break;
9369 }
9370 if (match) rb_match_unbusy(match);
9371 }
9372 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9373 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9374 }
9375
9376 return result ? result : str;
9377}
9378
9379VALUE
9380rb_str_split(VALUE str, const char *sep0)
9381{
9382 VALUE sep;
9383
9384 StringValue(str);
9385 sep = rb_str_new_cstr(sep0);
9386 return rb_str_split_m(1, &sep, str);
9387}
9388
9389#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9390
9391static inline int
9392enumerator_element(VALUE ary, VALUE e)
9393{
9394 if (ary) {
9395 rb_ary_push(ary, e);
9396 return 0;
9397 }
9398 else {
9399 rb_yield(e);
9400 return 1;
9401 }
9402}
9403
9404#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9405
9406static const char *
9407chomp_newline(const char *p, const char *e, rb_encoding *enc)
9408{
9409 const char *prev = rb_enc_prev_char(p, e, e, enc);
9410 if (rb_enc_is_newline(prev, e, enc)) {
9411 e = prev;
9412 prev = rb_enc_prev_char(p, e, e, enc);
9413 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9414 e = prev;
9415 }
9416 return e;
9417}
9418
9419static VALUE
9420get_rs(void)
9421{
9422 VALUE rs = rb_rs;
9423 if (!NIL_P(rs) &&
9424 (!RB_TYPE_P(rs, T_STRING) ||
9425 RSTRING_LEN(rs) != 1 ||
9426 RSTRING_PTR(rs)[0] != '\n')) {
9427 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9428 }
9429 return rs;
9430}
9431
9432#define rb_rs get_rs()
9433
9434static VALUE
9435rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9436{
9437 rb_encoding *enc;
9438 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9439 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9440 long pos, len, rslen;
9441 int rsnewline = 0;
9442
9443 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9444 rs = rb_rs;
9445 if (!NIL_P(opts)) {
9446 static ID keywords[1];
9447 if (!keywords[0]) {
9448 keywords[0] = rb_intern_const("chomp");
9449 }
9450 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9451 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9452 }
9453
9454 if (NIL_P(rs)) {
9455 if (!ENUM_ELEM(ary, str)) {
9456 return ary;
9457 }
9458 else {
9459 return orig;
9460 }
9461 }
9462
9463 if (!RSTRING_LEN(str)) goto end;
9464 str = rb_str_new_frozen(str);
9465 ptr = subptr = RSTRING_PTR(str);
9466 pend = RSTRING_END(str);
9467 len = RSTRING_LEN(str);
9468 StringValue(rs);
9469 rslen = RSTRING_LEN(rs);
9470
9471 if (rs == rb_default_rs)
9472 enc = rb_enc_get(str);
9473 else
9474 enc = rb_enc_check(str, rs);
9475
9476 if (rslen == 0) {
9477 /* paragraph mode */
9478 int n;
9479 const char *eol = NULL;
9480 subend = subptr;
9481 while (subend < pend) {
9482 long chomp_rslen = 0;
9483 do {
9484 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9485 n = 0;
9486 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9487 if (rb_enc_is_newline(subend + n, pend, enc)) {
9488 if (eol == subend) break;
9489 subend += rslen;
9490 if (subptr) {
9491 eol = subend;
9492 chomp_rslen = -rslen;
9493 }
9494 }
9495 else {
9496 if (!subptr) subptr = subend;
9497 subend += rslen;
9498 }
9499 rslen = 0;
9500 } while (subend < pend);
9501 if (!subptr) break;
9502 if (rslen == 0) chomp_rslen = 0;
9503 line = rb_str_subseq(str, subptr - ptr,
9504 subend - subptr + (chomp ? chomp_rslen : rslen));
9505 if (ENUM_ELEM(ary, line)) {
9506 str_mod_check(str, ptr, len);
9507 }
9508 subptr = eol = NULL;
9509 }
9510 goto end;
9511 }
9512 else {
9513 rsptr = RSTRING_PTR(rs);
9514 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9515 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9516 rsnewline = 1;
9517 }
9518 }
9519
9520 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9521 rs = rb_str_new(rsptr, rslen);
9522 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9523 rsptr = RSTRING_PTR(rs);
9524 rslen = RSTRING_LEN(rs);
9525 }
9526
9527 while (subptr < pend) {
9528 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9529 if (pos < 0) break;
9530 hit = subptr + pos;
9531 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9532 if (hit != adjusted) {
9533 subptr = adjusted;
9534 continue;
9535 }
9536 subend = hit += rslen;
9537 if (chomp) {
9538 if (rsnewline) {
9539 subend = chomp_newline(subptr, subend, enc);
9540 }
9541 else {
9542 subend -= rslen;
9543 }
9544 }
9545 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9546 if (ENUM_ELEM(ary, line)) {
9547 str_mod_check(str, ptr, len);
9548 }
9549 subptr = hit;
9550 }
9551
9552 if (subptr != pend) {
9553 if (chomp) {
9554 if (rsnewline) {
9555 pend = chomp_newline(subptr, pend, enc);
9556 }
9557 else if (pend - subptr >= rslen &&
9558 memcmp(pend - rslen, rsptr, rslen) == 0) {
9559 pend -= rslen;
9560 }
9561 }
9562 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9563 ENUM_ELEM(ary, line);
9564 RB_GC_GUARD(str);
9565 }
9566
9567 end:
9568 if (ary)
9569 return ary;
9570 else
9571 return orig;
9572}
9573
9574/*
9575 * call-seq:
9576 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9577 * each_line(record_separator = $/, chomp: false) -> enumerator
9578 *
9579 * :include: doc/string/each_line.rdoc
9580 *
9581 */
9582
9583static VALUE
9584rb_str_each_line(int argc, VALUE *argv, VALUE str)
9585{
9586 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9587 return rb_str_enumerate_lines(argc, argv, str, 0);
9588}
9589
9590/*
9591 * call-seq:
9592 * lines(record_separator = $/, chomp: false) -> array_of_strings
9593 *
9594 * Returns substrings ("lines") of +self+
9595 * according to the given arguments:
9596 *
9597 * s = <<~EOT
9598 * This is the first line.
9599 * This is line two.
9600 *
9601 * This is line four.
9602 * This is line five.
9603 * EOT
9604 *
9605 * With the default argument values:
9606 *
9607 * $/ # => "\n"
9608 * s.lines
9609 * # =>
9610 * ["This is the first line.\n",
9611 * "This is line two.\n",
9612 * "\n",
9613 * "This is line four.\n",
9614 * "This is line five.\n"]
9615 *
9616 * With a different +record_separator+:
9617 *
9618 * record_separator = ' is '
9619 * s.lines(record_separator)
9620 * # =>
9621 * ["This is ",
9622 * "the first line.\nThis is ",
9623 * "line two.\n\nThis is ",
9624 * "line four.\nThis is ",
9625 * "line five.\n"]
9626 *
9627 * With keyword argument +chomp+ as +true+,
9628 * removes the trailing newline from each line:
9629 *
9630 * s.lines(chomp: true)
9631 * # =>
9632 * ["This is the first line.",
9633 * "This is line two.",
9634 * "",
9635 * "This is line four.",
9636 * "This is line five."]
9637 *
9638 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9639 */
9640
9641static VALUE
9642rb_str_lines(int argc, VALUE *argv, VALUE str)
9643{
9644 VALUE ary = WANTARRAY("lines", 0);
9645 return rb_str_enumerate_lines(argc, argv, str, ary);
9646}
9647
9648static VALUE
9649rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9650{
9651 return LONG2FIX(RSTRING_LEN(str));
9652}
9653
9654static VALUE
9655rb_str_enumerate_bytes(VALUE str, VALUE ary)
9656{
9657 long i;
9658
9659 for (i=0; i<RSTRING_LEN(str); i++) {
9660 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9661 }
9662 if (ary)
9663 return ary;
9664 else
9665 return str;
9666}
9667
9668/*
9669 * call-seq:
9670 * each_byte {|byte| ... } -> self
9671 * each_byte -> enumerator
9672 *
9673 * :include: doc/string/each_byte.rdoc
9674 *
9675 */
9676
9677static VALUE
9678rb_str_each_byte(VALUE str)
9679{
9680 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9681 return rb_str_enumerate_bytes(str, 0);
9682}
9683
9684/*
9685 * call-seq:
9686 * bytes -> array_of_bytes
9687 *
9688 * :include: doc/string/bytes.rdoc
9689 *
9690 */
9691
9692static VALUE
9693rb_str_bytes(VALUE str)
9694{
9695 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9696 return rb_str_enumerate_bytes(str, ary);
9697}
9698
9699static VALUE
9700rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9701{
9702 return rb_str_length(str);
9703}
9704
9705static VALUE
9706rb_str_enumerate_chars(VALUE str, VALUE ary)
9707{
9708 VALUE orig = str;
9709 long i, len, n;
9710 const char *ptr;
9711 rb_encoding *enc;
9712
9713 str = rb_str_new_frozen(str);
9714 ptr = RSTRING_PTR(str);
9715 len = RSTRING_LEN(str);
9716 enc = rb_enc_get(str);
9717
9719 for (i = 0; i < len; i += n) {
9720 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9721 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9722 }
9723 }
9724 else {
9725 for (i = 0; i < len; i += n) {
9726 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9727 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9728 }
9729 }
9730 RB_GC_GUARD(str);
9731 if (ary)
9732 return ary;
9733 else
9734 return orig;
9735}
9736
9737/*
9738 * call-seq:
9739 * each_char {|char| ... } -> self
9740 * each_char -> enumerator
9741 *
9742 * :include: doc/string/each_char.rdoc
9743 *
9744 */
9745
9746static VALUE
9747rb_str_each_char(VALUE str)
9748{
9749 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9750 return rb_str_enumerate_chars(str, 0);
9751}
9752
9753/*
9754 * call-seq:
9755 * chars -> array_of_characters
9756 *
9757 * :include: doc/string/chars.rdoc
9758 *
9759 */
9760
9761static VALUE
9762rb_str_chars(VALUE str)
9763{
9764 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9765 return rb_str_enumerate_chars(str, ary);
9766}
9767
9768static VALUE
9769rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9770{
9771 VALUE orig = str;
9772 int n;
9773 unsigned int c;
9774 const char *ptr, *end;
9775 rb_encoding *enc;
9776
9777 if (single_byte_optimizable(str))
9778 return rb_str_enumerate_bytes(str, ary);
9779
9780 str = rb_str_new_frozen(str);
9781 ptr = RSTRING_PTR(str);
9782 end = RSTRING_END(str);
9783 enc = STR_ENC_GET(str);
9784
9785 while (ptr < end) {
9786 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9787 ENUM_ELEM(ary, UINT2NUM(c));
9788 ptr += n;
9789 }
9790 RB_GC_GUARD(str);
9791 if (ary)
9792 return ary;
9793 else
9794 return orig;
9795}
9796
9797/*
9798 * call-seq:
9799 * each_codepoint {|codepoint| ... } -> self
9800 * each_codepoint -> enumerator
9801 *
9802 * :include: doc/string/each_codepoint.rdoc
9803 *
9804 */
9805
9806static VALUE
9807rb_str_each_codepoint(VALUE str)
9808{
9809 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9810 return rb_str_enumerate_codepoints(str, 0);
9811}
9812
9813/*
9814 * call-seq:
9815 * codepoints -> array_of_integers
9816 *
9817 * :include: doc/string/codepoints.rdoc
9818 *
9819 */
9820
9821static VALUE
9822rb_str_codepoints(VALUE str)
9823{
9824 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9825 return rb_str_enumerate_codepoints(str, ary);
9826}
9827
9828static regex_t *
9829get_reg_grapheme_cluster(rb_encoding *enc)
9830{
9831 int encidx = rb_enc_to_index(enc);
9832
9833 const OnigUChar source_ascii[] = "\\X";
9834 const OnigUChar *source = source_ascii;
9835 size_t source_len = sizeof(source_ascii) - 1;
9836
9837 switch (encidx) {
9838#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9839#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9840#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9841#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9842#define CASE_UTF(e) \
9843 case ENCINDEX_UTF_##e: { \
9844 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9845 source = source_UTF_##e; \
9846 source_len = sizeof(source_UTF_##e); \
9847 break; \
9848 }
9849 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9850#undef CASE_UTF
9851#undef CHARS_16BE
9852#undef CHARS_16LE
9853#undef CHARS_32BE
9854#undef CHARS_32LE
9855 }
9856
9857 regex_t *reg_grapheme_cluster;
9858 OnigErrorInfo einfo;
9859 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9860 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9861 if (r) {
9862 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9863 onig_error_code_to_str(message, r, &einfo);
9864 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9865 }
9866
9867 return reg_grapheme_cluster;
9868}
9869
9870static regex_t *
9871get_cached_reg_grapheme_cluster(rb_encoding *enc)
9872{
9873 int encidx = rb_enc_to_index(enc);
9874 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9875
9876 if (encidx == rb_utf8_encindex()) {
9877 if (!reg_grapheme_cluster_utf8) {
9878 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9879 }
9880
9881 return reg_grapheme_cluster_utf8;
9882 }
9883
9884 return NULL;
9885}
9886
9887static VALUE
9888rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9889{
9890 size_t grapheme_cluster_count = 0;
9891 rb_encoding *enc = get_encoding(str);
9892 const char *ptr, *end;
9893
9894 if (!rb_enc_unicode_p(enc)) {
9895 return rb_str_length(str);
9896 }
9897
9898 bool cached_reg_grapheme_cluster = true;
9899 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9900 if (!reg_grapheme_cluster) {
9901 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9902 cached_reg_grapheme_cluster = false;
9903 }
9904
9905 ptr = RSTRING_PTR(str);
9906 end = RSTRING_END(str);
9907
9908 while (ptr < end) {
9909 OnigPosition len = onig_match(reg_grapheme_cluster,
9910 (const OnigUChar *)ptr, (const OnigUChar *)end,
9911 (const OnigUChar *)ptr, NULL, 0);
9912 if (len <= 0) break;
9913 grapheme_cluster_count++;
9914 ptr += len;
9915 }
9916
9917 if (!cached_reg_grapheme_cluster) {
9918 onig_free(reg_grapheme_cluster);
9919 }
9920
9921 return SIZET2NUM(grapheme_cluster_count);
9922}
9923
9924static VALUE
9925rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9926{
9927 VALUE orig = str;
9928 rb_encoding *enc = get_encoding(str);
9929 const char *ptr0, *ptr, *end;
9930
9931 if (!rb_enc_unicode_p(enc)) {
9932 return rb_str_enumerate_chars(str, ary);
9933 }
9934
9935 if (!ary) str = rb_str_new_frozen(str);
9936
9937 bool cached_reg_grapheme_cluster = true;
9938 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9939 if (!reg_grapheme_cluster) {
9940 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9941 cached_reg_grapheme_cluster = false;
9942 }
9943
9944 ptr0 = ptr = RSTRING_PTR(str);
9945 end = RSTRING_END(str);
9946
9947 while (ptr < end) {
9948 OnigPosition len = onig_match(reg_grapheme_cluster,
9949 (const OnigUChar *)ptr, (const OnigUChar *)end,
9950 (const OnigUChar *)ptr, NULL, 0);
9951 if (len <= 0) break;
9952 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9953 ptr += len;
9954 }
9955
9956 if (!cached_reg_grapheme_cluster) {
9957 onig_free(reg_grapheme_cluster);
9958 }
9959
9960 RB_GC_GUARD(str);
9961 if (ary)
9962 return ary;
9963 else
9964 return orig;
9965}
9966
9967/*
9968 * call-seq:
9969 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9970 * each_grapheme_cluster -> enumerator
9971 *
9972 * :include: doc/string/each_grapheme_cluster.rdoc
9973 *
9974 */
9975
9976static VALUE
9977rb_str_each_grapheme_cluster(VALUE str)
9978{
9979 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9980 return rb_str_enumerate_grapheme_clusters(str, 0);
9981}
9982
9983/*
9984 * call-seq:
9985 * grapheme_clusters -> array_of_grapheme_clusters
9986 *
9987 * :include: doc/string/grapheme_clusters.rdoc
9988 *
9989 */
9990
9991static VALUE
9992rb_str_grapheme_clusters(VALUE str)
9993{
9994 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9995 return rb_str_enumerate_grapheme_clusters(str, ary);
9996}
9997
9998static long
9999chopped_length(VALUE str)
10000{
10001 rb_encoding *enc = STR_ENC_GET(str);
10002 const char *p, *p2, *beg, *end;
10003
10004 beg = RSTRING_PTR(str);
10005 end = beg + RSTRING_LEN(str);
10006 if (beg >= end) return 0;
10007 p = rb_enc_prev_char(beg, end, end, enc);
10008 if (!p) return 0;
10009 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10010 p2 = rb_enc_prev_char(beg, p, end, enc);
10011 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10012 }
10013 return p - beg;
10014}
10015
10016/*
10017 * call-seq:
10018 * chop! -> self or nil
10019 *
10020 * Like String#chop, except that:
10021 *
10022 * - Removes trailing characters from +self+ (not from a copy of +self+).
10023 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10024 *
10025 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10026 */
10027
10028static VALUE
10029rb_str_chop_bang(VALUE str)
10030{
10031 str_modify_keep_cr(str);
10032 if (RSTRING_LEN(str) > 0) {
10033 long len;
10034 len = chopped_length(str);
10035 STR_SET_LEN(str, len);
10036 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10037 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10039 }
10040 return str;
10041 }
10042 return Qnil;
10043}
10044
10045
10046/*
10047 * call-seq:
10048 * chop -> new_string
10049 *
10050 * :include: doc/string/chop.rdoc
10051 *
10052 */
10053
10054static VALUE
10055rb_str_chop(VALUE str)
10056{
10057 return rb_str_subseq(str, 0, chopped_length(str));
10058}
10059
10060static long
10061smart_chomp(VALUE str, const char *e, const char *p)
10062{
10063 rb_encoding *enc = rb_enc_get(str);
10064 if (rb_enc_mbminlen(enc) > 1) {
10065 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10066 if (rb_enc_is_newline(pp, e, enc)) {
10067 e = pp;
10068 }
10069 pp = e - rb_enc_mbminlen(enc);
10070 if (pp >= p) {
10071 pp = rb_enc_left_char_head(p, pp, e, enc);
10072 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10073 e = pp;
10074 }
10075 }
10076 }
10077 else {
10078 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10079 case '\n':
10080 if (--e > p && *(e-1) == '\r') {
10081 --e;
10082 }
10083 break;
10084 case '\r':
10085 --e;
10086 break;
10087 }
10088 }
10089 return e - p;
10090}
10091
10092static long
10093chompped_length(VALUE str, VALUE rs)
10094{
10095 rb_encoding *enc;
10096 int newline;
10097 char *pp, *e, *rsptr;
10098 long rslen;
10099 char *const p = RSTRING_PTR(str);
10100 long len = RSTRING_LEN(str);
10101
10102 if (len == 0) return 0;
10103 e = p + len;
10104 if (rs == rb_default_rs) {
10105 return smart_chomp(str, e, p);
10106 }
10107
10108 enc = rb_enc_get(str);
10109 RSTRING_GETMEM(rs, rsptr, rslen);
10110 if (rslen == 0) {
10111 if (rb_enc_mbminlen(enc) > 1) {
10112 while (e > p) {
10113 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10114 if (!rb_enc_is_newline(pp, e, enc)) break;
10115 e = pp;
10116 pp -= rb_enc_mbminlen(enc);
10117 if (pp >= p) {
10118 pp = rb_enc_left_char_head(p, pp, e, enc);
10119 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10120 e = pp;
10121 }
10122 }
10123 }
10124 }
10125 else {
10126 while (e > p && *(e-1) == '\n') {
10127 --e;
10128 if (e > p && *(e-1) == '\r')
10129 --e;
10130 }
10131 }
10132 return e - p;
10133 }
10134 if (rslen > len) return len;
10135
10136 enc = rb_enc_get(rs);
10137 newline = rsptr[rslen-1];
10138 if (rslen == rb_enc_mbminlen(enc)) {
10139 if (rslen == 1) {
10140 if (newline == '\n')
10141 return smart_chomp(str, e, p);
10142 }
10143 else {
10144 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10145 return smart_chomp(str, e, p);
10146 }
10147 }
10148
10149 enc = rb_enc_check(str, rs);
10150 if (is_broken_string(rs)) {
10151 return len;
10152 }
10153 pp = e - rslen;
10154 if (p[len-1] == newline &&
10155 (rslen <= 1 ||
10156 memcmp(rsptr, pp, rslen) == 0)) {
10157 if (at_char_boundary(p, pp, e, enc))
10158 return len - rslen;
10159 RB_GC_GUARD(rs);
10160 }
10161 return len;
10162}
10163
10169static VALUE
10170chomp_rs(int argc, const VALUE *argv)
10171{
10172 rb_check_arity(argc, 0, 1);
10173 if (argc > 0) {
10174 VALUE rs = argv[0];
10175 if (!NIL_P(rs)) StringValue(rs);
10176 return rs;
10177 }
10178 else {
10179 return rb_rs;
10180 }
10181}
10182
10183VALUE
10184rb_str_chomp_string(VALUE str, VALUE rs)
10185{
10186 long olen = RSTRING_LEN(str);
10187 long len = chompped_length(str, rs);
10188 if (len >= olen) return Qnil;
10189 str_modify_keep_cr(str);
10190 STR_SET_LEN(str, len);
10191 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10192 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10194 }
10195 return str;
10196}
10197
10198/*
10199 * call-seq:
10200 * chomp!(line_sep = $/) -> self or nil
10201 *
10202 * Like String#chomp, except that:
10203 *
10204 * - Removes trailing characters from +self+ (not from a copy of +self+).
10205 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10206 *
10207 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10208 */
10209
10210static VALUE
10211rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10212{
10213 VALUE rs;
10214 str_modifiable(str);
10215 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10216 rs = chomp_rs(argc, argv);
10217 if (NIL_P(rs)) return Qnil;
10218 return rb_str_chomp_string(str, rs);
10219}
10220
10221
10222/*
10223 * call-seq:
10224 * chomp(line_sep = $/) -> new_string
10225 *
10226 * :include: doc/string/chomp.rdoc
10227 *
10228 */
10229
10230static VALUE
10231rb_str_chomp(int argc, VALUE *argv, VALUE str)
10232{
10233 VALUE rs = chomp_rs(argc, argv);
10234 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10235 return rb_str_subseq(str, 0, chompped_length(str, rs));
10236}
10237
10238static long
10239lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10240{
10241 const char *const start = s;
10242
10243 if (!s || s >= e) return 0;
10244
10245 /* remove spaces at head */
10246 if (single_byte_optimizable(str)) {
10247 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10248 }
10249 else {
10250 while (s < e) {
10251 int n;
10252 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10253
10254 if (cc && !rb_isspace(cc)) break;
10255 s += n;
10256 }
10257 }
10258 return s - start;
10259}
10260
10261/*
10262 * call-seq:
10263 * lstrip! -> self or nil
10264 *
10265 * Like String#lstrip, except that:
10266 *
10267 * - Performs stripping in +self+ (not in a copy of +self+).
10268 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10269 *
10270 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10271 */
10272
10273static VALUE
10274rb_str_lstrip_bang(VALUE str)
10275{
10276 rb_encoding *enc;
10277 char *start, *s;
10278 long olen, loffset;
10279
10280 str_modify_keep_cr(str);
10281 enc = STR_ENC_GET(str);
10282 RSTRING_GETMEM(str, start, olen);
10283 loffset = lstrip_offset(str, start, start+olen, enc);
10284 if (loffset > 0) {
10285 long len = olen-loffset;
10286 s = start + loffset;
10287 memmove(start, s, len);
10288 STR_SET_LEN(str, len);
10289 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10290 return str;
10291 }
10292 return Qnil;
10293}
10294
10295
10296/*
10297 * call-seq:
10298 * lstrip -> new_string
10299 *
10300 * Returns a copy of +self+ with leading whitespace removed;
10301 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10302 *
10303 * whitespace = "\x00\t\n\v\f\r "
10304 * s = whitespace + 'abc' + whitespace
10305 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10306 * s.lstrip
10307 * # => "abc\u0000\t\n\v\f\r "
10308 *
10309 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10310 */
10311
10312static VALUE
10313rb_str_lstrip(VALUE str)
10314{
10315 char *start;
10316 long len, loffset;
10317 RSTRING_GETMEM(str, start, len);
10318 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10319 if (loffset <= 0) return str_duplicate(rb_cString, str);
10320 return rb_str_subseq(str, loffset, len - loffset);
10321}
10322
10323static long
10324rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10325{
10326 const char *t;
10327
10328 rb_str_check_dummy_enc(enc);
10330 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10331 }
10332 if (!s || s >= e) return 0;
10333 t = e;
10334
10335 /* remove trailing spaces or '\0's */
10336 if (single_byte_optimizable(str)) {
10337 unsigned char c;
10338 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10339 }
10340 else {
10341 char *tp;
10342
10343 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10344 unsigned int c = rb_enc_codepoint(tp, e, enc);
10345 if (c && !rb_isspace(c)) break;
10346 t = tp;
10347 }
10348 }
10349 return e - t;
10350}
10351
10352/*
10353 * call-seq:
10354 * rstrip! -> self or nil
10355 *
10356 * Like String#rstrip, except that:
10357 *
10358 * - Performs stripping in +self+ (not in a copy of +self+).
10359 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10360 *
10361 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10362 */
10363
10364static VALUE
10365rb_str_rstrip_bang(VALUE str)
10366{
10367 rb_encoding *enc;
10368 char *start;
10369 long olen, roffset;
10370
10371 str_modify_keep_cr(str);
10372 enc = STR_ENC_GET(str);
10373 RSTRING_GETMEM(str, start, olen);
10374 roffset = rstrip_offset(str, start, start+olen, enc);
10375 if (roffset > 0) {
10376 long len = olen - roffset;
10377
10378 STR_SET_LEN(str, len);
10379 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10380 return str;
10381 }
10382 return Qnil;
10383}
10384
10385
10386/*
10387 * call-seq:
10388 * rstrip -> new_string
10389 *
10390 * Returns a copy of +self+ with trailing whitespace removed;
10391 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10392 *
10393 * whitespace = "\x00\t\n\v\f\r "
10394 * s = whitespace + 'abc' + whitespace
10395 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10396 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10397 *
10398 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10399 */
10400
10401static VALUE
10402rb_str_rstrip(VALUE str)
10403{
10404 rb_encoding *enc;
10405 char *start;
10406 long olen, roffset;
10407
10408 enc = STR_ENC_GET(str);
10409 RSTRING_GETMEM(str, start, olen);
10410 roffset = rstrip_offset(str, start, start+olen, enc);
10411
10412 if (roffset <= 0) return str_duplicate(rb_cString, str);
10413 return rb_str_subseq(str, 0, olen-roffset);
10414}
10415
10416
10417/*
10418 * call-seq:
10419 * strip! -> self or nil
10420 *
10421 * Like String#strip, except that:
10422 *
10423 * - Any modifications are made to +self+.
10424 * - Returns +self+ if any modification are made, +nil+ otherwise.
10425 *
10426 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10427 */
10428
10429static VALUE
10430rb_str_strip_bang(VALUE str)
10431{
10432 char *start;
10433 long olen, loffset, roffset;
10434 rb_encoding *enc;
10435
10436 str_modify_keep_cr(str);
10437 enc = STR_ENC_GET(str);
10438 RSTRING_GETMEM(str, start, olen);
10439 loffset = lstrip_offset(str, start, start+olen, enc);
10440 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10441
10442 if (loffset > 0 || roffset > 0) {
10443 long len = olen-roffset;
10444 if (loffset > 0) {
10445 len -= loffset;
10446 memmove(start, start + loffset, len);
10447 }
10448 STR_SET_LEN(str, len);
10449 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10450 return str;
10451 }
10452 return Qnil;
10453}
10454
10455
10456/*
10457 * call-seq:
10458 * strip -> new_string
10459 *
10460 * Returns a copy of +self+ with leading and trailing whitespace removed;
10461 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10462 *
10463 * whitespace = "\x00\t\n\v\f\r "
10464 * s = whitespace + 'abc' + whitespace
10465 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10466 * s.strip # => "abc"
10467 *
10468 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10469 */
10470
10471static VALUE
10472rb_str_strip(VALUE str)
10473{
10474 char *start;
10475 long olen, loffset, roffset;
10476 rb_encoding *enc = STR_ENC_GET(str);
10477
10478 RSTRING_GETMEM(str, start, olen);
10479 loffset = lstrip_offset(str, start, start+olen, enc);
10480 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10481
10482 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10483 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10484}
10485
10486static VALUE
10487scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10488{
10489 VALUE result = Qnil;
10490 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10491 if (pos >= 0) {
10492 VALUE match;
10493 struct re_registers *regs;
10494 if (BUILTIN_TYPE(pat) == T_STRING) {
10495 regs = NULL;
10496 end = pos + RSTRING_LEN(pat);
10497 }
10498 else {
10499 match = rb_backref_get();
10500 regs = RMATCH_REGS(match);
10501 pos = BEG(0);
10502 end = END(0);
10503 }
10504
10505 if (pos == end) {
10506 rb_encoding *enc = STR_ENC_GET(str);
10507 /*
10508 * Always consume at least one character of the input string
10509 */
10510 if (RSTRING_LEN(str) > end)
10511 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10512 RSTRING_END(str), enc);
10513 else
10514 *start = end + 1;
10515 }
10516 else {
10517 *start = end;
10518 }
10519
10520 if (!regs || regs->num_regs == 1) {
10521 result = rb_str_subseq(str, pos, end - pos);
10522 return result;
10523 }
10524 else {
10525 result = rb_ary_new2(regs->num_regs);
10526 for (int i = 1; i < regs->num_regs; i++) {
10527 VALUE s = Qnil;
10528 if (BEG(i) >= 0) {
10529 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10530 }
10531
10532 rb_ary_push(result, s);
10533 }
10534 }
10535
10536 RB_GC_GUARD(match);
10537 }
10538
10539 return result;
10540}
10541
10542
10543/*
10544 * call-seq:
10545 * scan(pattern) -> array_of_results
10546 * scan(pattern) {|result| ... } -> self
10547 *
10548 * :include: doc/string/scan.rdoc
10549 *
10550 */
10551
10552static VALUE
10553rb_str_scan(VALUE str, VALUE pat)
10554{
10555 VALUE result;
10556 long start = 0;
10557 long last = -1, prev = 0;
10558 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10559
10560 pat = get_pat_quoted(pat, 1);
10561 mustnot_broken(str);
10562 if (!rb_block_given_p()) {
10563 VALUE ary = rb_ary_new();
10564
10565 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10566 last = prev;
10567 prev = start;
10568 rb_ary_push(ary, result);
10569 }
10570 if (last >= 0) rb_pat_search(pat, str, last, 1);
10571 else rb_backref_set(Qnil);
10572 return ary;
10573 }
10574
10575 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10576 last = prev;
10577 prev = start;
10578 rb_yield(result);
10579 str_mod_check(str, p, len);
10580 }
10581 if (last >= 0) rb_pat_search(pat, str, last, 1);
10582 return str;
10583}
10584
10585
10586/*
10587 * call-seq:
10588 * hex -> integer
10589 *
10590 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10591 * returns its value as an integer.
10592 *
10593 * The leading substring is interpreted as hexadecimal when it begins with:
10594 *
10595 * - One or more character representing hexadecimal digits
10596 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10597 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10598 *
10599 * 'f'.hex # => 15
10600 * '11'.hex # => 17
10601 * 'FFF'.hex # => 4095
10602 * 'fffg'.hex # => 4095
10603 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10604 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10605 * 'deadbeef'.hex # => 3735928559
10606 *
10607 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10608 *
10609 * '0xfff'.hex # => 4095
10610 * '0xfffg'.hex # => 4095
10611 *
10612 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10613 *
10614 * '-fff'.hex # => -4095
10615 * '-0xFFF'.hex # => -4095
10616 *
10617 * For any substring not described above, returns zero:
10618 *
10619 * 'xxx'.hex # => 0
10620 * ''.hex # => 0
10621 *
10622 * Note that, unlike #oct, this method interprets only hexadecimal,
10623 * and not binary, octal, or decimal notations:
10624 *
10625 * '0b111'.hex # => 45329
10626 * '0o777'.hex # => 0
10627 * '0d999'.hex # => 55705
10628 *
10629 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10630 */
10631
10632static VALUE
10633rb_str_hex(VALUE str)
10634{
10635 return rb_str_to_inum(str, 16, FALSE);
10636}
10637
10638
10639/*
10640 * call-seq:
10641 * oct -> integer
10642 *
10643 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10644 * returns their value as an integer.
10645 *
10646 * In brief:
10647 *
10648 * # Interpreted as octal.
10649 * '777'.oct # => 511
10650 * '777x'.oct # => 511
10651 * '0777'.oct # => 511
10652 * '0o777'.oct # => 511
10653 * '-777'.oct # => -511
10654 * # Not interpreted as octal.
10655 * '0b111'.oct # => 7 # Interpreted as binary.
10656 * '0d999'.oct # => 999 # Interpreted as decimal.
10657 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10658 *
10659 * The leading substring is interpreted as octal when it begins with:
10660 *
10661 * - One or more character representing octal digits
10662 * (each in the range <tt>'0'..'7'</tt>);
10663 * the string to be interpreted ends at the first character that does not represent an octal digit:
10664 *
10665 * '7'.oct @ => 7
10666 * '11'.oct # => 9
10667 * '777'.oct # => 511
10668 * '0777'.oct # => 511
10669 * '7778'.oct # => 511
10670 * '777x'.oct # => 511
10671 *
10672 * - <tt>'0o'</tt>, followed by one or more octal digits:
10673 *
10674 * '0o777'.oct # => 511
10675 * '0o7778'.oct # => 511
10676 *
10677 * The leading substring is _not_ interpreted as octal when it begins with:
10678 *
10679 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10680 * (each in the range <tt>'0'..'1'</tt>);
10681 * the string to be interpreted ends at the first character that does not represent a binary digit.
10682 * the string is interpreted as binary digits (base 2):
10683 *
10684 * '0b111'.oct # => 7
10685 * '0b1112'.oct # => 7
10686 *
10687 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10688 * (each in the range <tt>'0'..'9'</tt>);
10689 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10690 * the string is interpreted as decimal digits (base 10):
10691 *
10692 * '0d999'.oct # => 999
10693 * '0d999x'.oct # => 999
10694 *
10695 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10696 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10697 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10698 * the string is interpreted as hexadecimal digits (base 16):
10699 *
10700 * '0xfff'.oct # => 4095
10701 * '0xfffg'.oct # => 4095
10702 *
10703 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10704 *
10705 * '-777'.oct # => -511
10706 * '-0777'.oct # => -511
10707 * '-0b111'.oct # => -7
10708 * '-0xfff'.oct # => -4095
10709 *
10710 * For any substring not described above, returns zero:
10711 *
10712 * 'foo'.oct # => 0
10713 * ''.oct # => 0
10714 *
10715 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10716 */
10717
10718static VALUE
10719rb_str_oct(VALUE str)
10720{
10721 return rb_str_to_inum(str, -8, FALSE);
10722}
10723
10724#ifndef HAVE_CRYPT_R
10725# include "ruby/thread_native.h"
10726# include "ruby/atomic.h"
10727
10728static struct {
10729 rb_nativethread_lock_t lock;
10730} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10731#endif
10732
10733/*
10734 * call-seq:
10735 * crypt(salt_str) -> new_string
10736 *
10737 * Returns the string generated by calling <code>crypt(3)</code>
10738 * standard library function with <code>str</code> and
10739 * <code>salt_str</code>, in this order, as its arguments. Please do
10740 * not use this method any longer. It is legacy; provided only for
10741 * backward compatibility with ruby scripts in earlier days. It is
10742 * bad to use in contemporary programs for several reasons:
10743 *
10744 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10745 * run. The generated string lacks data portability.
10746 *
10747 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10748 * (i.e. silently ends up in unexpected results).
10749 *
10750 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10751 * thread safe.
10752 *
10753 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10754 * very very weak. According to its manpage, Linux's traditional
10755 * <code>crypt(3)</code> output has only 2**56 variations; too
10756 * easy to brute force today. And this is the default behaviour.
10757 *
10758 * * In order to make things robust some OSes implement so-called
10759 * "modular" usage. To go through, you have to do a complex
10760 * build-up of the <code>salt_str</code> parameter, by hand.
10761 * Failure in generation of a proper salt string tends not to
10762 * yield any errors; typos in parameters are normally not
10763 * detectable.
10764 *
10765 * * For instance, in the following example, the second invocation
10766 * of String#crypt is wrong; it has a typo in "round=" (lacks
10767 * "s"). However the call does not fail and something unexpected
10768 * is generated.
10769 *
10770 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10771 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10772 *
10773 * * Even in the "modular" mode, some hash functions are considered
10774 * archaic and no longer recommended at all; for instance module
10775 * <code>$1$</code> is officially abandoned by its author: see
10776 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10777 * instance module <code>$3$</code> is considered completely
10778 * broken: see the manpage of FreeBSD.
10779 *
10780 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10781 * written above, <code>crypt(3)</code> on Mac OS never fails.
10782 * This means even if you build up a proper salt string it
10783 * generates a traditional DES hash anyways, and there is no way
10784 * for you to be aware of.
10785 *
10786 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10787 *
10788 * If for some reason you cannot migrate to other secure contemporary
10789 * password hashing algorithms, install the string-crypt gem and
10790 * <code>require 'string/crypt'</code> to continue using it.
10791 */
10792
10793static VALUE
10794rb_str_crypt(VALUE str, VALUE salt)
10795{
10796#ifdef HAVE_CRYPT_R
10797 VALUE databuf;
10798 struct crypt_data *data;
10799# define CRYPT_END() ALLOCV_END(databuf)
10800#else
10801 char *tmp_buf;
10802 extern char *crypt(const char *, const char *);
10803# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10804#endif
10805 VALUE result;
10806 const char *s, *saltp;
10807 char *res;
10808#ifdef BROKEN_CRYPT
10809 char salt_8bit_clean[3];
10810#endif
10811
10812 StringValue(salt);
10813 mustnot_wchar(str);
10814 mustnot_wchar(salt);
10815 s = StringValueCStr(str);
10816 saltp = RSTRING_PTR(salt);
10817 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10818 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10819 }
10820
10821#ifdef BROKEN_CRYPT
10822 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10823 salt_8bit_clean[0] = saltp[0] & 0x7f;
10824 salt_8bit_clean[1] = saltp[1] & 0x7f;
10825 salt_8bit_clean[2] = '\0';
10826 saltp = salt_8bit_clean;
10827 }
10828#endif
10829#ifdef HAVE_CRYPT_R
10830 data = ALLOCV(databuf, sizeof(struct crypt_data));
10831# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10832 data->initialized = 0;
10833# endif
10834 res = crypt_r(s, saltp, data);
10835#else
10836 rb_nativethread_lock_lock(&crypt_mutex.lock);
10837 res = crypt(s, saltp);
10838#endif
10839 if (!res) {
10840 int err = errno;
10841 CRYPT_END();
10842 rb_syserr_fail(err, "crypt");
10843 }
10844#ifdef HAVE_CRYPT_R
10845 result = rb_str_new_cstr(res);
10846 CRYPT_END();
10847#else
10848 // We need to copy this buffer because it's static and we need to unlock the mutex
10849 // before allocating a new object (the string to be returned). If we allocate while
10850 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10851 // if other ractors are waiting on this lock.
10852 size_t res_size = strlen(res)+1;
10853 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10854 memcpy(tmp_buf, res, res_size);
10855 res = tmp_buf;
10856 CRYPT_END();
10857 result = rb_str_new_cstr(res);
10858#endif
10859 return result;
10860}
10861
10862
10863/*
10864 * call-seq:
10865 * ord -> integer
10866 *
10867 * :include: doc/string/ord.rdoc
10868 *
10869 */
10870
10871static VALUE
10872rb_str_ord(VALUE s)
10873{
10874 unsigned int c;
10875
10876 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10877 return UINT2NUM(c);
10878}
10879/*
10880 * call-seq:
10881 * sum(n = 16) -> integer
10882 *
10883 * :include: doc/string/sum.rdoc
10884 *
10885 */
10886
10887static VALUE
10888rb_str_sum(int argc, VALUE *argv, VALUE str)
10889{
10890 int bits = 16;
10891 char *ptr, *p, *pend;
10892 long len;
10893 VALUE sum = INT2FIX(0);
10894 unsigned long sum0 = 0;
10895
10896 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10897 bits = 0;
10898 }
10899 ptr = p = RSTRING_PTR(str);
10900 len = RSTRING_LEN(str);
10901 pend = p + len;
10902
10903 while (p < pend) {
10904 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10905 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10906 str_mod_check(str, ptr, len);
10907 sum0 = 0;
10908 }
10909 sum0 += (unsigned char)*p;
10910 p++;
10911 }
10912
10913 if (bits == 0) {
10914 if (sum0) {
10915 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10916 }
10917 }
10918 else {
10919 if (sum == INT2FIX(0)) {
10920 if (bits < (int)sizeof(long)*CHAR_BIT) {
10921 sum0 &= (((unsigned long)1)<<bits)-1;
10922 }
10923 sum = LONG2FIX(sum0);
10924 }
10925 else {
10926 VALUE mod;
10927
10928 if (sum0) {
10929 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10930 }
10931
10932 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10933 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10934 sum = rb_funcall(sum, '&', 1, mod);
10935 }
10936 }
10937 return sum;
10938}
10939
10940static VALUE
10941rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10942{
10943 rb_encoding *enc;
10944 VALUE w;
10945 long width, len, flen = 1, fclen = 1;
10946 VALUE res;
10947 char *p;
10948 const char *f = " ";
10949 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10950 VALUE pad;
10951 int singlebyte = 1, cr;
10952 int termlen;
10953
10954 rb_scan_args(argc, argv, "11", &w, &pad);
10955 enc = STR_ENC_GET(str);
10956 termlen = rb_enc_mbminlen(enc);
10957 width = NUM2LONG(w);
10958 if (argc == 2) {
10959 StringValue(pad);
10960 enc = rb_enc_check(str, pad);
10961 f = RSTRING_PTR(pad);
10962 flen = RSTRING_LEN(pad);
10963 fclen = str_strlen(pad, enc); /* rb_enc_check */
10964 singlebyte = single_byte_optimizable(pad);
10965 if (flen == 0 || fclen == 0) {
10966 rb_raise(rb_eArgError, "zero width padding");
10967 }
10968 }
10969 len = str_strlen(str, enc); /* rb_enc_check */
10970 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10971 n = width - len;
10972 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10973 rlen = n - llen;
10974 cr = ENC_CODERANGE(str);
10975 if (flen > 1) {
10976 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10977 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10978 }
10979 size = RSTRING_LEN(str);
10980 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10981 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10982 (len += llen2 + rlen2) >= LONG_MAX - size) {
10983 rb_raise(rb_eArgError, "argument too big");
10984 }
10985 len += size;
10986 res = str_enc_new(rb_cString, 0, len, enc);
10987 p = RSTRING_PTR(res);
10988 if (flen <= 1) {
10989 memset(p, *f, llen);
10990 p += llen;
10991 }
10992 else {
10993 while (llen >= fclen) {
10994 memcpy(p,f,flen);
10995 p += flen;
10996 llen -= fclen;
10997 }
10998 if (llen > 0) {
10999 memcpy(p, f, llen2);
11000 p += llen2;
11001 }
11002 }
11003 memcpy(p, RSTRING_PTR(str), size);
11004 p += size;
11005 if (flen <= 1) {
11006 memset(p, *f, rlen);
11007 p += rlen;
11008 }
11009 else {
11010 while (rlen >= fclen) {
11011 memcpy(p,f,flen);
11012 p += flen;
11013 rlen -= fclen;
11014 }
11015 if (rlen > 0) {
11016 memcpy(p, f, rlen2);
11017 p += rlen2;
11018 }
11019 }
11020 TERM_FILL(p, termlen);
11021 STR_SET_LEN(res, p-RSTRING_PTR(res));
11022
11023 if (argc == 2)
11024 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11025 if (cr != ENC_CODERANGE_BROKEN)
11026 ENC_CODERANGE_SET(res, cr);
11027
11028 RB_GC_GUARD(pad);
11029 return res;
11030}
11031
11032
11033/*
11034 * call-seq:
11035 * ljust(width, pad_string = ' ') -> new_string
11036 *
11037 * :include: doc/string/ljust.rdoc
11038 *
11039 */
11040
11041static VALUE
11042rb_str_ljust(int argc, VALUE *argv, VALUE str)
11043{
11044 return rb_str_justify(argc, argv, str, 'l');
11045}
11046
11047/*
11048 * call-seq:
11049 * rjust(width, pad_string = ' ') -> new_string
11050 *
11051 * :include: doc/string/rjust.rdoc
11052 *
11053 */
11054
11055static VALUE
11056rb_str_rjust(int argc, VALUE *argv, VALUE str)
11057{
11058 return rb_str_justify(argc, argv, str, 'r');
11059}
11060
11061
11062/*
11063 * call-seq:
11064 * center(size, pad_string = ' ') -> new_string
11065 *
11066 * :include: doc/string/center.rdoc
11067 *
11068 */
11069
11070static VALUE
11071rb_str_center(int argc, VALUE *argv, VALUE str)
11072{
11073 return rb_str_justify(argc, argv, str, 'c');
11074}
11075
11076/*
11077 * call-seq:
11078 * partition(pattern) -> [pre_match, first_match, post_match]
11079 *
11080 * :include: doc/string/partition.rdoc
11081 *
11082 */
11083
11084static VALUE
11085rb_str_partition(VALUE str, VALUE sep)
11086{
11087 long pos;
11088
11089 sep = get_pat_quoted(sep, 0);
11090 if (RB_TYPE_P(sep, T_REGEXP)) {
11091 if (rb_reg_search(sep, str, 0, 0) < 0) {
11092 goto failed;
11093 }
11094 VALUE match = rb_backref_get();
11095 struct re_registers *regs = RMATCH_REGS(match);
11096
11097 pos = BEG(0);
11098 sep = rb_str_subseq(str, pos, END(0) - pos);
11099 }
11100 else {
11101 pos = rb_str_index(str, sep, 0);
11102 if (pos < 0) goto failed;
11103 }
11104 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11105 sep,
11106 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11107 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11108
11109 failed:
11110 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11111}
11112
11113/*
11114 * call-seq:
11115 * rpartition(pattern) -> [pre_match, last_match, post_match]
11116 *
11117 * :include: doc/string/rpartition.rdoc
11118 *
11119 */
11120
11121static VALUE
11122rb_str_rpartition(VALUE str, VALUE sep)
11123{
11124 long pos = RSTRING_LEN(str);
11125
11126 sep = get_pat_quoted(sep, 0);
11127 if (RB_TYPE_P(sep, T_REGEXP)) {
11128 if (rb_reg_search(sep, str, pos, 1) < 0) {
11129 goto failed;
11130 }
11131 VALUE match = rb_backref_get();
11132 struct re_registers *regs = RMATCH_REGS(match);
11133
11134 pos = BEG(0);
11135 sep = rb_str_subseq(str, pos, END(0) - pos);
11136 }
11137 else {
11138 pos = rb_str_sublen(str, pos);
11139 pos = rb_str_rindex(str, sep, pos);
11140 if (pos < 0) {
11141 goto failed;
11142 }
11143 }
11144
11145 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11146 sep,
11147 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11148 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11149 failed:
11150 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11151}
11152
11153/*
11154 * call-seq:
11155 * start_with?(*patterns) -> true or false
11156 *
11157 * :include: doc/string/start_with_p.rdoc
11158 *
11159 */
11160
11161static VALUE
11162rb_str_start_with(int argc, VALUE *argv, VALUE str)
11163{
11164 int i;
11165
11166 for (i=0; i<argc; i++) {
11167 VALUE tmp = argv[i];
11168 if (RB_TYPE_P(tmp, T_REGEXP)) {
11169 if (rb_reg_start_with_p(tmp, str))
11170 return Qtrue;
11171 }
11172 else {
11173 const char *p, *s, *e;
11174 long slen, tlen;
11175 rb_encoding *enc;
11176
11177 StringValue(tmp);
11178 enc = rb_enc_check(str, tmp);
11179 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11180 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11181 p = RSTRING_PTR(str);
11182 e = p + slen;
11183 s = p + tlen;
11184 if (!at_char_right_boundary(p, s, e, enc))
11185 continue;
11186 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11187 return Qtrue;
11188 }
11189 }
11190 return Qfalse;
11191}
11192
11193/*
11194 * call-seq:
11195 * end_with?(*strings) -> true or false
11196 *
11197 * :include: doc/string/end_with_p.rdoc
11198 *
11199 */
11200
11201static VALUE
11202rb_str_end_with(int argc, VALUE *argv, VALUE str)
11203{
11204 int i;
11205
11206 for (i=0; i<argc; i++) {
11207 VALUE tmp = argv[i];
11208 const char *p, *s, *e;
11209 long slen, tlen;
11210 rb_encoding *enc;
11211
11212 StringValue(tmp);
11213 enc = rb_enc_check(str, tmp);
11214 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11215 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11216 p = RSTRING_PTR(str);
11217 e = p + slen;
11218 s = e - tlen;
11219 if (!at_char_boundary(p, s, e, enc))
11220 continue;
11221 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11222 return Qtrue;
11223 }
11224 return Qfalse;
11225}
11226
11236static long
11237deleted_prefix_length(VALUE str, VALUE prefix)
11238{
11239 const char *strptr, *prefixptr;
11240 long olen, prefixlen;
11241 rb_encoding *enc = rb_enc_get(str);
11242
11243 StringValue(prefix);
11244
11245 if (!is_broken_string(prefix) ||
11246 !rb_enc_asciicompat(enc) ||
11247 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11248 enc = rb_enc_check(str, prefix);
11249 }
11250
11251 /* return 0 if not start with prefix */
11252 prefixlen = RSTRING_LEN(prefix);
11253 if (prefixlen <= 0) return 0;
11254 olen = RSTRING_LEN(str);
11255 if (olen < prefixlen) return 0;
11256 strptr = RSTRING_PTR(str);
11257 prefixptr = RSTRING_PTR(prefix);
11258 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11259 if (is_broken_string(prefix)) {
11260 if (!is_broken_string(str)) {
11261 /* prefix in a valid string cannot be broken */
11262 return 0;
11263 }
11264 const char *strend = strptr + olen;
11265 const char *after_prefix = strptr + prefixlen;
11266 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11267 /* prefix does not end at char-boundary */
11268 return 0;
11269 }
11270 }
11271 /* prefix part in `str` also should be valid. */
11272
11273 return prefixlen;
11274}
11275
11276/*
11277 * call-seq:
11278 * delete_prefix!(prefix) -> self or nil
11279 *
11280 * Like String#delete_prefix, except that +self+ is modified in place;
11281 * returns +self+ if the prefix is removed, +nil+ otherwise.
11282 *
11283 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11284 */
11285
11286static VALUE
11287rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11288{
11289 long prefixlen;
11290 str_modify_keep_cr(str);
11291
11292 prefixlen = deleted_prefix_length(str, prefix);
11293 if (prefixlen <= 0) return Qnil;
11294
11295 return rb_str_drop_bytes(str, prefixlen);
11296}
11297
11298/*
11299 * call-seq:
11300 * delete_prefix(prefix) -> new_string
11301 *
11302 * :include: doc/string/delete_prefix.rdoc
11303 *
11304 */
11305
11306static VALUE
11307rb_str_delete_prefix(VALUE str, VALUE prefix)
11308{
11309 long prefixlen;
11310
11311 prefixlen = deleted_prefix_length(str, prefix);
11312 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11313
11314 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11315}
11316
11326static long
11327deleted_suffix_length(VALUE str, VALUE suffix)
11328{
11329 const char *strptr, *suffixptr;
11330 long olen, suffixlen;
11331 rb_encoding *enc;
11332
11333 StringValue(suffix);
11334 if (is_broken_string(suffix)) return 0;
11335 enc = rb_enc_check(str, suffix);
11336
11337 /* return 0 if not start with suffix */
11338 suffixlen = RSTRING_LEN(suffix);
11339 if (suffixlen <= 0) return 0;
11340 olen = RSTRING_LEN(str);
11341 if (olen < suffixlen) return 0;
11342 strptr = RSTRING_PTR(str);
11343 suffixptr = RSTRING_PTR(suffix);
11344 const char *strend = strptr + olen;
11345 const char *before_suffix = strend - suffixlen;
11346 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11347 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11348
11349 return suffixlen;
11350}
11351
11352/*
11353 * call-seq:
11354 * delete_suffix!(suffix) -> self or nil
11355 *
11356 * Like String#delete_suffix, except that +self+ is modified in place;
11357 * returns +self+ if the suffix is removed, +nil+ otherwise.
11358 *
11359 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11360 */
11361
11362static VALUE
11363rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11364{
11365 long olen, suffixlen, len;
11366 str_modifiable(str);
11367
11368 suffixlen = deleted_suffix_length(str, suffix);
11369 if (suffixlen <= 0) return Qnil;
11370
11371 olen = RSTRING_LEN(str);
11372 str_modify_keep_cr(str);
11373 len = olen - suffixlen;
11374 STR_SET_LEN(str, len);
11375 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11376 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11378 }
11379 return str;
11380}
11381
11382/*
11383 * call-seq:
11384 * delete_suffix(suffix) -> new_string
11385 *
11386 * :include: doc/string/delete_suffix.rdoc
11387 *
11388 */
11389
11390static VALUE
11391rb_str_delete_suffix(VALUE str, VALUE suffix)
11392{
11393 long suffixlen;
11394
11395 suffixlen = deleted_suffix_length(str, suffix);
11396 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11397
11398 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11399}
11400
11401void
11402rb_str_setter(VALUE val, ID id, VALUE *var)
11403{
11404 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11405 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11406 }
11407 *var = val;
11408}
11409
11410static void
11411rb_fs_setter(VALUE val, ID id, VALUE *var)
11412{
11413 val = rb_fs_check(val);
11414 if (!val) {
11415 rb_raise(rb_eTypeError,
11416 "value of %"PRIsVALUE" must be String or Regexp",
11417 rb_id2str(id));
11418 }
11419 if (!NIL_P(val)) {
11420 rb_warn_deprecated("'$;'", NULL);
11421 }
11422 *var = val;
11423}
11424
11425
11426/*
11427 * call-seq:
11428 * force_encoding(encoding) -> self
11429 *
11430 * :include: doc/string/force_encoding.rdoc
11431 *
11432 */
11433
11434static VALUE
11435rb_str_force_encoding(VALUE str, VALUE enc)
11436{
11437 str_modifiable(str);
11438
11439 rb_encoding *encoding = rb_to_encoding(enc);
11440 int idx = rb_enc_to_index(encoding);
11441
11442 // If the encoding is unchanged, we do nothing.
11443 if (ENCODING_GET(str) == idx) {
11444 return str;
11445 }
11446
11447 rb_enc_associate_index(str, idx);
11448
11449 // If the coderange was 7bit and the new encoding is ASCII-compatible
11450 // we can keep the coderange.
11451 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11452 return str;
11453 }
11454
11456 return str;
11457}
11458
11459/*
11460 * call-seq:
11461 * b -> new_string
11462 *
11463 * :include: doc/string/b.rdoc
11464 *
11465 */
11466
11467static VALUE
11468rb_str_b(VALUE str)
11469{
11470 VALUE str2;
11471 if (STR_EMBED_P(str)) {
11472 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11473 }
11474 else {
11475 str2 = str_alloc_heap(rb_cString);
11476 }
11477 str_replace_shared_without_enc(str2, str);
11478
11479 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11480 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11481 // If we know the receiver's code range then we know the result's code range.
11482 int cr = ENC_CODERANGE(str);
11483 switch (cr) {
11484 case ENC_CODERANGE_7BIT:
11486 break;
11490 break;
11491 default:
11492 ENC_CODERANGE_CLEAR(str2);
11493 break;
11494 }
11495 }
11496
11497 return str2;
11498}
11499
11500/*
11501 * call-seq:
11502 * valid_encoding? -> true or false
11503 *
11504 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11505 *
11506 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11507 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11508 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11509 */
11510
11511static VALUE
11512rb_str_valid_encoding_p(VALUE str)
11513{
11514 int cr = rb_enc_str_coderange(str);
11515
11516 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11517}
11518
11519/*
11520 * call-seq:
11521 * ascii_only? -> true or false
11522 *
11523 * Returns whether +self+ contains only ASCII characters:
11524 *
11525 * 'abc'.ascii_only? # => true
11526 * "abc\u{6666}".ascii_only? # => false
11527 *
11528 * Related: see {Querying}[rdoc-ref:String@Querying].
11529 */
11530
11531static VALUE
11532rb_str_is_ascii_only_p(VALUE str)
11533{
11534 int cr = rb_enc_str_coderange(str);
11535
11536 return RBOOL(cr == ENC_CODERANGE_7BIT);
11537}
11538
11539VALUE
11541{
11542 static const char ellipsis[] = "...";
11543 const long ellipsislen = sizeof(ellipsis) - 1;
11544 rb_encoding *const enc = rb_enc_get(str);
11545 const long blen = RSTRING_LEN(str);
11546 const char *const p = RSTRING_PTR(str), *e = p + blen;
11547 VALUE estr, ret = 0;
11548
11549 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11550 if (len * rb_enc_mbminlen(enc) >= blen ||
11551 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11552 ret = str;
11553 }
11554 else if (len <= ellipsislen ||
11555 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11556 if (rb_enc_asciicompat(enc)) {
11557 ret = rb_str_new(ellipsis, len);
11558 rb_enc_associate(ret, enc);
11559 }
11560 else {
11561 estr = rb_usascii_str_new(ellipsis, len);
11562 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11563 }
11564 }
11565 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11566 rb_str_cat(ret, ellipsis, ellipsislen);
11567 }
11568 else {
11569 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11570 rb_enc_from_encoding(enc), 0, Qnil);
11571 rb_str_append(ret, estr);
11572 }
11573 return ret;
11574}
11575
11576static VALUE
11577str_compat_and_valid(VALUE str, rb_encoding *enc)
11578{
11579 int cr;
11580 str = StringValue(str);
11581 cr = rb_enc_str_coderange(str);
11582 if (cr == ENC_CODERANGE_BROKEN) {
11583 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11584 }
11585 else {
11586 rb_encoding *e = STR_ENC_GET(str);
11587 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11588 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11589 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11590 }
11591 }
11592 return str;
11593}
11594
11595static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11596
11597VALUE
11599{
11600 rb_encoding *enc = STR_ENC_GET(str);
11601 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11602}
11603
11604VALUE
11605rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11606{
11607 int cr = ENC_CODERANGE_UNKNOWN;
11608 if (enc == STR_ENC_GET(str)) {
11609 /* cached coderange makes sense only when enc equals the
11610 * actual encoding of str */
11611 cr = ENC_CODERANGE(str);
11612 }
11613 return enc_str_scrub(enc, str, repl, cr);
11614}
11615
11616static VALUE
11617enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11618{
11619 int encidx;
11620 VALUE buf = Qnil;
11621 const char *rep, *p, *e, *p1, *sp;
11622 long replen = -1;
11623 long slen;
11624
11625 if (rb_block_given_p()) {
11626 if (!NIL_P(repl))
11627 rb_raise(rb_eArgError, "both of block and replacement given");
11628 replen = 0;
11629 }
11630
11631 if (ENC_CODERANGE_CLEAN_P(cr))
11632 return Qnil;
11633
11634 if (!NIL_P(repl)) {
11635 repl = str_compat_and_valid(repl, enc);
11636 }
11637
11638 if (rb_enc_dummy_p(enc)) {
11639 return Qnil;
11640 }
11641 encidx = rb_enc_to_index(enc);
11642
11643#define DEFAULT_REPLACE_CHAR(str) do { \
11644 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11645 rep = replace; replen = (int)sizeof(replace); \
11646 } while (0)
11647
11648 slen = RSTRING_LEN(str);
11649 p = RSTRING_PTR(str);
11650 e = RSTRING_END(str);
11651 p1 = p;
11652 sp = p;
11653
11654 if (rb_enc_asciicompat(enc)) {
11655 int rep7bit_p;
11656 if (!replen) {
11657 rep = NULL;
11658 rep7bit_p = FALSE;
11659 }
11660 else if (!NIL_P(repl)) {
11661 rep = RSTRING_PTR(repl);
11662 replen = RSTRING_LEN(repl);
11663 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11664 }
11665 else if (encidx == rb_utf8_encindex()) {
11666 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11667 rep7bit_p = FALSE;
11668 }
11669 else {
11670 DEFAULT_REPLACE_CHAR("?");
11671 rep7bit_p = TRUE;
11672 }
11673 cr = ENC_CODERANGE_7BIT;
11674
11675 p = search_nonascii(p, e);
11676 if (!p) {
11677 p = e;
11678 }
11679 while (p < e) {
11680 int ret = rb_enc_precise_mbclen(p, e, enc);
11681 if (MBCLEN_NEEDMORE_P(ret)) {
11682 break;
11683 }
11684 else if (MBCLEN_CHARFOUND_P(ret)) {
11686 p += MBCLEN_CHARFOUND_LEN(ret);
11687 }
11688 else if (MBCLEN_INVALID_P(ret)) {
11689 /*
11690 * p1~p: valid ascii/multibyte chars
11691 * p ~e: invalid bytes + unknown bytes
11692 */
11693 long clen = rb_enc_mbmaxlen(enc);
11694 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11695 if (p > p1) {
11696 rb_str_buf_cat(buf, p1, p - p1);
11697 }
11698
11699 if (e - p < clen) clen = e - p;
11700 if (clen <= 2) {
11701 clen = 1;
11702 }
11703 else {
11704 const char *q = p;
11705 clen--;
11706 for (; clen > 1; clen--) {
11707 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11708 if (MBCLEN_NEEDMORE_P(ret)) break;
11709 if (MBCLEN_INVALID_P(ret)) continue;
11711 }
11712 }
11713 if (rep) {
11714 rb_str_buf_cat(buf, rep, replen);
11715 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11716 }
11717 else {
11718 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11719 str_mod_check(str, sp, slen);
11720 repl = str_compat_and_valid(repl, enc);
11721 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11724 }
11725 p += clen;
11726 p1 = p;
11727 p = search_nonascii(p, e);
11728 if (!p) {
11729 p = e;
11730 break;
11731 }
11732 }
11733 else {
11735 }
11736 }
11737 if (NIL_P(buf)) {
11738 if (p == e) {
11739 ENC_CODERANGE_SET(str, cr);
11740 return Qnil;
11741 }
11742 buf = rb_str_buf_new(RSTRING_LEN(str));
11743 }
11744 if (p1 < p) {
11745 rb_str_buf_cat(buf, p1, p - p1);
11746 }
11747 if (p < e) {
11748 if (rep) {
11749 rb_str_buf_cat(buf, rep, replen);
11750 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11751 }
11752 else {
11753 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11754 str_mod_check(str, sp, slen);
11755 repl = str_compat_and_valid(repl, enc);
11756 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11759 }
11760 }
11761 }
11762 else {
11763 /* ASCII incompatible */
11764 long mbminlen = rb_enc_mbminlen(enc);
11765 if (!replen) {
11766 rep = NULL;
11767 }
11768 else if (!NIL_P(repl)) {
11769 rep = RSTRING_PTR(repl);
11770 replen = RSTRING_LEN(repl);
11771 }
11772 else if (encidx == ENCINDEX_UTF_16BE) {
11773 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11774 }
11775 else if (encidx == ENCINDEX_UTF_16LE) {
11776 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11777 }
11778 else if (encidx == ENCINDEX_UTF_32BE) {
11779 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11780 }
11781 else if (encidx == ENCINDEX_UTF_32LE) {
11782 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11783 }
11784 else {
11785 DEFAULT_REPLACE_CHAR("?");
11786 }
11787
11788 while (p < e) {
11789 int ret = rb_enc_precise_mbclen(p, e, enc);
11790 if (MBCLEN_NEEDMORE_P(ret)) {
11791 break;
11792 }
11793 else if (MBCLEN_CHARFOUND_P(ret)) {
11794 p += MBCLEN_CHARFOUND_LEN(ret);
11795 }
11796 else if (MBCLEN_INVALID_P(ret)) {
11797 const char *q = p;
11798 long clen = rb_enc_mbmaxlen(enc);
11799 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11800 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11801
11802 if (e - p < clen) clen = e - p;
11803 if (clen <= mbminlen * 2) {
11804 clen = mbminlen;
11805 }
11806 else {
11807 clen -= mbminlen;
11808 for (; clen > mbminlen; clen-=mbminlen) {
11809 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11810 if (MBCLEN_NEEDMORE_P(ret)) break;
11811 if (MBCLEN_INVALID_P(ret)) continue;
11813 }
11814 }
11815 if (rep) {
11816 rb_str_buf_cat(buf, rep, replen);
11817 }
11818 else {
11819 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11820 str_mod_check(str, sp, slen);
11821 repl = str_compat_and_valid(repl, enc);
11822 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11823 }
11824 p += clen;
11825 p1 = p;
11826 }
11827 else {
11829 }
11830 }
11831 if (NIL_P(buf)) {
11832 if (p == e) {
11834 return Qnil;
11835 }
11836 buf = rb_str_buf_new(RSTRING_LEN(str));
11837 }
11838 if (p1 < p) {
11839 rb_str_buf_cat(buf, p1, p - p1);
11840 }
11841 if (p < e) {
11842 if (rep) {
11843 rb_str_buf_cat(buf, rep, replen);
11844 }
11845 else {
11846 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11847 str_mod_check(str, sp, slen);
11848 repl = str_compat_and_valid(repl, enc);
11849 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11850 }
11851 }
11853 }
11854 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11855 return buf;
11856}
11857
11858/*
11859 * call-seq:
11860 * scrub(replacement_string = default_replacement_string) -> new_string
11861 * scrub{|sequence| ... } -> new_string
11862 *
11863 * :include: doc/string/scrub.rdoc
11864 *
11865 */
11866static VALUE
11867str_scrub(int argc, VALUE *argv, VALUE str)
11868{
11869 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11870 VALUE new = rb_str_scrub(str, repl);
11871 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11872}
11873
11874/*
11875 * call-seq:
11876 * scrub!(replacement_string = default_replacement_string) -> self
11877 * scrub!{|sequence| ... } -> self
11878 *
11879 * Like String#scrub, except that:
11880 *
11881 * - Any replacements are made in +self+.
11882 * - Returns +self+.
11883 *
11884 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11885 *
11886 */
11887static VALUE
11888str_scrub_bang(int argc, VALUE *argv, VALUE str)
11889{
11890 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11891 VALUE new = rb_str_scrub(str, repl);
11892 if (!NIL_P(new)) rb_str_replace(str, new);
11893 return str;
11894}
11895
11896static ID id_normalize;
11897static ID id_normalized_p;
11898static VALUE mUnicodeNormalize;
11899
11900static VALUE
11901unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11902{
11903 static int UnicodeNormalizeRequired = 0;
11904 VALUE argv2[2];
11905
11906 if (!UnicodeNormalizeRequired) {
11907 rb_require("unicode_normalize/normalize.rb");
11908 UnicodeNormalizeRequired = 1;
11909 }
11910 argv2[0] = str;
11911 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11912 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11913}
11914
11915/*
11916 * call-seq:
11917 * unicode_normalize(form = :nfc) -> string
11918 *
11919 * Returns a copy of +self+ with
11920 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11921 *
11922 * Argument +form+ must be one of the following symbols
11923 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11924 *
11925 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11926 * - +:nfd+: Canonical decomposition.
11927 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11928 * - +:nfkd+: Compatibility decomposition.
11929 *
11930 * The encoding of +self+ must be one of:
11931 *
11932 * - Encoding::UTF_8
11933 * - Encoding::UTF_16BE
11934 * - Encoding::UTF_16LE
11935 * - Encoding::UTF_32BE
11936 * - Encoding::UTF_32LE
11937 * - Encoding::GB18030
11938 * - Encoding::UCS_2BE
11939 * - Encoding::UCS_4BE
11940 *
11941 * Examples:
11942 *
11943 * "a\u0300".unicode_normalize # => "a"
11944 * "\u00E0".unicode_normalize(:nfd) # => "a "
11945 *
11946 * Related: String#unicode_normalize!, String#unicode_normalized?.
11947 */
11948static VALUE
11949rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11950{
11951 return unicode_normalize_common(argc, argv, str, id_normalize);
11952}
11953
11954/*
11955 * call-seq:
11956 * unicode_normalize!(form = :nfc) -> self
11957 *
11958 * Like String#unicode_normalize, except that the normalization
11959 * is performed on +self+.
11960 *
11961 * Related String#unicode_normalized?.
11962 *
11963 */
11964static VALUE
11965rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11966{
11967 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11968}
11969
11970/* call-seq:
11971 * unicode_normalized?(form = :nfc) -> true or false
11972 *
11973 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11974 * +false+ otherwise.
11975 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11976 *
11977 * Examples:
11978 *
11979 * "a\u0300".unicode_normalized? # => false
11980 * "a\u0300".unicode_normalized?(:nfd) # => true
11981 * "\u00E0".unicode_normalized? # => true
11982 * "\u00E0".unicode_normalized?(:nfd) # => false
11983 *
11984 *
11985 * Raises an exception if +self+ is not in a Unicode encoding:
11986 *
11987 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
11988 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11989 *
11990 * Related: String#unicode_normalize, String#unicode_normalize!.
11991 *
11992 */
11993static VALUE
11994rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11995{
11996 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11997}
11998
11999/**********************************************************************
12000 * Document-class: Symbol
12001 *
12002 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12003 *
12004 * You can create a +Symbol+ object explicitly with:
12005 *
12006 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12007 *
12008 * The same +Symbol+ object will be
12009 * created for a given name or string for the duration of a program's
12010 * execution, regardless of the context or meaning of that name. Thus
12011 * if <code>Fred</code> is a constant in one context, a method in
12012 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12013 * will be the same object in all three contexts.
12014 *
12015 * module One
12016 * class Fred
12017 * end
12018 * $f1 = :Fred
12019 * end
12020 * module Two
12021 * Fred = 1
12022 * $f2 = :Fred
12023 * end
12024 * def Fred()
12025 * end
12026 * $f3 = :Fred
12027 * $f1.object_id #=> 2514190
12028 * $f2.object_id #=> 2514190
12029 * $f3.object_id #=> 2514190
12030 *
12031 * Constant, method, and variable names are returned as symbols:
12032 *
12033 * module One
12034 * Two = 2
12035 * def three; 3 end
12036 * @four = 4
12037 * @@five = 5
12038 * $six = 6
12039 * end
12040 * seven = 7
12041 *
12042 * One.constants
12043 * # => [:Two]
12044 * One.instance_methods(true)
12045 * # => [:three]
12046 * One.instance_variables
12047 * # => [:@four]
12048 * One.class_variables
12049 * # => [:@@five]
12050 * global_variables.grep(/six/)
12051 * # => [:$six]
12052 * local_variables
12053 * # => [:seven]
12054 *
12055 * A +Symbol+ object differs from a String object in that
12056 * a +Symbol+ object represents an identifier, while a String object
12057 * represents text or data.
12058 *
12059 * == What's Here
12060 *
12061 * First, what's elsewhere. Class +Symbol+:
12062 *
12063 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12064 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12065 *
12066 * Here, class +Symbol+ provides methods that are useful for:
12067 *
12068 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12069 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12070 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12071 *
12072 * === Methods for Querying
12073 *
12074 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12075 * - #=~: Returns the index of the first substring in symbol that matches a
12076 * given Regexp or other object; returns +nil+ if no match is found.
12077 * - #[], #slice : Returns a substring of symbol
12078 * determined by a given index, start/length, or range, or string.
12079 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12080 * - #encoding: Returns the Encoding object that represents the encoding
12081 * of symbol.
12082 * - #end_with?: Returns +true+ if symbol ends with
12083 * any of the given strings.
12084 * - #match: Returns a MatchData object if symbol
12085 * matches a given Regexp; +nil+ otherwise.
12086 * - #match?: Returns +true+ if symbol
12087 * matches a given Regexp; +false+ otherwise.
12088 * - #length, #size: Returns the number of characters in symbol.
12089 * - #start_with?: Returns +true+ if symbol starts with
12090 * any of the given strings.
12091 *
12092 * === Methods for Comparing
12093 *
12094 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12095 * or larger than symbol.
12096 * - #==, #===: Returns +true+ if a given symbol has the same content and
12097 * encoding.
12098 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12099 * symbol is smaller than, equal to, or larger than symbol.
12100 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12101 * after Unicode case folding; +false+ otherwise.
12102 *
12103 * === Methods for Converting
12104 *
12105 * - #capitalize: Returns symbol with the first character upcased
12106 * and all other characters downcased.
12107 * - #downcase: Returns symbol with all characters downcased.
12108 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12109 * - #name: Returns the frozen string corresponding to symbol.
12110 * - #succ, #next: Returns the symbol that is the successor to symbol.
12111 * - #swapcase: Returns symbol with all upcase characters downcased
12112 * and all downcase characters upcased.
12113 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12114 * - #to_s, #id2name: Returns the string corresponding to +self+.
12115 * - #to_sym, #intern: Returns +self+.
12116 * - #upcase: Returns symbol with all characters upcased.
12117 *
12118 */
12119
12120
12121/*
12122 * call-seq:
12123 * symbol == object -> true or false
12124 *
12125 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12126 */
12127
12128#define sym_equal rb_obj_equal
12129
12130static int
12131sym_printable(const char *s, const char *send, rb_encoding *enc)
12132{
12133 while (s < send) {
12134 int n;
12135 int c = rb_enc_precise_mbclen(s, send, enc);
12136
12137 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12138 n = MBCLEN_CHARFOUND_LEN(c);
12139 c = rb_enc_mbc_to_codepoint(s, send, enc);
12140 if (!rb_enc_isprint(c, enc)) return FALSE;
12141 s += n;
12142 }
12143 return TRUE;
12144}
12145
12146int
12147rb_str_symname_p(VALUE sym)
12148{
12149 rb_encoding *enc;
12150 const char *ptr;
12151 long len;
12152 rb_encoding *resenc = rb_default_internal_encoding();
12153
12154 if (resenc == NULL) resenc = rb_default_external_encoding();
12155 enc = STR_ENC_GET(sym);
12156 ptr = RSTRING_PTR(sym);
12157 len = RSTRING_LEN(sym);
12158 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12159 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12160 return FALSE;
12161 }
12162 return TRUE;
12163}
12164
12165VALUE
12166rb_str_quote_unprintable(VALUE str)
12167{
12168 rb_encoding *enc;
12169 const char *ptr;
12170 long len;
12171 rb_encoding *resenc;
12172
12173 Check_Type(str, T_STRING);
12174 resenc = rb_default_internal_encoding();
12175 if (resenc == NULL) resenc = rb_default_external_encoding();
12176 enc = STR_ENC_GET(str);
12177 ptr = RSTRING_PTR(str);
12178 len = RSTRING_LEN(str);
12179 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12180 !sym_printable(ptr, ptr + len, enc)) {
12181 return rb_str_escape(str);
12182 }
12183 return str;
12184}
12185
12186VALUE
12187rb_id_quote_unprintable(ID id)
12188{
12189 VALUE str = rb_id2str(id);
12190 if (!rb_str_symname_p(str)) {
12191 return rb_str_escape(str);
12192 }
12193 return str;
12194}
12195
12196/*
12197 * call-seq:
12198 * inspect -> string
12199 *
12200 * Returns a string representation of +self+ (including the leading colon):
12201 *
12202 * :foo.inspect # => ":foo"
12203 *
12204 * Related: Symbol#to_s, Symbol#name.
12205 *
12206 */
12207
12208static VALUE
12209sym_inspect(VALUE sym)
12210{
12211 VALUE str = rb_sym2str(sym);
12212 const char *ptr;
12213 long len;
12214 char *dest;
12215
12216 if (!rb_str_symname_p(str)) {
12217 str = rb_str_inspect(str);
12218 len = RSTRING_LEN(str);
12219 rb_str_resize(str, len + 1);
12220 dest = RSTRING_PTR(str);
12221 memmove(dest + 1, dest, len);
12222 }
12223 else {
12224 rb_encoding *enc = STR_ENC_GET(str);
12225 VALUE orig_str = str;
12226
12227 len = RSTRING_LEN(orig_str);
12228 str = rb_enc_str_new(0, len + 1, enc);
12229
12230 // Get data pointer after allocation
12231 ptr = RSTRING_PTR(orig_str);
12232 dest = RSTRING_PTR(str);
12233 memcpy(dest + 1, ptr, len);
12234
12235 RB_GC_GUARD(orig_str);
12236 }
12237 dest[0] = ':';
12238
12240
12241 return str;
12242}
12243
12244VALUE
12246{
12247 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12248 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12249 return str;
12250}
12251
12252VALUE
12253rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12254{
12255 VALUE obj;
12256
12257 if (argc < 1) {
12258 rb_raise(rb_eArgError, "no receiver given");
12259 }
12260 obj = argv[0];
12261 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12262}
12263
12264/*
12265 * call-seq:
12266 * succ
12267 *
12268 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12269 *
12270 * :foo.succ # => :fop
12271 *
12272 * Related: String#succ.
12273 */
12274
12275static VALUE
12276sym_succ(VALUE sym)
12277{
12278 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12279}
12280
12281/*
12282 * call-seq:
12283 * symbol <=> object -> -1, 0, +1, or nil
12284 *
12285 * If +object+ is a symbol,
12286 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12287 *
12288 * :bar <=> :foo # => -1
12289 * :foo <=> :foo # => 0
12290 * :foo <=> :bar # => 1
12291 *
12292 * Otherwise, returns +nil+:
12293 *
12294 * :foo <=> 'bar' # => nil
12295 *
12296 * Related: String#<=>.
12297 */
12298
12299static VALUE
12300sym_cmp(VALUE sym, VALUE other)
12301{
12302 if (!SYMBOL_P(other)) {
12303 return Qnil;
12304 }
12305 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12306}
12307
12308/*
12309 * call-seq:
12310 * casecmp(object) -> -1, 0, 1, or nil
12311 *
12312 * :include: doc/symbol/casecmp.rdoc
12313 *
12314 */
12315
12316static VALUE
12317sym_casecmp(VALUE sym, VALUE other)
12318{
12319 if (!SYMBOL_P(other)) {
12320 return Qnil;
12321 }
12322 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12323}
12324
12325/*
12326 * call-seq:
12327 * casecmp?(object) -> true, false, or nil
12328 *
12329 * :include: doc/symbol/casecmp_p.rdoc
12330 *
12331 */
12332
12333static VALUE
12334sym_casecmp_p(VALUE sym, VALUE other)
12335{
12336 if (!SYMBOL_P(other)) {
12337 return Qnil;
12338 }
12339 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12340}
12341
12342/*
12343 * call-seq:
12344 * symbol =~ object -> integer or nil
12345 *
12346 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12347 * including possible updates to global variables;
12348 * see String#=~.
12349 *
12350 */
12351
12352static VALUE
12353sym_match(VALUE sym, VALUE other)
12354{
12355 return rb_str_match(rb_sym2str(sym), other);
12356}
12357
12358/*
12359 * call-seq:
12360 * match(pattern, offset = 0) -> matchdata or nil
12361 * match(pattern, offset = 0) {|matchdata| } -> object
12362 *
12363 * Equivalent to <tt>self.to_s.match</tt>,
12364 * including possible updates to global variables;
12365 * see String#match.
12366 *
12367 */
12368
12369static VALUE
12370sym_match_m(int argc, VALUE *argv, VALUE sym)
12371{
12372 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12373}
12374
12375/*
12376 * call-seq:
12377 * match?(pattern, offset) -> true or false
12378 *
12379 * Equivalent to <tt>sym.to_s.match?</tt>;
12380 * see String#match.
12381 *
12382 */
12383
12384static VALUE
12385sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12386{
12387 return rb_str_match_m_p(argc, argv, sym);
12388}
12389
12390/*
12391 * call-seq:
12392 * symbol[index] -> string or nil
12393 * symbol[start, length] -> string or nil
12394 * symbol[range] -> string or nil
12395 * symbol[regexp, capture = 0] -> string or nil
12396 * symbol[substring] -> string or nil
12397 *
12398 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12399 *
12400 */
12401
12402static VALUE
12403sym_aref(int argc, VALUE *argv, VALUE sym)
12404{
12405 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12406}
12407
12408/*
12409 * call-seq:
12410 * length -> integer
12411 *
12412 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12413 */
12414
12415static VALUE
12416sym_length(VALUE sym)
12417{
12418 return rb_str_length(rb_sym2str(sym));
12419}
12420
12421/*
12422 * call-seq:
12423 * empty? -> true or false
12424 *
12425 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12426 *
12427 */
12428
12429static VALUE
12430sym_empty(VALUE sym)
12431{
12432 return rb_str_empty(rb_sym2str(sym));
12433}
12434
12435/*
12436 * call-seq:
12437 * upcase(mapping) -> symbol
12438 *
12439 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12440 *
12441 * See String#upcase.
12442 *
12443 */
12444
12445static VALUE
12446sym_upcase(int argc, VALUE *argv, VALUE sym)
12447{
12448 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12449}
12450
12451/*
12452 * call-seq:
12453 * downcase(mapping) -> symbol
12454 *
12455 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12456 *
12457 * See String#downcase.
12458 *
12459 * Related: Symbol#upcase.
12460 *
12461 */
12462
12463static VALUE
12464sym_downcase(int argc, VALUE *argv, VALUE sym)
12465{
12466 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12467}
12468
12469/*
12470 * call-seq:
12471 * capitalize(mapping) -> symbol
12472 *
12473 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12474 *
12475 * See String#capitalize.
12476 *
12477 */
12478
12479static VALUE
12480sym_capitalize(int argc, VALUE *argv, VALUE sym)
12481{
12482 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12483}
12484
12485/*
12486 * call-seq:
12487 * swapcase(mapping) -> symbol
12488 *
12489 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12490 *
12491 * See String#swapcase.
12492 *
12493 */
12494
12495static VALUE
12496sym_swapcase(int argc, VALUE *argv, VALUE sym)
12497{
12498 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12499}
12500
12501/*
12502 * call-seq:
12503 * start_with?(*string_or_regexp) -> true or false
12504 *
12505 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12506 *
12507 */
12508
12509static VALUE
12510sym_start_with(int argc, VALUE *argv, VALUE sym)
12511{
12512 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12513}
12514
12515/*
12516 * call-seq:
12517 * end_with?(*strings) -> true or false
12518 *
12519 *
12520 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12521 *
12522 */
12523
12524static VALUE
12525sym_end_with(int argc, VALUE *argv, VALUE sym)
12526{
12527 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12528}
12529
12530/*
12531 * call-seq:
12532 * encoding -> encoding
12533 *
12534 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12535 *
12536 */
12537
12538static VALUE
12539sym_encoding(VALUE sym)
12540{
12541 return rb_obj_encoding(rb_sym2str(sym));
12542}
12543
12544static VALUE
12545string_for_symbol(VALUE name)
12546{
12547 if (!RB_TYPE_P(name, T_STRING)) {
12548 VALUE tmp = rb_check_string_type(name);
12549 if (NIL_P(tmp)) {
12550 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12551 name);
12552 }
12553 name = tmp;
12554 }
12555 return name;
12556}
12557
12558ID
12560{
12561 if (SYMBOL_P(name)) {
12562 return SYM2ID(name);
12563 }
12564 name = string_for_symbol(name);
12565 return rb_intern_str(name);
12566}
12567
12568VALUE
12570{
12571 if (SYMBOL_P(name)) {
12572 return name;
12573 }
12574 name = string_for_symbol(name);
12575 return rb_str_intern(name);
12576}
12577
12578/*
12579 * call-seq:
12580 * Symbol.all_symbols -> array_of_symbols
12581 *
12582 * Returns an array of all symbols currently in Ruby's symbol table:
12583 *
12584 * Symbol.all_symbols.size # => 9334
12585 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12586 *
12587 */
12588
12589static VALUE
12590sym_all_symbols(VALUE _)
12591{
12592 return rb_sym_all_symbols();
12593}
12594
12595VALUE
12596rb_str_to_interned_str(VALUE str)
12597{
12598 return rb_fstring(str);
12599}
12600
12601VALUE
12602rb_interned_str(const char *ptr, long len)
12603{
12604 struct RString fake_str = {RBASIC_INIT};
12605 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12606}
12607
12608VALUE
12610{
12611 return rb_interned_str(ptr, strlen(ptr));
12612}
12613
12614VALUE
12615rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12616{
12617 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12618 rb_enc_autoload(enc);
12619 }
12620
12621 struct RString fake_str = {RBASIC_INIT};
12622 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12623}
12624
12625VALUE
12626rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12627{
12628 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12629 rb_enc_autoload(enc);
12630 }
12631
12632 struct RString fake_str = {RBASIC_INIT};
12633 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12634 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12635 return str;
12636}
12637
12638VALUE
12640{
12641 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12642}
12643
12644#if USE_YJIT
12645void
12646rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12647{
12648 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12649 ssize_t code = RB_NUM2SSIZE(codepoint);
12650
12651 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12652 rb_str_buf_cat_byte(str, (char) code);
12653 return;
12654 }
12655 }
12656
12657 rb_str_concat(str, codepoint);
12658}
12659#endif
12660
12661static int
12662fstring_set_class_i(VALUE *str, void *data)
12663{
12664 RBASIC_SET_CLASS(*str, rb_cString);
12665
12666 return ST_CONTINUE;
12667}
12668
12669void
12670Init_String(void)
12671{
12672 rb_cString = rb_define_class("String", rb_cObject);
12673
12674 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12675
12677 rb_define_alloc_func(rb_cString, empty_str_alloc);
12678 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12679 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12680 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12682 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12683 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12686 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12687 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12688 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12689 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12692 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12693 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12694 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12695 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12698 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12699 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12700 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12701 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12702 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12704 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12706 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12707 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12708 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12709 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12710 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12711 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12712 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12713 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12714 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12715 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12716 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12717 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12718 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12719 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12721 rb_define_method(rb_cString, "+@", str_uplus, 0);
12722 rb_define_method(rb_cString, "-@", str_uminus, 0);
12723 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12724 rb_define_alias(rb_cString, "dedup", "-@");
12725
12726 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12727 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12728 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12729 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12732 rb_define_method(rb_cString, "undump", str_undump, 0);
12733
12734 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12735 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12736 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12737 sym_fold = ID2SYM(rb_intern_const("fold"));
12738
12739 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12740 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12741 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12742 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12743
12744 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12745 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12746 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12747 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12748
12749 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12750 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12751 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12752 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12753 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12754 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12755 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12756 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12757 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12758 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12759 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12760 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12762 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12763 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12764 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12765 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12766 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12767
12768 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12769 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12770 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12771
12772 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12773
12774 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12775 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12776 rb_define_method(rb_cString, "center", rb_str_center, -1);
12777
12778 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12779 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12780 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12781 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12782 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12783 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12784 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12785 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12786 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12787
12788 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12789 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12790 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12791 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12792 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12793 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12794 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12795 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12796 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12797
12798 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12799 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12800 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12801 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12802 rb_define_method(rb_cString, "count", rb_str_count, -1);
12803
12804 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12805 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12806 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12807 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12808
12809 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12810 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12811 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12812 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12813 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12814
12815 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12816
12817 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12818 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12819
12820 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12821 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12822
12823 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12824 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12825 rb_define_method(rb_cString, "b", rb_str_b, 0);
12826 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12827 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12828
12829 /* define UnicodeNormalize module here so that we don't have to look it up */
12830 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12831 id_normalize = rb_intern_const("normalize");
12832 id_normalized_p = rb_intern_const("normalized?");
12833
12834 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12835 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12836 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12837
12838 rb_fs = Qnil;
12839 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12840 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12841 rb_gc_register_address(&rb_fs);
12842
12843 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12847 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12848
12849 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12850 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12851 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12852 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12853 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12854 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12855
12856 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12857 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12858 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12859 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12860
12861 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12862 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12863 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12864 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12865 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12866 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12867 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12868
12869 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12870 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12871 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12872 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12873
12874 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12875 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12876
12877 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12878}
12879
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1810
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1603
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1716
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2962
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2782
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3252
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1037
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3041
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:206
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:683
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3908
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:676
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2164
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2182
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1341
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3578
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:265
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:583
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:177
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1329
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3262
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1336
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:941
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1201
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3022
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1220
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12615
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2328
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3726
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1149
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1441
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1342
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:960
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12639
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:825
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:710
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2005
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2011
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1927
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4223
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3720
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1746
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1506
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2481
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3791
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1417
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12245
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2554
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1393
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1740
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3050
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5327
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4154
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3147
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11540
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1782
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1183
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:995
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1512
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1990
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4140
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3559
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2417
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2008
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6560
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3155
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12609
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1423
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3757
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3097
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4261
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3381
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7235
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2784
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12602
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4208
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4028
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4183
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3733
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3272
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5837
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11598
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1696
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2944
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3244
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3363
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1195
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2738
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7342
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1405
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1712
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2431
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5755
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9380
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1189
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1844
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1986
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2065
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3361
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1624
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12569
ID rb_to_id(VALUE str)
Definition string.c:12559
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3499
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4467
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1435
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2921
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2803
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1429
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2816
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1773
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:455
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1479
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:202
Definition string.c:8268
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113