Ruby 4.0.0dev (2025-12-23 revision 515119541095bcb84cb8d85db644d836eeeeef33)
string.c (515119541095bcb84cb8d85db644d836eeeeef33)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 FL_SET((shared_str), STR_SHARED_ROOT); \
209 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
210 FL_SET_RAW((shared_str), STR_BORROWED); \
211 } \
212} while (0)
213
214#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
215#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216/* TODO: include the terminator size in capa. */
217
218#define STR_ENC_GET(str) get_encoding(str)
219
220#if !defined SHARABLE_MIDDLE_SUBSTRING
221# define SHARABLE_MIDDLE_SUBSTRING 0
222#endif
223#if !SHARABLE_MIDDLE_SUBSTRING
224#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#else
226#define SHARABLE_SUBSTRING_P(beg, len, end) 1
227#endif
228
229
230static inline long
231str_embed_capa(VALUE str)
232{
233 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
234}
235
236bool
237rb_str_reembeddable_p(VALUE str)
238{
239 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
240}
241
242static inline size_t
243rb_str_embed_size(long capa, long termlen)
244{
245 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
246 if (size < sizeof(struct RString)) size = sizeof(struct RString);
247 return size;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254 if (STR_EMBED_P(str)) {
255 size_t capa = RSTRING(str)->len;
256 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
257
258 real_size = rb_str_embed_size(capa, TERM_LEN(str));
259 }
260 /* if the string is not currently embedded, but it can be embedded, how
261 * much space would it require */
262 else if (rb_str_reembeddable_p(str)) {
263 size_t capa = RSTRING(str)->as.heap.aux.capa;
264 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
265
266 real_size = rb_str_embed_size(capa, TERM_LEN(str));
267 }
268 else {
269 real_size = sizeof(struct RString);
270 }
271
272 return real_size;
273}
274
275static inline bool
276STR_EMBEDDABLE_P(long len, long termlen)
277{
278 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
279}
280
281static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
282static VALUE str_new_frozen(VALUE klass, VALUE orig);
283static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
284static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
285static VALUE str_new(VALUE klass, const char *ptr, long len);
286static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
287static inline void str_modifiable(VALUE str);
288static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
289static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->len;
317
318 STR_SET_EMBED(str);
319 STR_SET_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_debug_rstring_null_ptr(const char *func)
331{
332 fprintf(stderr, "%s is returning NULL!! "
333 "SIGSEGV is highly expected to follow immediately.\n"
334 "If you could reproduce, attach your debugger here, "
335 "and look at the passed string.\n",
336 func);
337}
338
339/* symbols for [up|down|swap]case/capitalize options */
340static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
341
342static rb_encoding *
343get_encoding(VALUE str)
344{
345 return rb_enc_from_index(ENCODING_GET(str));
346}
347
348static void
349mustnot_broken(VALUE str)
350{
351 if (is_broken_string(str)) {
352 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
353 }
354}
355
356static void
357mustnot_wchar(VALUE str)
358{
359 rb_encoding *enc = STR_ENC_GET(str);
360 if (rb_enc_mbminlen(enc) > 1) {
361 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
362 }
363}
364
365static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
366
367#if SIZEOF_LONG == SIZEOF_VOIDP
368#define PRECOMPUTED_FAKESTR_HASH 1
369#else
370#endif
371
372static inline bool
373BARE_STRING_P(VALUE str)
374{
375 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
376}
377
378static inline st_index_t
379str_do_hash(VALUE str)
380{
381 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
382 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
383 if (e && !is_ascii_string(str)) {
384 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
385 }
386 return h;
387}
388
389static VALUE
390str_store_precomputed_hash(VALUE str, st_index_t hash)
391{
392 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
393 RUBY_ASSERT(STR_EMBED_P(str));
394
395#if RUBY_DEBUG
396 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
397 size_t free_bytes = str_embed_capa(str) - used_bytes;
398 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
399#endif
400
401 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
402
403 FL_SET(str, STR_PRECOMPUTED_HASH);
404
405 return str;
406}
407
408VALUE
409rb_fstring(VALUE str)
410{
411 VALUE fstr;
412 int bare;
413
414 Check_Type(str, T_STRING);
415
416 if (FL_TEST(str, RSTRING_FSTR))
417 return str;
418
419 bare = BARE_STRING_P(str);
420 if (!bare) {
421 if (STR_EMBED_P(str)) {
422 OBJ_FREEZE(str);
423 return str;
424 }
425
426 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 return str;
429 }
430 }
431
432 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
433 rb_str_resize(str, RSTRING_LEN(str));
434
435 fstr = register_fstring(str, false, false);
436
437 if (!bare) {
438 str_replace_shared_without_enc(str, fstr);
439 OBJ_FREEZE(str);
440 return str;
441 }
442 return fstr;
443}
444
445static VALUE fstring_table_obj;
446
447static VALUE
448fstring_concurrent_set_hash(VALUE str)
449{
450#ifdef PRECOMPUTED_FAKESTR_HASH
451 st_index_t h;
452 if (FL_TEST_RAW(str, STR_FAKESTR)) {
453 // register_fstring precomputes the hash and stores it in capa for fake strings
454 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
455 }
456 else {
457 h = rb_str_hash(str);
458 }
459 // rb_str_hash doesn't include the encoding for ascii only strings, so
460 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
461 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
462#else
463 return (VALUE)rb_str_hash(str);
464#endif
465}
466
467static bool
468fstring_concurrent_set_cmp(VALUE a, VALUE b)
469{
470 long alen, blen;
471 const char *aptr, *bptr;
472
475
476 RSTRING_GETMEM(a, aptr, alen);
477 RSTRING_GETMEM(b, bptr, blen);
478 return (alen == blen &&
479 ENCODING_GET(a) == ENCODING_GET(b) &&
480 memcmp(aptr, bptr, alen) == 0);
481}
482
484 bool copy;
485 bool force_precompute_hash;
486};
487
488static VALUE
489fstring_concurrent_set_create(VALUE str, void *data)
490{
491 struct fstr_create_arg *arg = data;
492
493 // Unless the string is empty or binary, its coderange has been precomputed.
494 int coderange = ENC_CODERANGE(str);
495
496 if (FL_TEST_RAW(str, STR_FAKESTR)) {
497 if (arg->copy) {
498 VALUE new_str;
499 long len = RSTRING_LEN(str);
500 long capa = len + sizeof(st_index_t);
501 int term_len = TERM_LEN(str);
502
503 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
504 new_str = str_alloc_embed(rb_cString, capa + term_len);
505 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
506 STR_SET_LEN(new_str, RSTRING_LEN(str));
507 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
508 rb_enc_copy(new_str, str);
509 str_store_precomputed_hash(new_str, str_do_hash(str));
510 }
511 else {
512 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
513 rb_enc_copy(new_str, str);
514#ifdef PRECOMPUTED_FAKESTR_HASH
515 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
516 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
517 }
518#endif
519 }
520 str = new_str;
521 }
522 else {
523 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
524 RSTRING(str)->len,
525 ENCODING_GET(str));
526 }
527 OBJ_FREEZE(str);
528 }
529 else {
530 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
531 str = str_new_frozen(rb_cString, str);
532 }
533 if (STR_SHARED_P(str)) { /* str should not be shared */
534 /* shared substring */
535 str_make_independent(str);
537 }
538 if (!BARE_STRING_P(str)) {
539 str = str_new_frozen(rb_cString, str);
540 }
541 }
542
543 ENC_CODERANGE_SET(str, coderange);
544 RBASIC(str)->flags |= RSTRING_FSTR;
545 if (!RB_OBJ_SHAREABLE_P(str)) {
546 RB_OBJ_SET_SHAREABLE(str);
547 }
548 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
551 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
552 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
554 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
555
556 return str;
557}
558
559static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
560 .hash = fstring_concurrent_set_hash,
561 .cmp = fstring_concurrent_set_cmp,
562 .create = fstring_concurrent_set_create,
563 .free = NULL,
564};
565
566void
567Init_fstring_table(void)
568{
569 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
570 rb_gc_register_address(&fstring_table_obj);
571}
572
573static VALUE
574register_fstring(VALUE str, bool copy, bool force_precompute_hash)
575{
576 struct fstr_create_arg args = {
577 .copy = copy,
578 .force_precompute_hash = force_precompute_hash
579 };
580
581#if SIZEOF_VOIDP == SIZEOF_LONG
582 if (FL_TEST_RAW(str, STR_FAKESTR)) {
583 // if the string hasn't been interned, we'll need the hash twice, so we
584 // compute it once and store it in capa
585 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
586 }
587#endif
588
589 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
590
591 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593 RUBY_ASSERT(OBJ_FROZEN(result));
595 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
596 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
598
599 return result;
600}
601
602bool
603rb_obj_is_fstring_table(VALUE obj)
604{
605 ASSERT_vm_locking();
606
607 return obj == fstring_table_obj;
608}
609
610void
611rb_gc_free_fstring(VALUE obj)
612{
613 ASSERT_vm_locking_with_barrier();
614
615 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
617 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
618
619 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
620
621 RB_DEBUG_COUNTER_INC(obj_str_fstr);
622
623 FL_UNSET(obj, RSTRING_FSTR);
624}
625
626void
627rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
628{
629 if (fstring_table_obj) {
630 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
631 }
632}
633
634static VALUE
635setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
636{
637 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
638 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
639
640 if (!name) {
642 name = "";
643 }
644
645 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
646
647 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
648 fake_str->len = len;
649 fake_str->as.heap.ptr = (char *)name;
650 fake_str->as.heap.aux.capa = len;
651 return (VALUE)fake_str;
652}
653
654/*
655 * set up a fake string which refers a static string literal.
656 */
657VALUE
658rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
659{
660 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
661}
662
663/*
664 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
665 * shared string which refers a static string literal. `ptr` must
666 * point a constant string.
667 */
668VALUE
669rb_fstring_new(const char *ptr, long len)
670{
671 struct RString fake_str = {RBASIC_INIT};
672 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
673}
674
675VALUE
676rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
677{
678 struct RString fake_str = {RBASIC_INIT};
679 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
680}
681
682VALUE
683rb_fstring_cstr(const char *ptr)
684{
685 return rb_fstring_new(ptr, strlen(ptr));
686}
687
688static inline bool
689single_byte_optimizable(VALUE str)
690{
691 int encindex = ENCODING_GET(str);
692 switch (encindex) {
693 case ENCINDEX_ASCII_8BIT:
694 case ENCINDEX_US_ASCII:
695 return true;
696 case ENCINDEX_UTF_8:
697 // For UTF-8 it's worth scanning the string coderange when unknown.
699 }
700 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
701 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
702 return true;
703 }
704
705 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
706 return true;
707 }
708
709 /* Conservative. Possibly single byte.
710 * "\xa1" in Shift_JIS for example. */
711 return false;
712}
713
715
716static inline const char *
717search_nonascii(const char *p, const char *e)
718{
719 const char *s, *t;
720
721#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
722# if SIZEOF_UINTPTR_T == 8
723# define NONASCII_MASK UINT64_C(0x8080808080808080)
724# elif SIZEOF_UINTPTR_T == 4
725# define NONASCII_MASK UINT32_C(0x80808080)
726# else
727# error "don't know what to do."
728# endif
729#else
730# if SIZEOF_UINTPTR_T == 8
731# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
732# elif SIZEOF_UINTPTR_T == 4
733# define NONASCII_MASK 0x80808080UL /* or...? */
734# else
735# error "don't know what to do."
736# endif
737#endif
738
739 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
740#if !UNALIGNED_WORD_ACCESS
741 if ((uintptr_t)p % SIZEOF_VOIDP) {
742 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
743 p += l;
744 switch (l) {
745 default: UNREACHABLE;
746#if SIZEOF_VOIDP > 4
747 case 7: if (p[-7]&0x80) return p-7;
748 case 6: if (p[-6]&0x80) return p-6;
749 case 5: if (p[-5]&0x80) return p-5;
750 case 4: if (p[-4]&0x80) return p-4;
751#endif
752 case 3: if (p[-3]&0x80) return p-3;
753 case 2: if (p[-2]&0x80) return p-2;
754 case 1: if (p[-1]&0x80) return p-1;
755 case 0: break;
756 }
757 }
758#endif
759#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
760#define aligned_ptr(value) \
761 __builtin_assume_aligned((value), sizeof(uintptr_t))
762#else
763#define aligned_ptr(value) (value)
764#endif
765 s = aligned_ptr(p);
766 t = (e - (SIZEOF_VOIDP-1));
767#undef aligned_ptr
768 for (;s < t; s += sizeof(uintptr_t)) {
769 uintptr_t word;
770 memcpy(&word, s, sizeof(word));
771 if (word & NONASCII_MASK) {
772#ifdef WORDS_BIGENDIAN
773 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
774#else
775 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
776#endif
777 }
778 }
779 p = (const char *)s;
780 }
781
782 switch (e - p) {
783 default: UNREACHABLE;
784#if SIZEOF_VOIDP > 4
785 case 7: if (e[-7]&0x80) return e-7;
786 case 6: if (e[-6]&0x80) return e-6;
787 case 5: if (e[-5]&0x80) return e-5;
788 case 4: if (e[-4]&0x80) return e-4;
789#endif
790 case 3: if (e[-3]&0x80) return e-3;
791 case 2: if (e[-2]&0x80) return e-2;
792 case 1: if (e[-1]&0x80) return e-1;
793 case 0: return NULL;
794 }
795}
796
797static int
798coderange_scan(const char *p, long len, rb_encoding *enc)
799{
800 const char *e = p + len;
801
802 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
803 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
804 p = search_nonascii(p, e);
806 }
807
808 if (rb_enc_asciicompat(enc)) {
809 p = search_nonascii(p, e);
810 if (!p) return ENC_CODERANGE_7BIT;
811 for (;;) {
812 int ret = rb_enc_precise_mbclen(p, e, enc);
814 p += MBCLEN_CHARFOUND_LEN(ret);
815 if (p == e) break;
816 p = search_nonascii(p, e);
817 if (!p) break;
818 }
819 }
820 else {
821 while (p < e) {
822 int ret = rb_enc_precise_mbclen(p, e, enc);
824 p += MBCLEN_CHARFOUND_LEN(ret);
825 }
826 }
827 return ENC_CODERANGE_VALID;
828}
829
830long
831rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
832{
833 const char *p = s;
834
835 if (*cr == ENC_CODERANGE_BROKEN)
836 return e - s;
837
838 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
839 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
840 if (*cr == ENC_CODERANGE_VALID) return e - s;
841 p = search_nonascii(p, e);
843 return e - s;
844 }
845 else if (rb_enc_asciicompat(enc)) {
846 p = search_nonascii(p, e);
847 if (!p) {
848 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
849 return e - s;
850 }
851 for (;;) {
852 int ret = rb_enc_precise_mbclen(p, e, enc);
853 if (!MBCLEN_CHARFOUND_P(ret)) {
855 return p - s;
856 }
857 p += MBCLEN_CHARFOUND_LEN(ret);
858 if (p == e) break;
859 p = search_nonascii(p, e);
860 if (!p) break;
861 }
862 }
863 else {
864 while (p < e) {
865 int ret = rb_enc_precise_mbclen(p, e, enc);
866 if (!MBCLEN_CHARFOUND_P(ret)) {
868 return p - s;
869 }
870 p += MBCLEN_CHARFOUND_LEN(ret);
871 }
872 }
874 return e - s;
875}
876
877static inline void
878str_enc_copy(VALUE str1, VALUE str2)
879{
880 rb_enc_set_index(str1, ENCODING_GET(str2));
881}
882
883/* Like str_enc_copy, but does not check frozen status of str1.
884 * You should use this only if you're certain that str1 is not frozen. */
885static inline void
886str_enc_copy_direct(VALUE str1, VALUE str2)
887{
888 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
889 if (inlined_encoding == ENCODING_INLINE_MAX) {
890 rb_enc_set_index(str1, rb_enc_get_index(str2));
891 }
892 else {
893 ENCODING_SET_INLINED(str1, inlined_encoding);
894 }
895}
896
897static void
898rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
899{
900 /* this function is designed for copying encoding and coderange
901 * from src to new string "dest" which is made from the part of src.
902 */
903 str_enc_copy(dest, src);
904 if (RSTRING_LEN(dest) == 0) {
905 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
907 else
909 return;
910 }
911 switch (ENC_CODERANGE(src)) {
914 break;
916 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
917 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
919 else
921 break;
922 default:
923 break;
924 }
925}
926
927static void
928rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
929{
930 str_enc_copy(dest, src);
932}
933
934static int
935enc_coderange_scan(VALUE str, rb_encoding *enc)
936{
937 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
938}
939
940int
941rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
942{
943 return enc_coderange_scan(str, enc);
944}
945
946int
948{
949 int cr = ENC_CODERANGE(str);
950
951 if (cr == ENC_CODERANGE_UNKNOWN) {
952 cr = enc_coderange_scan(str, get_encoding(str));
953 ENC_CODERANGE_SET(str, cr);
954 }
955 return cr;
956}
957
958static inline bool
959rb_enc_str_asciicompat(VALUE str)
960{
961 int encindex = ENCODING_GET_INLINED(str);
962 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
963}
964
965int
967{
968 switch(ENC_CODERANGE(str)) {
970 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
972 return true;
973 default:
974 return false;
975 }
976}
977
978static inline void
979str_mod_check(VALUE s, const char *p, long len)
980{
981 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
982 rb_raise(rb_eRuntimeError, "string modified");
983 }
984}
985
986static size_t
987str_capacity(VALUE str, const int termlen)
988{
989 if (STR_EMBED_P(str)) {
990 return str_embed_capa(str) - termlen;
991 }
992 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
993 return RSTRING(str)->len;
994 }
995 else {
996 return RSTRING(str)->as.heap.aux.capa;
997 }
998}
999
1000size_t
1002{
1003 return str_capacity(str, TERM_LEN(str));
1004}
1005
1006static inline void
1007must_not_null(const char *ptr)
1008{
1009 if (!ptr) {
1010 rb_raise(rb_eArgError, "NULL pointer given");
1011 }
1012}
1013
1014static inline VALUE
1015str_alloc_embed(VALUE klass, size_t capa)
1016{
1017 size_t size = rb_str_embed_size(capa, 0);
1018 RUBY_ASSERT(size > 0);
1019 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1020
1021 NEWOBJ_OF(str, struct RString, klass,
1023
1024 str->len = 0;
1025 str->as.embed.ary[0] = 0;
1026
1027 return (VALUE)str;
1028}
1029
1030static inline VALUE
1031str_alloc_heap(VALUE klass)
1032{
1033 NEWOBJ_OF(str, struct RString, klass,
1034 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1035
1036 str->len = 0;
1037 str->as.heap.aux.capa = 0;
1038 str->as.heap.ptr = NULL;
1039
1040 return (VALUE)str;
1041}
1042
1043static inline VALUE
1044empty_str_alloc(VALUE klass)
1045{
1046 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1047 VALUE str = str_alloc_embed(klass, 0);
1048 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1050 return str;
1051}
1052
1053static VALUE
1054str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1055{
1056 VALUE str;
1057
1058 if (len < 0) {
1059 rb_raise(rb_eArgError, "negative string size (or size too big)");
1060 }
1061
1062 if (enc == NULL) {
1063 enc = rb_ascii8bit_encoding();
1064 }
1065
1066 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1067
1068 int termlen = rb_enc_mbminlen(enc);
1069
1070 if (STR_EMBEDDABLE_P(len, termlen)) {
1071 str = str_alloc_embed(klass, len + termlen);
1072 if (len == 0) {
1073 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1074 }
1075 }
1076 else {
1077 str = str_alloc_heap(klass);
1078 RSTRING(str)->as.heap.aux.capa = len;
1079 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1080 * integer overflow. If we can STATIC_ASSERT that, the following
1081 * mul_add_mul can be reverted to a simple ALLOC_N. */
1082 RSTRING(str)->as.heap.ptr =
1083 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1084 }
1085
1086 rb_enc_raw_set(str, enc);
1087
1088 if (ptr) {
1089 memcpy(RSTRING_PTR(str), ptr, len);
1090 }
1091 else {
1092 memset(RSTRING_PTR(str), 0, len);
1093 }
1094
1095 STR_SET_LEN(str, len);
1096 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1097 return str;
1098}
1099
1100static VALUE
1101str_new(VALUE klass, const char *ptr, long len)
1102{
1103 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1104}
1105
1106VALUE
1107rb_str_new(const char *ptr, long len)
1108{
1109 return str_new(rb_cString, ptr, len);
1110}
1111
1112VALUE
1113rb_usascii_str_new(const char *ptr, long len)
1114{
1115 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1116}
1117
1118VALUE
1119rb_utf8_str_new(const char *ptr, long len)
1120{
1121 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1122}
1123
1124VALUE
1125rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1126{
1127 return str_enc_new(rb_cString, ptr, len, enc);
1128}
1129
1130VALUE
1132{
1133 must_not_null(ptr);
1134 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1135 * memory regions, and that cannot be detected by the MSAN. Just
1136 * trust the programmer that the argument passed here is a sane C
1137 * string. */
1138 __msan_unpoison_string(ptr);
1139 return rb_str_new(ptr, strlen(ptr));
1140}
1141
1142VALUE
1144{
1145 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1146}
1147
1148VALUE
1150{
1151 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1152}
1153
1154VALUE
1156{
1157 must_not_null(ptr);
1158 if (rb_enc_mbminlen(enc) != 1) {
1159 rb_raise(rb_eArgError, "wchar encoding given");
1160 }
1161 return rb_enc_str_new(ptr, strlen(ptr), enc);
1162}
1163
1164static VALUE
1165str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1166{
1167 VALUE str;
1168
1169 if (len < 0) {
1170 rb_raise(rb_eArgError, "negative string size (or size too big)");
1171 }
1172
1173 if (!ptr) {
1174 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1175 }
1176 else {
1177 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1178 str = str_alloc_heap(klass);
1179 RSTRING(str)->len = len;
1180 RSTRING(str)->as.heap.ptr = (char *)ptr;
1181 RSTRING(str)->as.heap.aux.capa = len;
1182 RBASIC(str)->flags |= STR_NOFREE;
1183 rb_enc_associate_index(str, encindex);
1184 }
1185 return str;
1186}
1187
1188VALUE
1189rb_str_new_static(const char *ptr, long len)
1190{
1191 return str_new_static(rb_cString, ptr, len, 0);
1192}
1193
1194VALUE
1196{
1197 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1198}
1199
1200VALUE
1202{
1203 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1204}
1205
1206VALUE
1208{
1209 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1210}
1211
1212static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1213 rb_encoding *from, rb_encoding *to,
1214 int ecflags, VALUE ecopts);
1215
1216static inline bool
1217is_enc_ascii_string(VALUE str, rb_encoding *enc)
1218{
1219 int encidx = rb_enc_to_index(enc);
1220 if (rb_enc_get_index(str) == encidx)
1221 return is_ascii_string(str);
1222 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1223}
1224
1225VALUE
1226rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1227{
1228 long len;
1229 const char *ptr;
1230 VALUE newstr;
1231
1232 if (!to) return str;
1233 if (!from) from = rb_enc_get(str);
1234 if (from == to) return str;
1235 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1236 rb_is_ascii8bit_enc(to)) {
1237 if (STR_ENC_GET(str) != to) {
1238 str = rb_str_dup(str);
1239 rb_enc_associate(str, to);
1240 }
1241 return str;
1242 }
1243
1244 RSTRING_GETMEM(str, ptr, len);
1245 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1246 from, to, ecflags, ecopts);
1247 if (NIL_P(newstr)) {
1248 /* some error, return original */
1249 return str;
1250 }
1251 return newstr;
1252}
1253
1254VALUE
1255rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1256 rb_encoding *from, int ecflags, VALUE ecopts)
1257{
1258 long olen;
1259
1260 olen = RSTRING_LEN(newstr);
1261 if (ofs < -olen || olen < ofs)
1262 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1263 if (ofs < 0) ofs += olen;
1264 if (!from) {
1265 STR_SET_LEN(newstr, ofs);
1266 return rb_str_cat(newstr, ptr, len);
1267 }
1268
1269 rb_str_modify(newstr);
1270 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1271 rb_enc_get(newstr),
1272 ecflags, ecopts);
1273}
1274
1275VALUE
1276rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1277{
1278 STR_SET_LEN(str, 0);
1279 rb_enc_associate(str, enc);
1280 rb_str_cat(str, ptr, len);
1281 return str;
1282}
1283
1284static VALUE
1285str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1286 rb_encoding *from, rb_encoding *to,
1287 int ecflags, VALUE ecopts)
1288{
1289 rb_econv_t *ec;
1291 long olen;
1292 VALUE econv_wrapper;
1293 const unsigned char *start, *sp;
1294 unsigned char *dest, *dp;
1295 size_t converted_output = (size_t)ofs;
1296
1297 olen = rb_str_capacity(newstr);
1298
1299 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1300 RBASIC_CLEAR_CLASS(econv_wrapper);
1301 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1302 if (!ec) return Qnil;
1303 DATA_PTR(econv_wrapper) = ec;
1304
1305 sp = (unsigned char*)ptr;
1306 start = sp;
1307 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1308 (dp = dest + converted_output),
1309 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1311 /* destination buffer short */
1312 size_t converted_input = sp - start;
1313 size_t rest = len - converted_input;
1314 converted_output = dp - dest;
1315 rb_str_set_len(newstr, converted_output);
1316 if (converted_input && converted_output &&
1317 rest < (LONG_MAX / converted_output)) {
1318 rest = (rest * converted_output) / converted_input;
1319 }
1320 else {
1321 rest = olen;
1322 }
1323 olen += rest < 2 ? 2 : rest;
1324 rb_str_resize(newstr, olen);
1325 }
1326 DATA_PTR(econv_wrapper) = 0;
1327 RB_GC_GUARD(econv_wrapper);
1328 rb_econv_close(ec);
1329 switch (ret) {
1330 case econv_finished:
1331 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1332 rb_str_set_len(newstr, len);
1333 rb_enc_associate(newstr, to);
1334 return newstr;
1335
1336 default:
1337 return Qnil;
1338 }
1339}
1340
1341VALUE
1343{
1344 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1345}
1346
1347VALUE
1349{
1350 rb_encoding *ienc;
1351 VALUE str;
1352 const int eidx = rb_enc_to_index(eenc);
1353
1354 if (!ptr) {
1355 return rb_enc_str_new(ptr, len, eenc);
1356 }
1357
1358 /* ASCII-8BIT case, no conversion */
1359 if ((eidx == rb_ascii8bit_encindex()) ||
1360 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1361 return rb_str_new(ptr, len);
1362 }
1363 /* no default_internal or same encoding, no conversion */
1364 ienc = rb_default_internal_encoding();
1365 if (!ienc || eenc == ienc) {
1366 return rb_enc_str_new(ptr, len, eenc);
1367 }
1368 /* ASCII compatible, and ASCII only string, no conversion in
1369 * default_internal */
1370 if ((eidx == rb_ascii8bit_encindex()) ||
1371 (eidx == rb_usascii_encindex()) ||
1372 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1373 return rb_enc_str_new(ptr, len, ienc);
1374 }
1375 /* convert from the given encoding to default_internal */
1376 str = rb_enc_str_new(NULL, 0, ienc);
1377 /* when the conversion failed for some reason, just ignore the
1378 * default_internal and result in the given encoding as-is. */
1379 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1380 rb_str_initialize(str, ptr, len, eenc);
1381 }
1382 return str;
1383}
1384
1385VALUE
1386rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1387{
1388 int eidx = rb_enc_to_index(eenc);
1389 if (eidx == rb_usascii_encindex() &&
1390 !is_ascii_string(str)) {
1391 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1392 return str;
1393 }
1394 rb_enc_associate_index(str, eidx);
1395 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1396}
1397
1398VALUE
1399rb_external_str_new(const char *ptr, long len)
1400{
1401 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1402}
1403
1404VALUE
1406{
1407 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1408}
1409
1410VALUE
1411rb_locale_str_new(const char *ptr, long len)
1412{
1413 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1414}
1415
1416VALUE
1418{
1419 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1420}
1421
1422VALUE
1424{
1425 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1426}
1427
1428VALUE
1430{
1431 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1432}
1433
1434VALUE
1436{
1437 return rb_str_export_to_enc(str, rb_default_external_encoding());
1438}
1439
1440VALUE
1442{
1443 return rb_str_export_to_enc(str, rb_locale_encoding());
1444}
1445
1446VALUE
1448{
1449 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1450}
1451
1452static VALUE
1453str_replace_shared_without_enc(VALUE str2, VALUE str)
1454{
1455 const int termlen = TERM_LEN(str);
1456 char *ptr;
1457 long len;
1458
1459 RSTRING_GETMEM(str, ptr, len);
1460 if (str_embed_capa(str2) >= len + termlen) {
1461 char *ptr2 = RSTRING(str2)->as.embed.ary;
1462 STR_SET_EMBED(str2);
1463 memcpy(ptr2, RSTRING_PTR(str), len);
1464 TERM_FILL(ptr2+len, termlen);
1465 }
1466 else {
1467 VALUE root;
1468 if (STR_SHARED_P(str)) {
1469 root = RSTRING(str)->as.heap.aux.shared;
1470 RSTRING_GETMEM(str, ptr, len);
1471 }
1472 else {
1473 root = rb_str_new_frozen(str);
1474 RSTRING_GETMEM(root, ptr, len);
1475 }
1476 RUBY_ASSERT(OBJ_FROZEN(root));
1477
1478 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1479 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1480 rb_fatal("about to free a possible shared root");
1481 }
1482 char *ptr2 = STR_HEAP_PTR(str2);
1483 if (ptr2 != ptr) {
1484 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1485 }
1486 }
1487 FL_SET(str2, STR_NOEMBED);
1488 RSTRING(str2)->as.heap.ptr = ptr;
1489 STR_SET_SHARED(str2, root);
1490 }
1491
1492 STR_SET_LEN(str2, len);
1493
1494 return str2;
1495}
1496
1497static VALUE
1498str_replace_shared(VALUE str2, VALUE str)
1499{
1500 str_replace_shared_without_enc(str2, str);
1501 rb_enc_cr_str_exact_copy(str2, str);
1502 return str2;
1503}
1504
1505static VALUE
1506str_new_shared(VALUE klass, VALUE str)
1507{
1508 return str_replace_shared(str_alloc_heap(klass), str);
1509}
1510
1511VALUE
1513{
1514 return str_new_shared(rb_obj_class(str), str);
1515}
1516
1517VALUE
1519{
1520 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1521 return str_new_frozen(rb_obj_class(orig), orig);
1522}
1523
1524static VALUE
1525rb_str_new_frozen_String(VALUE orig)
1526{
1527 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1528 return str_new_frozen(rb_cString, orig);
1529}
1530
1531
1532VALUE
1533rb_str_frozen_bare_string(VALUE orig)
1534{
1535 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1536 return str_new_frozen(rb_cString, orig);
1537}
1538
1539VALUE
1540rb_str_tmp_frozen_acquire(VALUE orig)
1541{
1542 if (OBJ_FROZEN_RAW(orig)) return orig;
1543 return str_new_frozen_buffer(0, orig, FALSE);
1544}
1545
1546VALUE
1547rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1548{
1549 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1550 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1551
1552 VALUE str = str_alloc_heap(0);
1553 OBJ_FREEZE(str);
1554 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1555 FL_SET(str, STR_SHARED_ROOT);
1556
1557 size_t capa = str_capacity(orig, TERM_LEN(orig));
1558
1559 /* If the string is embedded then we want to create a copy that is heap
1560 * allocated. If the string is shared then the shared root must be
1561 * embedded, so we want to create a copy. If the string is a shared root
1562 * then it must be embedded, so we want to create a copy. */
1563 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1564 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1565 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1566 }
1567 else {
1568 /* orig must be heap allocated and not shared, so we can safely transfer
1569 * the pointer to str. */
1570 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1571 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1572 RBASIC(orig)->flags &= ~STR_NOFREE;
1573 STR_SET_SHARED(orig, str);
1574 if (RB_OBJ_SHAREABLE_P(orig)) {
1575 RB_OBJ_SET_SHAREABLE(str);
1576 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1577 }
1578 }
1579
1580 RSTRING(str)->len = RSTRING(orig)->len;
1581 RSTRING(str)->as.heap.aux.capa = capa;
1582
1583 return str;
1584}
1585
1586void
1587rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1588{
1589 if (RBASIC_CLASS(tmp) != 0)
1590 return;
1591
1592 if (STR_EMBED_P(tmp)) {
1594 }
1595 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1596 !OBJ_FROZEN_RAW(orig)) {
1597 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1598
1599 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1600 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1601 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1602
1603 /* Unshare orig since the root (tmp) only has this one child. */
1604 FL_UNSET_RAW(orig, STR_SHARED);
1605 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1606 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1608
1609 /* Make tmp embedded and empty so it is safe for sweeping. */
1610 STR_SET_EMBED(tmp);
1611 STR_SET_LEN(tmp, 0);
1612 }
1613 }
1614}
1615
1616static VALUE
1617str_new_frozen(VALUE klass, VALUE orig)
1618{
1619 return str_new_frozen_buffer(klass, orig, TRUE);
1620}
1621
1622static VALUE
1623heap_str_make_shared(VALUE klass, VALUE orig)
1624{
1625 RUBY_ASSERT(!STR_EMBED_P(orig));
1626 RUBY_ASSERT(!STR_SHARED_P(orig));
1628
1629 VALUE str = str_alloc_heap(klass);
1630 STR_SET_LEN(str, RSTRING_LEN(orig));
1631 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1632 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1633 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1634 RBASIC(orig)->flags &= ~STR_NOFREE;
1635 STR_SET_SHARED(orig, str);
1636 if (klass == 0)
1637 FL_UNSET_RAW(str, STR_BORROWED);
1638 return str;
1639}
1640
1641static VALUE
1642str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1643{
1644 VALUE str;
1645
1646 long len = RSTRING_LEN(orig);
1647 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1648 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1649
1650 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1651 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1652 RUBY_ASSERT(STR_EMBED_P(str));
1653 }
1654 else {
1655 if (FL_TEST_RAW(orig, STR_SHARED)) {
1656 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1657 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1658 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1659 RUBY_ASSERT(ofs >= 0);
1660 RUBY_ASSERT(rest >= 0);
1661 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1663
1664 if ((ofs > 0) || (rest > 0) ||
1665 (klass != RBASIC(shared)->klass) ||
1666 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1667 str = str_new_shared(klass, shared);
1668 RUBY_ASSERT(!STR_EMBED_P(str));
1669 RSTRING(str)->as.heap.ptr += ofs;
1670 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1671 }
1672 else {
1673 if (RBASIC_CLASS(shared) == 0)
1674 FL_SET_RAW(shared, STR_BORROWED);
1675 return shared;
1676 }
1677 }
1678 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1679 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1680 STR_SET_EMBED(str);
1681 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1682 STR_SET_LEN(str, RSTRING_LEN(orig));
1683 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1684 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1685 }
1686 else {
1687 if (RB_OBJ_SHAREABLE_P(orig)) {
1688 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1689 }
1690 else {
1691 str = heap_str_make_shared(klass, orig);
1692 }
1693 }
1694 }
1695
1696 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1697 OBJ_FREEZE(str);
1698 return str;
1699}
1700
1701VALUE
1702rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1703{
1704 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1705}
1706
1707static VALUE
1708str_new_empty_String(VALUE str)
1709{
1710 VALUE v = rb_str_new(0, 0);
1711 rb_enc_copy(v, str);
1712 return v;
1713}
1714
1715#define STR_BUF_MIN_SIZE 63
1716
1717VALUE
1719{
1720 if (STR_EMBEDDABLE_P(capa, 1)) {
1721 return str_alloc_embed(rb_cString, capa + 1);
1722 }
1723
1724 VALUE str = str_alloc_heap(rb_cString);
1725
1726 RSTRING(str)->as.heap.aux.capa = capa;
1727 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1728 RSTRING(str)->as.heap.ptr[0] = '\0';
1729
1730 return str;
1731}
1732
1733VALUE
1735{
1736 VALUE str;
1737 long len = strlen(ptr);
1738
1739 str = rb_str_buf_new(len);
1740 rb_str_buf_cat(str, ptr, len);
1741
1742 return str;
1743}
1744
1745VALUE
1747{
1748 return str_new(0, 0, len);
1749}
1750
1751void
1753{
1754 if (STR_EMBED_P(str)) {
1755 RB_DEBUG_COUNTER_INC(obj_str_embed);
1756 }
1757 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1758 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1759 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1760 }
1761 else {
1762 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1763 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1764 }
1765}
1766
1767size_t
1768rb_str_memsize(VALUE str)
1769{
1770 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1771 return STR_HEAP_SIZE(str);
1772 }
1773 else {
1774 return 0;
1775 }
1776}
1777
1778VALUE
1780{
1781 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1782}
1783
1784static inline void str_discard(VALUE str);
1785static void str_shared_replace(VALUE str, VALUE str2);
1786
1787void
1789{
1790 if (str != str2) str_shared_replace(str, str2);
1791}
1792
1793static void
1794str_shared_replace(VALUE str, VALUE str2)
1795{
1796 rb_encoding *enc;
1797 int cr;
1798 int termlen;
1799
1800 RUBY_ASSERT(str2 != str);
1801 enc = STR_ENC_GET(str2);
1802 cr = ENC_CODERANGE(str2);
1803 str_discard(str);
1804 termlen = rb_enc_mbminlen(enc);
1805
1806 STR_SET_LEN(str, RSTRING_LEN(str2));
1807
1808 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1809 STR_SET_EMBED(str);
1810 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1811 rb_enc_associate(str, enc);
1812 ENC_CODERANGE_SET(str, cr);
1813 }
1814 else {
1815 if (STR_EMBED_P(str2)) {
1816 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1817 long len = RSTRING_LEN(str2);
1818 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1819
1820 char *new_ptr = ALLOC_N(char, len + termlen);
1821 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1822 RSTRING(str2)->as.heap.ptr = new_ptr;
1823 STR_SET_LEN(str2, len);
1824 RSTRING(str2)->as.heap.aux.capa = len;
1825 STR_SET_NOEMBED(str2);
1826 }
1827
1828 STR_SET_NOEMBED(str);
1829 FL_UNSET(str, STR_SHARED);
1830 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1831
1832 if (FL_TEST(str2, STR_SHARED)) {
1833 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1834 STR_SET_SHARED(str, shared);
1835 }
1836 else {
1837 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1838 }
1839
1840 /* abandon str2 */
1841 STR_SET_EMBED(str2);
1842 RSTRING_PTR(str2)[0] = 0;
1843 STR_SET_LEN(str2, 0);
1844 rb_enc_associate(str, enc);
1845 ENC_CODERANGE_SET(str, cr);
1846 }
1847}
1848
1849VALUE
1851{
1852 VALUE str;
1853
1854 if (RB_TYPE_P(obj, T_STRING)) {
1855 return obj;
1856 }
1857 str = rb_funcall(obj, idTo_s, 0);
1858 return rb_obj_as_string_result(str, obj);
1859}
1860
1861VALUE
1862rb_obj_as_string_result(VALUE str, VALUE obj)
1863{
1864 if (!RB_TYPE_P(str, T_STRING))
1865 return rb_any_to_s(obj);
1866 return str;
1867}
1868
1869static VALUE
1870str_replace(VALUE str, VALUE str2)
1871{
1872 long len;
1873
1874 len = RSTRING_LEN(str2);
1875 if (STR_SHARED_P(str2)) {
1876 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1878 STR_SET_NOEMBED(str);
1879 STR_SET_LEN(str, len);
1880 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1881 STR_SET_SHARED(str, shared);
1882 rb_enc_cr_str_exact_copy(str, str2);
1883 }
1884 else {
1885 str_replace_shared(str, str2);
1886 }
1887
1888 return str;
1889}
1890
1891static inline VALUE
1892ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1893{
1894 size_t size = rb_str_embed_size(capa, 0);
1895 RUBY_ASSERT(size > 0);
1896 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1897
1898 NEWOBJ_OF(str, struct RString, klass,
1900
1901 str->len = 0;
1902
1903 return (VALUE)str;
1904}
1905
1906static inline VALUE
1907ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1908{
1909 NEWOBJ_OF(str, struct RString, klass,
1910 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1911
1912 str->as.heap.aux.capa = 0;
1913 str->as.heap.ptr = NULL;
1914
1915 return (VALUE)str;
1916}
1917
1918static inline VALUE
1919str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1920{
1921 int encidx = 0;
1922 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1923 encidx = rb_enc_get_index(str);
1924 flags &= ~ENCODING_MASK;
1925 }
1926 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1927 if (encidx) rb_enc_associate_index(dup, encidx);
1928 return dup;
1929}
1930
1931static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1932
1933static inline VALUE
1934str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1935{
1936 VALUE flags = FL_TEST_RAW(str, flag_mask);
1937 long len = RSTRING_LEN(str);
1938
1939 RUBY_ASSERT(STR_EMBED_P(dup));
1940 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1941 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1942 STR_SET_LEN(dup, RSTRING_LEN(str));
1943 return str_duplicate_setup_encoding(str, dup, flags);
1944}
1945
1946static inline VALUE
1947str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1948{
1949 VALUE flags = FL_TEST_RAW(str, flag_mask);
1950 VALUE root = str;
1951 if (FL_TEST_RAW(str, STR_SHARED)) {
1952 root = RSTRING(str)->as.heap.aux.shared;
1953 }
1954 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1955 root = str = str_new_frozen(klass, str);
1956 flags = FL_TEST_RAW(str, flag_mask);
1957 }
1958 RUBY_ASSERT(!STR_SHARED_P(root));
1960
1961 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1962 FL_SET(root, STR_SHARED_ROOT);
1963 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1964 flags |= RSTRING_NOEMBED | STR_SHARED;
1965
1966 STR_SET_LEN(dup, RSTRING_LEN(str));
1967 return str_duplicate_setup_encoding(str, dup, flags);
1968}
1969
1970static inline VALUE
1971str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1972{
1973 if (STR_EMBED_P(str)) {
1974 return str_duplicate_setup_embed(klass, str, dup);
1975 }
1976 else {
1977 return str_duplicate_setup_heap(klass, str, dup);
1978 }
1979}
1980
1981static inline VALUE
1982str_duplicate(VALUE klass, VALUE str)
1983{
1984 VALUE dup;
1985 if (STR_EMBED_P(str)) {
1986 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 }
1988 else {
1989 dup = str_alloc_heap(klass);
1990 }
1991
1992 return str_duplicate_setup(klass, str, dup);
1993}
1994
1995VALUE
1997{
1998 return str_duplicate(rb_obj_class(str), str);
1999}
2000
2001/* :nodoc: */
2002VALUE
2003rb_str_dup_m(VALUE str)
2004{
2005 if (LIKELY(BARE_STRING_P(str))) {
2006 return str_duplicate(rb_cString, str);
2007 }
2008 else {
2009 return rb_obj_dup(str);
2010 }
2011}
2012
2013VALUE
2015{
2016 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2017 return str_duplicate(rb_cString, str);
2018}
2019
2020VALUE
2021rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2022{
2023 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2024 VALUE new_str, klass = rb_cString;
2025
2026 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2027 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2028 str_duplicate_setup_embed(klass, str, new_str);
2029 }
2030 else {
2031 new_str = ec_str_alloc_heap(ec, klass);
2032 str_duplicate_setup_heap(klass, str, new_str);
2033 }
2034 if (chilled) {
2035 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2036 }
2037 return new_str;
2038}
2039
2040VALUE
2041rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2042{
2043 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2044 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2045 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2046 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2047 return rb_str_freeze(str);
2048}
2049
2050/*
2051 * The documentation block below uses an include (instead of inline text)
2052 * because the included text has non-ASCII characters (which are not allowed in a C file).
2053 */
2054
2055/*
2056 *
2057 * call-seq:
2058 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2059 *
2060 * :include: doc/string/new.rdoc
2061 *
2062 */
2063
2064static VALUE
2065rb_str_init(int argc, VALUE *argv, VALUE str)
2066{
2067 static ID keyword_ids[2];
2068 VALUE orig, opt, venc, vcapa;
2069 VALUE kwargs[2];
2070 rb_encoding *enc = 0;
2071 int n;
2072
2073 if (!keyword_ids[0]) {
2074 keyword_ids[0] = rb_id_encoding();
2075 CONST_ID(keyword_ids[1], "capacity");
2076 }
2077
2078 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2079 if (!NIL_P(opt)) {
2080 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2081 venc = kwargs[0];
2082 vcapa = kwargs[1];
2083 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2084 enc = rb_to_encoding(venc);
2085 }
2086 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2087 long capa = NUM2LONG(vcapa);
2088 long len = 0;
2089 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2090
2091 if (capa < STR_BUF_MIN_SIZE) {
2092 capa = STR_BUF_MIN_SIZE;
2093 }
2094 if (n == 1) {
2095 StringValue(orig);
2096 len = RSTRING_LEN(orig);
2097 if (capa < len) {
2098 capa = len;
2099 }
2100 if (orig == str) n = 0;
2101 }
2102 str_modifiable(str);
2103 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2104 /* make noembed always */
2105 const size_t size = (size_t)capa + termlen;
2106 const char *const old_ptr = RSTRING_PTR(str);
2107 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2108 char *new_ptr = ALLOC_N(char, size);
2109 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2110 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2111 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2112 RSTRING(str)->as.heap.ptr = new_ptr;
2113 }
2114 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2115 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2116 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2117 }
2118 STR_SET_LEN(str, len);
2119 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2120 if (n == 1) {
2121 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2122 rb_enc_cr_str_exact_copy(str, orig);
2123 }
2124 FL_SET(str, STR_NOEMBED);
2125 RSTRING(str)->as.heap.aux.capa = capa;
2126 }
2127 else if (n == 1) {
2128 rb_str_replace(str, orig);
2129 }
2130 if (enc) {
2131 rb_enc_associate(str, enc);
2133 }
2134 }
2135 else if (n == 1) {
2136 rb_str_replace(str, orig);
2137 }
2138 return str;
2139}
2140
2141/* :nodoc: */
2142static VALUE
2143rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2144{
2145 if (klass != rb_cString) {
2146 return rb_class_new_instance_pass_kw(argc, argv, klass);
2147 }
2148
2149 static ID keyword_ids[2];
2150 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2151 VALUE kwargs[2];
2152 rb_encoding *enc = NULL;
2153
2154 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2155 if (NIL_P(opt)) {
2156 return rb_class_new_instance_pass_kw(argc, argv, klass);
2157 }
2158
2159 keyword_ids[0] = rb_id_encoding();
2160 CONST_ID(keyword_ids[1], "capacity");
2161 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2162 encoding = kwargs[0];
2163 capacity = kwargs[1];
2164
2165 if (n == 1) {
2166 orig = StringValue(orig);
2167 }
2168 else {
2169 orig = Qnil;
2170 }
2171
2172 if (UNDEF_P(encoding)) {
2173 if (!NIL_P(orig)) {
2174 encoding = rb_obj_encoding(orig);
2175 }
2176 }
2177
2178 if (!UNDEF_P(encoding)) {
2179 enc = rb_to_encoding(encoding);
2180 }
2181
2182 // If capacity is nil, we're basically just duping `orig`.
2183 if (UNDEF_P(capacity)) {
2184 if (NIL_P(orig)) {
2185 VALUE empty_str = str_new(klass, "", 0);
2186 if (enc) {
2187 rb_enc_associate(empty_str, enc);
2188 }
2189 return empty_str;
2190 }
2191 VALUE copy = str_duplicate(klass, orig);
2192 rb_enc_associate(copy, enc);
2193 ENC_CODERANGE_CLEAR(copy);
2194 return copy;
2195 }
2196
2197 long capa = 0;
2198 capa = NUM2LONG(capacity);
2199 if (capa < 0) {
2200 capa = 0;
2201 }
2202
2203 if (!NIL_P(orig)) {
2204 long orig_capa = rb_str_capacity(orig);
2205 if (orig_capa > capa) {
2206 capa = orig_capa;
2207 }
2208 }
2209
2210 VALUE str = str_enc_new(klass, NULL, capa, enc);
2211 STR_SET_LEN(str, 0);
2212 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2213
2214 if (!NIL_P(orig)) {
2215 rb_str_buf_append(str, orig);
2216 }
2217
2218 return str;
2219}
2220
2221#ifdef NONASCII_MASK
2222#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2223
2224/*
2225 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2226 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2227 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2228 *
2229 * if (!(byte & 0x80))
2230 * byte |= 0x40; // turn on bit6
2231 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2232 *
2233 * This function calculates whether a byte is leading or not for all bytes
2234 * in the argument word by concurrently using the above logic, and then
2235 * adds up the number of leading bytes in the word.
2236 */
2237static inline uintptr_t
2238count_utf8_lead_bytes_with_word(const uintptr_t *s)
2239{
2240 uintptr_t d = *s;
2241
2242 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2243 d = (d>>6) | (~d>>7);
2244 d &= NONASCII_MASK >> 7;
2245
2246 /* Gather all bytes. */
2247#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2248 /* use only if it can use POPCNT */
2249 return rb_popcount_intptr(d);
2250#else
2251 d += (d>>8);
2252 d += (d>>16);
2253# if SIZEOF_VOIDP == 8
2254 d += (d>>32);
2255# endif
2256 return (d&0xF);
2257#endif
2258}
2259#endif
2260
2261static inline long
2262enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2263{
2264 long c;
2265 const char *q;
2266
2267 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2268 long diff = (long)(e - p);
2269 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2270 }
2271#ifdef NONASCII_MASK
2272 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2273 uintptr_t len = 0;
2274 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2275 const uintptr_t *s, *t;
2276 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2277 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2278 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2279 while (p < (const char *)s) {
2280 if (is_utf8_lead_byte(*p)) len++;
2281 p++;
2282 }
2283 while (s < t) {
2284 len += count_utf8_lead_bytes_with_word(s);
2285 s++;
2286 }
2287 p = (const char *)s;
2288 }
2289 while (p < e) {
2290 if (is_utf8_lead_byte(*p)) len++;
2291 p++;
2292 }
2293 return (long)len;
2294 }
2295#endif
2296 else if (rb_enc_asciicompat(enc)) {
2297 c = 0;
2298 if (ENC_CODERANGE_CLEAN_P(cr)) {
2299 while (p < e) {
2300 if (ISASCII(*p)) {
2301 q = search_nonascii(p, e);
2302 if (!q)
2303 return c + (e - p);
2304 c += q - p;
2305 p = q;
2306 }
2307 p += rb_enc_fast_mbclen(p, e, enc);
2308 c++;
2309 }
2310 }
2311 else {
2312 while (p < e) {
2313 if (ISASCII(*p)) {
2314 q = search_nonascii(p, e);
2315 if (!q)
2316 return c + (e - p);
2317 c += q - p;
2318 p = q;
2319 }
2320 p += rb_enc_mbclen(p, e, enc);
2321 c++;
2322 }
2323 }
2324 return c;
2325 }
2326
2327 for (c=0; p<e; c++) {
2328 p += rb_enc_mbclen(p, e, enc);
2329 }
2330 return c;
2331}
2332
2333long
2334rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2335{
2336 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2337}
2338
2339/* To get strlen with cr
2340 * Note that given cr is not used.
2341 */
2342long
2343rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2344{
2345 long c;
2346 const char *q;
2347 int ret;
2348
2349 *cr = 0;
2350 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2351 long diff = (long)(e - p);
2352 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2353 }
2354 else if (rb_enc_asciicompat(enc)) {
2355 c = 0;
2356 while (p < e) {
2357 if (ISASCII(*p)) {
2358 q = search_nonascii(p, e);
2359 if (!q) {
2360 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2361 return c + (e - p);
2362 }
2363 c += q - p;
2364 p = q;
2365 }
2366 ret = rb_enc_precise_mbclen(p, e, enc);
2367 if (MBCLEN_CHARFOUND_P(ret)) {
2368 *cr |= ENC_CODERANGE_VALID;
2369 p += MBCLEN_CHARFOUND_LEN(ret);
2370 }
2371 else {
2373 p++;
2374 }
2375 c++;
2376 }
2377 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2378 return c;
2379 }
2380
2381 for (c=0; p<e; c++) {
2382 ret = rb_enc_precise_mbclen(p, e, enc);
2383 if (MBCLEN_CHARFOUND_P(ret)) {
2384 *cr |= ENC_CODERANGE_VALID;
2385 p += MBCLEN_CHARFOUND_LEN(ret);
2386 }
2387 else {
2389 if (p + rb_enc_mbminlen(enc) <= e)
2390 p += rb_enc_mbminlen(enc);
2391 else
2392 p = e;
2393 }
2394 }
2395 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2396 return c;
2397}
2398
2399/* enc must be str's enc or rb_enc_check(str, str2) */
2400static long
2401str_strlen(VALUE str, rb_encoding *enc)
2402{
2403 const char *p, *e;
2404 int cr;
2405
2406 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2407 if (!enc) enc = STR_ENC_GET(str);
2408 p = RSTRING_PTR(str);
2409 e = RSTRING_END(str);
2410 cr = ENC_CODERANGE(str);
2411
2412 if (cr == ENC_CODERANGE_UNKNOWN) {
2413 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2414 if (cr) ENC_CODERANGE_SET(str, cr);
2415 return n;
2416 }
2417 else {
2418 return enc_strlen(p, e, enc, cr);
2419 }
2420}
2421
2422long
2424{
2425 return str_strlen(str, NULL);
2426}
2427
2428/*
2429 * call-seq:
2430 * length -> integer
2431 *
2432 * :include: doc/string/length.rdoc
2433 *
2434 */
2435
2436VALUE
2438{
2439 return LONG2NUM(str_strlen(str, NULL));
2440}
2441
2442/*
2443 * call-seq:
2444 * bytesize -> integer
2445 *
2446 * :include: doc/string/bytesize.rdoc
2447 *
2448 */
2449
2450VALUE
2451rb_str_bytesize(VALUE str)
2452{
2453 return LONG2NUM(RSTRING_LEN(str));
2454}
2455
2456/*
2457 * call-seq:
2458 * empty? -> true or false
2459 *
2460 * Returns whether the length of +self+ is zero:
2461 *
2462 * 'hello'.empty? # => false
2463 * ' '.empty? # => false
2464 * ''.empty? # => true
2465 *
2466 * Related: see {Querying}[rdoc-ref:String@Querying].
2467 */
2468
2469static VALUE
2470rb_str_empty(VALUE str)
2471{
2472 return RBOOL(RSTRING_LEN(str) == 0);
2473}
2474
2475/*
2476 * call-seq:
2477 * self + other_string -> new_string
2478 *
2479 * Returns a new string containing +other_string+ concatenated to +self+:
2480 *
2481 * 'Hello from ' + self.to_s # => "Hello from main"
2482 *
2483 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2484 */
2485
2486VALUE
2488{
2489 VALUE str3;
2490 rb_encoding *enc;
2491 char *ptr1, *ptr2, *ptr3;
2492 long len1, len2;
2493 int termlen;
2494
2495 StringValue(str2);
2496 enc = rb_enc_check_str(str1, str2);
2497 RSTRING_GETMEM(str1, ptr1, len1);
2498 RSTRING_GETMEM(str2, ptr2, len2);
2499 termlen = rb_enc_mbminlen(enc);
2500 if (len1 > LONG_MAX - len2) {
2501 rb_raise(rb_eArgError, "string size too big");
2502 }
2503 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2504 ptr3 = RSTRING_PTR(str3);
2505 memcpy(ptr3, ptr1, len1);
2506 memcpy(ptr3+len1, ptr2, len2);
2507 TERM_FILL(&ptr3[len1+len2], termlen);
2508
2509 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2511 RB_GC_GUARD(str1);
2512 RB_GC_GUARD(str2);
2513 return str3;
2514}
2515
2516/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2517VALUE
2518rb_str_opt_plus(VALUE str1, VALUE str2)
2519{
2522 long len1, len2;
2523 MAYBE_UNUSED(char) *ptr1, *ptr2;
2524 RSTRING_GETMEM(str1, ptr1, len1);
2525 RSTRING_GETMEM(str2, ptr2, len2);
2526 int enc1 = rb_enc_get_index(str1);
2527 int enc2 = rb_enc_get_index(str2);
2528
2529 if (enc1 < 0) {
2530 return Qundef;
2531 }
2532 else if (enc2 < 0) {
2533 return Qundef;
2534 }
2535 else if (enc1 != enc2) {
2536 return Qundef;
2537 }
2538 else if (len1 > LONG_MAX - len2) {
2539 return Qundef;
2540 }
2541 else {
2542 return rb_str_plus(str1, str2);
2543 }
2544
2545}
2546
2547/*
2548 * call-seq:
2549 * self * n -> new_string
2550 *
2551 * Returns a new string containing +n+ copies of +self+:
2552 *
2553 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2554 * 'No!' * 0 # => ""
2555 *
2556 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2557 */
2558
2559VALUE
2561{
2562 VALUE str2;
2563 long n, len;
2564 char *ptr2;
2565 int termlen;
2566
2567 if (times == INT2FIX(1)) {
2568 return str_duplicate(rb_cString, str);
2569 }
2570 if (times == INT2FIX(0)) {
2571 str2 = str_alloc_embed(rb_cString, 0);
2572 rb_enc_copy(str2, str);
2573 return str2;
2574 }
2575 len = NUM2LONG(times);
2576 if (len < 0) {
2577 rb_raise(rb_eArgError, "negative argument");
2578 }
2579 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2580 if (STR_EMBEDDABLE_P(len, 1)) {
2581 str2 = str_alloc_embed(rb_cString, len + 1);
2582 memset(RSTRING_PTR(str2), 0, len + 1);
2583 }
2584 else {
2585 str2 = str_alloc_heap(rb_cString);
2586 RSTRING(str2)->as.heap.aux.capa = len;
2587 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2588 }
2589 STR_SET_LEN(str2, len);
2590 rb_enc_copy(str2, str);
2591 return str2;
2592 }
2593 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2594 rb_raise(rb_eArgError, "argument too big");
2595 }
2596
2597 len *= RSTRING_LEN(str);
2598 termlen = TERM_LEN(str);
2599 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2600 ptr2 = RSTRING_PTR(str2);
2601 if (len) {
2602 n = RSTRING_LEN(str);
2603 memcpy(ptr2, RSTRING_PTR(str), n);
2604 while (n <= len/2) {
2605 memcpy(ptr2 + n, ptr2, n);
2606 n *= 2;
2607 }
2608 memcpy(ptr2 + n, ptr2, len-n);
2609 }
2610 STR_SET_LEN(str2, len);
2611 TERM_FILL(&ptr2[len], termlen);
2612 rb_enc_cr_str_copy_for_substr(str2, str);
2613
2614 return str2;
2615}
2616
2617/*
2618 * call-seq:
2619 * self % object -> new_string
2620 *
2621 * Returns the result of formatting +object+ into the format specifications
2622 * contained in +self+
2623 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2624 *
2625 * '%05d' % 123 # => "00123"
2626 *
2627 * If +self+ contains multiple format specifications,
2628 * +object+ must be an array or hash containing the objects to be formatted:
2629 *
2630 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2631 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2632 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2633 *
2634 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2635 */
2636
2637static VALUE
2638rb_str_format_m(VALUE str, VALUE arg)
2639{
2640 VALUE tmp = rb_check_array_type(arg);
2641
2642 if (!NIL_P(tmp)) {
2643 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2644 }
2645 return rb_str_format(1, &arg, str);
2646}
2647
2648static inline void
2649rb_check_lockedtmp(VALUE str)
2650{
2651 if (FL_TEST(str, STR_TMPLOCK)) {
2652 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2653 }
2654}
2655
2656// If none of these flags are set, we know we have an modifiable string.
2657// If any is set, we need to do more detailed checks.
2658#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2659static inline void
2660str_modifiable(VALUE str)
2661{
2662 RUBY_ASSERT(ruby_thread_has_gvl_p());
2663
2664 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2665 if (CHILLED_STRING_P(str)) {
2666 CHILLED_STRING_MUTATED(str);
2667 }
2668 rb_check_lockedtmp(str);
2669 rb_check_frozen(str);
2670 }
2671}
2672
2673static inline int
2674str_dependent_p(VALUE str)
2675{
2676 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2677 return FALSE;
2678 }
2679 else {
2680 return TRUE;
2681 }
2682}
2683
2684// If none of these flags are set, we know we have an independent string.
2685// If any is set, we need to do more detailed checks.
2686#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2687static inline int
2688str_independent(VALUE str)
2689{
2690 RUBY_ASSERT(ruby_thread_has_gvl_p());
2691
2692 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2693 str_modifiable(str);
2694 return !str_dependent_p(str);
2695 }
2696 return TRUE;
2697}
2698
2699static void
2700str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2701{
2702 RUBY_ASSERT(ruby_thread_has_gvl_p());
2703
2704 char *ptr;
2705 char *oldptr;
2706 long capa = len + expand;
2707
2708 if (len > capa) len = capa;
2709
2710 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2711 ptr = RSTRING(str)->as.heap.ptr;
2712 STR_SET_EMBED(str);
2713 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2714 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2715 STR_SET_LEN(str, len);
2716 return;
2717 }
2718
2719 ptr = ALLOC_N(char, (size_t)capa + termlen);
2720 oldptr = RSTRING_PTR(str);
2721 if (oldptr) {
2722 memcpy(ptr, oldptr, len);
2723 }
2724 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2725 xfree(oldptr);
2726 }
2727 STR_SET_NOEMBED(str);
2728 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2729 TERM_FILL(ptr + len, termlen);
2730 RSTRING(str)->as.heap.ptr = ptr;
2731 STR_SET_LEN(str, len);
2732 RSTRING(str)->as.heap.aux.capa = capa;
2733}
2734
2735void
2736rb_str_modify(VALUE str)
2737{
2738 if (!str_independent(str))
2739 str_make_independent(str);
2741}
2742
2743void
2745{
2746 RUBY_ASSERT(ruby_thread_has_gvl_p());
2747
2748 int termlen = TERM_LEN(str);
2749 long len = RSTRING_LEN(str);
2750
2751 if (expand < 0) {
2752 rb_raise(rb_eArgError, "negative expanding string size");
2753 }
2754 if (expand >= LONG_MAX - len) {
2755 rb_raise(rb_eArgError, "string size too big");
2756 }
2757
2758 if (!str_independent(str)) {
2759 str_make_independent_expand(str, len, expand, termlen);
2760 }
2761 else if (expand > 0) {
2762 RESIZE_CAPA_TERM(str, len + expand, termlen);
2763 }
2765}
2766
2767/* As rb_str_modify(), but don't clear coderange */
2768static void
2769str_modify_keep_cr(VALUE str)
2770{
2771 if (!str_independent(str))
2772 str_make_independent(str);
2774 /* Force re-scan later */
2776}
2777
2778static inline void
2779str_discard(VALUE str)
2780{
2781 str_modifiable(str);
2782 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2783 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2784 RSTRING(str)->as.heap.ptr = 0;
2785 STR_SET_LEN(str, 0);
2786 }
2787}
2788
2789void
2791{
2792 int encindex = rb_enc_get_index(str);
2793
2794 if (RB_UNLIKELY(encindex == -1)) {
2795 rb_raise(rb_eTypeError, "not encoding capable object");
2796 }
2797
2798 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2799 return;
2800 }
2801
2802 rb_encoding *enc = rb_enc_from_index(encindex);
2803 if (!rb_enc_asciicompat(enc)) {
2804 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2805 }
2806}
2807
2808VALUE
2810{
2811 RUBY_ASSERT(ruby_thread_has_gvl_p());
2812
2813 VALUE s = *ptr;
2814 if (!RB_TYPE_P(s, T_STRING)) {
2815 s = rb_str_to_str(s);
2816 *ptr = s;
2817 }
2818 return s;
2819}
2820
2821char *
2823{
2824 VALUE str = rb_string_value(ptr);
2825 return RSTRING_PTR(str);
2826}
2827
2828static int
2829zero_filled(const char *s, int n)
2830{
2831 for (; n > 0; --n) {
2832 if (*s++) return 0;
2833 }
2834 return 1;
2835}
2836
2837static const char *
2838str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2839{
2840 const char *e = s + len;
2841
2842 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2843 if (zero_filled(s, minlen)) return s;
2844 }
2845 return 0;
2846}
2847
2848static char *
2849str_fill_term(VALUE str, char *s, long len, int termlen)
2850{
2851 /* This function assumes that (capa + termlen) bytes of memory
2852 * is allocated, like many other functions in this file.
2853 */
2854 if (str_dependent_p(str)) {
2855 if (!zero_filled(s + len, termlen))
2856 str_make_independent_expand(str, len, 0L, termlen);
2857 }
2858 else {
2859 TERM_FILL(s + len, termlen);
2860 return s;
2861 }
2862 return RSTRING_PTR(str);
2863}
2864
2865void
2866rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2867{
2868 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2869 long len = RSTRING_LEN(str);
2870
2871 RUBY_ASSERT(capa >= len);
2872 if (capa - len < termlen) {
2873 rb_check_lockedtmp(str);
2874 str_make_independent_expand(str, len, 0L, termlen);
2875 }
2876 else if (str_dependent_p(str)) {
2877 if (termlen > oldtermlen)
2878 str_make_independent_expand(str, len, 0L, termlen);
2879 }
2880 else {
2881 if (!STR_EMBED_P(str)) {
2882 /* modify capa instead of realloc */
2883 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2884 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2885 }
2886 if (termlen > oldtermlen) {
2887 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2888 }
2889 }
2890
2891 return;
2892}
2893
2894static char *
2895str_null_check(VALUE str, int *w)
2896{
2897 char *s = RSTRING_PTR(str);
2898 long len = RSTRING_LEN(str);
2899 rb_encoding *enc = rb_enc_get(str);
2900 const int minlen = rb_enc_mbminlen(enc);
2901
2902 if (minlen > 1) {
2903 *w = 1;
2904 if (str_null_char(s, len, minlen, enc)) {
2905 return NULL;
2906 }
2907 return str_fill_term(str, s, len, minlen);
2908 }
2909 *w = 0;
2910 if (!s || memchr(s, 0, len)) {
2911 return NULL;
2912 }
2913 if (s[len]) {
2914 s = str_fill_term(str, s, len, minlen);
2915 }
2916 return s;
2917}
2918
2919char *
2920rb_str_to_cstr(VALUE str)
2921{
2922 int w;
2923 return str_null_check(str, &w);
2924}
2925
2926char *
2928{
2929 VALUE str = rb_string_value(ptr);
2930 int w;
2931 char *s = str_null_check(str, &w);
2932 if (!s) {
2933 if (w) {
2934 rb_raise(rb_eArgError, "string contains null char");
2935 }
2936 rb_raise(rb_eArgError, "string contains null byte");
2937 }
2938 return s;
2939}
2940
2941char *
2942rb_str_fill_terminator(VALUE str, const int newminlen)
2943{
2944 char *s = RSTRING_PTR(str);
2945 long len = RSTRING_LEN(str);
2946 return str_fill_term(str, s, len, newminlen);
2947}
2948
2949VALUE
2951{
2952 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2953 return str;
2954}
2955
2956/*
2957 * call-seq:
2958 * String.try_convert(object) -> object, new_string, or nil
2959 *
2960 * Attempts to convert the given +object+ to a string.
2961 *
2962 * If +object+ is already a string, returns +object+, unmodified.
2963 *
2964 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2965 * calls <tt>object.to_str</tt> and returns the result.
2966 *
2967 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2968 *
2969 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2970 */
2971static VALUE
2972rb_str_s_try_convert(VALUE dummy, VALUE str)
2973{
2974 return rb_check_string_type(str);
2975}
2976
2977static char*
2978str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2979{
2980 long nth = *nthp;
2981 if (rb_enc_mbmaxlen(enc) == 1) {
2982 p += nth;
2983 }
2984 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2985 p += nth * rb_enc_mbmaxlen(enc);
2986 }
2987 else if (rb_enc_asciicompat(enc)) {
2988 const char *p2, *e2;
2989 int n;
2990
2991 while (p < e && 0 < nth) {
2992 e2 = p + nth;
2993 if (e < e2) {
2994 *nthp = nth;
2995 return (char *)e;
2996 }
2997 if (ISASCII(*p)) {
2998 p2 = search_nonascii(p, e2);
2999 if (!p2) {
3000 nth -= e2 - p;
3001 *nthp = nth;
3002 return (char *)e2;
3003 }
3004 nth -= p2 - p;
3005 p = p2;
3006 }
3007 n = rb_enc_mbclen(p, e, enc);
3008 p += n;
3009 nth--;
3010 }
3011 *nthp = nth;
3012 if (nth != 0) {
3013 return (char *)e;
3014 }
3015 return (char *)p;
3016 }
3017 else {
3018 while (p < e && nth--) {
3019 p += rb_enc_mbclen(p, e, enc);
3020 }
3021 }
3022 if (p > e) p = e;
3023 *nthp = nth;
3024 return (char*)p;
3025}
3026
3027char*
3028rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3029{
3030 return str_nth_len(p, e, &nth, enc);
3031}
3032
3033static char*
3034str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3035{
3036 if (singlebyte)
3037 p += nth;
3038 else {
3039 p = str_nth_len(p, e, &nth, enc);
3040 }
3041 if (!p) return 0;
3042 if (p > e) p = e;
3043 return (char *)p;
3044}
3045
3046/* char offset to byte offset */
3047static long
3048str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3049{
3050 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3051 if (!pp) return e - p;
3052 return pp - p;
3053}
3054
3055long
3056rb_str_offset(VALUE str, long pos)
3057{
3058 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3059 STR_ENC_GET(str), single_byte_optimizable(str));
3060}
3061
3062#ifdef NONASCII_MASK
3063static char *
3064str_utf8_nth(const char *p, const char *e, long *nthp)
3065{
3066 long nth = *nthp;
3067 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3068 const uintptr_t *s, *t;
3069 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3070 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3071 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3072 while (p < (const char *)s) {
3073 if (is_utf8_lead_byte(*p)) nth--;
3074 p++;
3075 }
3076 do {
3077 nth -= count_utf8_lead_bytes_with_word(s);
3078 s++;
3079 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3080 p = (char *)s;
3081 }
3082 while (p < e) {
3083 if (is_utf8_lead_byte(*p)) {
3084 if (nth == 0) break;
3085 nth--;
3086 }
3087 p++;
3088 }
3089 *nthp = nth;
3090 return (char *)p;
3091}
3092
3093static long
3094str_utf8_offset(const char *p, const char *e, long nth)
3095{
3096 const char *pp = str_utf8_nth(p, e, &nth);
3097 return pp - p;
3098}
3099#endif
3100
3101/* byte offset to char offset */
3102long
3103rb_str_sublen(VALUE str, long pos)
3104{
3105 if (single_byte_optimizable(str) || pos < 0)
3106 return pos;
3107 else {
3108 char *p = RSTRING_PTR(str);
3109 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3110 }
3111}
3112
3113static VALUE
3114str_subseq(VALUE str, long beg, long len)
3115{
3116 VALUE str2;
3117
3118 RUBY_ASSERT(beg >= 0);
3119 RUBY_ASSERT(len >= 0);
3120 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3121
3122 const int termlen = TERM_LEN(str);
3123 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3124 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3125 RB_GC_GUARD(str);
3126 return str2;
3127 }
3128
3129 str2 = str_alloc_heap(rb_cString);
3130 if (str_embed_capa(str2) >= len + termlen) {
3131 char *ptr2 = RSTRING(str2)->as.embed.ary;
3132 STR_SET_EMBED(str2);
3133 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3134 TERM_FILL(ptr2+len, termlen);
3135
3136 STR_SET_LEN(str2, len);
3137 RB_GC_GUARD(str);
3138 }
3139 else {
3140 str_replace_shared(str2, str);
3141 RUBY_ASSERT(!STR_EMBED_P(str2));
3142 ENC_CODERANGE_CLEAR(str2);
3143 RSTRING(str2)->as.heap.ptr += beg;
3144 if (RSTRING_LEN(str2) > len) {
3145 STR_SET_LEN(str2, len);
3146 }
3147 }
3148
3149 return str2;
3150}
3151
3152VALUE
3153rb_str_subseq(VALUE str, long beg, long len)
3154{
3155 VALUE str2 = str_subseq(str, beg, len);
3156 rb_enc_cr_str_copy_for_substr(str2, str);
3157 return str2;
3158}
3159
3160char *
3161rb_str_subpos(VALUE str, long beg, long *lenp)
3162{
3163 long len = *lenp;
3164 long slen = -1L;
3165 const long blen = RSTRING_LEN(str);
3166 rb_encoding *enc = STR_ENC_GET(str);
3167 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3168
3169 if (len < 0) return 0;
3170 if (beg < 0 && -beg < 0) return 0;
3171 if (!blen) {
3172 len = 0;
3173 }
3174 if (single_byte_optimizable(str)) {
3175 if (beg > blen) return 0;
3176 if (beg < 0) {
3177 beg += blen;
3178 if (beg < 0) return 0;
3179 }
3180 if (len > blen - beg)
3181 len = blen - beg;
3182 if (len < 0) return 0;
3183 p = s + beg;
3184 goto end;
3185 }
3186 if (beg < 0) {
3187 if (len > -beg) len = -beg;
3188 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3189 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3190 beg = -beg;
3191 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3192 p = e;
3193 if (!p) return 0;
3194 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3195 if (!p) return 0;
3196 len = e - p;
3197 goto end;
3198 }
3199 else {
3200 slen = str_strlen(str, enc);
3201 beg += slen;
3202 if (beg < 0) return 0;
3203 p = s + beg;
3204 if (len == 0) goto end;
3205 }
3206 }
3207 else if (beg > 0 && beg > blen) {
3208 return 0;
3209 }
3210 if (len == 0) {
3211 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3212 p = s + beg;
3213 }
3214#ifdef NONASCII_MASK
3215 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3216 enc == rb_utf8_encoding()) {
3217 p = str_utf8_nth(s, e, &beg);
3218 if (beg > 0) return 0;
3219 len = str_utf8_offset(p, e, len);
3220 }
3221#endif
3222 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3223 int char_sz = rb_enc_mbmaxlen(enc);
3224
3225 p = s + beg * char_sz;
3226 if (p > e) {
3227 return 0;
3228 }
3229 else if (len * char_sz > e - p)
3230 len = e - p;
3231 else
3232 len *= char_sz;
3233 }
3234 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3235 if (beg > 0) return 0;
3236 len = 0;
3237 }
3238 else {
3239 len = str_offset(p, e, len, enc, 0);
3240 }
3241 end:
3242 *lenp = len;
3243 RB_GC_GUARD(str);
3244 return p;
3245}
3246
3247static VALUE str_substr(VALUE str, long beg, long len, int empty);
3248
3249VALUE
3250rb_str_substr(VALUE str, long beg, long len)
3251{
3252 return str_substr(str, beg, len, TRUE);
3253}
3254
3255VALUE
3256rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3257{
3258 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3259}
3260
3261static VALUE
3262str_substr(VALUE str, long beg, long len, int empty)
3263{
3264 char *p = rb_str_subpos(str, beg, &len);
3265
3266 if (!p) return Qnil;
3267 if (!len && !empty) return Qnil;
3268
3269 beg = p - RSTRING_PTR(str);
3270
3271 VALUE str2 = str_subseq(str, beg, len);
3272 rb_enc_cr_str_copy_for_substr(str2, str);
3273 return str2;
3274}
3275
3276/* :nodoc: */
3277VALUE
3279{
3280 if (CHILLED_STRING_P(str)) {
3281 FL_UNSET_RAW(str, STR_CHILLED);
3282 }
3283
3284 if (OBJ_FROZEN(str)) return str;
3285 rb_str_resize(str, RSTRING_LEN(str));
3286 return rb_obj_freeze(str);
3287}
3288
3289/*
3290 * call-seq:
3291 * +string -> new_string or self
3292 *
3293 * Returns +self+ if +self+ is not frozen and can be mutated
3294 * without warning issuance.
3295 *
3296 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3297 *
3298 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3299 */
3300static VALUE
3301str_uplus(VALUE str)
3302{
3303 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3304 return rb_str_dup(str);
3305 }
3306 else {
3307 return str;
3308 }
3309}
3310
3311/*
3312 * call-seq:
3313 * -self -> frozen_string
3314 *
3315 * Returns a frozen string equal to +self+.
3316 *
3317 * The returned string is +self+ if and only if all of the following are true:
3318 *
3319 * - +self+ is already frozen.
3320 * - +self+ is an instance of \String (rather than of a subclass of \String)
3321 * - +self+ has no instance variables set on it.
3322 *
3323 * Otherwise, the returned string is a frozen copy of +self+.
3324 *
3325 * Returning +self+, when possible, saves duplicating +self+;
3326 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3327 *
3328 * It may also save duplicating other, already-existing, strings:
3329 *
3330 * s0 = 'foo'
3331 * s1 = 'foo'
3332 * s0.object_id == s1.object_id # => false
3333 * (-s0).object_id == (-s1).object_id # => true
3334 *
3335 * Note that method #-@ is convenient for defining a constant:
3336 *
3337 * FileName = -'config/database.yml'
3338 *
3339 * While its alias #dedup is better suited for chaining:
3340 *
3341 * 'foo'.dedup.gsub!('o')
3342 *
3343 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3344 */
3345static VALUE
3346str_uminus(VALUE str)
3347{
3348 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3349 str = rb_str_dup(str);
3350 }
3351 return rb_fstring(str);
3352}
3353
3354RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3355#define rb_str_dup_frozen rb_str_new_frozen
3356
3357VALUE
3359{
3360 rb_check_frozen(str);
3361 if (FL_TEST(str, STR_TMPLOCK)) {
3362 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3363 }
3364 FL_SET(str, STR_TMPLOCK);
3365 return str;
3366}
3367
3368VALUE
3370{
3371 rb_check_frozen(str);
3372 if (!FL_TEST(str, STR_TMPLOCK)) {
3373 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3374 }
3375 FL_UNSET(str, STR_TMPLOCK);
3376 return str;
3377}
3378
3379VALUE
3380rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3381{
3382 rb_str_locktmp(str);
3383 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3384}
3385
3386void
3388{
3389 RUBY_ASSERT(ruby_thread_has_gvl_p());
3390
3391 long capa;
3392 const int termlen = TERM_LEN(str);
3393
3394 str_modifiable(str);
3395 if (STR_SHARED_P(str)) {
3396 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3397 }
3398 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3399 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3400 }
3401
3402 int cr = ENC_CODERANGE(str);
3403 if (len == 0) {
3404 /* Empty string does not contain non-ASCII */
3406 }
3407 else if (cr == ENC_CODERANGE_UNKNOWN) {
3408 /* Leave unknown. */
3409 }
3410 else if (len > RSTRING_LEN(str)) {
3411 if (ENC_CODERANGE_CLEAN_P(cr)) {
3412 /* Update the coderange regarding the extended part. */
3413 const char *const prev_end = RSTRING_END(str);
3414 const char *const new_end = RSTRING_PTR(str) + len;
3415 rb_encoding *enc = rb_enc_get(str);
3416 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3417 ENC_CODERANGE_SET(str, cr);
3418 }
3419 else if (cr == ENC_CODERANGE_BROKEN) {
3420 /* May be valid now, by appended part. */
3422 }
3423 }
3424 else if (len < RSTRING_LEN(str)) {
3425 if (cr != ENC_CODERANGE_7BIT) {
3426 /* ASCII-only string is keeping after truncated. Valid
3427 * and broken may be invalid or valid, leave unknown. */
3429 }
3430 }
3431
3432 STR_SET_LEN(str, len);
3433 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3434}
3435
3436VALUE
3437rb_str_resize(VALUE str, long len)
3438{
3439 if (len < 0) {
3440 rb_raise(rb_eArgError, "negative string size (or size too big)");
3441 }
3442
3443 int independent = str_independent(str);
3444 long slen = RSTRING_LEN(str);
3445 const int termlen = TERM_LEN(str);
3446
3447 if (slen > len || (termlen != 1 && slen < len)) {
3449 }
3450
3451 {
3452 long capa;
3453 if (STR_EMBED_P(str)) {
3454 if (len == slen) return str;
3455 if (str_embed_capa(str) >= len + termlen) {
3456 STR_SET_LEN(str, len);
3457 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3458 return str;
3459 }
3460 str_make_independent_expand(str, slen, len - slen, termlen);
3461 }
3462 else if (str_embed_capa(str) >= len + termlen) {
3463 char *ptr = STR_HEAP_PTR(str);
3464 STR_SET_EMBED(str);
3465 if (slen > len) slen = len;
3466 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3467 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3468 STR_SET_LEN(str, len);
3469 if (independent) ruby_xfree(ptr);
3470 return str;
3471 }
3472 else if (!independent) {
3473 if (len == slen) return str;
3474 str_make_independent_expand(str, slen, len - slen, termlen);
3475 }
3476 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3477 (capa - len) > (len < 1024 ? len : 1024)) {
3478 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3479 (size_t)len + termlen, STR_HEAP_SIZE(str));
3480 RSTRING(str)->as.heap.aux.capa = len;
3481 }
3482 else if (len == slen) return str;
3483 STR_SET_LEN(str, len);
3484 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3485 }
3486 return str;
3487}
3488
3489static void
3490str_ensure_available_capa(VALUE str, long len)
3491{
3492 str_modify_keep_cr(str);
3493
3494 const int termlen = TERM_LEN(str);
3495 long olen = RSTRING_LEN(str);
3496
3497 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3498 rb_raise(rb_eArgError, "string sizes too big");
3499 }
3500
3501 long total = olen + len;
3502 long capa = str_capacity(str, termlen);
3503
3504 if (capa < total) {
3505 if (total >= LONG_MAX / 2) {
3506 capa = total;
3507 }
3508 while (total > capa) {
3509 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3510 }
3511 RESIZE_CAPA_TERM(str, capa, termlen);
3512 }
3513}
3514
3515static VALUE
3516str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3517{
3518 if (keep_cr) {
3519 str_modify_keep_cr(str);
3520 }
3521 else {
3522 rb_str_modify(str);
3523 }
3524 if (len == 0) return 0;
3525
3526 long total, olen, off = -1;
3527 char *sptr;
3528 const int termlen = TERM_LEN(str);
3529
3530 RSTRING_GETMEM(str, sptr, olen);
3531 if (ptr >= sptr && ptr <= sptr + olen) {
3532 off = ptr - sptr;
3533 }
3534
3535 long capa = str_capacity(str, termlen);
3536
3537 if (olen > LONG_MAX - len) {
3538 rb_raise(rb_eArgError, "string sizes too big");
3539 }
3540 total = olen + len;
3541 if (capa < total) {
3542 if (total >= LONG_MAX / 2) {
3543 capa = total;
3544 }
3545 while (total > capa) {
3546 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3547 }
3548 RESIZE_CAPA_TERM(str, capa, termlen);
3549 sptr = RSTRING_PTR(str);
3550 }
3551 if (off != -1) {
3552 ptr = sptr + off;
3553 }
3554 memcpy(sptr + olen, ptr, len);
3555 STR_SET_LEN(str, total);
3556 TERM_FILL(sptr + total, termlen); /* sentinel */
3557
3558 return str;
3559}
3560
3561#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3562#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3563
3564VALUE
3565rb_str_cat(VALUE str, const char *ptr, long len)
3566{
3567 if (len == 0) return str;
3568 if (len < 0) {
3569 rb_raise(rb_eArgError, "negative string size (or size too big)");
3570 }
3571 return str_buf_cat(str, ptr, len);
3572}
3573
3574VALUE
3575rb_str_cat_cstr(VALUE str, const char *ptr)
3576{
3577 must_not_null(ptr);
3578 return rb_str_buf_cat(str, ptr, strlen(ptr));
3579}
3580
3581static void
3582rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3583{
3584 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3585
3586 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3587 if (UNLIKELY(!str_independent(str))) {
3588 str_make_independent(str);
3589 }
3590
3591 long string_length = -1;
3592 const int null_terminator_length = 1;
3593 char *sptr;
3594 RSTRING_GETMEM(str, sptr, string_length);
3595
3596 // Ensure the resulting string wouldn't be too long.
3597 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3598 rb_raise(rb_eArgError, "string sizes too big");
3599 }
3600
3601 long string_capacity = str_capacity(str, null_terminator_length);
3602
3603 // Get the code range before any modifications since those might clear the code range.
3604 int cr = ENC_CODERANGE(str);
3605
3606 // Check if the string has spare string_capacity to write the new byte.
3607 if (LIKELY(string_capacity >= string_length + 1)) {
3608 // In fast path we can write the new byte and note the string's new length.
3609 sptr[string_length] = byte;
3610 STR_SET_LEN(str, string_length + 1);
3611 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3612 }
3613 else {
3614 // If there's not enough string_capacity, make a call into the general string concatenation function.
3615 str_buf_cat(str, (char *)&byte, 1);
3616 }
3617
3618 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3619 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3620 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3621 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3622 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3623 if (ISASCII(byte)) {
3625 }
3626 else {
3628
3629 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3630 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3631 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3632 }
3633 }
3634 }
3635}
3636
3637RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3638RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3639RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3640
3641static VALUE
3642rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3643 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3644{
3645 int str_encindex = ENCODING_GET(str);
3646 int res_encindex;
3647 int str_cr, res_cr;
3648 rb_encoding *str_enc, *ptr_enc;
3649
3650 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3651
3652 if (str_encindex == ptr_encindex) {
3653 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3654 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3655 }
3656 }
3657 else {
3658 str_enc = rb_enc_from_index(str_encindex);
3659 ptr_enc = rb_enc_from_index(ptr_encindex);
3660 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3661 if (len == 0)
3662 return str;
3663 if (RSTRING_LEN(str) == 0) {
3664 rb_str_buf_cat(str, ptr, len);
3665 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3666 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3667 return str;
3668 }
3669 goto incompatible;
3670 }
3671 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3672 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3673 }
3674 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3675 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3676 str_cr = rb_enc_str_coderange(str);
3677 }
3678 }
3679 }
3680 if (ptr_cr_ret)
3681 *ptr_cr_ret = ptr_cr;
3682
3683 if (str_encindex != ptr_encindex &&
3684 str_cr != ENC_CODERANGE_7BIT &&
3685 ptr_cr != ENC_CODERANGE_7BIT) {
3686 str_enc = rb_enc_from_index(str_encindex);
3687 ptr_enc = rb_enc_from_index(ptr_encindex);
3688 goto incompatible;
3689 }
3690
3691 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3692 res_encindex = str_encindex;
3693 res_cr = ENC_CODERANGE_UNKNOWN;
3694 }
3695 else if (str_cr == ENC_CODERANGE_7BIT) {
3696 if (ptr_cr == ENC_CODERANGE_7BIT) {
3697 res_encindex = str_encindex;
3698 res_cr = ENC_CODERANGE_7BIT;
3699 }
3700 else {
3701 res_encindex = ptr_encindex;
3702 res_cr = ptr_cr;
3703 }
3704 }
3705 else if (str_cr == ENC_CODERANGE_VALID) {
3706 res_encindex = str_encindex;
3707 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3708 res_cr = str_cr;
3709 else
3710 res_cr = ptr_cr;
3711 }
3712 else { /* str_cr == ENC_CODERANGE_BROKEN */
3713 res_encindex = str_encindex;
3714 res_cr = str_cr;
3715 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3716 }
3717
3718 if (len < 0) {
3719 rb_raise(rb_eArgError, "negative string size (or size too big)");
3720 }
3721 str_buf_cat(str, ptr, len);
3722 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3723 return str;
3724
3725 incompatible:
3726 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3727 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3729}
3730
3731VALUE
3732rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3733{
3734 return rb_enc_cr_str_buf_cat(str, ptr, len,
3735 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3736}
3737
3738VALUE
3740{
3741 /* ptr must reference NUL terminated ASCII string. */
3742 int encindex = ENCODING_GET(str);
3743 rb_encoding *enc = rb_enc_from_index(encindex);
3744 if (rb_enc_asciicompat(enc)) {
3745 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3746 encindex, ENC_CODERANGE_7BIT, 0);
3747 }
3748 else {
3749 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3750 while (*ptr) {
3751 unsigned int c = (unsigned char)*ptr;
3752 int len = rb_enc_codelen(c, enc);
3753 rb_enc_mbcput(c, buf, enc);
3754 rb_enc_cr_str_buf_cat(str, buf, len,
3755 encindex, ENC_CODERANGE_VALID, 0);
3756 ptr++;
3757 }
3758 return str;
3759 }
3760}
3761
3762VALUE
3764{
3765 int str2_cr = rb_enc_str_coderange(str2);
3766
3767 if (str_enc_fastpath(str)) {
3768 switch (str2_cr) {
3769 case ENC_CODERANGE_7BIT:
3770 // If RHS is 7bit we can do simple concatenation
3771 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3772 RB_GC_GUARD(str2);
3773 return str;
3775 // If RHS is valid, we can do simple concatenation if encodings are the same
3776 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3777 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3778 int str_cr = ENC_CODERANGE(str);
3779 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3780 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3781 }
3782 RB_GC_GUARD(str2);
3783 return str;
3784 }
3785 }
3786 }
3787
3788 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3789 ENCODING_GET(str2), str2_cr, &str2_cr);
3790
3791 ENC_CODERANGE_SET(str2, str2_cr);
3792
3793 return str;
3794}
3795
3796VALUE
3798{
3799 StringValue(str2);
3800 return rb_str_buf_append(str, str2);
3801}
3802
3803VALUE
3804rb_str_concat_literals(size_t num, const VALUE *strary)
3805{
3806 VALUE str;
3807 size_t i, s = 0;
3808 unsigned long len = 1;
3809
3810 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3811 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3812
3813 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3814 str = rb_str_buf_new(len);
3815 str_enc_copy_direct(str, strary[0]);
3816
3817 for (i = s; i < num; ++i) {
3818 const VALUE v = strary[i];
3819 int encidx = ENCODING_GET(v);
3820
3821 rb_str_buf_append(str, v);
3822 if (encidx != ENCINDEX_US_ASCII) {
3823 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3824 rb_enc_set_index(str, encidx);
3825 }
3826 }
3827 return str;
3828}
3829
3830/*
3831 * call-seq:
3832 * concat(*objects) -> string
3833 *
3834 * :include: doc/string/concat.rdoc
3835 */
3836static VALUE
3837rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3838{
3839 str_modifiable(str);
3840
3841 if (argc == 1) {
3842 return rb_str_concat(str, argv[0]);
3843 }
3844 else if (argc > 1) {
3845 int i;
3846 VALUE arg_str = rb_str_tmp_new(0);
3847 rb_enc_copy(arg_str, str);
3848 for (i = 0; i < argc; i++) {
3849 rb_str_concat(arg_str, argv[i]);
3850 }
3851 rb_str_buf_append(str, arg_str);
3852 }
3853
3854 return str;
3855}
3856
3857/*
3858 * call-seq:
3859 * append_as_bytes(*objects) -> self
3860 *
3861 * Concatenates each object in +objects+ into +self+; returns +self+;
3862 * performs no encoding validation or conversion:
3863 *
3864 * s = 'foo'
3865 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3866 * s.valid_encoding? # => false
3867 * s.append_as_bytes("\xAC 12")
3868 * s.valid_encoding? # => true
3869 *
3870 * When a given object is an integer,
3871 * the value is considered an 8-bit byte;
3872 * if the integer occupies more than one byte (i.e,. is greater than 255),
3873 * appends only the low-order byte (similar to String#setbyte):
3874 *
3875 * s = ""
3876 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3877 * s.bytesize # => 2
3878 *
3879 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3880 */
3881
3882VALUE
3883rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3884{
3885 long needed_capacity = 0;
3886 volatile VALUE t0;
3887 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3888
3889 for (int index = 0; index < argc; index++) {
3890 VALUE obj = argv[index];
3891 enum ruby_value_type type = types[index] = rb_type(obj);
3892 switch (type) {
3893 case T_FIXNUM:
3894 case T_BIGNUM:
3895 needed_capacity++;
3896 break;
3897 case T_STRING:
3898 needed_capacity += RSTRING_LEN(obj);
3899 break;
3900 default:
3901 rb_raise(
3903 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3904 rb_obj_class(obj)
3905 );
3906 break;
3907 }
3908 }
3909
3910 str_ensure_available_capa(str, needed_capacity);
3911 char *sptr = RSTRING_END(str);
3912
3913 for (int index = 0; index < argc; index++) {
3914 VALUE obj = argv[index];
3915 enum ruby_value_type type = types[index];
3916 switch (type) {
3917 case T_FIXNUM:
3918 case T_BIGNUM: {
3919 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3920 char byte = (char)(NUM2INT(obj) & 0xFF);
3921 *sptr = byte;
3922 sptr++;
3923 break;
3924 }
3925 case T_STRING: {
3926 const char *ptr;
3927 long len;
3928 RSTRING_GETMEM(obj, ptr, len);
3929 memcpy(sptr, ptr, len);
3930 sptr += len;
3931 break;
3932 }
3933 default:
3934 rb_bug("append_as_bytes arguments should have been validated");
3935 }
3936 }
3937
3938 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3939 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3940
3941 int cr = ENC_CODERANGE(str);
3942 switch (cr) {
3943 case ENC_CODERANGE_7BIT: {
3944 for (int index = 0; index < argc; index++) {
3945 VALUE obj = argv[index];
3946 enum ruby_value_type type = types[index];
3947 switch (type) {
3948 case T_FIXNUM:
3949 case T_BIGNUM: {
3950 if (!ISASCII(NUM2INT(obj))) {
3951 goto clear_cr;
3952 }
3953 break;
3954 }
3955 case T_STRING: {
3956 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3957 goto clear_cr;
3958 }
3959 break;
3960 }
3961 default:
3962 rb_bug("append_as_bytes arguments should have been validated");
3963 }
3964 }
3965 break;
3966 }
3968 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3969 goto keep_cr;
3970 }
3971 else {
3972 goto clear_cr;
3973 }
3974 break;
3975 default:
3976 goto clear_cr;
3977 break;
3978 }
3979
3980 RB_GC_GUARD(t0);
3981
3982 clear_cr:
3983 // If no fast path was hit, we clear the coderange.
3984 // append_as_bytes is predominantly meant to be used in
3985 // buffering situation, hence it's likely the coderange
3986 // will never be scanned, so it's not worth spending time
3987 // precomputing the coderange except for simple and common
3988 // situations.
3990 keep_cr:
3991 return str;
3992}
3993
3994/*
3995 * call-seq:
3996 * self << object -> self
3997 *
3998 * Appends a string representation of +object+ to +self+;
3999 * returns +self+.
4000 *
4001 * If +object+ is a string, appends it to +self+:
4002 *
4003 * s = 'foo'
4004 * s << 'bar' # => "foobar"
4005 * s # => "foobar"
4006 *
4007 * If +object+ is an integer,
4008 * its value is considered a codepoint;
4009 * converts the value to a character before concatenating:
4010 *
4011 * s = 'foo'
4012 * s << 33 # => "foo!"
4013 *
4014 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4015 * and the encoding of +self+ is Encoding::US_ASCII,
4016 * changes the encoding to Encoding::ASCII_8BIT:
4017 *
4018 * s = 'foo'.encode(Encoding::US_ASCII)
4019 * s.encoding # => #<Encoding:US-ASCII>
4020 * s << 0xff # => "foo\xFF"
4021 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4022 *
4023 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4024 *
4025 * s = 'foo'
4026 * s.encoding # => <Encoding:UTF-8>
4027 * s << 0x00110000 # 1114112 out of char range (RangeError)
4028 * s = 'foo'.encode(Encoding::EUC_JP)
4029 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4030 *
4031 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4032 */
4033VALUE
4035{
4036 unsigned int code;
4037 rb_encoding *enc = STR_ENC_GET(str1);
4038 int encidx;
4039
4040 if (RB_INTEGER_TYPE_P(str2)) {
4041 if (rb_num_to_uint(str2, &code) == 0) {
4042 }
4043 else if (FIXNUM_P(str2)) {
4044 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4045 }
4046 else {
4047 rb_raise(rb_eRangeError, "bignum out of char range");
4048 }
4049 }
4050 else {
4051 return rb_str_append(str1, str2);
4052 }
4053
4054 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4055
4056 if (encidx >= 0) {
4057 rb_str_buf_cat_byte(str1, (unsigned char)code);
4058 }
4059 else {
4060 long pos = RSTRING_LEN(str1);
4061 int cr = ENC_CODERANGE(str1);
4062 int len;
4063 char *buf;
4064
4065 switch (len = rb_enc_codelen(code, enc)) {
4066 case ONIGERR_INVALID_CODE_POINT_VALUE:
4067 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4068 break;
4069 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4070 case 0:
4071 rb_raise(rb_eRangeError, "%u out of char range", code);
4072 break;
4073 }
4074 buf = ALLOCA_N(char, len + 1);
4075 rb_enc_mbcput(code, buf, enc);
4076 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4077 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4078 }
4079 rb_str_resize(str1, pos+len);
4080 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4081 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4083 }
4084 else if (cr == ENC_CODERANGE_BROKEN) {
4086 }
4087 ENC_CODERANGE_SET(str1, cr);
4088 }
4089 return str1;
4090}
4091
4092int
4093rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4094{
4095 int encidx = rb_enc_to_index(enc);
4096
4097 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4098 /* US-ASCII automatically extended to ASCII-8BIT */
4099 if (code > 0xFF) {
4100 rb_raise(rb_eRangeError, "%u out of char range", code);
4101 }
4102 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4103 return ENCINDEX_ASCII_8BIT;
4104 }
4105 return encidx;
4106 }
4107 else {
4108 return -1;
4109 }
4110}
4111
4112/*
4113 * call-seq:
4114 * prepend(*other_strings) -> new_string
4115 *
4116 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4117 *
4118 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4119 *
4120 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4121 *
4122 */
4123
4124static VALUE
4125rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4126{
4127 str_modifiable(str);
4128
4129 if (argc == 1) {
4130 rb_str_update(str, 0L, 0L, argv[0]);
4131 }
4132 else if (argc > 1) {
4133 int i;
4134 VALUE arg_str = rb_str_tmp_new(0);
4135 rb_enc_copy(arg_str, str);
4136 for (i = 0; i < argc; i++) {
4137 rb_str_append(arg_str, argv[i]);
4138 }
4139 rb_str_update(str, 0L, 0L, arg_str);
4140 }
4141
4142 return str;
4143}
4144
4145st_index_t
4147{
4148 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4149 st_index_t precomputed_hash;
4150 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4151
4152 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4153 return precomputed_hash;
4154 }
4155
4156 return str_do_hash(str);
4157}
4158
4159int
4161{
4162 long len1, len2;
4163 const char *ptr1, *ptr2;
4164 RSTRING_GETMEM(str1, ptr1, len1);
4165 RSTRING_GETMEM(str2, ptr2, len2);
4166 return (len1 != len2 ||
4167 !rb_str_comparable(str1, str2) ||
4168 memcmp(ptr1, ptr2, len1) != 0);
4169}
4170
4171/*
4172 * call-seq:
4173 * hash -> integer
4174 *
4175 * :include: doc/string/hash.rdoc
4176 *
4177 */
4178
4179static VALUE
4180rb_str_hash_m(VALUE str)
4181{
4182 st_index_t hval = rb_str_hash(str);
4183 return ST2FIX(hval);
4184}
4185
4186#define lesser(a,b) (((a)>(b))?(b):(a))
4187
4188int
4190{
4191 int idx1, idx2;
4192 int rc1, rc2;
4193
4194 if (RSTRING_LEN(str1) == 0) return TRUE;
4195 if (RSTRING_LEN(str2) == 0) return TRUE;
4196 idx1 = ENCODING_GET(str1);
4197 idx2 = ENCODING_GET(str2);
4198 if (idx1 == idx2) return TRUE;
4199 rc1 = rb_enc_str_coderange(str1);
4200 rc2 = rb_enc_str_coderange(str2);
4201 if (rc1 == ENC_CODERANGE_7BIT) {
4202 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4203 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4204 return TRUE;
4205 }
4206 if (rc2 == ENC_CODERANGE_7BIT) {
4207 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4208 return TRUE;
4209 }
4210 return FALSE;
4211}
4212
4213int
4215{
4216 long len1, len2;
4217 const char *ptr1, *ptr2;
4218 int retval;
4219
4220 if (str1 == str2) return 0;
4221 RSTRING_GETMEM(str1, ptr1, len1);
4222 RSTRING_GETMEM(str2, ptr2, len2);
4223 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4224 if (len1 == len2) {
4225 if (!rb_str_comparable(str1, str2)) {
4226 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4227 return 1;
4228 return -1;
4229 }
4230 return 0;
4231 }
4232 if (len1 > len2) return 1;
4233 return -1;
4234 }
4235 if (retval > 0) return 1;
4236 return -1;
4237}
4238
4239/*
4240 * call-seq:
4241 * self == object -> true or false
4242 *
4243 * Returns whether +object+ is equal to +self+.
4244 *
4245 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4246 *
4247 * s = 'foo'
4248 * s == 'foo' # => true
4249 * s == 'food' # => false
4250 * s == 'FOO' # => false
4251 *
4252 * Returns +false+ if the two strings' encodings are not compatible:
4253 *
4254 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4255 *
4256 * When +object+ is not a string:
4257 *
4258 * - If +object+ responds to method <tt>to_str</tt>,
4259 * <tt>object == self</tt> is called and its return value is returned.
4260 * - If +object+ does not respond to <tt>to_str</tt>,
4261 * +false+ is returned.
4262 *
4263 * Related: {Comparing}[rdoc-ref:String@Comparing].
4264 */
4265
4266VALUE
4268{
4269 if (str1 == str2) return Qtrue;
4270 if (!RB_TYPE_P(str2, T_STRING)) {
4271 if (!rb_respond_to(str2, idTo_str)) {
4272 return Qfalse;
4273 }
4274 return rb_equal(str2, str1);
4275 }
4276 return rb_str_eql_internal(str1, str2);
4277}
4278
4279/*
4280 * call-seq:
4281 * eql?(object) -> true or false
4282 *
4283 * :include: doc/string/eql_p.rdoc
4284 *
4285 */
4286
4287VALUE
4288rb_str_eql(VALUE str1, VALUE str2)
4289{
4290 if (str1 == str2) return Qtrue;
4291 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4292 return rb_str_eql_internal(str1, str2);
4293}
4294
4295/*
4296 * call-seq:
4297 * self <=> other -> -1, 0, 1, or nil
4298 *
4299 * Compares +self+ and +other+,
4300 * evaluating their _contents_, not their _lengths_.
4301 *
4302 * Returns:
4303 *
4304 * - +-1+, if +self+ is smaller.
4305 * - +0+, if the two are equal.
4306 * - +1+, if +self+ is larger.
4307 * - +nil+, if the two are incomparable.
4308 *
4309 * Examples:
4310 *
4311 * 'a' <=> 'b' # => -1
4312 * 'a' <=> 'ab' # => -1
4313 * 'a' <=> 'a' # => 0
4314 * 'b' <=> 'a' # => 1
4315 * 'ab' <=> 'a' # => 1
4316 * 'a' <=> :a # => nil
4317 *
4318 * \Class \String includes module Comparable,
4319 * each of whose methods uses String#<=> for comparison.
4320 *
4321 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4322 */
4323
4324static VALUE
4325rb_str_cmp_m(VALUE str1, VALUE str2)
4326{
4327 int result;
4328 VALUE s = rb_check_string_type(str2);
4329 if (NIL_P(s)) {
4330 return rb_invcmp(str1, str2);
4331 }
4332 result = rb_str_cmp(str1, s);
4333 return INT2FIX(result);
4334}
4335
4336static VALUE str_casecmp(VALUE str1, VALUE str2);
4337static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4338
4339/*
4340 * call-seq:
4341 * casecmp(other_string) -> -1, 0, 1, or nil
4342 *
4343 * Ignoring case, compares +self+ and +other_string+; returns:
4344 *
4345 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4346 * - 0 if the two are equal.
4347 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4348 * - +nil+ if the two are incomparable.
4349 *
4350 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4351 *
4352 * Examples:
4353 *
4354 * 'foo'.casecmp('goo') # => -1
4355 * 'goo'.casecmp('foo') # => 1
4356 * 'foo'.casecmp('food') # => -1
4357 * 'food'.casecmp('foo') # => 1
4358 * 'FOO'.casecmp('foo') # => 0
4359 * 'foo'.casecmp('FOO') # => 0
4360 * 'foo'.casecmp(1) # => nil
4361 *
4362 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4363 */
4364
4365static VALUE
4366rb_str_casecmp(VALUE str1, VALUE str2)
4367{
4368 VALUE s = rb_check_string_type(str2);
4369 if (NIL_P(s)) {
4370 return Qnil;
4371 }
4372 return str_casecmp(str1, s);
4373}
4374
4375static VALUE
4376str_casecmp(VALUE str1, VALUE str2)
4377{
4378 long len;
4379 rb_encoding *enc;
4380 const char *p1, *p1end, *p2, *p2end;
4381
4382 enc = rb_enc_compatible(str1, str2);
4383 if (!enc) {
4384 return Qnil;
4385 }
4386
4387 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4388 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4389 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4390 while (p1 < p1end && p2 < p2end) {
4391 if (*p1 != *p2) {
4392 unsigned int c1 = TOLOWER(*p1 & 0xff);
4393 unsigned int c2 = TOLOWER(*p2 & 0xff);
4394 if (c1 != c2)
4395 return INT2FIX(c1 < c2 ? -1 : 1);
4396 }
4397 p1++;
4398 p2++;
4399 }
4400 }
4401 else {
4402 while (p1 < p1end && p2 < p2end) {
4403 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4404 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4405
4406 if (0 <= c1 && 0 <= c2) {
4407 c1 = TOLOWER(c1);
4408 c2 = TOLOWER(c2);
4409 if (c1 != c2)
4410 return INT2FIX(c1 < c2 ? -1 : 1);
4411 }
4412 else {
4413 int r;
4414 l1 = rb_enc_mbclen(p1, p1end, enc);
4415 l2 = rb_enc_mbclen(p2, p2end, enc);
4416 len = l1 < l2 ? l1 : l2;
4417 r = memcmp(p1, p2, len);
4418 if (r != 0)
4419 return INT2FIX(r < 0 ? -1 : 1);
4420 if (l1 != l2)
4421 return INT2FIX(l1 < l2 ? -1 : 1);
4422 }
4423 p1 += l1;
4424 p2 += l2;
4425 }
4426 }
4427 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4428 if (p1 == p1end) return INT2FIX(-1);
4429 return INT2FIX(1);
4430}
4431
4432/*
4433 * call-seq:
4434 * casecmp?(other_string) -> true, false, or nil
4435 *
4436 * Returns +true+ if +self+ and +other_string+ are equal after
4437 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4438 *
4439 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4440 *
4441 * Examples:
4442 *
4443 * 'foo'.casecmp?('goo') # => false
4444 * 'goo'.casecmp?('foo') # => false
4445 * 'foo'.casecmp?('food') # => false
4446 * 'food'.casecmp?('foo') # => false
4447 * 'FOO'.casecmp?('foo') # => true
4448 * 'foo'.casecmp?('FOO') # => true
4449 * 'foo'.casecmp?(1) # => nil
4450 *
4451 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4452 */
4453
4454static VALUE
4455rb_str_casecmp_p(VALUE str1, VALUE str2)
4456{
4457 VALUE s = rb_check_string_type(str2);
4458 if (NIL_P(s)) {
4459 return Qnil;
4460 }
4461 return str_casecmp_p(str1, s);
4462}
4463
4464static VALUE
4465str_casecmp_p(VALUE str1, VALUE str2)
4466{
4467 rb_encoding *enc;
4468 VALUE folded_str1, folded_str2;
4469 VALUE fold_opt = sym_fold;
4470
4471 enc = rb_enc_compatible(str1, str2);
4472 if (!enc) {
4473 return Qnil;
4474 }
4475
4476 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4477 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4478
4479 return rb_str_eql(folded_str1, folded_str2);
4480}
4481
4482static long
4483strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4484 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4485{
4486 const char *search_start = str_ptr;
4487 long pos, search_len = str_len - offset;
4488
4489 for (;;) {
4490 const char *t;
4491 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4492 if (pos < 0) return pos;
4493 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4494 if (t == search_start + pos) break;
4495 search_len -= t - search_start;
4496 if (search_len <= 0) return -1;
4497 offset += t - search_start;
4498 search_start = t;
4499 }
4500 return pos + offset;
4501}
4502
4503/* found index in byte */
4504#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4505#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4506
4507static long
4508rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4509{
4510 const char *str_ptr, *str_ptr_end, *sub_ptr;
4511 long str_len, sub_len;
4512 rb_encoding *enc;
4513
4514 enc = rb_enc_check(str, sub);
4515 if (is_broken_string(sub)) return -1;
4516
4517 str_ptr = RSTRING_PTR(str);
4518 str_ptr_end = RSTRING_END(str);
4519 str_len = RSTRING_LEN(str);
4520 sub_ptr = RSTRING_PTR(sub);
4521 sub_len = RSTRING_LEN(sub);
4522
4523 if (str_len < sub_len) return -1;
4524
4525 if (offset != 0) {
4526 long str_len_char, sub_len_char;
4527 int single_byte = single_byte_optimizable(str);
4528 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4529 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4530 if (offset < 0) {
4531 offset += str_len_char;
4532 if (offset < 0) return -1;
4533 }
4534 if (str_len_char - offset < sub_len_char) return -1;
4535 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4536 str_ptr += offset;
4537 }
4538 if (sub_len == 0) return offset;
4539
4540 /* need proceed one character at a time */
4541 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4542}
4543
4544
4545/*
4546 * call-seq:
4547 * index(pattern, offset = 0) -> integer or nil
4548 *
4549 * :include: doc/string/index.rdoc
4550 *
4551 */
4552
4553static VALUE
4554rb_str_index_m(int argc, VALUE *argv, VALUE str)
4555{
4556 VALUE sub;
4557 VALUE initpos;
4558 rb_encoding *enc = STR_ENC_GET(str);
4559 long pos;
4560
4561 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4562 long slen = str_strlen(str, enc); /* str's enc */
4563 pos = NUM2LONG(initpos);
4564 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4565 if (RB_TYPE_P(sub, T_REGEXP)) {
4567 }
4568 return Qnil;
4569 }
4570 }
4571 else {
4572 pos = 0;
4573 }
4574
4575 if (RB_TYPE_P(sub, T_REGEXP)) {
4576 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4577 enc, single_byte_optimizable(str));
4578
4579 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4580 VALUE match = rb_backref_get();
4581 struct re_registers *regs = RMATCH_REGS(match);
4582 pos = rb_str_sublen(str, BEG(0));
4583 return LONG2NUM(pos);
4584 }
4585 }
4586 else {
4587 StringValue(sub);
4588 pos = rb_str_index(str, sub, pos);
4589 if (pos >= 0) {
4590 pos = rb_str_sublen(str, pos);
4591 return LONG2NUM(pos);
4592 }
4593 }
4594 return Qnil;
4595}
4596
4597/* Ensure that the given pos is a valid character boundary.
4598 * Note that in this function, "character" means a code point
4599 * (Unicode scalar value), not a grapheme cluster.
4600 */
4601static void
4602str_ensure_byte_pos(VALUE str, long pos)
4603{
4604 if (!single_byte_optimizable(str)) {
4605 const char *s = RSTRING_PTR(str);
4606 const char *e = RSTRING_END(str);
4607 const char *p = s + pos;
4608 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4609 rb_raise(rb_eIndexError,
4610 "offset %ld does not land on character boundary", pos);
4611 }
4612 }
4613}
4614
4615/*
4616 * call-seq:
4617 * byteindex(object, offset = 0) -> integer or nil
4618 *
4619 * Returns the 0-based integer index of a substring of +self+
4620 * specified by +object+ (a string or Regexp) and +offset+,
4621 * or +nil+ if there is no such substring;
4622 * the returned index is the count of _bytes_ (not characters).
4623 *
4624 * When +object+ is a string,
4625 * returns the index of the first found substring equal to +object+:
4626 *
4627 * s = 'foo' # => "foo"
4628 * s.size # => 3 # Three 1-byte characters.
4629 * s.bytesize # => 3 # Three bytes.
4630 * s.byteindex('f') # => 0
4631 * s.byteindex('o') # => 1
4632 * s.byteindex('oo') # => 1
4633 * s.byteindex('ooo') # => nil
4634 *
4635 * When +object+ is a Regexp,
4636 * returns the index of the first found substring matching +object+;
4637 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4638 *
4639 * s = 'foo'
4640 * s.byteindex(/f/) # => 0
4641 * $~ # => #<MatchData "f">
4642 * s.byteindex(/o/) # => 1
4643 * s.byteindex(/oo/) # => 1
4644 * s.byteindex(/ooo/) # => nil
4645 * $~ # => nil
4646 *
4647 * \Integer argument +offset+, if given, specifies the 0-based index
4648 * of the byte where searching is to begin.
4649 *
4650 * When +offset+ is non-negative,
4651 * searching begins at byte position +offset+:
4652 *
4653 * s = 'foo'
4654 * s.byteindex('o', 1) # => 1
4655 * s.byteindex('o', 2) # => 2
4656 * s.byteindex('o', 3) # => nil
4657 *
4658 * When +offset+ is negative, counts backward from the end of +self+:
4659 *
4660 * s = 'foo'
4661 * s.byteindex('o', -1) # => 2
4662 * s.byteindex('o', -2) # => 1
4663 * s.byteindex('o', -3) # => 1
4664 * s.byteindex('o', -4) # => nil
4665 *
4666 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4667 *
4668 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4669 * s.size # => 2 # Two 3-byte characters.
4670 * s.bytesize # => 6 # Six bytes.
4671 * s.byteindex("\uFFFF") # => 0
4672 * s.byteindex("\uFFFF", 1) # Raises IndexError
4673 * s.byteindex("\uFFFF", 2) # Raises IndexError
4674 * s.byteindex("\uFFFF", 3) # => 3
4675 * s.byteindex("\uFFFF", 4) # Raises IndexError
4676 * s.byteindex("\uFFFF", 5) # Raises IndexError
4677 * s.byteindex("\uFFFF", 6) # => nil
4678 *
4679 * Related: see {Querying}[rdoc-ref:String@Querying].
4680 */
4681
4682static VALUE
4683rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4684{
4685 VALUE sub;
4686 VALUE initpos;
4687 long pos;
4688
4689 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4690 long slen = RSTRING_LEN(str);
4691 pos = NUM2LONG(initpos);
4692 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4693 if (RB_TYPE_P(sub, T_REGEXP)) {
4695 }
4696 return Qnil;
4697 }
4698 }
4699 else {
4700 pos = 0;
4701 }
4702
4703 str_ensure_byte_pos(str, pos);
4704
4705 if (RB_TYPE_P(sub, T_REGEXP)) {
4706 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4707 VALUE match = rb_backref_get();
4708 struct re_registers *regs = RMATCH_REGS(match);
4709 pos = BEG(0);
4710 return LONG2NUM(pos);
4711 }
4712 }
4713 else {
4714 StringValue(sub);
4715 pos = rb_str_byteindex(str, sub, pos);
4716 if (pos >= 0) return LONG2NUM(pos);
4717 }
4718 return Qnil;
4719}
4720
4721#ifndef HAVE_MEMRCHR
4722static void*
4723memrchr(const char *search_str, int chr, long search_len)
4724{
4725 const char *ptr = search_str + search_len;
4726 while (ptr > search_str) {
4727 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4728 }
4729
4730 return ((void *)0);
4731}
4732#endif
4733
4734static long
4735str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4736{
4737 char *hit, *adjusted;
4738 int c;
4739 long slen, searchlen;
4740 char *sbeg, *e, *t;
4741
4742 sbeg = RSTRING_PTR(str);
4743 slen = RSTRING_LEN(sub);
4744 if (slen == 0) return s - sbeg;
4745 e = RSTRING_END(str);
4746 t = RSTRING_PTR(sub);
4747 c = *t & 0xff;
4748 searchlen = s - sbeg + 1;
4749
4750 if (memcmp(s, t, slen) == 0) {
4751 return s - sbeg;
4752 }
4753
4754 do {
4755 hit = memrchr(sbeg, c, searchlen);
4756 if (!hit) break;
4757 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4758 if (hit != adjusted) {
4759 searchlen = adjusted - sbeg;
4760 continue;
4761 }
4762 if (memcmp(hit, t, slen) == 0)
4763 return hit - sbeg;
4764 searchlen = adjusted - sbeg;
4765 } while (searchlen > 0);
4766
4767 return -1;
4768}
4769
4770/* found index in byte */
4771static long
4772rb_str_rindex(VALUE str, VALUE sub, long pos)
4773{
4774 long len, slen;
4775 char *sbeg, *s;
4776 rb_encoding *enc;
4777 int singlebyte;
4778
4779 enc = rb_enc_check(str, sub);
4780 if (is_broken_string(sub)) return -1;
4781 singlebyte = single_byte_optimizable(str);
4782 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4783 slen = str_strlen(sub, enc); /* rb_enc_check */
4784
4785 /* substring longer than string */
4786 if (len < slen) return -1;
4787 if (len - pos < slen) pos = len - slen;
4788 if (len == 0) return pos;
4789
4790 sbeg = RSTRING_PTR(str);
4791
4792 if (pos == 0) {
4793 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4794 return 0;
4795 else
4796 return -1;
4797 }
4798
4799 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4800 return str_rindex(str, sub, s, enc);
4801}
4802
4803/*
4804 * call-seq:
4805 * rindex(pattern, offset = self.length) -> integer or nil
4806 *
4807 * :include:doc/string/rindex.rdoc
4808 *
4809 */
4810
4811static VALUE
4812rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4813{
4814 VALUE sub;
4815 VALUE initpos;
4816 rb_encoding *enc = STR_ENC_GET(str);
4817 long pos, len = str_strlen(str, enc); /* str's enc */
4818
4819 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4820 pos = NUM2LONG(initpos);
4821 if (pos < 0 && (pos += len) < 0) {
4822 if (RB_TYPE_P(sub, T_REGEXP)) {
4824 }
4825 return Qnil;
4826 }
4827 if (pos > len) pos = len;
4828 }
4829 else {
4830 pos = len;
4831 }
4832
4833 if (RB_TYPE_P(sub, T_REGEXP)) {
4834 /* enc = rb_enc_check(str, sub); */
4835 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4836 enc, single_byte_optimizable(str));
4837
4838 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4839 VALUE match = rb_backref_get();
4840 struct re_registers *regs = RMATCH_REGS(match);
4841 pos = rb_str_sublen(str, BEG(0));
4842 return LONG2NUM(pos);
4843 }
4844 }
4845 else {
4846 StringValue(sub);
4847 pos = rb_str_rindex(str, sub, pos);
4848 if (pos >= 0) {
4849 pos = rb_str_sublen(str, pos);
4850 return LONG2NUM(pos);
4851 }
4852 }
4853 return Qnil;
4854}
4855
4856static long
4857rb_str_byterindex(VALUE str, VALUE sub, long pos)
4858{
4859 long len, slen;
4860 char *sbeg, *s;
4861 rb_encoding *enc;
4862
4863 enc = rb_enc_check(str, sub);
4864 if (is_broken_string(sub)) return -1;
4865 len = RSTRING_LEN(str);
4866 slen = RSTRING_LEN(sub);
4867
4868 /* substring longer than string */
4869 if (len < slen) return -1;
4870 if (len - pos < slen) pos = len - slen;
4871 if (len == 0) return pos;
4872
4873 sbeg = RSTRING_PTR(str);
4874
4875 if (pos == 0) {
4876 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4877 return 0;
4878 else
4879 return -1;
4880 }
4881
4882 s = sbeg + pos;
4883 return str_rindex(str, sub, s, enc);
4884}
4885
4886/*
4887 * call-seq:
4888 * byterindex(object, offset = self.bytesize) -> integer or nil
4889 *
4890 * Returns the 0-based integer index of a substring of +self+
4891 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4892 * or +nil+ if there is no such substring;
4893 * the returned index is the count of _bytes_ (not characters).
4894 *
4895 * When +object+ is a string,
4896 * returns the index of the _last_ found substring equal to +object+:
4897 *
4898 * s = 'foo' # => "foo"
4899 * s.size # => 3 # Three 1-byte characters.
4900 * s.bytesize # => 3 # Three bytes.
4901 * s.byterindex('f') # => 0
4902 s.byterindex('o') # => 2
4903 s.byterindex('oo') # => 1
4904 s.byterindex('ooo') # => nil
4905 *
4906 * When +object+ is a Regexp,
4907 * returns the index of the last found substring matching +object+;
4908 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4909 *
4910 * s = 'foo'
4911 * s.byterindex(/f/) # => 0
4912 * $~ # => #<MatchData "f">
4913 * s.byterindex(/o/) # => 2
4914 * s.byterindex(/oo/) # => 1
4915 * s.byterindex(/ooo/) # => nil
4916 * $~ # => nil
4917 *
4918 * The last match means starting at the possible last position,
4919 * not the last of the longest matches:
4920 *
4921 * s = 'foo'
4922 * s.byterindex(/o+/) # => 2
4923 * $~ #=> #<MatchData "o">
4924 *
4925 * To get the last longest match, use a negative lookbehind:
4926 *
4927 * s = 'foo'
4928 * s.byterindex(/(?<!o)o+/) # => 1
4929 * $~ # => #<MatchData "oo">
4930 *
4931 * Or use method #byteindex with negative lookahead:
4932 *
4933 * s = 'foo'
4934 * s.byteindex(/o+(?!.*o)/) # => 1
4935 * $~ #=> #<MatchData "oo">
4936 *
4937 * \Integer argument +offset+, if given, specifies the 0-based index
4938 * of the byte where searching is to end.
4939 *
4940 * When +offset+ is non-negative,
4941 * searching ends at byte position +offset+:
4942 *
4943 * s = 'foo'
4944 * s.byterindex('o', 0) # => nil
4945 * s.byterindex('o', 1) # => 1
4946 * s.byterindex('o', 2) # => 2
4947 * s.byterindex('o', 3) # => 2
4948 *
4949 * When +offset+ is negative, counts backward from the end of +self+:
4950 *
4951 * s = 'foo'
4952 * s.byterindex('o', -1) # => 2
4953 * s.byterindex('o', -2) # => 1
4954 * s.byterindex('o', -3) # => nil
4955 *
4956 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4957 *
4958 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4959 * s.size # => 2 # Two 3-byte characters.
4960 * s.bytesize # => 6 # Six bytes.
4961 * s.byterindex("\uFFFF") # => 3
4962 * s.byterindex("\uFFFF", 1) # Raises IndexError
4963 * s.byterindex("\uFFFF", 2) # Raises IndexError
4964 * s.byterindex("\uFFFF", 3) # => 3
4965 * s.byterindex("\uFFFF", 4) # Raises IndexError
4966 * s.byterindex("\uFFFF", 5) # Raises IndexError
4967 * s.byterindex("\uFFFF", 6) # => nil
4968 *
4969 * Related: see {Querying}[rdoc-ref:String@Querying].
4970 */
4971
4972static VALUE
4973rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4974{
4975 VALUE sub;
4976 VALUE initpos;
4977 long pos, len = RSTRING_LEN(str);
4978
4979 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4980 pos = NUM2LONG(initpos);
4981 if (pos < 0 && (pos += len) < 0) {
4982 if (RB_TYPE_P(sub, T_REGEXP)) {
4984 }
4985 return Qnil;
4986 }
4987 if (pos > len) pos = len;
4988 }
4989 else {
4990 pos = len;
4991 }
4992
4993 str_ensure_byte_pos(str, pos);
4994
4995 if (RB_TYPE_P(sub, T_REGEXP)) {
4996 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4997 VALUE match = rb_backref_get();
4998 struct re_registers *regs = RMATCH_REGS(match);
4999 pos = BEG(0);
5000 return LONG2NUM(pos);
5001 }
5002 }
5003 else {
5004 StringValue(sub);
5005 pos = rb_str_byterindex(str, sub, pos);
5006 if (pos >= 0) return LONG2NUM(pos);
5007 }
5008 return Qnil;
5009}
5010
5011/*
5012 * call-seq:
5013 * self =~ object -> integer or nil
5014 *
5015 * When +object+ is a Regexp, returns the index of the first substring in +self+
5016 * matched by +object+,
5017 * or +nil+ if no match is found;
5018 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5019 *
5020 * 'foo' =~ /f/ # => 0
5021 * $~ # => #<MatchData "f">
5022 * 'foo' =~ /o/ # => 1
5023 * $~ # => #<MatchData "o">
5024 * 'foo' =~ /x/ # => nil
5025 * $~ # => nil
5026 *
5027 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5028 * (see Regexp#=~):
5029 *
5030 * number = nil
5031 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5032 * number # => nil # Not assigned.
5033 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5034 * number # => "9" # Assigned.
5035 *
5036 * If +object+ is not a Regexp, returns the value
5037 * returned by <tt>object =~ self</tt>.
5038 *
5039 * Related: see {Querying}[rdoc-ref:String@Querying].
5040 */
5041
5042static VALUE
5043rb_str_match(VALUE x, VALUE y)
5044{
5045 switch (OBJ_BUILTIN_TYPE(y)) {
5046 case T_STRING:
5047 rb_raise(rb_eTypeError, "type mismatch: String given");
5048
5049 case T_REGEXP:
5050 return rb_reg_match(y, x);
5051
5052 default:
5053 return rb_funcall(y, idEqTilde, 1, x);
5054 }
5055}
5056
5057
5058static VALUE get_pat(VALUE);
5059
5060
5061/*
5062 * call-seq:
5063 * match(pattern, offset = 0) -> matchdata or nil
5064 * match(pattern, offset = 0) {|matchdata| ... } -> object
5065 *
5066 * Creates a MatchData object based on +self+ and the given arguments;
5067 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5068 *
5069 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5070 *
5071 * regexp = Regexp.new(pattern)
5072 *
5073 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5074 * (see Regexp#match):
5075 *
5076 * matchdata = regexp.match(self[offset..])
5077 *
5078 * With no block given, returns the computed +matchdata+ or +nil+:
5079 *
5080 * 'foo'.match('f') # => #<MatchData "f">
5081 * 'foo'.match('o') # => #<MatchData "o">
5082 * 'foo'.match('x') # => nil
5083 * 'foo'.match('f', 1) # => nil
5084 * 'foo'.match('o', 1) # => #<MatchData "o">
5085 *
5086 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5087 * returns the block's return value:
5088 *
5089 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5090 *
5091 * With a block given and +nil+ +matchdata+, does not call the block:
5092 *
5093 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5094 *
5095 * Related: see {Querying}[rdoc-ref:String@Querying].
5096 */
5097
5098static VALUE
5099rb_str_match_m(int argc, VALUE *argv, VALUE str)
5100{
5101 VALUE re, result;
5102 if (argc < 1)
5103 rb_check_arity(argc, 1, 2);
5104 re = argv[0];
5105 argv[0] = str;
5106 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5107 if (!NIL_P(result) && rb_block_given_p()) {
5108 return rb_yield(result);
5109 }
5110 return result;
5111}
5112
5113/*
5114 * call-seq:
5115 * match?(pattern, offset = 0) -> true or false
5116 *
5117 * Returns whether a match is found for +self+ and the given arguments;
5118 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5119 *
5120 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5121 *
5122 * regexp = Regexp.new(pattern)
5123 *
5124 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5125 * +false+ otherwise:
5126 *
5127 * 'foo'.match?(/o/) # => true
5128 * 'foo'.match?('o') # => true
5129 * 'foo'.match?(/x/) # => false
5130 * 'foo'.match?('f', 1) # => false
5131 * 'foo'.match?('o', 1) # => true
5132 *
5133 * Related: see {Querying}[rdoc-ref:String@Querying].
5134 */
5135
5136static VALUE
5137rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5138{
5139 VALUE re;
5140 rb_check_arity(argc, 1, 2);
5141 re = get_pat(argv[0]);
5142 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5143}
5144
5145enum neighbor_char {
5146 NEIGHBOR_NOT_CHAR,
5147 NEIGHBOR_FOUND,
5148 NEIGHBOR_WRAPPED
5149};
5150
5151static enum neighbor_char
5152enc_succ_char(char *p, long len, rb_encoding *enc)
5153{
5154 long i;
5155 int l;
5156
5157 if (rb_enc_mbminlen(enc) > 1) {
5158 /* wchar, trivial case */
5159 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5160 if (!MBCLEN_CHARFOUND_P(r)) {
5161 return NEIGHBOR_NOT_CHAR;
5162 }
5163 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5164 l = rb_enc_code_to_mbclen(c, enc);
5165 if (!l) return NEIGHBOR_NOT_CHAR;
5166 if (l != len) return NEIGHBOR_WRAPPED;
5167 rb_enc_mbcput(c, p, enc);
5168 r = rb_enc_precise_mbclen(p, p + len, enc);
5169 if (!MBCLEN_CHARFOUND_P(r)) {
5170 return NEIGHBOR_NOT_CHAR;
5171 }
5172 return NEIGHBOR_FOUND;
5173 }
5174 while (1) {
5175 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5176 p[i] = '\0';
5177 if (i < 0)
5178 return NEIGHBOR_WRAPPED;
5179 ++((unsigned char*)p)[i];
5180 l = rb_enc_precise_mbclen(p, p+len, enc);
5181 if (MBCLEN_CHARFOUND_P(l)) {
5182 l = MBCLEN_CHARFOUND_LEN(l);
5183 if (l == len) {
5184 return NEIGHBOR_FOUND;
5185 }
5186 else {
5187 memset(p+l, 0xff, len-l);
5188 }
5189 }
5190 if (MBCLEN_INVALID_P(l) && i < len-1) {
5191 long len2;
5192 int l2;
5193 for (len2 = len-1; 0 < len2; len2--) {
5194 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5195 if (!MBCLEN_INVALID_P(l2))
5196 break;
5197 }
5198 memset(p+len2+1, 0xff, len-(len2+1));
5199 }
5200 }
5201}
5202
5203static enum neighbor_char
5204enc_pred_char(char *p, long len, rb_encoding *enc)
5205{
5206 long i;
5207 int l;
5208 if (rb_enc_mbminlen(enc) > 1) {
5209 /* wchar, trivial case */
5210 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5211 if (!MBCLEN_CHARFOUND_P(r)) {
5212 return NEIGHBOR_NOT_CHAR;
5213 }
5214 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5215 if (!c) return NEIGHBOR_NOT_CHAR;
5216 --c;
5217 l = rb_enc_code_to_mbclen(c, enc);
5218 if (!l) return NEIGHBOR_NOT_CHAR;
5219 if (l != len) return NEIGHBOR_WRAPPED;
5220 rb_enc_mbcput(c, p, enc);
5221 r = rb_enc_precise_mbclen(p, p + len, enc);
5222 if (!MBCLEN_CHARFOUND_P(r)) {
5223 return NEIGHBOR_NOT_CHAR;
5224 }
5225 return NEIGHBOR_FOUND;
5226 }
5227 while (1) {
5228 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5229 p[i] = '\xff';
5230 if (i < 0)
5231 return NEIGHBOR_WRAPPED;
5232 --((unsigned char*)p)[i];
5233 l = rb_enc_precise_mbclen(p, p+len, enc);
5234 if (MBCLEN_CHARFOUND_P(l)) {
5235 l = MBCLEN_CHARFOUND_LEN(l);
5236 if (l == len) {
5237 return NEIGHBOR_FOUND;
5238 }
5239 else {
5240 memset(p+l, 0, len-l);
5241 }
5242 }
5243 if (MBCLEN_INVALID_P(l) && i < len-1) {
5244 long len2;
5245 int l2;
5246 for (len2 = len-1; 0 < len2; len2--) {
5247 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5248 if (!MBCLEN_INVALID_P(l2))
5249 break;
5250 }
5251 memset(p+len2+1, 0, len-(len2+1));
5252 }
5253 }
5254}
5255
5256/*
5257 overwrite +p+ by succeeding letter in +enc+ and returns
5258 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5259 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5260 assuming each ranges are successive, and mbclen
5261 never change in each ranges.
5262 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5263 character.
5264 */
5265static enum neighbor_char
5266enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5267{
5268 enum neighbor_char ret;
5269 unsigned int c;
5270 int ctype;
5271 int range;
5272 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5273
5274 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5275 int try;
5276 const int max_gaps = 1;
5277
5278 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5279 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5280 ctype = ONIGENC_CTYPE_DIGIT;
5281 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5282 ctype = ONIGENC_CTYPE_ALPHA;
5283 else
5284 return NEIGHBOR_NOT_CHAR;
5285
5286 MEMCPY(save, p, char, len);
5287 for (try = 0; try <= max_gaps; ++try) {
5288 ret = enc_succ_char(p, len, enc);
5289 if (ret == NEIGHBOR_FOUND) {
5290 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5291 if (rb_enc_isctype(c, ctype, enc))
5292 return NEIGHBOR_FOUND;
5293 }
5294 }
5295 MEMCPY(p, save, char, len);
5296 range = 1;
5297 while (1) {
5298 MEMCPY(save, p, char, len);
5299 ret = enc_pred_char(p, len, enc);
5300 if (ret == NEIGHBOR_FOUND) {
5301 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5302 if (!rb_enc_isctype(c, ctype, enc)) {
5303 MEMCPY(p, save, char, len);
5304 break;
5305 }
5306 }
5307 else {
5308 MEMCPY(p, save, char, len);
5309 break;
5310 }
5311 range++;
5312 }
5313 if (range == 1) {
5314 return NEIGHBOR_NOT_CHAR;
5315 }
5316
5317 if (ctype != ONIGENC_CTYPE_DIGIT) {
5318 MEMCPY(carry, p, char, len);
5319 return NEIGHBOR_WRAPPED;
5320 }
5321
5322 MEMCPY(carry, p, char, len);
5323 enc_succ_char(carry, len, enc);
5324 return NEIGHBOR_WRAPPED;
5325}
5326
5327
5328static VALUE str_succ(VALUE str);
5329
5330/*
5331 * call-seq:
5332 * succ -> new_str
5333 *
5334 * :include: doc/string/succ.rdoc
5335 *
5336 */
5337
5338VALUE
5340{
5341 VALUE str;
5342 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5343 rb_enc_cr_str_copy_for_substr(str, orig);
5344 return str_succ(str);
5345}
5346
5347static VALUE
5348str_succ(VALUE str)
5349{
5350 rb_encoding *enc;
5351 char *sbeg, *s, *e, *last_alnum = 0;
5352 int found_alnum = 0;
5353 long l, slen;
5354 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5355 long carry_pos = 0, carry_len = 1;
5356 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5357
5358 slen = RSTRING_LEN(str);
5359 if (slen == 0) return str;
5360
5361 enc = STR_ENC_GET(str);
5362 sbeg = RSTRING_PTR(str);
5363 s = e = sbeg + slen;
5364
5365 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5366 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5367 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5368 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5369 break;
5370 }
5371 }
5372 l = rb_enc_precise_mbclen(s, e, enc);
5373 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5374 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5375 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5376 switch (neighbor) {
5377 case NEIGHBOR_NOT_CHAR:
5378 continue;
5379 case NEIGHBOR_FOUND:
5380 return str;
5381 case NEIGHBOR_WRAPPED:
5382 last_alnum = s;
5383 break;
5384 }
5385 found_alnum = 1;
5386 carry_pos = s - sbeg;
5387 carry_len = l;
5388 }
5389 if (!found_alnum) { /* str contains no alnum */
5390 s = e;
5391 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5392 enum neighbor_char neighbor;
5393 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5394 l = rb_enc_precise_mbclen(s, e, enc);
5395 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5396 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5397 MEMCPY(tmp, s, char, l);
5398 neighbor = enc_succ_char(tmp, l, enc);
5399 switch (neighbor) {
5400 case NEIGHBOR_FOUND:
5401 MEMCPY(s, tmp, char, l);
5402 return str;
5403 break;
5404 case NEIGHBOR_WRAPPED:
5405 MEMCPY(s, tmp, char, l);
5406 break;
5407 case NEIGHBOR_NOT_CHAR:
5408 break;
5409 }
5410 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5411 /* wrapped to \0...\0. search next valid char. */
5412 enc_succ_char(s, l, enc);
5413 }
5414 if (!rb_enc_asciicompat(enc)) {
5415 MEMCPY(carry, s, char, l);
5416 carry_len = l;
5417 }
5418 carry_pos = s - sbeg;
5419 }
5421 }
5422 RESIZE_CAPA(str, slen + carry_len);
5423 sbeg = RSTRING_PTR(str);
5424 s = sbeg + carry_pos;
5425 memmove(s + carry_len, s, slen - carry_pos);
5426 memmove(s, carry, carry_len);
5427 slen += carry_len;
5428 STR_SET_LEN(str, slen);
5429 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5431 return str;
5432}
5433
5434
5435/*
5436 * call-seq:
5437 * succ! -> self
5438 *
5439 * Like String#succ, but modifies +self+ in place; returns +self+.
5440 *
5441 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5442 */
5443
5444static VALUE
5445rb_str_succ_bang(VALUE str)
5446{
5447 rb_str_modify(str);
5448 str_succ(str);
5449 return str;
5450}
5451
5452static int
5453all_digits_p(const char *s, long len)
5454{
5455 while (len-- > 0) {
5456 if (!ISDIGIT(*s)) return 0;
5457 s++;
5458 }
5459 return 1;
5460}
5461
5462static int
5463str_upto_i(VALUE str, VALUE arg)
5464{
5465 rb_yield(str);
5466 return 0;
5467}
5468
5469/*
5470 * call-seq:
5471 * upto(other_string, exclusive = false) {|string| ... } -> self
5472 * upto(other_string, exclusive = false) -> new_enumerator
5473 *
5474 * :include: doc/string/upto.rdoc
5475 *
5476 */
5477
5478static VALUE
5479rb_str_upto(int argc, VALUE *argv, VALUE beg)
5480{
5481 VALUE end, exclusive;
5482
5483 rb_scan_args(argc, argv, "11", &end, &exclusive);
5484 RETURN_ENUMERATOR(beg, argc, argv);
5485 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5486}
5487
5488VALUE
5489rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5490{
5491 VALUE current, after_end;
5492 ID succ;
5493 int n, ascii;
5494 rb_encoding *enc;
5495
5496 CONST_ID(succ, "succ");
5497 StringValue(end);
5498 enc = rb_enc_check(beg, end);
5499 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5500 /* single character */
5501 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5502 char c = RSTRING_PTR(beg)[0];
5503 char e = RSTRING_PTR(end)[0];
5504
5505 if (c > e || (excl && c == e)) return beg;
5506 for (;;) {
5507 VALUE str = rb_enc_str_new(&c, 1, enc);
5509 if ((*each)(str, arg)) break;
5510 if (!excl && c == e) break;
5511 c++;
5512 if (excl && c == e) break;
5513 }
5514 return beg;
5515 }
5516 /* both edges are all digits */
5517 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5518 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5519 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5520 VALUE b, e;
5521 int width;
5522
5523 width = RSTRING_LENINT(beg);
5524 b = rb_str_to_inum(beg, 10, FALSE);
5525 e = rb_str_to_inum(end, 10, FALSE);
5526 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5527 long bi = FIX2LONG(b);
5528 long ei = FIX2LONG(e);
5529 rb_encoding *usascii = rb_usascii_encoding();
5530
5531 while (bi <= ei) {
5532 if (excl && bi == ei) break;
5533 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5534 bi++;
5535 }
5536 }
5537 else {
5538 ID op = excl ? '<' : idLE;
5539 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5540
5541 args[0] = INT2FIX(width);
5542 while (rb_funcall(b, op, 1, e)) {
5543 args[1] = b;
5544 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5545 b = rb_funcallv(b, succ, 0, 0);
5546 }
5547 }
5548 return beg;
5549 }
5550 /* normal case */
5551 n = rb_str_cmp(beg, end);
5552 if (n > 0 || (excl && n == 0)) return beg;
5553
5554 after_end = rb_funcallv(end, succ, 0, 0);
5555 current = str_duplicate(rb_cString, beg);
5556 while (!rb_str_equal(current, after_end)) {
5557 VALUE next = Qnil;
5558 if (excl || !rb_str_equal(current, end))
5559 next = rb_funcallv(current, succ, 0, 0);
5560 if ((*each)(current, arg)) break;
5561 if (NIL_P(next)) break;
5562 current = next;
5563 StringValue(current);
5564 if (excl && rb_str_equal(current, end)) break;
5565 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5566 break;
5567 }
5568
5569 return beg;
5570}
5571
5572VALUE
5573rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5574{
5575 VALUE current;
5576 ID succ;
5577
5578 CONST_ID(succ, "succ");
5579 /* both edges are all digits */
5580 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5581 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5582 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5583 int width = RSTRING_LENINT(beg);
5584 b = rb_str_to_inum(beg, 10, FALSE);
5585 if (FIXNUM_P(b)) {
5586 long bi = FIX2LONG(b);
5587 rb_encoding *usascii = rb_usascii_encoding();
5588
5589 while (FIXABLE(bi)) {
5590 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5591 bi++;
5592 }
5593 b = LONG2NUM(bi);
5594 }
5595 args[0] = INT2FIX(width);
5596 while (1) {
5597 args[1] = b;
5598 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5599 b = rb_funcallv(b, succ, 0, 0);
5600 }
5601 }
5602 /* normal case */
5603 current = str_duplicate(rb_cString, beg);
5604 while (1) {
5605 VALUE next = rb_funcallv(current, succ, 0, 0);
5606 if ((*each)(current, arg)) break;
5607 current = next;
5608 StringValue(current);
5609 if (RSTRING_LEN(current) == 0)
5610 break;
5611 }
5612
5613 return beg;
5614}
5615
5616static int
5617include_range_i(VALUE str, VALUE arg)
5618{
5619 VALUE *argp = (VALUE *)arg;
5620 if (!rb_equal(str, *argp)) return 0;
5621 *argp = Qnil;
5622 return 1;
5623}
5624
5625VALUE
5626rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5627{
5628 beg = rb_str_new_frozen(beg);
5629 StringValue(end);
5630 end = rb_str_new_frozen(end);
5631 if (NIL_P(val)) return Qfalse;
5632 val = rb_check_string_type(val);
5633 if (NIL_P(val)) return Qfalse;
5634 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5635 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5636 rb_enc_asciicompat(STR_ENC_GET(val))) {
5637 const char *bp = RSTRING_PTR(beg);
5638 const char *ep = RSTRING_PTR(end);
5639 const char *vp = RSTRING_PTR(val);
5640 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5641 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5642 return Qfalse;
5643 else {
5644 char b = *bp;
5645 char e = *ep;
5646 char v = *vp;
5647
5648 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5649 if (b <= v && v < e) return Qtrue;
5650 return RBOOL(!RTEST(exclusive) && v == e);
5651 }
5652 }
5653 }
5654#if 0
5655 /* both edges are all digits */
5656 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5657 all_digits_p(bp, RSTRING_LEN(beg)) &&
5658 all_digits_p(ep, RSTRING_LEN(end))) {
5659 /* TODO */
5660 }
5661#endif
5662 }
5663 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5664
5665 return RBOOL(NIL_P(val));
5666}
5667
5668static VALUE
5669rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5670{
5671 if (rb_reg_search(re, str, 0, 0) >= 0) {
5672 VALUE match = rb_backref_get();
5673 int nth = rb_reg_backref_number(match, backref);
5674 return rb_reg_nth_match(nth, match);
5675 }
5676 return Qnil;
5677}
5678
5679static VALUE
5680rb_str_aref(VALUE str, VALUE indx)
5681{
5682 long idx;
5683
5684 if (FIXNUM_P(indx)) {
5685 idx = FIX2LONG(indx);
5686 }
5687 else if (RB_TYPE_P(indx, T_REGEXP)) {
5688 return rb_str_subpat(str, indx, INT2FIX(0));
5689 }
5690 else if (RB_TYPE_P(indx, T_STRING)) {
5691 if (rb_str_index(str, indx, 0) != -1)
5692 return str_duplicate(rb_cString, indx);
5693 return Qnil;
5694 }
5695 else {
5696 /* check if indx is Range */
5697 long beg, len = str_strlen(str, NULL);
5698 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5699 case Qfalse:
5700 break;
5701 case Qnil:
5702 return Qnil;
5703 default:
5704 return rb_str_substr(str, beg, len);
5705 }
5706 idx = NUM2LONG(indx);
5707 }
5708
5709 return str_substr(str, idx, 1, FALSE);
5710}
5711
5712
5713/*
5714 * call-seq:
5715 * self[index] -> new_string or nil
5716 * self[start, length] -> new_string or nil
5717 * self[range] -> new_string or nil
5718 * self[regexp, capture = 0] -> new_string or nil
5719 * self[substring] -> new_string or nil
5720 *
5721 * :include: doc/string/aref.rdoc
5722 *
5723 */
5724
5725static VALUE
5726rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5727{
5728 if (argc == 2) {
5729 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5730 return rb_str_subpat(str, argv[0], argv[1]);
5731 }
5732 else {
5733 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5734 }
5735 }
5736 rb_check_arity(argc, 1, 2);
5737 return rb_str_aref(str, argv[0]);
5738}
5739
5740VALUE
5742{
5743 char *ptr = RSTRING_PTR(str);
5744 long olen = RSTRING_LEN(str), nlen;
5745
5746 str_modifiable(str);
5747 if (len > olen) len = olen;
5748 nlen = olen - len;
5749 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5750 char *oldptr = ptr;
5751 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5752 STR_SET_EMBED(str);
5753 ptr = RSTRING(str)->as.embed.ary;
5754 memmove(ptr, oldptr + len, nlen);
5755 if (fl == STR_NOEMBED) xfree(oldptr);
5756 }
5757 else {
5758 if (!STR_SHARED_P(str)) {
5759 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5760 rb_enc_cr_str_exact_copy(shared, str);
5761 OBJ_FREEZE(shared);
5762 }
5763 ptr = RSTRING(str)->as.heap.ptr += len;
5764 }
5765 STR_SET_LEN(str, nlen);
5766
5767 if (!SHARABLE_MIDDLE_SUBSTRING) {
5768 TERM_FILL(ptr + nlen, TERM_LEN(str));
5769 }
5771 return str;
5772}
5773
5774static void
5775rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5776{
5777 char *sptr;
5778 long slen;
5779 int cr;
5780
5781 if (beg == 0 && vlen == 0) {
5782 rb_str_drop_bytes(str, len);
5783 return;
5784 }
5785
5786 str_modify_keep_cr(str);
5787 RSTRING_GETMEM(str, sptr, slen);
5788 if (len < vlen) {
5789 /* expand string */
5790 RESIZE_CAPA(str, slen + vlen - len);
5791 sptr = RSTRING_PTR(str);
5792 }
5793
5795 cr = rb_enc_str_coderange(val);
5796 else
5798
5799 if (vlen != len) {
5800 memmove(sptr + beg + vlen,
5801 sptr + beg + len,
5802 slen - (beg + len));
5803 }
5804 if (vlen < beg && len < 0) {
5805 MEMZERO(sptr + slen, char, -len);
5806 }
5807 if (vlen > 0) {
5808 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5809 }
5810 slen += vlen - len;
5811 STR_SET_LEN(str, slen);
5812 TERM_FILL(&sptr[slen], TERM_LEN(str));
5813 ENC_CODERANGE_SET(str, cr);
5814}
5815
5816static inline void
5817rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5818{
5819 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5820}
5821
5822void
5823rb_str_update(VALUE str, long beg, long len, VALUE val)
5824{
5825 long slen;
5826 char *p, *e;
5827 rb_encoding *enc;
5828 int singlebyte = single_byte_optimizable(str);
5829 int cr;
5830
5831 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5832
5833 StringValue(val);
5834 enc = rb_enc_check(str, val);
5835 slen = str_strlen(str, enc); /* rb_enc_check */
5836
5837 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5838 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5839 }
5840 if (beg < 0) {
5841 beg += slen;
5842 }
5843 RUBY_ASSERT(beg >= 0);
5844 RUBY_ASSERT(beg <= slen);
5845
5846 if (len > slen - beg) {
5847 len = slen - beg;
5848 }
5849 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5850 if (!p) p = RSTRING_END(str);
5851 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5852 if (!e) e = RSTRING_END(str);
5853 /* error check */
5854 beg = p - RSTRING_PTR(str); /* physical position */
5855 len = e - p; /* physical length */
5856 rb_str_update_0(str, beg, len, val);
5857 rb_enc_associate(str, enc);
5859 if (cr != ENC_CODERANGE_BROKEN)
5860 ENC_CODERANGE_SET(str, cr);
5861}
5862
5863static void
5864rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5865{
5866 int nth;
5867 VALUE match;
5868 long start, end, len;
5869 rb_encoding *enc;
5870 struct re_registers *regs;
5871
5872 if (rb_reg_search(re, str, 0, 0) < 0) {
5873 rb_raise(rb_eIndexError, "regexp not matched");
5874 }
5875 match = rb_backref_get();
5876 nth = rb_reg_backref_number(match, backref);
5877 regs = RMATCH_REGS(match);
5878 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5879 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5880 }
5881 if (nth < 0) {
5882 nth += regs->num_regs;
5883 }
5884
5885 start = BEG(nth);
5886 if (start == -1) {
5887 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5888 }
5889 end = END(nth);
5890 len = end - start;
5891 StringValue(val);
5892 enc = rb_enc_check_str(str, val);
5893 rb_str_update_0(str, start, len, val);
5894 rb_enc_associate(str, enc);
5895}
5896
5897static VALUE
5898rb_str_aset(VALUE str, VALUE indx, VALUE val)
5899{
5900 long idx, beg;
5901
5902 switch (TYPE(indx)) {
5903 case T_REGEXP:
5904 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5905 return val;
5906
5907 case T_STRING:
5908 beg = rb_str_index(str, indx, 0);
5909 if (beg < 0) {
5910 rb_raise(rb_eIndexError, "string not matched");
5911 }
5912 beg = rb_str_sublen(str, beg);
5913 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5914 return val;
5915
5916 default:
5917 /* check if indx is Range */
5918 {
5919 long beg, len;
5920 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5921 rb_str_update(str, beg, len, val);
5922 return val;
5923 }
5924 }
5925 /* FALLTHROUGH */
5926
5927 case T_FIXNUM:
5928 idx = NUM2LONG(indx);
5929 rb_str_update(str, idx, 1, val);
5930 return val;
5931 }
5932}
5933
5934/*
5935 * call-seq:
5936 * self[index] = other_string -> new_string
5937 * self[start, length] = other_string -> new_string
5938 * self[range] = other_string -> new_string
5939 * self[regexp, capture = 0] = other_string -> new_string
5940 * self[substring] = other_string -> new_string
5941 *
5942 * :include: doc/string/aset.rdoc
5943 *
5944 */
5945
5946static VALUE
5947rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5948{
5949 if (argc == 3) {
5950 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5951 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5952 }
5953 else {
5954 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5955 }
5956 return argv[2];
5957 }
5958 rb_check_arity(argc, 2, 3);
5959 return rb_str_aset(str, argv[0], argv[1]);
5960}
5961
5962/*
5963 * call-seq:
5964 * insert(offset, other_string) -> self
5965 *
5966 * :include: doc/string/insert.rdoc
5967 *
5968 */
5969
5970static VALUE
5971rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5972{
5973 long pos = NUM2LONG(idx);
5974
5975 if (pos == -1) {
5976 return rb_str_append(str, str2);
5977 }
5978 else if (pos < 0) {
5979 pos++;
5980 }
5981 rb_str_update(str, pos, 0, str2);
5982 return str;
5983}
5984
5985
5986/*
5987 * call-seq:
5988 * slice!(index) -> new_string or nil
5989 * slice!(start, length) -> new_string or nil
5990 * slice!(range) -> new_string or nil
5991 * slice!(regexp, capture = 0) -> new_string or nil
5992 * slice!(substring) -> new_string or nil
5993 *
5994 * Like String#[] (and its alias String#slice), except that:
5995 *
5996 * - Performs substitutions in +self+ (not in a copy of +self+).
5997 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
5998 *
5999 * A few examples:
6000 *
6001 * s = 'hello'
6002 * s.slice!('e') # => "e"
6003 * s # => "hllo"
6004 * s.slice!('e') # => nil
6005 * s # => "hllo"
6006 *
6007 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6008 */
6009
6010static VALUE
6011rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6012{
6013 VALUE result = Qnil;
6014 VALUE indx;
6015 long beg, len = 1;
6016 char *p;
6017
6018 rb_check_arity(argc, 1, 2);
6019 str_modify_keep_cr(str);
6020 indx = argv[0];
6021 if (RB_TYPE_P(indx, T_REGEXP)) {
6022 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6023 VALUE match = rb_backref_get();
6024 struct re_registers *regs = RMATCH_REGS(match);
6025 int nth = 0;
6026 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6027 if ((nth += regs->num_regs) <= 0) return Qnil;
6028 }
6029 else if (nth >= regs->num_regs) return Qnil;
6030 beg = BEG(nth);
6031 len = END(nth) - beg;
6032 goto subseq;
6033 }
6034 else if (argc == 2) {
6035 beg = NUM2LONG(indx);
6036 len = NUM2LONG(argv[1]);
6037 goto num_index;
6038 }
6039 else if (FIXNUM_P(indx)) {
6040 beg = FIX2LONG(indx);
6041 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6042 if (!len) return Qnil;
6043 beg = p - RSTRING_PTR(str);
6044 goto subseq;
6045 }
6046 else if (RB_TYPE_P(indx, T_STRING)) {
6047 beg = rb_str_index(str, indx, 0);
6048 if (beg == -1) return Qnil;
6049 len = RSTRING_LEN(indx);
6050 result = str_duplicate(rb_cString, indx);
6051 goto squash;
6052 }
6053 else {
6054 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6055 case Qnil:
6056 return Qnil;
6057 case Qfalse:
6058 beg = NUM2LONG(indx);
6059 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6060 if (!len) return Qnil;
6061 beg = p - RSTRING_PTR(str);
6062 goto subseq;
6063 default:
6064 goto num_index;
6065 }
6066 }
6067
6068 num_index:
6069 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6070 beg = p - RSTRING_PTR(str);
6071
6072 subseq:
6073 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6074 rb_enc_cr_str_copy_for_substr(result, str);
6075
6076 squash:
6077 if (len > 0) {
6078 if (beg == 0) {
6079 rb_str_drop_bytes(str, len);
6080 }
6081 else {
6082 char *sptr = RSTRING_PTR(str);
6083 long slen = RSTRING_LEN(str);
6084 if (beg + len > slen) /* pathological check */
6085 len = slen - beg;
6086 memmove(sptr + beg,
6087 sptr + beg + len,
6088 slen - (beg + len));
6089 slen -= len;
6090 STR_SET_LEN(str, slen);
6091 TERM_FILL(&sptr[slen], TERM_LEN(str));
6092 }
6093 }
6094 return result;
6095}
6096
6097static VALUE
6098get_pat(VALUE pat)
6099{
6100 VALUE val;
6101
6102 switch (OBJ_BUILTIN_TYPE(pat)) {
6103 case T_REGEXP:
6104 return pat;
6105
6106 case T_STRING:
6107 break;
6108
6109 default:
6110 val = rb_check_string_type(pat);
6111 if (NIL_P(val)) {
6112 Check_Type(pat, T_REGEXP);
6113 }
6114 pat = val;
6115 }
6116
6117 return rb_reg_regcomp(pat);
6118}
6119
6120static VALUE
6121get_pat_quoted(VALUE pat, int check)
6122{
6123 VALUE val;
6124
6125 switch (OBJ_BUILTIN_TYPE(pat)) {
6126 case T_REGEXP:
6127 return pat;
6128
6129 case T_STRING:
6130 break;
6131
6132 default:
6133 val = rb_check_string_type(pat);
6134 if (NIL_P(val)) {
6135 Check_Type(pat, T_REGEXP);
6136 }
6137 pat = val;
6138 }
6139 if (check && is_broken_string(pat)) {
6140 rb_exc_raise(rb_reg_check_preprocess(pat));
6141 }
6142 return pat;
6143}
6144
6145static long
6146rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6147{
6148 if (BUILTIN_TYPE(pat) == T_STRING) {
6149 pos = rb_str_byteindex(str, pat, pos);
6150 if (set_backref_str) {
6151 if (pos >= 0) {
6152 str = rb_str_new_frozen_String(str);
6153 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6154 if (match) {
6155 *match = match_data;
6156 }
6157 }
6158 else {
6160 }
6161 }
6162 return pos;
6163 }
6164 else {
6165 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6166 }
6167}
6168
6169static long
6170rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6171{
6172 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6173}
6174
6175
6176/*
6177 * call-seq:
6178 * sub!(pattern, replacement) -> self or nil
6179 * sub!(pattern) {|match| ... } -> self or nil
6180 *
6181 * Like String#sub, except that:
6182 *
6183 * - Changes are made to +self+, not to copy of +self+.
6184 * - Returns +self+ if any changes are made, +nil+ otherwise.
6185 *
6186 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6187 */
6188
6189static VALUE
6190rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6191{
6192 VALUE pat, repl, hash = Qnil;
6193 int iter = 0;
6194 long plen;
6195 int min_arity = rb_block_given_p() ? 1 : 2;
6196 long beg;
6197
6198 rb_check_arity(argc, min_arity, 2);
6199 if (argc == 1) {
6200 iter = 1;
6201 }
6202 else {
6203 repl = argv[1];
6204 hash = rb_check_hash_type(argv[1]);
6205 if (NIL_P(hash)) {
6206 StringValue(repl);
6207 }
6208 }
6209
6210 pat = get_pat_quoted(argv[0], 1);
6211
6212 str_modifiable(str);
6213 beg = rb_pat_search(pat, str, 0, 1);
6214 if (beg >= 0) {
6215 rb_encoding *enc;
6216 int cr = ENC_CODERANGE(str);
6217 long beg0, end0;
6218 VALUE match, match0 = Qnil;
6219 struct re_registers *regs;
6220 char *p, *rp;
6221 long len, rlen;
6222
6223 match = rb_backref_get();
6224 regs = RMATCH_REGS(match);
6225 if (RB_TYPE_P(pat, T_STRING)) {
6226 beg0 = beg;
6227 end0 = beg0 + RSTRING_LEN(pat);
6228 match0 = pat;
6229 }
6230 else {
6231 beg0 = BEG(0);
6232 end0 = END(0);
6233 if (iter) match0 = rb_reg_nth_match(0, match);
6234 }
6235
6236 if (iter || !NIL_P(hash)) {
6237 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6238
6239 if (iter) {
6240 repl = rb_obj_as_string(rb_yield(match0));
6241 }
6242 else {
6243 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6244 repl = rb_obj_as_string(repl);
6245 }
6246 str_mod_check(str, p, len);
6247 rb_check_frozen(str);
6248 }
6249 else {
6250 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6251 }
6252
6253 enc = rb_enc_compatible(str, repl);
6254 if (!enc) {
6255 rb_encoding *str_enc = STR_ENC_GET(str);
6256 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6257 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6258 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6259 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6260 rb_enc_inspect_name(str_enc),
6261 rb_enc_inspect_name(STR_ENC_GET(repl)));
6262 }
6263 enc = STR_ENC_GET(repl);
6264 }
6265 rb_str_modify(str);
6266 rb_enc_associate(str, enc);
6268 int cr2 = ENC_CODERANGE(repl);
6269 if (cr2 == ENC_CODERANGE_BROKEN ||
6270 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6272 else
6273 cr = cr2;
6274 }
6275 plen = end0 - beg0;
6276 rlen = RSTRING_LEN(repl);
6277 len = RSTRING_LEN(str);
6278 if (rlen > plen) {
6279 RESIZE_CAPA(str, len + rlen - plen);
6280 }
6281 p = RSTRING_PTR(str);
6282 if (rlen != plen) {
6283 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6284 }
6285 rp = RSTRING_PTR(repl);
6286 memmove(p + beg0, rp, rlen);
6287 len += rlen - plen;
6288 STR_SET_LEN(str, len);
6289 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6290 ENC_CODERANGE_SET(str, cr);
6291
6292 RB_GC_GUARD(match);
6293
6294 return str;
6295 }
6296 return Qnil;
6297}
6298
6299
6300/*
6301 * call-seq:
6302 * sub(pattern, replacement) -> new_string
6303 * sub(pattern) {|match| ... } -> new_string
6304 *
6305 * :include: doc/string/sub.rdoc
6306 */
6307
6308static VALUE
6309rb_str_sub(int argc, VALUE *argv, VALUE str)
6310{
6311 str = str_duplicate(rb_cString, str);
6312 rb_str_sub_bang(argc, argv, str);
6313 return str;
6314}
6315
6316static VALUE
6317str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6318{
6319 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6320 long beg, beg0, end0;
6321 long offset, blen, slen, len, last;
6322 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6323 char *sp, *cp;
6324 int need_backref_str = -1;
6325 rb_encoding *str_enc;
6326
6327 switch (argc) {
6328 case 1:
6329 RETURN_ENUMERATOR(str, argc, argv);
6330 mode = ITER;
6331 break;
6332 case 2:
6333 repl = argv[1];
6334 hash = rb_check_hash_type(argv[1]);
6335 if (NIL_P(hash)) {
6336 StringValue(repl);
6337 }
6338 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6339 mode = FAST_MAP;
6340 }
6341 else {
6342 mode = MAP;
6343 }
6344 break;
6345 default:
6346 rb_error_arity(argc, 1, 2);
6347 }
6348
6349 pat = get_pat_quoted(argv[0], 1);
6350 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6351
6352 if (beg < 0) {
6353 if (bang) return Qnil; /* no match, no substitution */
6354 return str_duplicate(rb_cString, str);
6355 }
6356
6357 offset = 0;
6358 blen = RSTRING_LEN(str) + 30; /* len + margin */
6359 dest = rb_str_buf_new(blen);
6360 sp = RSTRING_PTR(str);
6361 slen = RSTRING_LEN(str);
6362 cp = sp;
6363 str_enc = STR_ENC_GET(str);
6364 rb_enc_associate(dest, str_enc);
6365 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6366
6367 do {
6368 struct re_registers *regs = RMATCH_REGS(match);
6369 if (RB_TYPE_P(pat, T_STRING)) {
6370 beg0 = beg;
6371 end0 = beg0 + RSTRING_LEN(pat);
6372 match0 = pat;
6373 }
6374 else {
6375 beg0 = BEG(0);
6376 end0 = END(0);
6377 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6378 }
6379
6380 if (mode != STR) {
6381 if (mode == ITER) {
6382 val = rb_obj_as_string(rb_yield(match0));
6383 }
6384 else {
6385 struct RString fake_str = {RBASIC_INIT};
6386 VALUE key;
6387 if (mode == FAST_MAP) {
6388 // It is safe to use a fake_str here because we established that it won't escape,
6389 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6390 // default proc.
6391 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6392 }
6393 else {
6394 key = rb_str_subseq(str, beg0, end0 - beg0);
6395 }
6396 val = rb_hash_aref(hash, key);
6397 val = rb_obj_as_string(val);
6398 }
6399 str_mod_check(str, sp, slen);
6400 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6401 rb_raise(rb_eRuntimeError, "block should not cheat");
6402 }
6403 }
6404 else if (need_backref_str) {
6405 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6406 if (need_backref_str < 0) {
6407 need_backref_str = val != repl;
6408 }
6409 }
6410 else {
6411 val = repl;
6412 }
6413
6414 len = beg0 - offset; /* copy pre-match substr */
6415 if (len) {
6416 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6417 }
6418
6419 rb_str_buf_append(dest, val);
6420
6421 last = offset;
6422 offset = end0;
6423 if (beg0 == end0) {
6424 /*
6425 * Always consume at least one character of the input string
6426 * in order to prevent infinite loops.
6427 */
6428 if (RSTRING_LEN(str) <= end0) break;
6429 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6430 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6431 offset = end0 + len;
6432 }
6433 cp = RSTRING_PTR(str) + offset;
6434 if (offset > RSTRING_LEN(str)) break;
6435
6436 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6437 if (mode != FAST_MAP && mode != STR) {
6438 match = Qnil;
6439 }
6440 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6441
6442 RB_GC_GUARD(match);
6443 } while (beg >= 0);
6444
6445 if (RSTRING_LEN(str) > offset) {
6446 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6447 }
6448 rb_pat_search0(pat, str, last, 1, &match);
6449 if (bang) {
6450 str_shared_replace(str, dest);
6451 }
6452 else {
6453 str = dest;
6454 }
6455
6456 return str;
6457}
6458
6459
6460/*
6461 * call-seq:
6462 * gsub!(pattern, replacement) -> self or nil
6463 * gsub!(pattern) {|match| ... } -> self or nil
6464 * gsub!(pattern) -> an_enumerator
6465 *
6466 * Like String#gsub, except that:
6467 *
6468 * - Performs substitutions in +self+ (not in a copy of +self+).
6469 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6470 *
6471 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6472 */
6473
6474static VALUE
6475rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6476{
6477 str_modify_keep_cr(str);
6478 return str_gsub(argc, argv, str, 1);
6479}
6480
6481
6482/*
6483 * call-seq:
6484 * gsub(pattern, replacement) -> new_string
6485 * gsub(pattern) {|match| ... } -> new_string
6486 * gsub(pattern) -> enumerator
6487 *
6488 * Returns a copy of +self+ with zero or more substrings replaced.
6489 *
6490 * Argument +pattern+ may be a string or a Regexp;
6491 * argument +replacement+ may be a string or a Hash.
6492 * Varying types for the argument values makes this method very versatile.
6493 *
6494 * Below are some simple examples;
6495 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6496 *
6497 * With arguments +pattern+ and string +replacement+ given,
6498 * replaces each matching substring with the given +replacement+ string:
6499 *
6500 * s = 'abracadabra'
6501 * s.gsub('ab', 'AB') # => "ABracadABra"
6502 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6503 *
6504 * With arguments +pattern+ and hash +replacement+ given,
6505 * replaces each matching substring with a value from the given +replacement+ hash,
6506 * or removes it:
6507 *
6508 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6509 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6510 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6511 *
6512 * With argument +pattern+ and a block given,
6513 * calls the block with each matching substring;
6514 * replaces that substring with the block's return value:
6515 *
6516 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6517 * # => "ABrACADABrA"
6518 *
6519 * With argument +pattern+ and no block given,
6520 * returns a new Enumerator.
6521 *
6522 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6523 */
6524
6525static VALUE
6526rb_str_gsub(int argc, VALUE *argv, VALUE str)
6527{
6528 return str_gsub(argc, argv, str, 0);
6529}
6530
6531
6532/*
6533 * call-seq:
6534 * replace(other_string) -> self
6535 *
6536 * Replaces the contents of +self+ with the contents of +other_string+;
6537 * returns +self+:
6538 *
6539 * s = 'foo' # => "foo"
6540 * s.replace('bar') # => "bar"
6541 *
6542 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6543 */
6544
6545VALUE
6547{
6548 str_modifiable(str);
6549 if (str == str2) return str;
6550
6551 StringValue(str2);
6552 str_discard(str);
6553 return str_replace(str, str2);
6554}
6555
6556/*
6557 * call-seq:
6558 * clear -> self
6559 *
6560 * Removes the contents of +self+:
6561 *
6562 * s = 'foo'
6563 * s.clear # => ""
6564 * s # => ""
6565 *
6566 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6567 */
6568
6569static VALUE
6570rb_str_clear(VALUE str)
6571{
6572 str_discard(str);
6573 STR_SET_EMBED(str);
6574 STR_SET_LEN(str, 0);
6575 RSTRING_PTR(str)[0] = 0;
6576 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6578 else
6580 return str;
6581}
6582
6583/*
6584 * call-seq:
6585 * chr -> string
6586 *
6587 * :include: doc/string/chr.rdoc
6588 *
6589 */
6590
6591static VALUE
6592rb_str_chr(VALUE str)
6593{
6594 return rb_str_substr(str, 0, 1);
6595}
6596
6597/*
6598 * call-seq:
6599 * getbyte(index) -> integer or nil
6600 *
6601 * :include: doc/string/getbyte.rdoc
6602 *
6603 */
6604VALUE
6605rb_str_getbyte(VALUE str, VALUE index)
6606{
6607 long pos = NUM2LONG(index);
6608
6609 if (pos < 0)
6610 pos += RSTRING_LEN(str);
6611 if (pos < 0 || RSTRING_LEN(str) <= pos)
6612 return Qnil;
6613
6614 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6615}
6616
6617/*
6618 * call-seq:
6619 * setbyte(index, integer) -> integer
6620 *
6621 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6622 * returns +integer+:
6623 *
6624 * s = 'xyzzy'
6625 * s.setbyte(2, 129) # => 129
6626 * s # => "xy\x81zy"
6627 *
6628 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6629 */
6630VALUE
6631rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6632{
6633 long pos = NUM2LONG(index);
6634 long len = RSTRING_LEN(str);
6635 char *ptr, *head, *left = 0;
6636 rb_encoding *enc;
6637 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6638
6639 if (pos < -len || len <= pos)
6640 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6641 if (pos < 0)
6642 pos += len;
6643
6644 VALUE v = rb_to_int(value);
6645 VALUE w = rb_int_and(v, INT2FIX(0xff));
6646 char byte = (char)(NUM2INT(w) & 0xFF);
6647
6648 if (!str_independent(str))
6649 str_make_independent(str);
6650 enc = STR_ENC_GET(str);
6651 head = RSTRING_PTR(str);
6652 ptr = &head[pos];
6653 if (!STR_EMBED_P(str)) {
6654 cr = ENC_CODERANGE(str);
6655 switch (cr) {
6656 case ENC_CODERANGE_7BIT:
6657 left = ptr;
6658 *ptr = byte;
6659 if (ISASCII(byte)) goto end;
6660 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6661 if (!MBCLEN_CHARFOUND_P(nlen))
6663 else
6665 goto end;
6667 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6668 width = rb_enc_precise_mbclen(left, head+len, enc);
6669 *ptr = byte;
6670 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6671 if (!MBCLEN_CHARFOUND_P(nlen))
6673 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6675 goto end;
6676 }
6677 }
6679 *ptr = byte;
6680
6681 end:
6682 return value;
6683}
6684
6685static VALUE
6686str_byte_substr(VALUE str, long beg, long len, int empty)
6687{
6688 long n = RSTRING_LEN(str);
6689
6690 if (beg > n || len < 0) return Qnil;
6691 if (beg < 0) {
6692 beg += n;
6693 if (beg < 0) return Qnil;
6694 }
6695 if (len > n - beg)
6696 len = n - beg;
6697 if (len <= 0) {
6698 if (!empty) return Qnil;
6699 len = 0;
6700 }
6701
6702 VALUE str2 = str_subseq(str, beg, len);
6703
6704 str_enc_copy_direct(str2, str);
6705
6706 if (RSTRING_LEN(str2) == 0) {
6707 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6709 else
6711 }
6712 else {
6713 switch (ENC_CODERANGE(str)) {
6714 case ENC_CODERANGE_7BIT:
6716 break;
6717 default:
6719 break;
6720 }
6721 }
6722
6723 return str2;
6724}
6725
6726VALUE
6727rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6728{
6729 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6730}
6731
6732static VALUE
6733str_byte_aref(VALUE str, VALUE indx)
6734{
6735 long idx;
6736 if (FIXNUM_P(indx)) {
6737 idx = FIX2LONG(indx);
6738 }
6739 else {
6740 /* check if indx is Range */
6741 long beg, len = RSTRING_LEN(str);
6742
6743 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6744 case Qfalse:
6745 break;
6746 case Qnil:
6747 return Qnil;
6748 default:
6749 return str_byte_substr(str, beg, len, TRUE);
6750 }
6751
6752 idx = NUM2LONG(indx);
6753 }
6754 return str_byte_substr(str, idx, 1, FALSE);
6755}
6756
6757/*
6758 * call-seq:
6759 * byteslice(offset, length = 1) -> string or nil
6760 * byteslice(range) -> string or nil
6761 *
6762 * :include: doc/string/byteslice.rdoc
6763 */
6764
6765static VALUE
6766rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6767{
6768 if (argc == 2) {
6769 long beg = NUM2LONG(argv[0]);
6770 long len = NUM2LONG(argv[1]);
6771 return str_byte_substr(str, beg, len, TRUE);
6772 }
6773 rb_check_arity(argc, 1, 2);
6774 return str_byte_aref(str, argv[0]);
6775}
6776
6777static void
6778str_check_beg_len(VALUE str, long *beg, long *len)
6779{
6780 long end, slen = RSTRING_LEN(str);
6781
6782 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6783 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6784 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6785 }
6786 if (*beg < 0) {
6787 *beg += slen;
6788 }
6789 RUBY_ASSERT(*beg >= 0);
6790 RUBY_ASSERT(*beg <= slen);
6791
6792 if (*len > slen - *beg) {
6793 *len = slen - *beg;
6794 }
6795 end = *beg + *len;
6796 str_ensure_byte_pos(str, *beg);
6797 str_ensure_byte_pos(str, end);
6798}
6799
6800/*
6801 * call-seq:
6802 * bytesplice(offset, length, str) -> self
6803 * bytesplice(offset, length, str, str_offset, str_length) -> self
6804 * bytesplice(range, str) -> self
6805 * bytesplice(range, str, str_range) -> self
6806 *
6807 * :include: doc/string/bytesplice.rdoc
6808 */
6809
6810static VALUE
6811rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6812{
6813 long beg, len, vbeg, vlen;
6814 VALUE val;
6815 int cr;
6816
6817 rb_check_arity(argc, 2, 5);
6818 if (!(argc == 2 || argc == 3 || argc == 5)) {
6819 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6820 }
6821 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6822 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6823 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6824 rb_builtin_class_name(argv[0]));
6825 }
6826 val = argv[1];
6827 StringValue(val);
6828 if (argc == 2) {
6829 /* bytesplice(range, str) */
6830 vbeg = 0;
6831 vlen = RSTRING_LEN(val);
6832 }
6833 else {
6834 /* bytesplice(range, str, str_range) */
6835 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6836 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6837 rb_builtin_class_name(argv[2]));
6838 }
6839 }
6840 }
6841 else {
6842 beg = NUM2LONG(argv[0]);
6843 len = NUM2LONG(argv[1]);
6844 val = argv[2];
6845 StringValue(val);
6846 if (argc == 3) {
6847 /* bytesplice(index, length, str) */
6848 vbeg = 0;
6849 vlen = RSTRING_LEN(val);
6850 }
6851 else {
6852 /* bytesplice(index, length, str, str_index, str_length) */
6853 vbeg = NUM2LONG(argv[3]);
6854 vlen = NUM2LONG(argv[4]);
6855 }
6856 }
6857 str_check_beg_len(str, &beg, &len);
6858 str_check_beg_len(val, &vbeg, &vlen);
6859 str_modify_keep_cr(str);
6860
6861 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6862 rb_enc_associate(str, rb_enc_check(str, val));
6863 }
6864
6865 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6867 if (cr != ENC_CODERANGE_BROKEN)
6868 ENC_CODERANGE_SET(str, cr);
6869 return str;
6870}
6871
6872/*
6873 * call-seq:
6874 * reverse -> new_string
6875 *
6876 * Returns a new string with the characters from +self+ in reverse order.
6877 *
6878 * 'drawer'.reverse # => "reward"
6879 * 'reviled'.reverse # => "deliver"
6880 * 'stressed'.reverse # => "desserts"
6881 * 'semordnilaps'.reverse # => "spalindromes"
6882 *
6883 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6884 */
6885
6886static VALUE
6887rb_str_reverse(VALUE str)
6888{
6889 rb_encoding *enc;
6890 VALUE rev;
6891 char *s, *e, *p;
6892 int cr;
6893
6894 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6895 enc = STR_ENC_GET(str);
6896 rev = rb_str_new(0, RSTRING_LEN(str));
6897 s = RSTRING_PTR(str); e = RSTRING_END(str);
6898 p = RSTRING_END(rev);
6899 cr = ENC_CODERANGE(str);
6900
6901 if (RSTRING_LEN(str) > 1) {
6902 if (single_byte_optimizable(str)) {
6903 while (s < e) {
6904 *--p = *s++;
6905 }
6906 }
6907 else if (cr == ENC_CODERANGE_VALID) {
6908 while (s < e) {
6909 int clen = rb_enc_fast_mbclen(s, e, enc);
6910
6911 p -= clen;
6912 memcpy(p, s, clen);
6913 s += clen;
6914 }
6915 }
6916 else {
6917 cr = rb_enc_asciicompat(enc) ?
6919 while (s < e) {
6920 int clen = rb_enc_mbclen(s, e, enc);
6921
6922 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6923 p -= clen;
6924 memcpy(p, s, clen);
6925 s += clen;
6926 }
6927 }
6928 }
6929 STR_SET_LEN(rev, RSTRING_LEN(str));
6930 str_enc_copy_direct(rev, str);
6931 ENC_CODERANGE_SET(rev, cr);
6932
6933 return rev;
6934}
6935
6936
6937/*
6938 * call-seq:
6939 * reverse! -> self
6940 *
6941 * Returns +self+ with its characters reversed:
6942 *
6943 * 'drawer'.reverse! # => "reward"
6944 * 'reviled'.reverse! # => "deliver"
6945 * 'stressed'.reverse! # => "desserts"
6946 * 'semordnilaps'.reverse! # => "spalindromes"
6947 *
6948 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6949 */
6950
6951static VALUE
6952rb_str_reverse_bang(VALUE str)
6953{
6954 if (RSTRING_LEN(str) > 1) {
6955 if (single_byte_optimizable(str)) {
6956 char *s, *e, c;
6957
6958 str_modify_keep_cr(str);
6959 s = RSTRING_PTR(str);
6960 e = RSTRING_END(str) - 1;
6961 while (s < e) {
6962 c = *s;
6963 *s++ = *e;
6964 *e-- = c;
6965 }
6966 }
6967 else {
6968 str_shared_replace(str, rb_str_reverse(str));
6969 }
6970 }
6971 else {
6972 str_modify_keep_cr(str);
6973 }
6974 return str;
6975}
6976
6977
6978/*
6979 * call-seq:
6980 * include?(other_string) -> true or false
6981 *
6982 * Returns whether +self+ contains +other_string+:
6983 *
6984 * s = 'bar'
6985 * s.include?('ba') # => true
6986 * s.include?('ar') # => true
6987 * s.include?('bar') # => true
6988 * s.include?('a') # => true
6989 * s.include?('') # => true
6990 * s.include?('foo') # => false
6991 *
6992 * Related: see {Querying}[rdoc-ref:String@Querying].
6993 */
6994
6995VALUE
6996rb_str_include(VALUE str, VALUE arg)
6997{
6998 long i;
6999
7000 StringValue(arg);
7001 i = rb_str_index(str, arg, 0);
7002
7003 return RBOOL(i != -1);
7004}
7005
7006
7007/*
7008 * call-seq:
7009 * to_i(base = 10) -> integer
7010 *
7011 * Returns the result of interpreting leading characters in +self+
7012 * as an integer in the given +base+;
7013 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7014 *
7015 * '123456'.to_i # => 123456
7016 * '123def'.to_i(16) # => 1195503
7017 *
7018 * With +base+ zero given, string +object+ may contain leading characters
7019 * to specify the actual base:
7020 *
7021 * '123def'.to_i(0) # => 123
7022 * '0123def'.to_i(0) # => 83
7023 * '0b123def'.to_i(0) # => 1
7024 * '0o123def'.to_i(0) # => 83
7025 * '0d123def'.to_i(0) # => 123
7026 * '0x123def'.to_i(0) # => 1195503
7027 *
7028 * Characters past a leading valid number (in the given +base+) are ignored:
7029 *
7030 * '12.345'.to_i # => 12
7031 * '12345'.to_i(2) # => 1
7032 *
7033 * Returns zero if there is no leading valid number:
7034 *
7035 * 'abcdef'.to_i # => 0
7036 * '2'.to_i(2) # => 0
7037 *
7038 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7039 */
7040
7041static VALUE
7042rb_str_to_i(int argc, VALUE *argv, VALUE str)
7043{
7044 int base = 10;
7045
7046 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7047 rb_raise(rb_eArgError, "invalid radix %d", base);
7048 }
7049 return rb_str_to_inum(str, base, FALSE);
7050}
7051
7052
7053/*
7054 * call-seq:
7055 * to_f -> float
7056 *
7057 * Returns the result of interpreting leading characters in +self+ as a Float:
7058 *
7059 * '3.14159'.to_f # => 3.14159
7060 * '1.234e-2'.to_f # => 0.01234
7061 *
7062 * Characters past a leading valid number are ignored:
7063 *
7064 * '3.14 (pi to two places)'.to_f # => 3.14
7065 *
7066 * Returns zero if there is no leading valid number:
7067 *
7068 * 'abcdef'.to_f # => 0.0
7069 *
7070 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7071 */
7072
7073static VALUE
7074rb_str_to_f(VALUE str)
7075{
7076 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7077}
7078
7079
7080/*
7081 * call-seq:
7082 * to_s -> self or new_string
7083 *
7084 * Returns +self+ if +self+ is a +String+,
7085 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7086 *
7087 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7088 */
7089
7090static VALUE
7091rb_str_to_s(VALUE str)
7092{
7093 if (rb_obj_class(str) != rb_cString) {
7094 return str_duplicate(rb_cString, str);
7095 }
7096 return str;
7097}
7098
7099#if 0
7100static void
7101str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7102{
7103 char s[RUBY_MAX_CHAR_LEN];
7104 int n = rb_enc_codelen(c, enc);
7105
7106 rb_enc_mbcput(c, s, enc);
7107 rb_enc_str_buf_cat(str, s, n, enc);
7108}
7109#endif
7110
7111#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7112
7113int
7114rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7115{
7116 char buf[CHAR_ESC_LEN + 1];
7117 int l;
7118
7119#if SIZEOF_INT > 4
7120 c &= 0xffffffff;
7121#endif
7122 if (unicode_p) {
7123 if (c < 0x7F && ISPRINT(c)) {
7124 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7125 }
7126 else if (c < 0x10000) {
7127 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7128 }
7129 else {
7130 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7131 }
7132 }
7133 else {
7134 if (c < 0x100) {
7135 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7136 }
7137 else {
7138 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7139 }
7140 }
7141 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7142 rb_str_buf_cat(result, buf, l);
7143 return l;
7144}
7145
7146const char *
7147ruby_escaped_char(int c)
7148{
7149 switch (c) {
7150 case '\0': return "\\0";
7151 case '\n': return "\\n";
7152 case '\r': return "\\r";
7153 case '\t': return "\\t";
7154 case '\f': return "\\f";
7155 case '\013': return "\\v";
7156 case '\010': return "\\b";
7157 case '\007': return "\\a";
7158 case '\033': return "\\e";
7159 case '\x7f': return "\\c?";
7160 }
7161 return NULL;
7162}
7163
7164VALUE
7165rb_str_escape(VALUE str)
7166{
7167 int encidx = ENCODING_GET(str);
7168 rb_encoding *enc = rb_enc_from_index(encidx);
7169 const char *p = RSTRING_PTR(str);
7170 const char *pend = RSTRING_END(str);
7171 const char *prev = p;
7172 char buf[CHAR_ESC_LEN + 1];
7173 VALUE result = rb_str_buf_new(0);
7174 int unicode_p = rb_enc_unicode_p(enc);
7175 int asciicompat = rb_enc_asciicompat(enc);
7176
7177 while (p < pend) {
7178 unsigned int c;
7179 const char *cc;
7180 int n = rb_enc_precise_mbclen(p, pend, enc);
7181 if (!MBCLEN_CHARFOUND_P(n)) {
7182 if (p > prev) str_buf_cat(result, prev, p - prev);
7183 n = rb_enc_mbminlen(enc);
7184 if (pend < p + n)
7185 n = (int)(pend - p);
7186 while (n--) {
7187 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7188 str_buf_cat(result, buf, strlen(buf));
7189 prev = ++p;
7190 }
7191 continue;
7192 }
7193 n = MBCLEN_CHARFOUND_LEN(n);
7194 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7195 p += n;
7196 cc = ruby_escaped_char(c);
7197 if (cc) {
7198 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7199 str_buf_cat(result, cc, strlen(cc));
7200 prev = p;
7201 }
7202 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7203 }
7204 else {
7205 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7206 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7207 prev = p;
7208 }
7209 }
7210 if (p > prev) str_buf_cat(result, prev, p - prev);
7211 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7212
7213 return result;
7214}
7215
7216/*
7217 * call-seq:
7218 * inspect -> string
7219 *
7220 * :include: doc/string/inspect.rdoc
7221 *
7222 */
7223
7224VALUE
7226{
7227 int encidx = ENCODING_GET(str);
7228 rb_encoding *enc = rb_enc_from_index(encidx);
7229 const char *p, *pend, *prev;
7230 char buf[CHAR_ESC_LEN + 1];
7231 VALUE result = rb_str_buf_new(0);
7232 rb_encoding *resenc = rb_default_internal_encoding();
7233 int unicode_p = rb_enc_unicode_p(enc);
7234 int asciicompat = rb_enc_asciicompat(enc);
7235
7236 if (resenc == NULL) resenc = rb_default_external_encoding();
7237 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7238 rb_enc_associate(result, resenc);
7239 str_buf_cat2(result, "\"");
7240
7241 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7242 prev = p;
7243 while (p < pend) {
7244 unsigned int c, cc;
7245 int n;
7246
7247 n = rb_enc_precise_mbclen(p, pend, enc);
7248 if (!MBCLEN_CHARFOUND_P(n)) {
7249 if (p > prev) str_buf_cat(result, prev, p - prev);
7250 n = rb_enc_mbminlen(enc);
7251 if (pend < p + n)
7252 n = (int)(pend - p);
7253 while (n--) {
7254 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7255 str_buf_cat(result, buf, strlen(buf));
7256 prev = ++p;
7257 }
7258 continue;
7259 }
7260 n = MBCLEN_CHARFOUND_LEN(n);
7261 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7262 p += n;
7263 if ((asciicompat || unicode_p) &&
7264 (c == '"'|| c == '\\' ||
7265 (c == '#' &&
7266 p < pend &&
7267 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7268 (cc = rb_enc_codepoint(p,pend,enc),
7269 (cc == '$' || cc == '@' || cc == '{'))))) {
7270 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7271 str_buf_cat2(result, "\\");
7272 if (asciicompat || enc == resenc) {
7273 prev = p - n;
7274 continue;
7275 }
7276 }
7277 switch (c) {
7278 case '\n': cc = 'n'; break;
7279 case '\r': cc = 'r'; break;
7280 case '\t': cc = 't'; break;
7281 case '\f': cc = 'f'; break;
7282 case '\013': cc = 'v'; break;
7283 case '\010': cc = 'b'; break;
7284 case '\007': cc = 'a'; break;
7285 case 033: cc = 'e'; break;
7286 default: cc = 0; break;
7287 }
7288 if (cc) {
7289 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7290 buf[0] = '\\';
7291 buf[1] = (char)cc;
7292 str_buf_cat(result, buf, 2);
7293 prev = p;
7294 continue;
7295 }
7296 /* The special casing of 0x85 (NEXT_LINE) here is because
7297 * Oniguruma historically treats it as printable, but it
7298 * doesn't match the print POSIX bracket class or character
7299 * property in regexps.
7300 *
7301 * See Ruby Bug #16842 for details:
7302 * https://bugs.ruby-lang.org/issues/16842
7303 */
7304 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7305 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7306 continue;
7307 }
7308 else {
7309 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7310 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7311 prev = p;
7312 continue;
7313 }
7314 }
7315 if (p > prev) str_buf_cat(result, prev, p - prev);
7316 str_buf_cat2(result, "\"");
7317
7318 return result;
7319}
7320
7321#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7322
7323/*
7324 * call-seq:
7325 * dump -> new_string
7326 *
7327 * :include: doc/string/dump.rdoc
7328 *
7329 */
7330
7331VALUE
7333{
7334 int encidx = rb_enc_get_index(str);
7335 rb_encoding *enc = rb_enc_from_index(encidx);
7336 long len;
7337 const char *p, *pend;
7338 char *q, *qend;
7339 VALUE result;
7340 int u8 = (encidx == rb_utf8_encindex());
7341 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7342
7343 len = 2; /* "" */
7344 if (!rb_enc_asciicompat(enc)) {
7345 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7346 len += strlen(enc->name);
7347 }
7348
7349 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7350 while (p < pend) {
7351 int clen;
7352 unsigned char c = *p++;
7353
7354 switch (c) {
7355 case '"': case '\\':
7356 case '\n': case '\r':
7357 case '\t': case '\f':
7358 case '\013': case '\010': case '\007': case '\033':
7359 clen = 2;
7360 break;
7361
7362 case '#':
7363 clen = IS_EVSTR(p, pend) ? 2 : 1;
7364 break;
7365
7366 default:
7367 if (ISPRINT(c)) {
7368 clen = 1;
7369 }
7370 else {
7371 if (u8 && c > 0x7F) { /* \u notation */
7372 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7373 if (MBCLEN_CHARFOUND_P(n)) {
7374 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7375 if (cc <= 0xFFFF)
7376 clen = 6; /* \uXXXX */
7377 else if (cc <= 0xFFFFF)
7378 clen = 9; /* \u{XXXXX} */
7379 else
7380 clen = 10; /* \u{XXXXXX} */
7381 p += MBCLEN_CHARFOUND_LEN(n)-1;
7382 break;
7383 }
7384 }
7385 clen = 4; /* \xNN */
7386 }
7387 break;
7388 }
7389
7390 if (clen > LONG_MAX - len) {
7391 rb_raise(rb_eRuntimeError, "string size too big");
7392 }
7393 len += clen;
7394 }
7395
7396 result = rb_str_new(0, len);
7397 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7398 q = RSTRING_PTR(result); qend = q + len + 1;
7399
7400 *q++ = '"';
7401 while (p < pend) {
7402 unsigned char c = *p++;
7403
7404 if (c == '"' || c == '\\') {
7405 *q++ = '\\';
7406 *q++ = c;
7407 }
7408 else if (c == '#') {
7409 if (IS_EVSTR(p, pend)) *q++ = '\\';
7410 *q++ = '#';
7411 }
7412 else if (c == '\n') {
7413 *q++ = '\\';
7414 *q++ = 'n';
7415 }
7416 else if (c == '\r') {
7417 *q++ = '\\';
7418 *q++ = 'r';
7419 }
7420 else if (c == '\t') {
7421 *q++ = '\\';
7422 *q++ = 't';
7423 }
7424 else if (c == '\f') {
7425 *q++ = '\\';
7426 *q++ = 'f';
7427 }
7428 else if (c == '\013') {
7429 *q++ = '\\';
7430 *q++ = 'v';
7431 }
7432 else if (c == '\010') {
7433 *q++ = '\\';
7434 *q++ = 'b';
7435 }
7436 else if (c == '\007') {
7437 *q++ = '\\';
7438 *q++ = 'a';
7439 }
7440 else if (c == '\033') {
7441 *q++ = '\\';
7442 *q++ = 'e';
7443 }
7444 else if (ISPRINT(c)) {
7445 *q++ = c;
7446 }
7447 else {
7448 *q++ = '\\';
7449 if (u8) {
7450 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7451 if (MBCLEN_CHARFOUND_P(n)) {
7452 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7453 p += n;
7454 if (cc <= 0xFFFF)
7455 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7456 else
7457 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7458 q += strlen(q);
7459 continue;
7460 }
7461 }
7462 snprintf(q, qend-q, "x%02X", c);
7463 q += 3;
7464 }
7465 }
7466 *q++ = '"';
7467 *q = '\0';
7468 if (!rb_enc_asciicompat(enc)) {
7469 snprintf(q, qend-q, nonascii_suffix, enc->name);
7470 encidx = rb_ascii8bit_encindex();
7471 }
7472 /* result from dump is ASCII */
7473 rb_enc_associate_index(result, encidx);
7475 return result;
7476}
7477
7478static int
7479unescape_ascii(unsigned int c)
7480{
7481 switch (c) {
7482 case 'n':
7483 return '\n';
7484 case 'r':
7485 return '\r';
7486 case 't':
7487 return '\t';
7488 case 'f':
7489 return '\f';
7490 case 'v':
7491 return '\13';
7492 case 'b':
7493 return '\010';
7494 case 'a':
7495 return '\007';
7496 case 'e':
7497 return 033;
7498 }
7500}
7501
7502static void
7503undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7504{
7505 const char *s = *ss;
7506 unsigned int c;
7507 int codelen;
7508 size_t hexlen;
7509 unsigned char buf[6];
7510 static rb_encoding *enc_utf8 = NULL;
7511
7512 switch (*s) {
7513 case '\\':
7514 case '"':
7515 case '#':
7516 rb_str_cat(undumped, s, 1); /* cat itself */
7517 s++;
7518 break;
7519 case 'n':
7520 case 'r':
7521 case 't':
7522 case 'f':
7523 case 'v':
7524 case 'b':
7525 case 'a':
7526 case 'e':
7527 *buf = unescape_ascii(*s);
7528 rb_str_cat(undumped, (char *)buf, 1);
7529 s++;
7530 break;
7531 case 'u':
7532 if (*binary) {
7533 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7534 }
7535 *utf8 = true;
7536 if (++s >= s_end) {
7537 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7538 }
7539 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7540 if (*penc != enc_utf8) {
7541 *penc = enc_utf8;
7542 rb_enc_associate(undumped, enc_utf8);
7543 }
7544 if (*s == '{') { /* handle \u{...} form */
7545 s++;
7546 for (;;) {
7547 if (s >= s_end) {
7548 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7549 }
7550 if (*s == '}') {
7551 s++;
7552 break;
7553 }
7554 if (ISSPACE(*s)) {
7555 s++;
7556 continue;
7557 }
7558 c = scan_hex(s, s_end-s, &hexlen);
7559 if (hexlen == 0 || hexlen > 6) {
7560 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7561 }
7562 if (c > 0x10ffff) {
7563 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7564 }
7565 if (0xd800 <= c && c <= 0xdfff) {
7566 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7567 }
7568 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7569 rb_str_cat(undumped, (char *)buf, codelen);
7570 s += hexlen;
7571 }
7572 }
7573 else { /* handle \uXXXX form */
7574 c = scan_hex(s, 4, &hexlen);
7575 if (hexlen != 4) {
7576 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7577 }
7578 if (0xd800 <= c && c <= 0xdfff) {
7579 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7580 }
7581 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7582 rb_str_cat(undumped, (char *)buf, codelen);
7583 s += hexlen;
7584 }
7585 break;
7586 case 'x':
7587 if (++s >= s_end) {
7588 rb_raise(rb_eRuntimeError, "invalid hex escape");
7589 }
7590 *buf = scan_hex(s, 2, &hexlen);
7591 if (hexlen != 2) {
7592 rb_raise(rb_eRuntimeError, "invalid hex escape");
7593 }
7594 if (!ISASCII(*buf)) {
7595 if (*utf8) {
7596 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7597 }
7598 *binary = true;
7599 }
7600 rb_str_cat(undumped, (char *)buf, 1);
7601 s += hexlen;
7602 break;
7603 default:
7604 rb_str_cat(undumped, s-1, 2);
7605 s++;
7606 }
7607
7608 *ss = s;
7609}
7610
7611static VALUE rb_str_is_ascii_only_p(VALUE str);
7612
7613/*
7614 * call-seq:
7615 * undump -> new_string
7616 *
7617 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7618 *
7619 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7620 */
7621
7622static VALUE
7623str_undump(VALUE str)
7624{
7625 const char *s = RSTRING_PTR(str);
7626 const char *s_end = RSTRING_END(str);
7627 rb_encoding *enc = rb_enc_get(str);
7628 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7629 bool utf8 = false;
7630 bool binary = false;
7631 int w;
7632
7634 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7635 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7636 }
7637 if (!str_null_check(str, &w)) {
7638 rb_raise(rb_eRuntimeError, "string contains null byte");
7639 }
7640 if (RSTRING_LEN(str) < 2) goto invalid_format;
7641 if (*s != '"') goto invalid_format;
7642
7643 /* strip '"' at the start */
7644 s++;
7645
7646 for (;;) {
7647 if (s >= s_end) {
7648 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7649 }
7650
7651 if (*s == '"') {
7652 /* epilogue */
7653 s++;
7654 if (s == s_end) {
7655 /* ascii compatible dumped string */
7656 break;
7657 }
7658 else {
7659 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7660 static const char dup_suffix[] = ".dup";
7661 const char *encname;
7662 int encidx;
7663 ptrdiff_t size;
7664
7665 /* check separately for strings dumped by older versions */
7666 size = sizeof(dup_suffix) - 1;
7667 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7668
7669 size = sizeof(force_encoding_suffix) - 1;
7670 if (s_end - s <= size) goto invalid_format;
7671 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7672 s += size;
7673
7674 if (utf8) {
7675 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7676 }
7677
7678 encname = s;
7679 s = memchr(s, '"', s_end-s);
7680 size = s - encname;
7681 if (!s) goto invalid_format;
7682 if (s_end - s != 2) goto invalid_format;
7683 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7684
7685 encidx = rb_enc_find_index2(encname, (long)size);
7686 if (encidx < 0) {
7687 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7688 }
7689 rb_enc_associate_index(undumped, encidx);
7690 }
7691 break;
7692 }
7693
7694 if (*s == '\\') {
7695 s++;
7696 if (s >= s_end) {
7697 rb_raise(rb_eRuntimeError, "invalid escape");
7698 }
7699 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7700 }
7701 else {
7702 rb_str_cat(undumped, s++, 1);
7703 }
7704 }
7705
7706 RB_GC_GUARD(str);
7707
7708 return undumped;
7709invalid_format:
7710 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7711}
7712
7713static void
7714rb_str_check_dummy_enc(rb_encoding *enc)
7715{
7716 if (rb_enc_dummy_p(enc)) {
7717 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7718 rb_enc_name(enc));
7719 }
7720}
7721
7722static rb_encoding *
7723str_true_enc(VALUE str)
7724{
7725 rb_encoding *enc = STR_ENC_GET(str);
7726 rb_str_check_dummy_enc(enc);
7727 return enc;
7728}
7729
7730static OnigCaseFoldType
7731check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7732{
7733 if (argc==0)
7734 return flags;
7735 if (argc>2)
7736 rb_raise(rb_eArgError, "too many options");
7737 if (argv[0]==sym_turkic) {
7738 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7739 if (argc==2) {
7740 if (argv[1]==sym_lithuanian)
7741 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7742 else
7743 rb_raise(rb_eArgError, "invalid second option");
7744 }
7745 }
7746 else if (argv[0]==sym_lithuanian) {
7747 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7748 if (argc==2) {
7749 if (argv[1]==sym_turkic)
7750 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7751 else
7752 rb_raise(rb_eArgError, "invalid second option");
7753 }
7754 }
7755 else if (argc>1)
7756 rb_raise(rb_eArgError, "too many options");
7757 else if (argv[0]==sym_ascii)
7758 flags |= ONIGENC_CASE_ASCII_ONLY;
7759 else if (argv[0]==sym_fold) {
7760 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7761 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7762 else
7763 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7764 }
7765 else
7766 rb_raise(rb_eArgError, "invalid option");
7767 return flags;
7768}
7769
7770static inline bool
7771case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7772{
7773 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7774 return true;
7775 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7776}
7777
7778/* 16 should be long enough to absorb any kind of single character length increase */
7779#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7780#ifndef CASEMAP_DEBUG
7781# define CASEMAP_DEBUG 0
7782#endif
7783
7784struct mapping_buffer;
7785typedef struct mapping_buffer {
7786 size_t capa;
7787 size_t used;
7788 struct mapping_buffer *next;
7789 OnigUChar space[FLEX_ARY_LEN];
7791
7792static void
7793mapping_buffer_free(void *p)
7794{
7795 mapping_buffer *previous_buffer;
7796 mapping_buffer *current_buffer = p;
7797 while (current_buffer) {
7798 previous_buffer = current_buffer;
7799 current_buffer = current_buffer->next;
7800 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7801 }
7802}
7803
7804static const rb_data_type_t mapping_buffer_type = {
7805 "mapping_buffer",
7806 {0, mapping_buffer_free,},
7807 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7808};
7809
7810static VALUE
7811rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7812{
7813 VALUE target;
7814
7815 const OnigUChar *source_current, *source_end;
7816 int target_length = 0;
7817 VALUE buffer_anchor;
7818 mapping_buffer *current_buffer = 0;
7819 mapping_buffer **pre_buffer;
7820 size_t buffer_count = 0;
7821 int buffer_length_or_invalid;
7822
7823 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7824
7825 source_current = (OnigUChar*)RSTRING_PTR(source);
7826 source_end = (OnigUChar*)RSTRING_END(source);
7827
7828 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7829 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7830 while (source_current < source_end) {
7831 /* increase multiplier using buffer count to converge quickly */
7832 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7833 if (CASEMAP_DEBUG) {
7834 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7835 }
7836 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7837 *pre_buffer = current_buffer;
7838 pre_buffer = &current_buffer->next;
7839 current_buffer->next = NULL;
7840 current_buffer->capa = capa;
7841 buffer_length_or_invalid = enc->case_map(flags,
7842 &source_current, source_end,
7843 current_buffer->space,
7844 current_buffer->space+current_buffer->capa,
7845 enc);
7846 if (buffer_length_or_invalid < 0) {
7847 current_buffer = DATA_PTR(buffer_anchor);
7848 DATA_PTR(buffer_anchor) = 0;
7849 mapping_buffer_free(current_buffer);
7850 rb_raise(rb_eArgError, "input string invalid");
7851 }
7852 target_length += current_buffer->used = buffer_length_or_invalid;
7853 }
7854 if (CASEMAP_DEBUG) {
7855 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7856 }
7857
7858 if (buffer_count==1) {
7859 target = rb_str_new((const char*)current_buffer->space, target_length);
7860 }
7861 else {
7862 char *target_current;
7863
7864 target = rb_str_new(0, target_length);
7865 target_current = RSTRING_PTR(target);
7866 current_buffer = DATA_PTR(buffer_anchor);
7867 while (current_buffer) {
7868 memcpy(target_current, current_buffer->space, current_buffer->used);
7869 target_current += current_buffer->used;
7870 current_buffer = current_buffer->next;
7871 }
7872 }
7873 current_buffer = DATA_PTR(buffer_anchor);
7874 DATA_PTR(buffer_anchor) = 0;
7875 mapping_buffer_free(current_buffer);
7876
7877 RB_GC_GUARD(buffer_anchor);
7878
7879 /* TODO: check about string terminator character */
7880 str_enc_copy_direct(target, source);
7881 /*ENC_CODERANGE_SET(mapped, cr);*/
7882
7883 return target;
7884}
7885
7886static VALUE
7887rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7888{
7889 const OnigUChar *source_current, *source_end;
7890 OnigUChar *target_current, *target_end;
7891 long old_length = RSTRING_LEN(source);
7892 int length_or_invalid;
7893
7894 if (old_length == 0) return Qnil;
7895
7896 source_current = (OnigUChar*)RSTRING_PTR(source);
7897 source_end = (OnigUChar*)RSTRING_END(source);
7898 if (source == target) {
7899 target_current = (OnigUChar*)source_current;
7900 target_end = (OnigUChar*)source_end;
7901 }
7902 else {
7903 target_current = (OnigUChar*)RSTRING_PTR(target);
7904 target_end = (OnigUChar*)RSTRING_END(target);
7905 }
7906
7907 length_or_invalid = onigenc_ascii_only_case_map(flags,
7908 &source_current, source_end,
7909 target_current, target_end, enc);
7910 if (length_or_invalid < 0)
7911 rb_raise(rb_eArgError, "input string invalid");
7912 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7913 fprintf(stderr, "problem with rb_str_ascii_casemap"
7914 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7915 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7916 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7917 }
7918
7919 str_enc_copy(target, source);
7920
7921 return target;
7922}
7923
7924static bool
7925upcase_single(VALUE str)
7926{
7927 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7928 bool modified = false;
7929
7930 while (s < send) {
7931 unsigned int c = *(unsigned char*)s;
7932
7933 if ('a' <= c && c <= 'z') {
7934 *s = 'A' + (c - 'a');
7935 modified = true;
7936 }
7937 s++;
7938 }
7939 return modified;
7940}
7941
7942/*
7943 * call-seq:
7944 * upcase!(mapping) -> self or nil
7945 *
7946 * Like String#upcase, except that:
7947 *
7948 * - Changes character casings in +self+ (not in a copy of +self+).
7949 * - Returns +self+ if any changes are made, +nil+ otherwise.
7950 *
7951 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7952 */
7953
7954static VALUE
7955rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7956{
7957 rb_encoding *enc;
7958 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7959
7960 flags = check_case_options(argc, argv, flags);
7961 str_modify_keep_cr(str);
7962 enc = str_true_enc(str);
7963 if (case_option_single_p(flags, enc, str)) {
7964 if (upcase_single(str))
7965 flags |= ONIGENC_CASE_MODIFIED;
7966 }
7967 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7968 rb_str_ascii_casemap(str, str, &flags, enc);
7969 else
7970 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7971
7972 if (ONIGENC_CASE_MODIFIED&flags) return str;
7973 return Qnil;
7974}
7975
7976
7977/*
7978 * call-seq:
7979 * upcase(mapping = :ascii) -> new_string
7980 *
7981 * :include: doc/string/upcase.rdoc
7982 */
7983
7984static VALUE
7985rb_str_upcase(int argc, VALUE *argv, VALUE str)
7986{
7987 rb_encoding *enc;
7988 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7989 VALUE ret;
7990
7991 flags = check_case_options(argc, argv, flags);
7992 enc = str_true_enc(str);
7993 if (case_option_single_p(flags, enc, str)) {
7994 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7995 str_enc_copy_direct(ret, str);
7996 upcase_single(ret);
7997 }
7998 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7999 ret = rb_str_new(0, RSTRING_LEN(str));
8000 rb_str_ascii_casemap(str, ret, &flags, enc);
8001 }
8002 else {
8003 ret = rb_str_casemap(str, &flags, enc);
8004 }
8005
8006 return ret;
8007}
8008
8009static bool
8010downcase_single(VALUE str)
8011{
8012 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8013 bool modified = false;
8014
8015 while (s < send) {
8016 unsigned int c = *(unsigned char*)s;
8017
8018 if ('A' <= c && c <= 'Z') {
8019 *s = 'a' + (c - 'A');
8020 modified = true;
8021 }
8022 s++;
8023 }
8024
8025 return modified;
8026}
8027
8028/*
8029 * call-seq:
8030 * downcase!(mapping) -> self or nil
8031 *
8032 * Like String#downcase, except that:
8033 *
8034 * - Changes character casings in +self+ (not in a copy of +self+).
8035 * - Returns +self+ if any changes are made, +nil+ otherwise.
8036 *
8037 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8038 */
8039
8040static VALUE
8041rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8042{
8043 rb_encoding *enc;
8044 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8045
8046 flags = check_case_options(argc, argv, flags);
8047 str_modify_keep_cr(str);
8048 enc = str_true_enc(str);
8049 if (case_option_single_p(flags, enc, str)) {
8050 if (downcase_single(str))
8051 flags |= ONIGENC_CASE_MODIFIED;
8052 }
8053 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8054 rb_str_ascii_casemap(str, str, &flags, enc);
8055 else
8056 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8057
8058 if (ONIGENC_CASE_MODIFIED&flags) return str;
8059 return Qnil;
8060}
8061
8062
8063/*
8064 * call-seq:
8065 * downcase(mapping = :ascii) -> new_string
8066 *
8067 * :include: doc/string/downcase.rdoc
8068 *
8069 */
8070
8071static VALUE
8072rb_str_downcase(int argc, VALUE *argv, VALUE str)
8073{
8074 rb_encoding *enc;
8075 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8076 VALUE ret;
8077
8078 flags = check_case_options(argc, argv, flags);
8079 enc = str_true_enc(str);
8080 if (case_option_single_p(flags, enc, str)) {
8081 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8082 str_enc_copy_direct(ret, str);
8083 downcase_single(ret);
8084 }
8085 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8086 ret = rb_str_new(0, RSTRING_LEN(str));
8087 rb_str_ascii_casemap(str, ret, &flags, enc);
8088 }
8089 else {
8090 ret = rb_str_casemap(str, &flags, enc);
8091 }
8092
8093 return ret;
8094}
8095
8096
8097/*
8098 * call-seq:
8099 * capitalize!(mapping = :ascii) -> self or nil
8100 *
8101 * Like String#capitalize, except that:
8102 *
8103 * - Changes character casings in +self+ (not in a copy of +self+).
8104 * - Returns +self+ if any changes are made, +nil+ otherwise.
8105 *
8106 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8107 */
8108
8109static VALUE
8110rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8111{
8112 rb_encoding *enc;
8113 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8114
8115 flags = check_case_options(argc, argv, flags);
8116 str_modify_keep_cr(str);
8117 enc = str_true_enc(str);
8118 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8119 if (flags&ONIGENC_CASE_ASCII_ONLY)
8120 rb_str_ascii_casemap(str, str, &flags, enc);
8121 else
8122 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8123
8124 if (ONIGENC_CASE_MODIFIED&flags) return str;
8125 return Qnil;
8126}
8127
8128
8129/*
8130 * call-seq:
8131 * capitalize(mapping = :ascii) -> new_string
8132 *
8133 * :include: doc/string/capitalize.rdoc
8134 *
8135 */
8136
8137static VALUE
8138rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8139{
8140 rb_encoding *enc;
8141 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8142 VALUE ret;
8143
8144 flags = check_case_options(argc, argv, flags);
8145 enc = str_true_enc(str);
8146 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8147 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8148 ret = rb_str_new(0, RSTRING_LEN(str));
8149 rb_str_ascii_casemap(str, ret, &flags, enc);
8150 }
8151 else {
8152 ret = rb_str_casemap(str, &flags, enc);
8153 }
8154 return ret;
8155}
8156
8157
8158/*
8159 * call-seq:
8160 * swapcase!(mapping) -> self or nil
8161 *
8162 * Like String#swapcase, except that:
8163 *
8164 * - Changes are made to +self+, not to copy of +self+.
8165 * - Returns +self+ if any changes are made, +nil+ otherwise.
8166 *
8167 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8168 */
8169
8170static VALUE
8171rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8172{
8173 rb_encoding *enc;
8174 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8175
8176 flags = check_case_options(argc, argv, flags);
8177 str_modify_keep_cr(str);
8178 enc = str_true_enc(str);
8179 if (flags&ONIGENC_CASE_ASCII_ONLY)
8180 rb_str_ascii_casemap(str, str, &flags, enc);
8181 else
8182 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8183
8184 if (ONIGENC_CASE_MODIFIED&flags) return str;
8185 return Qnil;
8186}
8187
8188
8189/*
8190 * call-seq:
8191 * swapcase(mapping = :ascii) -> new_string
8192 *
8193 * :include: doc/string/swapcase.rdoc
8194 *
8195 */
8196
8197static VALUE
8198rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8199{
8200 rb_encoding *enc;
8201 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8202 VALUE ret;
8203
8204 flags = check_case_options(argc, argv, flags);
8205 enc = str_true_enc(str);
8206 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8207 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8208 ret = rb_str_new(0, RSTRING_LEN(str));
8209 rb_str_ascii_casemap(str, ret, &flags, enc);
8210 }
8211 else {
8212 ret = rb_str_casemap(str, &flags, enc);
8213 }
8214 return ret;
8215}
8216
8217typedef unsigned char *USTR;
8218
8219struct tr {
8220 int gen;
8221 unsigned int now, max;
8222 char *p, *pend;
8223};
8224
8225static unsigned int
8226trnext(struct tr *t, rb_encoding *enc)
8227{
8228 int n;
8229
8230 for (;;) {
8231 nextpart:
8232 if (!t->gen) {
8233 if (t->p == t->pend) return -1;
8234 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8235 t->p += n;
8236 }
8237 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8238 t->p += n;
8239 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8240 t->p += n;
8241 if (t->p < t->pend) {
8242 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8243 t->p += n;
8244 if (t->now > c) {
8245 if (t->now < 0x80 && c < 0x80) {
8246 rb_raise(rb_eArgError,
8247 "invalid range \"%c-%c\" in string transliteration",
8248 t->now, c);
8249 }
8250 else {
8251 rb_raise(rb_eArgError, "invalid range in string transliteration");
8252 }
8253 continue; /* not reached */
8254 }
8255 else if (t->now < c) {
8256 t->gen = 1;
8257 t->max = c;
8258 }
8259 }
8260 }
8261 return t->now;
8262 }
8263 else {
8264 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8265 if (t->now == t->max) {
8266 t->gen = 0;
8267 goto nextpart;
8268 }
8269 }
8270 if (t->now < t->max) {
8271 return t->now;
8272 }
8273 else {
8274 t->gen = 0;
8275 return t->max;
8276 }
8277 }
8278 }
8279}
8280
8281static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8282
8283static VALUE
8284tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8285{
8286 const unsigned int errc = -1;
8287 unsigned int trans[256];
8288 rb_encoding *enc, *e1, *e2;
8289 struct tr trsrc, trrepl;
8290 int cflag = 0;
8291 unsigned int c, c0, last = 0;
8292 int modify = 0, i, l;
8293 unsigned char *s, *send;
8294 VALUE hash = 0;
8295 int singlebyte = single_byte_optimizable(str);
8296 int termlen;
8297 int cr;
8298
8299#define CHECK_IF_ASCII(c) \
8300 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8301 (cr = ENC_CODERANGE_VALID) : 0)
8302
8303 StringValue(src);
8304 StringValue(repl);
8305 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8306 if (RSTRING_LEN(repl) == 0) {
8307 return rb_str_delete_bang(1, &src, str);
8308 }
8309
8310 cr = ENC_CODERANGE(str);
8311 e1 = rb_enc_check(str, src);
8312 e2 = rb_enc_check(str, repl);
8313 if (e1 == e2) {
8314 enc = e1;
8315 }
8316 else {
8317 enc = rb_enc_check(src, repl);
8318 }
8319 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8320 if (RSTRING_LEN(src) > 1 &&
8321 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8322 trsrc.p + l < trsrc.pend) {
8323 cflag = 1;
8324 trsrc.p += l;
8325 }
8326 trrepl.p = RSTRING_PTR(repl);
8327 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8328 trsrc.gen = trrepl.gen = 0;
8329 trsrc.now = trrepl.now = 0;
8330 trsrc.max = trrepl.max = 0;
8331
8332 if (cflag) {
8333 for (i=0; i<256; i++) {
8334 trans[i] = 1;
8335 }
8336 while ((c = trnext(&trsrc, enc)) != errc) {
8337 if (c < 256) {
8338 trans[c] = errc;
8339 }
8340 else {
8341 if (!hash) hash = rb_hash_new();
8342 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8343 }
8344 }
8345 while ((c = trnext(&trrepl, enc)) != errc)
8346 /* retrieve last replacer */;
8347 last = trrepl.now;
8348 for (i=0; i<256; i++) {
8349 if (trans[i] != errc) {
8350 trans[i] = last;
8351 }
8352 }
8353 }
8354 else {
8355 unsigned int r;
8356
8357 for (i=0; i<256; i++) {
8358 trans[i] = errc;
8359 }
8360 while ((c = trnext(&trsrc, enc)) != errc) {
8361 r = trnext(&trrepl, enc);
8362 if (r == errc) r = trrepl.now;
8363 if (c < 256) {
8364 trans[c] = r;
8365 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8366 }
8367 else {
8368 if (!hash) hash = rb_hash_new();
8369 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8370 }
8371 }
8372 }
8373
8374 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8375 cr = ENC_CODERANGE_7BIT;
8376 str_modify_keep_cr(str);
8377 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8378 termlen = rb_enc_mbminlen(enc);
8379 if (sflag) {
8380 int clen, tlen;
8381 long offset, max = RSTRING_LEN(str);
8382 unsigned int save = -1;
8383 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8384
8385 while (s < send) {
8386 int may_modify = 0;
8387
8388 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8389 if (!MBCLEN_CHARFOUND_P(r)) {
8390 xfree(buf);
8391 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8392 }
8393 clen = MBCLEN_CHARFOUND_LEN(r);
8394 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8395
8396 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8397
8398 s += clen;
8399 if (c < 256) {
8400 c = trans[c];
8401 }
8402 else if (hash) {
8403 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8404 if (NIL_P(tmp)) {
8405 if (cflag) c = last;
8406 else c = errc;
8407 }
8408 else if (cflag) c = errc;
8409 else c = NUM2INT(tmp);
8410 }
8411 else {
8412 c = errc;
8413 }
8414 if (c != (unsigned int)-1) {
8415 if (save == c) {
8416 CHECK_IF_ASCII(c);
8417 continue;
8418 }
8419 save = c;
8420 tlen = rb_enc_codelen(c, enc);
8421 modify = 1;
8422 }
8423 else {
8424 save = -1;
8425 c = c0;
8426 if (enc != e1) may_modify = 1;
8427 }
8428 if ((offset = t - buf) + tlen > max) {
8429 size_t MAYBE_UNUSED(old) = max + termlen;
8430 max = offset + tlen + (send - s);
8431 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8432 t = buf + offset;
8433 }
8434 rb_enc_mbcput(c, t, enc);
8435 if (may_modify && memcmp(s, t, tlen) != 0) {
8436 modify = 1;
8437 }
8438 CHECK_IF_ASCII(c);
8439 t += tlen;
8440 }
8441 if (!STR_EMBED_P(str)) {
8442 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8443 }
8444 TERM_FILL((char *)t, termlen);
8445 RSTRING(str)->as.heap.ptr = (char *)buf;
8446 STR_SET_LEN(str, t - buf);
8447 STR_SET_NOEMBED(str);
8448 RSTRING(str)->as.heap.aux.capa = max;
8449 }
8450 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8451 while (s < send) {
8452 c = (unsigned char)*s;
8453 if (trans[c] != errc) {
8454 if (!cflag) {
8455 c = trans[c];
8456 *s = c;
8457 modify = 1;
8458 }
8459 else {
8460 *s = last;
8461 modify = 1;
8462 }
8463 }
8464 CHECK_IF_ASCII(c);
8465 s++;
8466 }
8467 }
8468 else {
8469 int clen, tlen;
8470 long offset, max = (long)((send - s) * 1.2);
8471 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8472
8473 while (s < send) {
8474 int may_modify = 0;
8475
8476 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8477 if (!MBCLEN_CHARFOUND_P(r)) {
8478 xfree(buf);
8479 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8480 }
8481 clen = MBCLEN_CHARFOUND_LEN(r);
8482 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8483
8484 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8485
8486 if (c < 256) {
8487 c = trans[c];
8488 }
8489 else if (hash) {
8490 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8491 if (NIL_P(tmp)) {
8492 if (cflag) c = last;
8493 else c = errc;
8494 }
8495 else if (cflag) c = errc;
8496 else c = NUM2INT(tmp);
8497 }
8498 else {
8499 c = cflag ? last : errc;
8500 }
8501 if (c != errc) {
8502 tlen = rb_enc_codelen(c, enc);
8503 modify = 1;
8504 }
8505 else {
8506 c = c0;
8507 if (enc != e1) may_modify = 1;
8508 }
8509 if ((offset = t - buf) + tlen > max) {
8510 size_t MAYBE_UNUSED(old) = max + termlen;
8511 max = offset + tlen + (long)((send - s) * 1.2);
8512 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8513 t = buf + offset;
8514 }
8515 if (s != t) {
8516 rb_enc_mbcput(c, t, enc);
8517 if (may_modify && memcmp(s, t, tlen) != 0) {
8518 modify = 1;
8519 }
8520 }
8521 CHECK_IF_ASCII(c);
8522 s += clen;
8523 t += tlen;
8524 }
8525 if (!STR_EMBED_P(str)) {
8526 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8527 }
8528 TERM_FILL((char *)t, termlen);
8529 RSTRING(str)->as.heap.ptr = (char *)buf;
8530 STR_SET_LEN(str, t - buf);
8531 STR_SET_NOEMBED(str);
8532 RSTRING(str)->as.heap.aux.capa = max;
8533 }
8534
8535 if (modify) {
8536 if (cr != ENC_CODERANGE_BROKEN)
8537 ENC_CODERANGE_SET(str, cr);
8538 rb_enc_associate(str, enc);
8539 return str;
8540 }
8541 return Qnil;
8542}
8543
8544
8545/*
8546 * call-seq:
8547 * tr!(selector, replacements) -> self or nil
8548 *
8549 * Like String#tr, except:
8550 *
8551 * - Performs substitutions in +self+ (not in a copy of +self+).
8552 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8553 *
8554 * Related: {Modifying}[rdoc-ref:String@Modifying].
8555 */
8556
8557static VALUE
8558rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8559{
8560 return tr_trans(str, src, repl, 0);
8561}
8562
8563
8564/*
8565 * call-seq:
8566 * tr(selector, replacements) -> new_string
8567 *
8568 * Returns a copy of +self+ with each character specified by string +selector+
8569 * translated to the corresponding character in string +replacements+.
8570 * The correspondence is _positional_:
8571 *
8572 * - Each occurrence of the first character specified by +selector+
8573 * is translated to the first character in +replacements+.
8574 * - Each occurrence of the second character specified by +selector+
8575 * is translated to the second character in +replacements+.
8576 * - And so on.
8577 *
8578 * Example:
8579 *
8580 * 'hello'.tr('el', 'ip') #=> "hippo"
8581 *
8582 * If +replacements+ is shorter than +selector+,
8583 * it is implicitly padded with its own last character:
8584 *
8585 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8586 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8587 *
8588 * Arguments +selector+ and +replacements+ must be valid character selectors
8589 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8590 * and may use any of its valid forms, including negation, ranges, and escapes:
8591 *
8592 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8593 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8594 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8595 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8596 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8597 *
8598 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8599 */
8600
8601static VALUE
8602rb_str_tr(VALUE str, VALUE src, VALUE repl)
8603{
8604 str = str_duplicate(rb_cString, str);
8605 tr_trans(str, src, repl, 0);
8606 return str;
8607}
8608
8609#define TR_TABLE_MAX (UCHAR_MAX+1)
8610#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8611static void
8612tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8613 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8614{
8615 const unsigned int errc = -1;
8616 char buf[TR_TABLE_MAX];
8617 struct tr tr;
8618 unsigned int c;
8619 VALUE table = 0, ptable = 0;
8620 int i, l, cflag = 0;
8621
8622 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8623 tr.gen = tr.now = tr.max = 0;
8624
8625 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8626 cflag = 1;
8627 tr.p += l;
8628 }
8629 if (first) {
8630 for (i=0; i<TR_TABLE_MAX; i++) {
8631 stable[i] = 1;
8632 }
8633 stable[TR_TABLE_MAX] = cflag;
8634 }
8635 else if (stable[TR_TABLE_MAX] && !cflag) {
8636 stable[TR_TABLE_MAX] = 0;
8637 }
8638 for (i=0; i<TR_TABLE_MAX; i++) {
8639 buf[i] = cflag;
8640 }
8641
8642 while ((c = trnext(&tr, enc)) != errc) {
8643 if (c < TR_TABLE_MAX) {
8644 buf[(unsigned char)c] = !cflag;
8645 }
8646 else {
8647 VALUE key = UINT2NUM(c);
8648
8649 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8650 if (cflag) {
8651 ptable = *ctablep;
8652 table = ptable ? ptable : rb_hash_new();
8653 *ctablep = table;
8654 }
8655 else {
8656 table = rb_hash_new();
8657 ptable = *tablep;
8658 *tablep = table;
8659 }
8660 }
8661 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8662 rb_hash_aset(table, key, Qtrue);
8663 }
8664 }
8665 }
8666 for (i=0; i<TR_TABLE_MAX; i++) {
8667 stable[i] = stable[i] && buf[i];
8668 }
8669 if (!table && !cflag) {
8670 *tablep = 0;
8671 }
8672}
8673
8674
8675static int
8676tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8677{
8678 if (c < TR_TABLE_MAX) {
8679 return table[c] != 0;
8680 }
8681 else {
8682 VALUE v = UINT2NUM(c);
8683
8684 if (del) {
8685 if (!NIL_P(rb_hash_lookup(del, v)) &&
8686 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8687 return TRUE;
8688 }
8689 }
8690 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8691 return FALSE;
8692 }
8693 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8694 }
8695}
8696
8697/*
8698 * call-seq:
8699 * delete!(*selectors) -> self or nil
8700 *
8701 * Like String#delete, but modifies +self+ in place;
8702 * returns +self+ if any characters were deleted, +nil+ otherwise.
8703 *
8704 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8705 */
8706
8707static VALUE
8708rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8709{
8710 char squeez[TR_TABLE_SIZE];
8711 rb_encoding *enc = 0;
8712 char *s, *send, *t;
8713 VALUE del = 0, nodel = 0;
8714 int modify = 0;
8715 int i, ascompat, cr;
8716
8717 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8719 for (i=0; i<argc; i++) {
8720 VALUE s = argv[i];
8721
8722 StringValue(s);
8723 enc = rb_enc_check(str, s);
8724 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8725 }
8726
8727 str_modify_keep_cr(str);
8728 ascompat = rb_enc_asciicompat(enc);
8729 s = t = RSTRING_PTR(str);
8730 send = RSTRING_END(str);
8731 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8732 while (s < send) {
8733 unsigned int c;
8734 int clen;
8735
8736 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8737 if (squeez[c]) {
8738 modify = 1;
8739 }
8740 else {
8741 if (t != s) *t = c;
8742 t++;
8743 }
8744 s++;
8745 }
8746 else {
8747 c = rb_enc_codepoint_len(s, send, &clen, enc);
8748
8749 if (tr_find(c, squeez, del, nodel)) {
8750 modify = 1;
8751 }
8752 else {
8753 if (t != s) rb_enc_mbcput(c, t, enc);
8754 t += clen;
8756 }
8757 s += clen;
8758 }
8759 }
8760 TERM_FILL(t, TERM_LEN(str));
8761 STR_SET_LEN(str, t - RSTRING_PTR(str));
8762 ENC_CODERANGE_SET(str, cr);
8763
8764 if (modify) return str;
8765 return Qnil;
8766}
8767
8768
8769/*
8770 * call-seq:
8771 * delete(*selectors) -> new_string
8772 *
8773 * :include: doc/string/delete.rdoc
8774 *
8775 */
8776
8777static VALUE
8778rb_str_delete(int argc, VALUE *argv, VALUE str)
8779{
8780 str = str_duplicate(rb_cString, str);
8781 rb_str_delete_bang(argc, argv, str);
8782 return str;
8783}
8784
8785
8786/*
8787 * call-seq:
8788 * squeeze!(*selectors) -> self or nil
8789 *
8790 * Like String#squeeze, except that:
8791 *
8792 * - Characters are squeezed in +self+ (not in a copy of +self+).
8793 * - Returns +self+ if any changes are made, +nil+ otherwise.
8794 *
8795 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8796 */
8797
8798static VALUE
8799rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8800{
8801 char squeez[TR_TABLE_SIZE];
8802 rb_encoding *enc = 0;
8803 VALUE del = 0, nodel = 0;
8804 unsigned char *s, *send, *t;
8805 int i, modify = 0;
8806 int ascompat, singlebyte = single_byte_optimizable(str);
8807 unsigned int save;
8808
8809 if (argc == 0) {
8810 enc = STR_ENC_GET(str);
8811 }
8812 else {
8813 for (i=0; i<argc; i++) {
8814 VALUE s = argv[i];
8815
8816 StringValue(s);
8817 enc = rb_enc_check(str, s);
8818 if (singlebyte && !single_byte_optimizable(s))
8819 singlebyte = 0;
8820 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8821 }
8822 }
8823
8824 str_modify_keep_cr(str);
8825 s = t = (unsigned char *)RSTRING_PTR(str);
8826 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8827 send = (unsigned char *)RSTRING_END(str);
8828 save = -1;
8829 ascompat = rb_enc_asciicompat(enc);
8830
8831 if (singlebyte) {
8832 while (s < send) {
8833 unsigned int c = *s++;
8834 if (c != save || (argc > 0 && !squeez[c])) {
8835 *t++ = save = c;
8836 }
8837 }
8838 }
8839 else {
8840 while (s < send) {
8841 unsigned int c;
8842 int clen;
8843
8844 if (ascompat && (c = *s) < 0x80) {
8845 if (c != save || (argc > 0 && !squeez[c])) {
8846 *t++ = save = c;
8847 }
8848 s++;
8849 }
8850 else {
8851 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8852
8853 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8854 if (t != s) rb_enc_mbcput(c, t, enc);
8855 save = c;
8856 t += clen;
8857 }
8858 s += clen;
8859 }
8860 }
8861 }
8862
8863 TERM_FILL((char *)t, TERM_LEN(str));
8864 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8865 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8866 modify = 1;
8867 }
8868
8869 if (modify) return str;
8870 return Qnil;
8871}
8872
8873
8874/*
8875 * call-seq:
8876 * squeeze(*selectors) -> new_string
8877 *
8878 * :include: doc/string/squeeze.rdoc
8879 *
8880 */
8881
8882static VALUE
8883rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8884{
8885 str = str_duplicate(rb_cString, str);
8886 rb_str_squeeze_bang(argc, argv, str);
8887 return str;
8888}
8889
8890
8891/*
8892 * call-seq:
8893 * tr_s!(selector, replacements) -> self or nil
8894 *
8895 * Like String#tr_s, except:
8896 *
8897 * - Modifies +self+ in place (not a copy of +self+).
8898 * - Returns +self+ if any changes were made, +nil+ otherwise.
8899 *
8900 * Related: {Modifying}[rdoc-ref:String@Modifying].
8901 */
8902
8903static VALUE
8904rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8905{
8906 return tr_trans(str, src, repl, 1);
8907}
8908
8909
8910/*
8911 * call-seq:
8912 * tr_s(selector, replacements) -> new_string
8913 *
8914 * Like String#tr, except:
8915 *
8916 * - Also squeezes the modified portions of the translated string;
8917 * see String#squeeze.
8918 * - Returns the translated and squeezed string.
8919 *
8920 * Examples:
8921 *
8922 * 'hello'.tr_s('l', 'r') #=> "hero"
8923 * 'hello'.tr_s('el', '-') #=> "h-o"
8924 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8925 *
8926 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8927 *
8928 */
8929
8930static VALUE
8931rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8932{
8933 str = str_duplicate(rb_cString, str);
8934 tr_trans(str, src, repl, 1);
8935 return str;
8936}
8937
8938
8939/*
8940 * call-seq:
8941 * count(*selectors) -> integer
8942 *
8943 * :include: doc/string/count.rdoc
8944 */
8945
8946static VALUE
8947rb_str_count(int argc, VALUE *argv, VALUE str)
8948{
8949 char table[TR_TABLE_SIZE];
8950 rb_encoding *enc = 0;
8951 VALUE del = 0, nodel = 0, tstr;
8952 char *s, *send;
8953 int i;
8954 int ascompat;
8955 size_t n = 0;
8956
8958
8959 tstr = argv[0];
8960 StringValue(tstr);
8961 enc = rb_enc_check(str, tstr);
8962 if (argc == 1) {
8963 const char *ptstr;
8964 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8965 (ptstr = RSTRING_PTR(tstr),
8966 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8967 !is_broken_string(str)) {
8968 int clen;
8969 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8970
8971 s = RSTRING_PTR(str);
8972 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8973 send = RSTRING_END(str);
8974 while (s < send) {
8975 if (*(unsigned char*)s++ == c) n++;
8976 }
8977 return SIZET2NUM(n);
8978 }
8979 }
8980
8981 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8982 for (i=1; i<argc; i++) {
8983 tstr = argv[i];
8984 StringValue(tstr);
8985 enc = rb_enc_check(str, tstr);
8986 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8987 }
8988
8989 s = RSTRING_PTR(str);
8990 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8991 send = RSTRING_END(str);
8992 ascompat = rb_enc_asciicompat(enc);
8993 while (s < send) {
8994 unsigned int c;
8995
8996 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8997 if (table[c]) {
8998 n++;
8999 }
9000 s++;
9001 }
9002 else {
9003 int clen;
9004 c = rb_enc_codepoint_len(s, send, &clen, enc);
9005 if (tr_find(c, table, del, nodel)) {
9006 n++;
9007 }
9008 s += clen;
9009 }
9010 }
9011
9012 return SIZET2NUM(n);
9013}
9014
9015static VALUE
9016rb_fs_check(VALUE val)
9017{
9018 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9019 val = rb_check_string_type(val);
9020 if (NIL_P(val)) return 0;
9021 }
9022 return val;
9023}
9024
9025static const char isspacetable[256] = {
9026 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9027 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9028 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9029 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9030 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9034 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9035 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9036 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9037 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9038 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9039 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9040 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9041 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9042};
9043
9044#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9045
9046static long
9047split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9048{
9049 if (empty_count >= 0 && len == 0) {
9050 return empty_count + 1;
9051 }
9052 if (empty_count > 0) {
9053 /* make different substrings */
9054 if (result) {
9055 do {
9056 rb_ary_push(result, str_new_empty_String(str));
9057 } while (--empty_count > 0);
9058 }
9059 else {
9060 do {
9061 rb_yield(str_new_empty_String(str));
9062 } while (--empty_count > 0);
9063 }
9064 }
9065 str = rb_str_subseq(str, beg, len);
9066 if (result) {
9067 rb_ary_push(result, str);
9068 }
9069 else {
9070 rb_yield(str);
9071 }
9072 return empty_count;
9073}
9074
9075typedef enum {
9076 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9077} split_type_t;
9078
9079static split_type_t
9080literal_split_pattern(VALUE spat, split_type_t default_type)
9081{
9082 rb_encoding *enc = STR_ENC_GET(spat);
9083 const char *ptr;
9084 long len;
9085 RSTRING_GETMEM(spat, ptr, len);
9086 if (len == 0) {
9087 /* Special case - split into chars */
9088 return SPLIT_TYPE_CHARS;
9089 }
9090 else if (rb_enc_asciicompat(enc)) {
9091 if (len == 1 && ptr[0] == ' ') {
9092 return SPLIT_TYPE_AWK;
9093 }
9094 }
9095 else {
9096 int l;
9097 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9098 return SPLIT_TYPE_AWK;
9099 }
9100 }
9101 return default_type;
9102}
9103
9104/*
9105 * call-seq:
9106 * split(field_sep = $;, limit = 0) -> array_of_substrings
9107 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9108 *
9109 * :include: doc/string/split.rdoc
9110 *
9111 */
9112
9113static VALUE
9114rb_str_split_m(int argc, VALUE *argv, VALUE str)
9115{
9116 rb_encoding *enc;
9117 VALUE spat;
9118 VALUE limit;
9119 split_type_t split_type;
9120 long beg, end, i = 0, empty_count = -1;
9121 int lim = 0;
9122 VALUE result, tmp;
9123
9124 result = rb_block_given_p() ? Qfalse : Qnil;
9125 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9126 lim = NUM2INT(limit);
9127 if (lim <= 0) limit = Qnil;
9128 else if (lim == 1) {
9129 if (RSTRING_LEN(str) == 0)
9130 return result ? rb_ary_new2(0) : str;
9131 tmp = str_duplicate(rb_cString, str);
9132 if (!result) {
9133 rb_yield(tmp);
9134 return str;
9135 }
9136 return rb_ary_new3(1, tmp);
9137 }
9138 i = 1;
9139 }
9140 if (NIL_P(limit) && !lim) empty_count = 0;
9141
9142 enc = STR_ENC_GET(str);
9143 split_type = SPLIT_TYPE_REGEXP;
9144 if (!NIL_P(spat)) {
9145 spat = get_pat_quoted(spat, 0);
9146 }
9147 else if (NIL_P(spat = rb_fs)) {
9148 split_type = SPLIT_TYPE_AWK;
9149 }
9150 else if (!(spat = rb_fs_check(spat))) {
9151 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9152 }
9153 else {
9154 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9155 }
9156 if (split_type != SPLIT_TYPE_AWK) {
9157 switch (BUILTIN_TYPE(spat)) {
9158 case T_REGEXP:
9159 rb_reg_options(spat); /* check if uninitialized */
9160 tmp = RREGEXP_SRC(spat);
9161 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9162 if (split_type == SPLIT_TYPE_AWK) {
9163 spat = tmp;
9164 split_type = SPLIT_TYPE_STRING;
9165 }
9166 break;
9167
9168 case T_STRING:
9169 mustnot_broken(spat);
9170 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9171 break;
9172
9173 default:
9175 }
9176 }
9177
9178#define SPLIT_STR(beg, len) ( \
9179 empty_count = split_string(result, str, beg, len, empty_count), \
9180 str_mod_check(str, str_start, str_len))
9181
9182 beg = 0;
9183 char *ptr = RSTRING_PTR(str);
9184 char *const str_start = ptr;
9185 const long str_len = RSTRING_LEN(str);
9186 char *const eptr = str_start + str_len;
9187 if (split_type == SPLIT_TYPE_AWK) {
9188 char *bptr = ptr;
9189 int skip = 1;
9190 unsigned int c;
9191
9192 if (result) result = rb_ary_new();
9193 end = beg;
9194 if (is_ascii_string(str)) {
9195 while (ptr < eptr) {
9196 c = (unsigned char)*ptr++;
9197 if (skip) {
9198 if (ascii_isspace(c)) {
9199 beg = ptr - bptr;
9200 }
9201 else {
9202 end = ptr - bptr;
9203 skip = 0;
9204 if (!NIL_P(limit) && lim <= i) break;
9205 }
9206 }
9207 else if (ascii_isspace(c)) {
9208 SPLIT_STR(beg, end-beg);
9209 skip = 1;
9210 beg = ptr - bptr;
9211 if (!NIL_P(limit)) ++i;
9212 }
9213 else {
9214 end = ptr - bptr;
9215 }
9216 }
9217 }
9218 else {
9219 while (ptr < eptr) {
9220 int n;
9221
9222 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9223 ptr += n;
9224 if (skip) {
9225 if (rb_isspace(c)) {
9226 beg = ptr - bptr;
9227 }
9228 else {
9229 end = ptr - bptr;
9230 skip = 0;
9231 if (!NIL_P(limit) && lim <= i) break;
9232 }
9233 }
9234 else if (rb_isspace(c)) {
9235 SPLIT_STR(beg, end-beg);
9236 skip = 1;
9237 beg = ptr - bptr;
9238 if (!NIL_P(limit)) ++i;
9239 }
9240 else {
9241 end = ptr - bptr;
9242 }
9243 }
9244 }
9245 }
9246 else if (split_type == SPLIT_TYPE_STRING) {
9247 char *substr_start = ptr;
9248 char *sptr = RSTRING_PTR(spat);
9249 long slen = RSTRING_LEN(spat);
9250
9251 if (result) result = rb_ary_new();
9252 mustnot_broken(str);
9253 enc = rb_enc_check(str, spat);
9254 while (ptr < eptr &&
9255 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9256 /* Check we are at the start of a char */
9257 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9258 if (t != ptr + end) {
9259 ptr = t;
9260 continue;
9261 }
9262 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9263 str_mod_check(spat, sptr, slen);
9264 ptr += end + slen;
9265 substr_start = ptr;
9266 if (!NIL_P(limit) && lim <= ++i) break;
9267 }
9268 beg = ptr - str_start;
9269 }
9270 else if (split_type == SPLIT_TYPE_CHARS) {
9271 int n;
9272
9273 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9274 mustnot_broken(str);
9275 enc = rb_enc_get(str);
9276 while (ptr < eptr &&
9277 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9278 SPLIT_STR(ptr - str_start, n);
9279 ptr += n;
9280 if (!NIL_P(limit) && lim <= ++i) break;
9281 }
9282 beg = ptr - str_start;
9283 }
9284 else {
9285 if (result) result = rb_ary_new();
9286 long len = RSTRING_LEN(str);
9287 long start = beg;
9288 long idx;
9289 int last_null = 0;
9290 struct re_registers *regs;
9291 VALUE match = 0;
9292
9293 for (; rb_reg_search(spat, str, start, 0) >= 0;
9294 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9295 match = rb_backref_get();
9296 if (!result) rb_match_busy(match);
9297 regs = RMATCH_REGS(match);
9298 end = BEG(0);
9299 if (start == end && BEG(0) == END(0)) {
9300 if (!ptr) {
9301 SPLIT_STR(0, 0);
9302 break;
9303 }
9304 else if (last_null == 1) {
9305 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9306 beg = start;
9307 }
9308 else {
9309 if (start == len)
9310 start++;
9311 else
9312 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9313 last_null = 1;
9314 continue;
9315 }
9316 }
9317 else {
9318 SPLIT_STR(beg, end-beg);
9319 beg = start = END(0);
9320 }
9321 last_null = 0;
9322
9323 for (idx=1; idx < regs->num_regs; idx++) {
9324 if (BEG(idx) == -1) continue;
9325 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9326 }
9327 if (!NIL_P(limit) && lim <= ++i) break;
9328 }
9329 if (match) rb_match_unbusy(match);
9330 }
9331 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9332 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9333 }
9334
9335 return result ? result : str;
9336}
9337
9338VALUE
9339rb_str_split(VALUE str, const char *sep0)
9340{
9341 VALUE sep;
9342
9343 StringValue(str);
9344 sep = rb_str_new_cstr(sep0);
9345 return rb_str_split_m(1, &sep, str);
9346}
9347
9348#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9349
9350static inline int
9351enumerator_element(VALUE ary, VALUE e)
9352{
9353 if (ary) {
9354 rb_ary_push(ary, e);
9355 return 0;
9356 }
9357 else {
9358 rb_yield(e);
9359 return 1;
9360 }
9361}
9362
9363#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9364
9365static const char *
9366chomp_newline(const char *p, const char *e, rb_encoding *enc)
9367{
9368 const char *prev = rb_enc_prev_char(p, e, e, enc);
9369 if (rb_enc_is_newline(prev, e, enc)) {
9370 e = prev;
9371 prev = rb_enc_prev_char(p, e, e, enc);
9372 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9373 e = prev;
9374 }
9375 return e;
9376}
9377
9378static VALUE
9379get_rs(void)
9380{
9381 VALUE rs = rb_rs;
9382 if (!NIL_P(rs) &&
9383 (!RB_TYPE_P(rs, T_STRING) ||
9384 RSTRING_LEN(rs) != 1 ||
9385 RSTRING_PTR(rs)[0] != '\n')) {
9386 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9387 }
9388 return rs;
9389}
9390
9391#define rb_rs get_rs()
9392
9393static VALUE
9394rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9395{
9396 rb_encoding *enc;
9397 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9398 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9399 long pos, len, rslen;
9400 int rsnewline = 0;
9401
9402 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9403 rs = rb_rs;
9404 if (!NIL_P(opts)) {
9405 static ID keywords[1];
9406 if (!keywords[0]) {
9407 keywords[0] = rb_intern_const("chomp");
9408 }
9409 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9410 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9411 }
9412
9413 if (NIL_P(rs)) {
9414 if (!ENUM_ELEM(ary, str)) {
9415 return ary;
9416 }
9417 else {
9418 return orig;
9419 }
9420 }
9421
9422 if (!RSTRING_LEN(str)) goto end;
9423 str = rb_str_new_frozen(str);
9424 ptr = subptr = RSTRING_PTR(str);
9425 pend = RSTRING_END(str);
9426 len = RSTRING_LEN(str);
9427 StringValue(rs);
9428 rslen = RSTRING_LEN(rs);
9429
9430 if (rs == rb_default_rs)
9431 enc = rb_enc_get(str);
9432 else
9433 enc = rb_enc_check(str, rs);
9434
9435 if (rslen == 0) {
9436 /* paragraph mode */
9437 int n;
9438 const char *eol = NULL;
9439 subend = subptr;
9440 while (subend < pend) {
9441 long chomp_rslen = 0;
9442 do {
9443 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9444 n = 0;
9445 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9446 if (rb_enc_is_newline(subend + n, pend, enc)) {
9447 if (eol == subend) break;
9448 subend += rslen;
9449 if (subptr) {
9450 eol = subend;
9451 chomp_rslen = -rslen;
9452 }
9453 }
9454 else {
9455 if (!subptr) subptr = subend;
9456 subend += rslen;
9457 }
9458 rslen = 0;
9459 } while (subend < pend);
9460 if (!subptr) break;
9461 if (rslen == 0) chomp_rslen = 0;
9462 line = rb_str_subseq(str, subptr - ptr,
9463 subend - subptr + (chomp ? chomp_rslen : rslen));
9464 if (ENUM_ELEM(ary, line)) {
9465 str_mod_check(str, ptr, len);
9466 }
9467 subptr = eol = NULL;
9468 }
9469 goto end;
9470 }
9471 else {
9472 rsptr = RSTRING_PTR(rs);
9473 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9474 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9475 rsnewline = 1;
9476 }
9477 }
9478
9479 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9480 rs = rb_str_new(rsptr, rslen);
9481 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9482 rsptr = RSTRING_PTR(rs);
9483 rslen = RSTRING_LEN(rs);
9484 }
9485
9486 while (subptr < pend) {
9487 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9488 if (pos < 0) break;
9489 hit = subptr + pos;
9490 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9491 if (hit != adjusted) {
9492 subptr = adjusted;
9493 continue;
9494 }
9495 subend = hit += rslen;
9496 if (chomp) {
9497 if (rsnewline) {
9498 subend = chomp_newline(subptr, subend, enc);
9499 }
9500 else {
9501 subend -= rslen;
9502 }
9503 }
9504 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9505 if (ENUM_ELEM(ary, line)) {
9506 str_mod_check(str, ptr, len);
9507 }
9508 subptr = hit;
9509 }
9510
9511 if (subptr != pend) {
9512 if (chomp) {
9513 if (rsnewline) {
9514 pend = chomp_newline(subptr, pend, enc);
9515 }
9516 else if (pend - subptr >= rslen &&
9517 memcmp(pend - rslen, rsptr, rslen) == 0) {
9518 pend -= rslen;
9519 }
9520 }
9521 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9522 ENUM_ELEM(ary, line);
9523 RB_GC_GUARD(str);
9524 }
9525
9526 end:
9527 if (ary)
9528 return ary;
9529 else
9530 return orig;
9531}
9532
9533/*
9534 * call-seq:
9535 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9536 * each_line(record_separator = $/, chomp: false) -> enumerator
9537 *
9538 * :include: doc/string/each_line.rdoc
9539 *
9540 */
9541
9542static VALUE
9543rb_str_each_line(int argc, VALUE *argv, VALUE str)
9544{
9545 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9546 return rb_str_enumerate_lines(argc, argv, str, 0);
9547}
9548
9549/*
9550 * call-seq:
9551 * lines(record_separator = $/, chomp: false) -> array_of_strings
9552 *
9553 * Returns substrings ("lines") of +self+
9554 * according to the given arguments:
9555 *
9556 * s = <<~EOT
9557 * This is the first line.
9558 * This is line two.
9559 *
9560 * This is line four.
9561 * This is line five.
9562 * EOT
9563 *
9564 * With the default argument values:
9565 *
9566 * $/ # => "\n"
9567 * s.lines
9568 * # =>
9569 * ["This is the first line.\n",
9570 * "This is line two.\n",
9571 * "\n",
9572 * "This is line four.\n",
9573 * "This is line five.\n"]
9574 *
9575 * With a different +record_separator+:
9576 *
9577 * record_separator = ' is '
9578 * s.lines(record_separator)
9579 * # =>
9580 * ["This is ",
9581 * "the first line.\nThis is ",
9582 * "line two.\n\nThis is ",
9583 * "line four.\nThis is ",
9584 * "line five.\n"]
9585 *
9586 * With keyword argument +chomp+ as +true+,
9587 * removes the trailing newline from each line:
9588 *
9589 * s.lines(chomp: true)
9590 * # =>
9591 * ["This is the first line.",
9592 * "This is line two.",
9593 * "",
9594 * "This is line four.",
9595 * "This is line five."]
9596 *
9597 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9598 */
9599
9600static VALUE
9601rb_str_lines(int argc, VALUE *argv, VALUE str)
9602{
9603 VALUE ary = WANTARRAY("lines", 0);
9604 return rb_str_enumerate_lines(argc, argv, str, ary);
9605}
9606
9607static VALUE
9608rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9609{
9610 return LONG2FIX(RSTRING_LEN(str));
9611}
9612
9613static VALUE
9614rb_str_enumerate_bytes(VALUE str, VALUE ary)
9615{
9616 long i;
9617
9618 for (i=0; i<RSTRING_LEN(str); i++) {
9619 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9620 }
9621 if (ary)
9622 return ary;
9623 else
9624 return str;
9625}
9626
9627/*
9628 * call-seq:
9629 * each_byte {|byte| ... } -> self
9630 * each_byte -> enumerator
9631 *
9632 * :include: doc/string/each_byte.rdoc
9633 *
9634 */
9635
9636static VALUE
9637rb_str_each_byte(VALUE str)
9638{
9639 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9640 return rb_str_enumerate_bytes(str, 0);
9641}
9642
9643/*
9644 * call-seq:
9645 * bytes -> array_of_bytes
9646 *
9647 * :include: doc/string/bytes.rdoc
9648 *
9649 */
9650
9651static VALUE
9652rb_str_bytes(VALUE str)
9653{
9654 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9655 return rb_str_enumerate_bytes(str, ary);
9656}
9657
9658static VALUE
9659rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9660{
9661 return rb_str_length(str);
9662}
9663
9664static VALUE
9665rb_str_enumerate_chars(VALUE str, VALUE ary)
9666{
9667 VALUE orig = str;
9668 long i, len, n;
9669 const char *ptr;
9670 rb_encoding *enc;
9671
9672 str = rb_str_new_frozen(str);
9673 ptr = RSTRING_PTR(str);
9674 len = RSTRING_LEN(str);
9675 enc = rb_enc_get(str);
9676
9678 for (i = 0; i < len; i += n) {
9679 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9680 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9681 }
9682 }
9683 else {
9684 for (i = 0; i < len; i += n) {
9685 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9686 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9687 }
9688 }
9689 RB_GC_GUARD(str);
9690 if (ary)
9691 return ary;
9692 else
9693 return orig;
9694}
9695
9696/*
9697 * call-seq:
9698 * each_char {|char| ... } -> self
9699 * each_char -> enumerator
9700 *
9701 * :include: doc/string/each_char.rdoc
9702 *
9703 */
9704
9705static VALUE
9706rb_str_each_char(VALUE str)
9707{
9708 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9709 return rb_str_enumerate_chars(str, 0);
9710}
9711
9712/*
9713 * call-seq:
9714 * chars -> array_of_characters
9715 *
9716 * :include: doc/string/chars.rdoc
9717 *
9718 */
9719
9720static VALUE
9721rb_str_chars(VALUE str)
9722{
9723 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9724 return rb_str_enumerate_chars(str, ary);
9725}
9726
9727static VALUE
9728rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9729{
9730 VALUE orig = str;
9731 int n;
9732 unsigned int c;
9733 const char *ptr, *end;
9734 rb_encoding *enc;
9735
9736 if (single_byte_optimizable(str))
9737 return rb_str_enumerate_bytes(str, ary);
9738
9739 str = rb_str_new_frozen(str);
9740 ptr = RSTRING_PTR(str);
9741 end = RSTRING_END(str);
9742 enc = STR_ENC_GET(str);
9743
9744 while (ptr < end) {
9745 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9746 ENUM_ELEM(ary, UINT2NUM(c));
9747 ptr += n;
9748 }
9749 RB_GC_GUARD(str);
9750 if (ary)
9751 return ary;
9752 else
9753 return orig;
9754}
9755
9756/*
9757 * call-seq:
9758 * each_codepoint {|codepoint| ... } -> self
9759 * each_codepoint -> enumerator
9760 *
9761 * :include: doc/string/each_codepoint.rdoc
9762 *
9763 */
9764
9765static VALUE
9766rb_str_each_codepoint(VALUE str)
9767{
9768 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9769 return rb_str_enumerate_codepoints(str, 0);
9770}
9771
9772/*
9773 * call-seq:
9774 * codepoints -> array_of_integers
9775 *
9776 * :include: doc/string/codepoints.rdoc
9777 *
9778 */
9779
9780static VALUE
9781rb_str_codepoints(VALUE str)
9782{
9783 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9784 return rb_str_enumerate_codepoints(str, ary);
9785}
9786
9787static regex_t *
9788get_reg_grapheme_cluster(rb_encoding *enc)
9789{
9790 int encidx = rb_enc_to_index(enc);
9791
9792 const OnigUChar source_ascii[] = "\\X";
9793 const OnigUChar *source = source_ascii;
9794 size_t source_len = sizeof(source_ascii) - 1;
9795
9796 switch (encidx) {
9797#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9798#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9799#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9800#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9801#define CASE_UTF(e) \
9802 case ENCINDEX_UTF_##e: { \
9803 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9804 source = source_UTF_##e; \
9805 source_len = sizeof(source_UTF_##e); \
9806 break; \
9807 }
9808 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9809#undef CASE_UTF
9810#undef CHARS_16BE
9811#undef CHARS_16LE
9812#undef CHARS_32BE
9813#undef CHARS_32LE
9814 }
9815
9816 regex_t *reg_grapheme_cluster;
9817 OnigErrorInfo einfo;
9818 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9819 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9820 if (r) {
9821 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9822 onig_error_code_to_str(message, r, &einfo);
9823 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9824 }
9825
9826 return reg_grapheme_cluster;
9827}
9828
9829static regex_t *
9830get_cached_reg_grapheme_cluster(rb_encoding *enc)
9831{
9832 int encidx = rb_enc_to_index(enc);
9833 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9834
9835 if (encidx == rb_utf8_encindex()) {
9836 if (!reg_grapheme_cluster_utf8) {
9837 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9838 }
9839
9840 return reg_grapheme_cluster_utf8;
9841 }
9842
9843 return NULL;
9844}
9845
9846static VALUE
9847rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9848{
9849 size_t grapheme_cluster_count = 0;
9850 rb_encoding *enc = get_encoding(str);
9851 const char *ptr, *end;
9852
9853 if (!rb_enc_unicode_p(enc)) {
9854 return rb_str_length(str);
9855 }
9856
9857 bool cached_reg_grapheme_cluster = true;
9858 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9859 if (!reg_grapheme_cluster) {
9860 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9861 cached_reg_grapheme_cluster = false;
9862 }
9863
9864 ptr = RSTRING_PTR(str);
9865 end = RSTRING_END(str);
9866
9867 while (ptr < end) {
9868 OnigPosition len = onig_match(reg_grapheme_cluster,
9869 (const OnigUChar *)ptr, (const OnigUChar *)end,
9870 (const OnigUChar *)ptr, NULL, 0);
9871 if (len <= 0) break;
9872 grapheme_cluster_count++;
9873 ptr += len;
9874 }
9875
9876 if (!cached_reg_grapheme_cluster) {
9877 onig_free(reg_grapheme_cluster);
9878 }
9879
9880 return SIZET2NUM(grapheme_cluster_count);
9881}
9882
9883static VALUE
9884rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9885{
9886 VALUE orig = str;
9887 rb_encoding *enc = get_encoding(str);
9888 const char *ptr0, *ptr, *end;
9889
9890 if (!rb_enc_unicode_p(enc)) {
9891 return rb_str_enumerate_chars(str, ary);
9892 }
9893
9894 if (!ary) str = rb_str_new_frozen(str);
9895
9896 bool cached_reg_grapheme_cluster = true;
9897 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9898 if (!reg_grapheme_cluster) {
9899 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9900 cached_reg_grapheme_cluster = false;
9901 }
9902
9903 ptr0 = ptr = RSTRING_PTR(str);
9904 end = RSTRING_END(str);
9905
9906 while (ptr < end) {
9907 OnigPosition len = onig_match(reg_grapheme_cluster,
9908 (const OnigUChar *)ptr, (const OnigUChar *)end,
9909 (const OnigUChar *)ptr, NULL, 0);
9910 if (len <= 0) break;
9911 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9912 ptr += len;
9913 }
9914
9915 if (!cached_reg_grapheme_cluster) {
9916 onig_free(reg_grapheme_cluster);
9917 }
9918
9919 RB_GC_GUARD(str);
9920 if (ary)
9921 return ary;
9922 else
9923 return orig;
9924}
9925
9926/*
9927 * call-seq:
9928 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9929 * each_grapheme_cluster -> enumerator
9930 *
9931 * :include: doc/string/each_grapheme_cluster.rdoc
9932 *
9933 */
9934
9935static VALUE
9936rb_str_each_grapheme_cluster(VALUE str)
9937{
9938 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9939 return rb_str_enumerate_grapheme_clusters(str, 0);
9940}
9941
9942/*
9943 * call-seq:
9944 * grapheme_clusters -> array_of_grapheme_clusters
9945 *
9946 * :include: doc/string/grapheme_clusters.rdoc
9947 *
9948 */
9949
9950static VALUE
9951rb_str_grapheme_clusters(VALUE str)
9952{
9953 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9954 return rb_str_enumerate_grapheme_clusters(str, ary);
9955}
9956
9957static long
9958chopped_length(VALUE str)
9959{
9960 rb_encoding *enc = STR_ENC_GET(str);
9961 const char *p, *p2, *beg, *end;
9962
9963 beg = RSTRING_PTR(str);
9964 end = beg + RSTRING_LEN(str);
9965 if (beg >= end) return 0;
9966 p = rb_enc_prev_char(beg, end, end, enc);
9967 if (!p) return 0;
9968 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9969 p2 = rb_enc_prev_char(beg, p, end, enc);
9970 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9971 }
9972 return p - beg;
9973}
9974
9975/*
9976 * call-seq:
9977 * chop! -> self or nil
9978 *
9979 * Like String#chop, except that:
9980 *
9981 * - Removes trailing characters from +self+ (not from a copy of +self+).
9982 * - Returns +self+ if any characters are removed, +nil+ otherwise.
9983 *
9984 * Related: see {Modifying}[rdoc-ref:String@Modifying].
9985 */
9986
9987static VALUE
9988rb_str_chop_bang(VALUE str)
9989{
9990 str_modify_keep_cr(str);
9991 if (RSTRING_LEN(str) > 0) {
9992 long len;
9993 len = chopped_length(str);
9994 STR_SET_LEN(str, len);
9995 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9996 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9998 }
9999 return str;
10000 }
10001 return Qnil;
10002}
10003
10004
10005/*
10006 * call-seq:
10007 * chop -> new_string
10008 *
10009 * :include: doc/string/chop.rdoc
10010 *
10011 */
10012
10013static VALUE
10014rb_str_chop(VALUE str)
10015{
10016 return rb_str_subseq(str, 0, chopped_length(str));
10017}
10018
10019static long
10020smart_chomp(VALUE str, const char *e, const char *p)
10021{
10022 rb_encoding *enc = rb_enc_get(str);
10023 if (rb_enc_mbminlen(enc) > 1) {
10024 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10025 if (rb_enc_is_newline(pp, e, enc)) {
10026 e = pp;
10027 }
10028 pp = e - rb_enc_mbminlen(enc);
10029 if (pp >= p) {
10030 pp = rb_enc_left_char_head(p, pp, e, enc);
10031 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10032 e = pp;
10033 }
10034 }
10035 }
10036 else {
10037 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10038 case '\n':
10039 if (--e > p && *(e-1) == '\r') {
10040 --e;
10041 }
10042 break;
10043 case '\r':
10044 --e;
10045 break;
10046 }
10047 }
10048 return e - p;
10049}
10050
10051static long
10052chompped_length(VALUE str, VALUE rs)
10053{
10054 rb_encoding *enc;
10055 int newline;
10056 char *pp, *e, *rsptr;
10057 long rslen;
10058 char *const p = RSTRING_PTR(str);
10059 long len = RSTRING_LEN(str);
10060
10061 if (len == 0) return 0;
10062 e = p + len;
10063 if (rs == rb_default_rs) {
10064 return smart_chomp(str, e, p);
10065 }
10066
10067 enc = rb_enc_get(str);
10068 RSTRING_GETMEM(rs, rsptr, rslen);
10069 if (rslen == 0) {
10070 if (rb_enc_mbminlen(enc) > 1) {
10071 while (e > p) {
10072 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10073 if (!rb_enc_is_newline(pp, e, enc)) break;
10074 e = pp;
10075 pp -= rb_enc_mbminlen(enc);
10076 if (pp >= p) {
10077 pp = rb_enc_left_char_head(p, pp, e, enc);
10078 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10079 e = pp;
10080 }
10081 }
10082 }
10083 }
10084 else {
10085 while (e > p && *(e-1) == '\n') {
10086 --e;
10087 if (e > p && *(e-1) == '\r')
10088 --e;
10089 }
10090 }
10091 return e - p;
10092 }
10093 if (rslen > len) return len;
10094
10095 enc = rb_enc_get(rs);
10096 newline = rsptr[rslen-1];
10097 if (rslen == rb_enc_mbminlen(enc)) {
10098 if (rslen == 1) {
10099 if (newline == '\n')
10100 return smart_chomp(str, e, p);
10101 }
10102 else {
10103 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10104 return smart_chomp(str, e, p);
10105 }
10106 }
10107
10108 enc = rb_enc_check(str, rs);
10109 if (is_broken_string(rs)) {
10110 return len;
10111 }
10112 pp = e - rslen;
10113 if (p[len-1] == newline &&
10114 (rslen <= 1 ||
10115 memcmp(rsptr, pp, rslen) == 0)) {
10116 if (at_char_boundary(p, pp, e, enc))
10117 return len - rslen;
10118 RB_GC_GUARD(rs);
10119 }
10120 return len;
10121}
10122
10128static VALUE
10129chomp_rs(int argc, const VALUE *argv)
10130{
10131 rb_check_arity(argc, 0, 1);
10132 if (argc > 0) {
10133 VALUE rs = argv[0];
10134 if (!NIL_P(rs)) StringValue(rs);
10135 return rs;
10136 }
10137 else {
10138 return rb_rs;
10139 }
10140}
10141
10142VALUE
10143rb_str_chomp_string(VALUE str, VALUE rs)
10144{
10145 long olen = RSTRING_LEN(str);
10146 long len = chompped_length(str, rs);
10147 if (len >= olen) return Qnil;
10148 str_modify_keep_cr(str);
10149 STR_SET_LEN(str, len);
10150 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10151 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10153 }
10154 return str;
10155}
10156
10157/*
10158 * call-seq:
10159 * chomp!(line_sep = $/) -> self or nil
10160 *
10161 * Like String#chomp, except that:
10162 *
10163 * - Removes trailing characters from +self+ (not from a copy of +self+).
10164 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10165 *
10166 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10167 */
10168
10169static VALUE
10170rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10171{
10172 VALUE rs;
10173 str_modifiable(str);
10174 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10175 rs = chomp_rs(argc, argv);
10176 if (NIL_P(rs)) return Qnil;
10177 return rb_str_chomp_string(str, rs);
10178}
10179
10180
10181/*
10182 * call-seq:
10183 * chomp(line_sep = $/) -> new_string
10184 *
10185 * :include: doc/string/chomp.rdoc
10186 *
10187 */
10188
10189static VALUE
10190rb_str_chomp(int argc, VALUE *argv, VALUE str)
10191{
10192 VALUE rs = chomp_rs(argc, argv);
10193 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10194 return rb_str_subseq(str, 0, chompped_length(str, rs));
10195}
10196
10197static void
10198tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10199 VALUE str, int num_selectors, VALUE *selectors)
10200{
10201 int i;
10202
10203 for (i=0; i<num_selectors; i++) {
10204 VALUE selector = selectors[i];
10205 rb_encoding *enc;
10206
10207 StringValue(selector);
10208 enc = rb_enc_check(str, selector);
10209 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10210 }
10211}
10212
10213static long
10214lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10215{
10216 const char *const start = s;
10217
10218 if (!s || s >= e) return 0;
10219
10220 /* remove spaces at head */
10221 if (single_byte_optimizable(str)) {
10222 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10223 }
10224 else {
10225 while (s < e) {
10226 int n;
10227 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10228
10229 if (cc && !rb_isspace(cc)) break;
10230 s += n;
10231 }
10232 }
10233 return s - start;
10234}
10235
10236static long
10237lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10238 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10239{
10240 const char *const start = s;
10241
10242 if (!s || s >= e) return 0;
10243
10244 /* remove leading characters in the table */
10245 while (s < e) {
10246 int n;
10247 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10248
10249 if (!tr_find(cc, table, del, nodel)) break;
10250 s += n;
10251 }
10252 return s - start;
10253}
10254
10255/*
10256 * call-seq:
10257 * lstrip!(*selectors) -> self or nil
10258 *
10259 * Like String#lstrip, except that:
10260 *
10261 * - Performs stripping in +self+ (not in a copy of +self+).
10262 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10263 *
10264 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10265 */
10266
10267static VALUE
10268rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10269{
10270 rb_encoding *enc;
10271 char *start, *s;
10272 long olen, loffset;
10273
10274 str_modify_keep_cr(str);
10275 enc = STR_ENC_GET(str);
10276 RSTRING_GETMEM(str, start, olen);
10277 if (argc > 0) {
10278 char table[TR_TABLE_SIZE];
10279 VALUE del = 0, nodel = 0;
10280
10281 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10282 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10283 }
10284 else {
10285 loffset = lstrip_offset(str, start, start+olen, enc);
10286 }
10287
10288 if (loffset > 0) {
10289 long len = olen-loffset;
10290 s = start + loffset;
10291 memmove(start, s, len);
10292 STR_SET_LEN(str, len);
10293 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10294 return str;
10295 }
10296 return Qnil;
10297}
10298
10299
10300/*
10301 * call-seq:
10302 * lstrip(*selectors) -> new_string
10303 *
10304 * Returns a copy of +self+ with leading whitespace removed;
10305 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10306 *
10307 * whitespace = "\x00\t\n\v\f\r "
10308 * s = whitespace + 'abc' + whitespace
10309 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10310 * s.lstrip
10311 * # => "abc\u0000\t\n\v\f\r "
10312 *
10313 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10314 *
10315 * s = "---abc+++"
10316 * s.lstrip("-") # => "abc+++"
10317 *
10318 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10319 * and may use any of its valid forms, including negation, ranges, and escapes:
10320 *
10321 * "01234abc56789".lstrip("0-9") # "abc56789"
10322 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10323 *
10324 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10325 */
10326
10327static VALUE
10328rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10329{
10330 char *start;
10331 long len, loffset;
10332
10333 RSTRING_GETMEM(str, start, len);
10334 if (argc > 0) {
10335 char table[TR_TABLE_SIZE];
10336 VALUE del = 0, nodel = 0;
10337
10338 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10339 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10340 }
10341 else {
10342 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10343 }
10344 if (loffset <= 0) return str_duplicate(rb_cString, str);
10345 return rb_str_subseq(str, loffset, len - loffset);
10346}
10347
10348static long
10349rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10350{
10351 const char *t;
10352
10353 rb_str_check_dummy_enc(enc);
10355 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10356 }
10357 if (!s || s >= e) return 0;
10358 t = e;
10359
10360 /* remove trailing spaces or '\0's */
10361 if (single_byte_optimizable(str)) {
10362 unsigned char c;
10363 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10364 }
10365 else {
10366 char *tp;
10367
10368 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10369 unsigned int c = rb_enc_codepoint(tp, e, enc);
10370 if (c && !rb_isspace(c)) break;
10371 t = tp;
10372 }
10373 }
10374 return e - t;
10375}
10376
10377static long
10378rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10379 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10380{
10381 const char *t;
10382 char *tp;
10383
10384 rb_str_check_dummy_enc(enc);
10386 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10387 }
10388 if (!s || s >= e) return 0;
10389 t = e;
10390
10391 /* remove trailing characters in the table */
10392 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10393 unsigned int c = rb_enc_codepoint(tp, e, enc);
10394 if (!tr_find(c, table, del, nodel)) break;
10395 t = tp;
10396 }
10397
10398 return e - t;
10399}
10400
10401/*
10402 * call-seq:
10403 * rstrip!(*selectors) -> self or nil
10404 *
10405 * Like String#rstrip, except that:
10406 *
10407 * - Performs stripping in +self+ (not in a copy of +self+).
10408 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10409 *
10410 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10411 */
10412
10413static VALUE
10414rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10415{
10416 rb_encoding *enc;
10417 char *start;
10418 long olen, roffset;
10419
10420 str_modify_keep_cr(str);
10421 enc = STR_ENC_GET(str);
10422 RSTRING_GETMEM(str, start, olen);
10423 if (argc > 0) {
10424 char table[TR_TABLE_SIZE];
10425 VALUE del = 0, nodel = 0;
10426
10427 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10428 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10429 }
10430 else {
10431 roffset = rstrip_offset(str, start, start+olen, enc);
10432 }
10433 if (roffset > 0) {
10434 long len = olen - roffset;
10435
10436 STR_SET_LEN(str, len);
10437 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10438 return str;
10439 }
10440 return Qnil;
10441}
10442
10443
10444/*
10445 * call-seq:
10446 * rstrip(*selectors) -> new_string
10447 *
10448 * Returns a copy of +self+ with trailing whitespace removed;
10449 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10450 *
10451 * whitespace = "\x00\t\n\v\f\r "
10452 * s = whitespace + 'abc' + whitespace
10453 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10454 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10455 *
10456 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10457 *
10458 * s = "---abc+++"
10459 * s.rstrip("+") # => "---abc"
10460 *
10461 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10462 * and may use any of its valid forms, including negation, ranges, and escapes:
10463 *
10464 * "01234abc56789".rstrip("0-9") # "01234abc"
10465 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10466 *
10467 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10468 */
10469
10470static VALUE
10471rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10472{
10473 rb_encoding *enc;
10474 char *start;
10475 long olen, roffset;
10476
10477 enc = STR_ENC_GET(str);
10478 RSTRING_GETMEM(str, start, olen);
10479 if (argc > 0) {
10480 char table[TR_TABLE_SIZE];
10481 VALUE del = 0, nodel = 0;
10482
10483 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10484 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10485 }
10486 else {
10487 roffset = rstrip_offset(str, start, start+olen, enc);
10488 }
10489 if (roffset <= 0) return str_duplicate(rb_cString, str);
10490 return rb_str_subseq(str, 0, olen-roffset);
10491}
10492
10493
10494/*
10495 * call-seq:
10496 * strip!(*selectors) -> self or nil
10497 *
10498 * Like String#strip, except that:
10499 *
10500 * - Any modifications are made to +self+.
10501 * - Returns +self+ if any modification are made, +nil+ otherwise.
10502 *
10503 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10504 */
10505
10506static VALUE
10507rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10508{
10509 char *start;
10510 long olen, loffset, roffset;
10511 rb_encoding *enc;
10512
10513 str_modify_keep_cr(str);
10514 enc = STR_ENC_GET(str);
10515 RSTRING_GETMEM(str, start, olen);
10516
10517 if (argc > 0) {
10518 char table[TR_TABLE_SIZE];
10519 VALUE del = 0, nodel = 0;
10520
10521 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10522 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10523 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10524 }
10525 else {
10526 loffset = lstrip_offset(str, start, start+olen, enc);
10527 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10528 }
10529
10530 if (loffset > 0 || roffset > 0) {
10531 long len = olen-roffset;
10532 if (loffset > 0) {
10533 len -= loffset;
10534 memmove(start, start + loffset, len);
10535 }
10536 STR_SET_LEN(str, len);
10537 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10538 return str;
10539 }
10540 return Qnil;
10541}
10542
10543
10544/*
10545 * call-seq:
10546 * strip(*selectors) -> new_string
10547 *
10548 * Returns a copy of +self+ with leading and trailing whitespace removed;
10549 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10550 *
10551 * whitespace = "\x00\t\n\v\f\r "
10552 * s = whitespace + 'abc' + whitespace
10553 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10554 * s.strip # => "abc"
10555 *
10556 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10557 *
10558 * s = "---abc+++"
10559 * s.strip("-+") # => "abc"
10560 * s.strip("+-") # => "abc"
10561 *
10562 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10563 * and may use any of its valid forms, including negation, ranges, and escapes:
10564 *
10565 * "01234abc56789".strip("0-9") # "abc"
10566 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10567 *
10568 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10569 */
10570
10571static VALUE
10572rb_str_strip(int argc, VALUE *argv, VALUE str)
10573{
10574 char *start;
10575 long olen, loffset, roffset;
10576 rb_encoding *enc = STR_ENC_GET(str);
10577
10578 RSTRING_GETMEM(str, start, olen);
10579
10580 if (argc > 0) {
10581 char table[TR_TABLE_SIZE];
10582 VALUE del = 0, nodel = 0;
10583
10584 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10585 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10586 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10587 }
10588 else {
10589 loffset = lstrip_offset(str, start, start+olen, enc);
10590 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10591 }
10592
10593 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10594 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10595}
10596
10597static VALUE
10598scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10599{
10600 VALUE result = Qnil;
10601 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10602 if (pos >= 0) {
10603 VALUE match;
10604 struct re_registers *regs;
10605 if (BUILTIN_TYPE(pat) == T_STRING) {
10606 regs = NULL;
10607 end = pos + RSTRING_LEN(pat);
10608 }
10609 else {
10610 match = rb_backref_get();
10611 regs = RMATCH_REGS(match);
10612 pos = BEG(0);
10613 end = END(0);
10614 }
10615
10616 if (pos == end) {
10617 rb_encoding *enc = STR_ENC_GET(str);
10618 /*
10619 * Always consume at least one character of the input string
10620 */
10621 if (RSTRING_LEN(str) > end)
10622 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10623 RSTRING_END(str), enc);
10624 else
10625 *start = end + 1;
10626 }
10627 else {
10628 *start = end;
10629 }
10630
10631 if (!regs || regs->num_regs == 1) {
10632 result = rb_str_subseq(str, pos, end - pos);
10633 return result;
10634 }
10635 else {
10636 result = rb_ary_new2(regs->num_regs);
10637 for (int i = 1; i < regs->num_regs; i++) {
10638 VALUE s = Qnil;
10639 if (BEG(i) >= 0) {
10640 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10641 }
10642
10643 rb_ary_push(result, s);
10644 }
10645 }
10646
10647 RB_GC_GUARD(match);
10648 }
10649
10650 return result;
10651}
10652
10653
10654/*
10655 * call-seq:
10656 * scan(pattern) -> array_of_results
10657 * scan(pattern) {|result| ... } -> self
10658 *
10659 * :include: doc/string/scan.rdoc
10660 *
10661 */
10662
10663static VALUE
10664rb_str_scan(VALUE str, VALUE pat)
10665{
10666 VALUE result;
10667 long start = 0;
10668 long last = -1, prev = 0;
10669 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10670
10671 pat = get_pat_quoted(pat, 1);
10672 mustnot_broken(str);
10673 if (!rb_block_given_p()) {
10674 VALUE ary = rb_ary_new();
10675
10676 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10677 last = prev;
10678 prev = start;
10679 rb_ary_push(ary, result);
10680 }
10681 if (last >= 0) rb_pat_search(pat, str, last, 1);
10682 else rb_backref_set(Qnil);
10683 return ary;
10684 }
10685
10686 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10687 last = prev;
10688 prev = start;
10689 rb_yield(result);
10690 str_mod_check(str, p, len);
10691 }
10692 if (last >= 0) rb_pat_search(pat, str, last, 1);
10693 return str;
10694}
10695
10696
10697/*
10698 * call-seq:
10699 * hex -> integer
10700 *
10701 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10702 * returns its value as an integer.
10703 *
10704 * The leading substring is interpreted as hexadecimal when it begins with:
10705 *
10706 * - One or more character representing hexadecimal digits
10707 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10708 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10709 *
10710 * 'f'.hex # => 15
10711 * '11'.hex # => 17
10712 * 'FFF'.hex # => 4095
10713 * 'fffg'.hex # => 4095
10714 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10715 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10716 * 'deadbeef'.hex # => 3735928559
10717 *
10718 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10719 *
10720 * '0xfff'.hex # => 4095
10721 * '0xfffg'.hex # => 4095
10722 *
10723 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10724 *
10725 * '-fff'.hex # => -4095
10726 * '-0xFFF'.hex # => -4095
10727 *
10728 * For any substring not described above, returns zero:
10729 *
10730 * 'xxx'.hex # => 0
10731 * ''.hex # => 0
10732 *
10733 * Note that, unlike #oct, this method interprets only hexadecimal,
10734 * and not binary, octal, or decimal notations:
10735 *
10736 * '0b111'.hex # => 45329
10737 * '0o777'.hex # => 0
10738 * '0d999'.hex # => 55705
10739 *
10740 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10741 */
10742
10743static VALUE
10744rb_str_hex(VALUE str)
10745{
10746 return rb_str_to_inum(str, 16, FALSE);
10747}
10748
10749
10750/*
10751 * call-seq:
10752 * oct -> integer
10753 *
10754 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10755 * returns their value as an integer.
10756 *
10757 * In brief:
10758 *
10759 * # Interpreted as octal.
10760 * '777'.oct # => 511
10761 * '777x'.oct # => 511
10762 * '0777'.oct # => 511
10763 * '0o777'.oct # => 511
10764 * '-777'.oct # => -511
10765 * # Not interpreted as octal.
10766 * '0b111'.oct # => 7 # Interpreted as binary.
10767 * '0d999'.oct # => 999 # Interpreted as decimal.
10768 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10769 *
10770 * The leading substring is interpreted as octal when it begins with:
10771 *
10772 * - One or more character representing octal digits
10773 * (each in the range <tt>'0'..'7'</tt>);
10774 * the string to be interpreted ends at the first character that does not represent an octal digit:
10775 *
10776 * '7'.oct @ => 7
10777 * '11'.oct # => 9
10778 * '777'.oct # => 511
10779 * '0777'.oct # => 511
10780 * '7778'.oct # => 511
10781 * '777x'.oct # => 511
10782 *
10783 * - <tt>'0o'</tt>, followed by one or more octal digits:
10784 *
10785 * '0o777'.oct # => 511
10786 * '0o7778'.oct # => 511
10787 *
10788 * The leading substring is _not_ interpreted as octal when it begins with:
10789 *
10790 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10791 * (each in the range <tt>'0'..'1'</tt>);
10792 * the string to be interpreted ends at the first character that does not represent a binary digit.
10793 * the string is interpreted as binary digits (base 2):
10794 *
10795 * '0b111'.oct # => 7
10796 * '0b1112'.oct # => 7
10797 *
10798 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10799 * (each in the range <tt>'0'..'9'</tt>);
10800 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10801 * the string is interpreted as decimal digits (base 10):
10802 *
10803 * '0d999'.oct # => 999
10804 * '0d999x'.oct # => 999
10805 *
10806 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10807 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10808 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10809 * the string is interpreted as hexadecimal digits (base 16):
10810 *
10811 * '0xfff'.oct # => 4095
10812 * '0xfffg'.oct # => 4095
10813 *
10814 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10815 *
10816 * '-777'.oct # => -511
10817 * '-0777'.oct # => -511
10818 * '-0b111'.oct # => -7
10819 * '-0xfff'.oct # => -4095
10820 *
10821 * For any substring not described above, returns zero:
10822 *
10823 * 'foo'.oct # => 0
10824 * ''.oct # => 0
10825 *
10826 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10827 */
10828
10829static VALUE
10830rb_str_oct(VALUE str)
10831{
10832 return rb_str_to_inum(str, -8, FALSE);
10833}
10834
10835#ifndef HAVE_CRYPT_R
10836# include "ruby/thread_native.h"
10837# include "ruby/atomic.h"
10838
10839static struct {
10840 rb_nativethread_lock_t lock;
10841} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10842#endif
10843
10844/*
10845 * call-seq:
10846 * crypt(salt_str) -> new_string
10847 *
10848 * Returns the string generated by calling <code>crypt(3)</code>
10849 * standard library function with <code>str</code> and
10850 * <code>salt_str</code>, in this order, as its arguments. Please do
10851 * not use this method any longer. It is legacy; provided only for
10852 * backward compatibility with ruby scripts in earlier days. It is
10853 * bad to use in contemporary programs for several reasons:
10854 *
10855 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10856 * run. The generated string lacks data portability.
10857 *
10858 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10859 * (i.e. silently ends up in unexpected results).
10860 *
10861 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10862 * thread safe.
10863 *
10864 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10865 * very very weak. According to its manpage, Linux's traditional
10866 * <code>crypt(3)</code> output has only 2**56 variations; too
10867 * easy to brute force today. And this is the default behaviour.
10868 *
10869 * * In order to make things robust some OSes implement so-called
10870 * "modular" usage. To go through, you have to do a complex
10871 * build-up of the <code>salt_str</code> parameter, by hand.
10872 * Failure in generation of a proper salt string tends not to
10873 * yield any errors; typos in parameters are normally not
10874 * detectable.
10875 *
10876 * * For instance, in the following example, the second invocation
10877 * of String#crypt is wrong; it has a typo in "round=" (lacks
10878 * "s"). However the call does not fail and something unexpected
10879 * is generated.
10880 *
10881 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10882 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10883 *
10884 * * Even in the "modular" mode, some hash functions are considered
10885 * archaic and no longer recommended at all; for instance module
10886 * <code>$1$</code> is officially abandoned by its author: see
10887 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10888 * instance module <code>$3$</code> is considered completely
10889 * broken: see the manpage of FreeBSD.
10890 *
10891 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10892 * written above, <code>crypt(3)</code> on Mac OS never fails.
10893 * This means even if you build up a proper salt string it
10894 * generates a traditional DES hash anyways, and there is no way
10895 * for you to be aware of.
10896 *
10897 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10898 *
10899 * If for some reason you cannot migrate to other secure contemporary
10900 * password hashing algorithms, install the string-crypt gem and
10901 * <code>require 'string/crypt'</code> to continue using it.
10902 */
10903
10904static VALUE
10905rb_str_crypt(VALUE str, VALUE salt)
10906{
10907#ifdef HAVE_CRYPT_R
10908 VALUE databuf;
10909 struct crypt_data *data;
10910# define CRYPT_END() ALLOCV_END(databuf)
10911#else
10912 char *tmp_buf;
10913 extern char *crypt(const char *, const char *);
10914# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10915#endif
10916 VALUE result;
10917 const char *s, *saltp;
10918 char *res;
10919#ifdef BROKEN_CRYPT
10920 char salt_8bit_clean[3];
10921#endif
10922
10923 StringValue(salt);
10924 mustnot_wchar(str);
10925 mustnot_wchar(salt);
10926 s = StringValueCStr(str);
10927 saltp = RSTRING_PTR(salt);
10928 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10929 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10930 }
10931
10932#ifdef BROKEN_CRYPT
10933 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10934 salt_8bit_clean[0] = saltp[0] & 0x7f;
10935 salt_8bit_clean[1] = saltp[1] & 0x7f;
10936 salt_8bit_clean[2] = '\0';
10937 saltp = salt_8bit_clean;
10938 }
10939#endif
10940#ifdef HAVE_CRYPT_R
10941 data = ALLOCV(databuf, sizeof(struct crypt_data));
10942# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10943 data->initialized = 0;
10944# endif
10945 res = crypt_r(s, saltp, data);
10946#else
10947 rb_nativethread_lock_lock(&crypt_mutex.lock);
10948 res = crypt(s, saltp);
10949#endif
10950 if (!res) {
10951 int err = errno;
10952 CRYPT_END();
10953 rb_syserr_fail(err, "crypt");
10954 }
10955#ifdef HAVE_CRYPT_R
10956 result = rb_str_new_cstr(res);
10957 CRYPT_END();
10958#else
10959 // We need to copy this buffer because it's static and we need to unlock the mutex
10960 // before allocating a new object (the string to be returned). If we allocate while
10961 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10962 // if other ractors are waiting on this lock.
10963 size_t res_size = strlen(res)+1;
10964 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10965 memcpy(tmp_buf, res, res_size);
10966 res = tmp_buf;
10967 CRYPT_END();
10968 result = rb_str_new_cstr(res);
10969#endif
10970 return result;
10971}
10972
10973
10974/*
10975 * call-seq:
10976 * ord -> integer
10977 *
10978 * :include: doc/string/ord.rdoc
10979 *
10980 */
10981
10982static VALUE
10983rb_str_ord(VALUE s)
10984{
10985 unsigned int c;
10986
10987 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10988 return UINT2NUM(c);
10989}
10990/*
10991 * call-seq:
10992 * sum(n = 16) -> integer
10993 *
10994 * :include: doc/string/sum.rdoc
10995 *
10996 */
10997
10998static VALUE
10999rb_str_sum(int argc, VALUE *argv, VALUE str)
11000{
11001 int bits = 16;
11002 char *ptr, *p, *pend;
11003 long len;
11004 VALUE sum = INT2FIX(0);
11005 unsigned long sum0 = 0;
11006
11007 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11008 bits = 0;
11009 }
11010 ptr = p = RSTRING_PTR(str);
11011 len = RSTRING_LEN(str);
11012 pend = p + len;
11013
11014 while (p < pend) {
11015 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11016 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11017 str_mod_check(str, ptr, len);
11018 sum0 = 0;
11019 }
11020 sum0 += (unsigned char)*p;
11021 p++;
11022 }
11023
11024 if (bits == 0) {
11025 if (sum0) {
11026 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11027 }
11028 }
11029 else {
11030 if (sum == INT2FIX(0)) {
11031 if (bits < (int)sizeof(long)*CHAR_BIT) {
11032 sum0 &= (((unsigned long)1)<<bits)-1;
11033 }
11034 sum = LONG2FIX(sum0);
11035 }
11036 else {
11037 VALUE mod;
11038
11039 if (sum0) {
11040 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11041 }
11042
11043 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11044 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11045 sum = rb_funcall(sum, '&', 1, mod);
11046 }
11047 }
11048 return sum;
11049}
11050
11051static VALUE
11052rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11053{
11054 rb_encoding *enc;
11055 VALUE w;
11056 long width, len, flen = 1, fclen = 1;
11057 VALUE res;
11058 char *p;
11059 const char *f = " ";
11060 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11061 VALUE pad;
11062 int singlebyte = 1, cr;
11063 int termlen;
11064
11065 rb_scan_args(argc, argv, "11", &w, &pad);
11066 enc = STR_ENC_GET(str);
11067 termlen = rb_enc_mbminlen(enc);
11068 width = NUM2LONG(w);
11069 if (argc == 2) {
11070 StringValue(pad);
11071 enc = rb_enc_check(str, pad);
11072 f = RSTRING_PTR(pad);
11073 flen = RSTRING_LEN(pad);
11074 fclen = str_strlen(pad, enc); /* rb_enc_check */
11075 singlebyte = single_byte_optimizable(pad);
11076 if (flen == 0 || fclen == 0) {
11077 rb_raise(rb_eArgError, "zero width padding");
11078 }
11079 }
11080 len = str_strlen(str, enc); /* rb_enc_check */
11081 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11082 n = width - len;
11083 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11084 rlen = n - llen;
11085 cr = ENC_CODERANGE(str);
11086 if (flen > 1) {
11087 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11088 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11089 }
11090 size = RSTRING_LEN(str);
11091 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11092 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11093 (len += llen2 + rlen2) >= LONG_MAX - size) {
11094 rb_raise(rb_eArgError, "argument too big");
11095 }
11096 len += size;
11097 res = str_enc_new(rb_cString, 0, len, enc);
11098 p = RSTRING_PTR(res);
11099 if (flen <= 1) {
11100 memset(p, *f, llen);
11101 p += llen;
11102 }
11103 else {
11104 while (llen >= fclen) {
11105 memcpy(p,f,flen);
11106 p += flen;
11107 llen -= fclen;
11108 }
11109 if (llen > 0) {
11110 memcpy(p, f, llen2);
11111 p += llen2;
11112 }
11113 }
11114 memcpy(p, RSTRING_PTR(str), size);
11115 p += size;
11116 if (flen <= 1) {
11117 memset(p, *f, rlen);
11118 p += rlen;
11119 }
11120 else {
11121 while (rlen >= fclen) {
11122 memcpy(p,f,flen);
11123 p += flen;
11124 rlen -= fclen;
11125 }
11126 if (rlen > 0) {
11127 memcpy(p, f, rlen2);
11128 p += rlen2;
11129 }
11130 }
11131 TERM_FILL(p, termlen);
11132 STR_SET_LEN(res, p-RSTRING_PTR(res));
11133
11134 if (argc == 2)
11135 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11136 if (cr != ENC_CODERANGE_BROKEN)
11137 ENC_CODERANGE_SET(res, cr);
11138
11139 RB_GC_GUARD(pad);
11140 return res;
11141}
11142
11143
11144/*
11145 * call-seq:
11146 * ljust(width, pad_string = ' ') -> new_string
11147 *
11148 * :include: doc/string/ljust.rdoc
11149 *
11150 */
11151
11152static VALUE
11153rb_str_ljust(int argc, VALUE *argv, VALUE str)
11154{
11155 return rb_str_justify(argc, argv, str, 'l');
11156}
11157
11158/*
11159 * call-seq:
11160 * rjust(width, pad_string = ' ') -> new_string
11161 *
11162 * :include: doc/string/rjust.rdoc
11163 *
11164 */
11165
11166static VALUE
11167rb_str_rjust(int argc, VALUE *argv, VALUE str)
11168{
11169 return rb_str_justify(argc, argv, str, 'r');
11170}
11171
11172
11173/*
11174 * call-seq:
11175 * center(size, pad_string = ' ') -> new_string
11176 *
11177 * :include: doc/string/center.rdoc
11178 *
11179 */
11180
11181static VALUE
11182rb_str_center(int argc, VALUE *argv, VALUE str)
11183{
11184 return rb_str_justify(argc, argv, str, 'c');
11185}
11186
11187/*
11188 * call-seq:
11189 * partition(pattern) -> [pre_match, first_match, post_match]
11190 *
11191 * :include: doc/string/partition.rdoc
11192 *
11193 */
11194
11195static VALUE
11196rb_str_partition(VALUE str, VALUE sep)
11197{
11198 long pos;
11199
11200 sep = get_pat_quoted(sep, 0);
11201 if (RB_TYPE_P(sep, T_REGEXP)) {
11202 if (rb_reg_search(sep, str, 0, 0) < 0) {
11203 goto failed;
11204 }
11205 VALUE match = rb_backref_get();
11206 struct re_registers *regs = RMATCH_REGS(match);
11207
11208 pos = BEG(0);
11209 sep = rb_str_subseq(str, pos, END(0) - pos);
11210 }
11211 else {
11212 pos = rb_str_index(str, sep, 0);
11213 if (pos < 0) goto failed;
11214 }
11215 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11216 sep,
11217 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11218 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11219
11220 failed:
11221 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11222}
11223
11224/*
11225 * call-seq:
11226 * rpartition(pattern) -> [pre_match, last_match, post_match]
11227 *
11228 * :include: doc/string/rpartition.rdoc
11229 *
11230 */
11231
11232static VALUE
11233rb_str_rpartition(VALUE str, VALUE sep)
11234{
11235 long pos = RSTRING_LEN(str);
11236
11237 sep = get_pat_quoted(sep, 0);
11238 if (RB_TYPE_P(sep, T_REGEXP)) {
11239 if (rb_reg_search(sep, str, pos, 1) < 0) {
11240 goto failed;
11241 }
11242 VALUE match = rb_backref_get();
11243 struct re_registers *regs = RMATCH_REGS(match);
11244
11245 pos = BEG(0);
11246 sep = rb_str_subseq(str, pos, END(0) - pos);
11247 }
11248 else {
11249 pos = rb_str_sublen(str, pos);
11250 pos = rb_str_rindex(str, sep, pos);
11251 if (pos < 0) {
11252 goto failed;
11253 }
11254 }
11255
11256 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11257 sep,
11258 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11259 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11260 failed:
11261 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11262}
11263
11264/*
11265 * call-seq:
11266 * start_with?(*patterns) -> true or false
11267 *
11268 * :include: doc/string/start_with_p.rdoc
11269 *
11270 */
11271
11272static VALUE
11273rb_str_start_with(int argc, VALUE *argv, VALUE str)
11274{
11275 int i;
11276
11277 for (i=0; i<argc; i++) {
11278 VALUE tmp = argv[i];
11279 if (RB_TYPE_P(tmp, T_REGEXP)) {
11280 if (rb_reg_start_with_p(tmp, str))
11281 return Qtrue;
11282 }
11283 else {
11284 const char *p, *s, *e;
11285 long slen, tlen;
11286 rb_encoding *enc;
11287
11288 StringValue(tmp);
11289 enc = rb_enc_check(str, tmp);
11290 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11291 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11292 p = RSTRING_PTR(str);
11293 e = p + slen;
11294 s = p + tlen;
11295 if (!at_char_right_boundary(p, s, e, enc))
11296 continue;
11297 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11298 return Qtrue;
11299 }
11300 }
11301 return Qfalse;
11302}
11303
11304/*
11305 * call-seq:
11306 * end_with?(*strings) -> true or false
11307 *
11308 * :include: doc/string/end_with_p.rdoc
11309 *
11310 */
11311
11312static VALUE
11313rb_str_end_with(int argc, VALUE *argv, VALUE str)
11314{
11315 int i;
11316
11317 for (i=0; i<argc; i++) {
11318 VALUE tmp = argv[i];
11319 const char *p, *s, *e;
11320 long slen, tlen;
11321 rb_encoding *enc;
11322
11323 StringValue(tmp);
11324 enc = rb_enc_check(str, tmp);
11325 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11326 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11327 p = RSTRING_PTR(str);
11328 e = p + slen;
11329 s = e - tlen;
11330 if (!at_char_boundary(p, s, e, enc))
11331 continue;
11332 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11333 return Qtrue;
11334 }
11335 return Qfalse;
11336}
11337
11347static long
11348deleted_prefix_length(VALUE str, VALUE prefix)
11349{
11350 const char *strptr, *prefixptr;
11351 long olen, prefixlen;
11352 rb_encoding *enc = rb_enc_get(str);
11353
11354 StringValue(prefix);
11355
11356 if (!is_broken_string(prefix) ||
11357 !rb_enc_asciicompat(enc) ||
11358 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11359 enc = rb_enc_check(str, prefix);
11360 }
11361
11362 /* return 0 if not start with prefix */
11363 prefixlen = RSTRING_LEN(prefix);
11364 if (prefixlen <= 0) return 0;
11365 olen = RSTRING_LEN(str);
11366 if (olen < prefixlen) return 0;
11367 strptr = RSTRING_PTR(str);
11368 prefixptr = RSTRING_PTR(prefix);
11369 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11370 if (is_broken_string(prefix)) {
11371 if (!is_broken_string(str)) {
11372 /* prefix in a valid string cannot be broken */
11373 return 0;
11374 }
11375 const char *strend = strptr + olen;
11376 const char *after_prefix = strptr + prefixlen;
11377 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11378 /* prefix does not end at char-boundary */
11379 return 0;
11380 }
11381 }
11382 /* prefix part in `str` also should be valid. */
11383
11384 return prefixlen;
11385}
11386
11387/*
11388 * call-seq:
11389 * delete_prefix!(prefix) -> self or nil
11390 *
11391 * Like String#delete_prefix, except that +self+ is modified in place;
11392 * returns +self+ if the prefix is removed, +nil+ otherwise.
11393 *
11394 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11395 */
11396
11397static VALUE
11398rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11399{
11400 long prefixlen;
11401 str_modify_keep_cr(str);
11402
11403 prefixlen = deleted_prefix_length(str, prefix);
11404 if (prefixlen <= 0) return Qnil;
11405
11406 return rb_str_drop_bytes(str, prefixlen);
11407}
11408
11409/*
11410 * call-seq:
11411 * delete_prefix(prefix) -> new_string
11412 *
11413 * :include: doc/string/delete_prefix.rdoc
11414 *
11415 */
11416
11417static VALUE
11418rb_str_delete_prefix(VALUE str, VALUE prefix)
11419{
11420 long prefixlen;
11421
11422 prefixlen = deleted_prefix_length(str, prefix);
11423 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11424
11425 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11426}
11427
11437static long
11438deleted_suffix_length(VALUE str, VALUE suffix)
11439{
11440 const char *strptr, *suffixptr;
11441 long olen, suffixlen;
11442 rb_encoding *enc;
11443
11444 StringValue(suffix);
11445 if (is_broken_string(suffix)) return 0;
11446 enc = rb_enc_check(str, suffix);
11447
11448 /* return 0 if not start with suffix */
11449 suffixlen = RSTRING_LEN(suffix);
11450 if (suffixlen <= 0) return 0;
11451 olen = RSTRING_LEN(str);
11452 if (olen < suffixlen) return 0;
11453 strptr = RSTRING_PTR(str);
11454 suffixptr = RSTRING_PTR(suffix);
11455 const char *strend = strptr + olen;
11456 const char *before_suffix = strend - suffixlen;
11457 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11458 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11459
11460 return suffixlen;
11461}
11462
11463/*
11464 * call-seq:
11465 * delete_suffix!(suffix) -> self or nil
11466 *
11467 * Like String#delete_suffix, except that +self+ is modified in place;
11468 * returns +self+ if the suffix is removed, +nil+ otherwise.
11469 *
11470 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11471 */
11472
11473static VALUE
11474rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11475{
11476 long olen, suffixlen, len;
11477 str_modifiable(str);
11478
11479 suffixlen = deleted_suffix_length(str, suffix);
11480 if (suffixlen <= 0) return Qnil;
11481
11482 olen = RSTRING_LEN(str);
11483 str_modify_keep_cr(str);
11484 len = olen - suffixlen;
11485 STR_SET_LEN(str, len);
11486 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11487 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11489 }
11490 return str;
11491}
11492
11493/*
11494 * call-seq:
11495 * delete_suffix(suffix) -> new_string
11496 *
11497 * :include: doc/string/delete_suffix.rdoc
11498 *
11499 */
11500
11501static VALUE
11502rb_str_delete_suffix(VALUE str, VALUE suffix)
11503{
11504 long suffixlen;
11505
11506 suffixlen = deleted_suffix_length(str, suffix);
11507 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11508
11509 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11510}
11511
11512void
11513rb_str_setter(VALUE val, ID id, VALUE *var)
11514{
11515 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11516 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11517 }
11518 *var = val;
11519}
11520
11521static void
11522nil_setter_warning(ID id)
11523{
11524 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11525}
11526
11527void
11528rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11529{
11530 rb_str_setter(val, id, var);
11531 if (!NIL_P(*var)) {
11532 nil_setter_warning(id);
11533 }
11534}
11535
11536static void
11537rb_fs_setter(VALUE val, ID id, VALUE *var)
11538{
11539 val = rb_fs_check(val);
11540 if (!val) {
11541 rb_raise(rb_eTypeError,
11542 "value of %"PRIsVALUE" must be String or Regexp",
11543 rb_id2str(id));
11544 }
11545 if (!NIL_P(val)) {
11546 nil_setter_warning(id);
11547 }
11548 *var = val;
11549}
11550
11551
11552/*
11553 * call-seq:
11554 * force_encoding(encoding) -> self
11555 *
11556 * :include: doc/string/force_encoding.rdoc
11557 *
11558 */
11559
11560static VALUE
11561rb_str_force_encoding(VALUE str, VALUE enc)
11562{
11563 str_modifiable(str);
11564
11565 rb_encoding *encoding = rb_to_encoding(enc);
11566 int idx = rb_enc_to_index(encoding);
11567
11568 // If the encoding is unchanged, we do nothing.
11569 if (ENCODING_GET(str) == idx) {
11570 return str;
11571 }
11572
11573 rb_enc_associate_index(str, idx);
11574
11575 // If the coderange was 7bit and the new encoding is ASCII-compatible
11576 // we can keep the coderange.
11577 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11578 return str;
11579 }
11580
11582 return str;
11583}
11584
11585/*
11586 * call-seq:
11587 * b -> new_string
11588 *
11589 * :include: doc/string/b.rdoc
11590 *
11591 */
11592
11593static VALUE
11594rb_str_b(VALUE str)
11595{
11596 VALUE str2;
11597 if (STR_EMBED_P(str)) {
11598 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11599 }
11600 else {
11601 str2 = str_alloc_heap(rb_cString);
11602 }
11603 str_replace_shared_without_enc(str2, str);
11604
11605 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11606 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11607 // If we know the receiver's code range then we know the result's code range.
11608 int cr = ENC_CODERANGE(str);
11609 switch (cr) {
11610 case ENC_CODERANGE_7BIT:
11612 break;
11616 break;
11617 default:
11618 ENC_CODERANGE_CLEAR(str2);
11619 break;
11620 }
11621 }
11622
11623 return str2;
11624}
11625
11626/*
11627 * call-seq:
11628 * valid_encoding? -> true or false
11629 *
11630 * :include: doc/string/valid_encoding_p.rdoc
11631 *
11632 */
11633
11634static VALUE
11635rb_str_valid_encoding_p(VALUE str)
11636{
11637 int cr = rb_enc_str_coderange(str);
11638
11639 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11640}
11641
11642/*
11643 * call-seq:
11644 * ascii_only? -> true or false
11645 *
11646 * Returns whether +self+ contains only ASCII characters:
11647 *
11648 * 'abc'.ascii_only? # => true
11649 * "abc\u{6666}".ascii_only? # => false
11650 *
11651 * Related: see {Querying}[rdoc-ref:String@Querying].
11652 */
11653
11654static VALUE
11655rb_str_is_ascii_only_p(VALUE str)
11656{
11657 int cr = rb_enc_str_coderange(str);
11658
11659 return RBOOL(cr == ENC_CODERANGE_7BIT);
11660}
11661
11662VALUE
11664{
11665 static const char ellipsis[] = "...";
11666 const long ellipsislen = sizeof(ellipsis) - 1;
11667 rb_encoding *const enc = rb_enc_get(str);
11668 const long blen = RSTRING_LEN(str);
11669 const char *const p = RSTRING_PTR(str), *e = p + blen;
11670 VALUE estr, ret = 0;
11671
11672 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11673 if (len * rb_enc_mbminlen(enc) >= blen ||
11674 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11675 ret = str;
11676 }
11677 else if (len <= ellipsislen ||
11678 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11679 if (rb_enc_asciicompat(enc)) {
11680 ret = rb_str_new(ellipsis, len);
11681 rb_enc_associate(ret, enc);
11682 }
11683 else {
11684 estr = rb_usascii_str_new(ellipsis, len);
11685 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11686 }
11687 }
11688 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11689 rb_str_cat(ret, ellipsis, ellipsislen);
11690 }
11691 else {
11692 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11693 rb_enc_from_encoding(enc), 0, Qnil);
11694 rb_str_append(ret, estr);
11695 }
11696 return ret;
11697}
11698
11699static VALUE
11700str_compat_and_valid(VALUE str, rb_encoding *enc)
11701{
11702 int cr;
11703 str = StringValue(str);
11704 cr = rb_enc_str_coderange(str);
11705 if (cr == ENC_CODERANGE_BROKEN) {
11706 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11707 }
11708 else {
11709 rb_encoding *e = STR_ENC_GET(str);
11710 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11711 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11712 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11713 }
11714 }
11715 return str;
11716}
11717
11718static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11719
11720VALUE
11722{
11723 rb_encoding *enc = STR_ENC_GET(str);
11724 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11725}
11726
11727VALUE
11728rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11729{
11730 int cr = ENC_CODERANGE_UNKNOWN;
11731 if (enc == STR_ENC_GET(str)) {
11732 /* cached coderange makes sense only when enc equals the
11733 * actual encoding of str */
11734 cr = ENC_CODERANGE(str);
11735 }
11736 return enc_str_scrub(enc, str, repl, cr);
11737}
11738
11739static VALUE
11740enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11741{
11742 int encidx;
11743 VALUE buf = Qnil;
11744 const char *rep, *p, *e, *p1, *sp;
11745 long replen = -1;
11746 long slen;
11747
11748 if (rb_block_given_p()) {
11749 if (!NIL_P(repl))
11750 rb_raise(rb_eArgError, "both of block and replacement given");
11751 replen = 0;
11752 }
11753
11754 if (ENC_CODERANGE_CLEAN_P(cr))
11755 return Qnil;
11756
11757 if (!NIL_P(repl)) {
11758 repl = str_compat_and_valid(repl, enc);
11759 }
11760
11761 if (rb_enc_dummy_p(enc)) {
11762 return Qnil;
11763 }
11764 encidx = rb_enc_to_index(enc);
11765
11766#define DEFAULT_REPLACE_CHAR(str) do { \
11767 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11768 rep = replace; replen = (int)sizeof(replace); \
11769 } while (0)
11770
11771 slen = RSTRING_LEN(str);
11772 p = RSTRING_PTR(str);
11773 e = RSTRING_END(str);
11774 p1 = p;
11775 sp = p;
11776
11777 if (rb_enc_asciicompat(enc)) {
11778 int rep7bit_p;
11779 if (!replen) {
11780 rep = NULL;
11781 rep7bit_p = FALSE;
11782 }
11783 else if (!NIL_P(repl)) {
11784 rep = RSTRING_PTR(repl);
11785 replen = RSTRING_LEN(repl);
11786 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11787 }
11788 else if (encidx == rb_utf8_encindex()) {
11789 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11790 rep7bit_p = FALSE;
11791 }
11792 else {
11793 DEFAULT_REPLACE_CHAR("?");
11794 rep7bit_p = TRUE;
11795 }
11796 cr = ENC_CODERANGE_7BIT;
11797
11798 p = search_nonascii(p, e);
11799 if (!p) {
11800 p = e;
11801 }
11802 while (p < e) {
11803 int ret = rb_enc_precise_mbclen(p, e, enc);
11804 if (MBCLEN_NEEDMORE_P(ret)) {
11805 break;
11806 }
11807 else if (MBCLEN_CHARFOUND_P(ret)) {
11809 p += MBCLEN_CHARFOUND_LEN(ret);
11810 }
11811 else if (MBCLEN_INVALID_P(ret)) {
11812 /*
11813 * p1~p: valid ascii/multibyte chars
11814 * p ~e: invalid bytes + unknown bytes
11815 */
11816 long clen = rb_enc_mbmaxlen(enc);
11817 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11818 if (p > p1) {
11819 rb_str_buf_cat(buf, p1, p - p1);
11820 }
11821
11822 if (e - p < clen) clen = e - p;
11823 if (clen <= 2) {
11824 clen = 1;
11825 }
11826 else {
11827 const char *q = p;
11828 clen--;
11829 for (; clen > 1; clen--) {
11830 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11831 if (MBCLEN_NEEDMORE_P(ret)) break;
11832 if (MBCLEN_INVALID_P(ret)) continue;
11834 }
11835 }
11836 if (rep) {
11837 rb_str_buf_cat(buf, rep, replen);
11838 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11839 }
11840 else {
11841 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11842 str_mod_check(str, sp, slen);
11843 repl = str_compat_and_valid(repl, enc);
11844 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11847 }
11848 p += clen;
11849 p1 = p;
11850 p = search_nonascii(p, e);
11851 if (!p) {
11852 p = e;
11853 break;
11854 }
11855 }
11856 else {
11858 }
11859 }
11860 if (NIL_P(buf)) {
11861 if (p == e) {
11862 ENC_CODERANGE_SET(str, cr);
11863 return Qnil;
11864 }
11865 buf = rb_str_buf_new(RSTRING_LEN(str));
11866 }
11867 if (p1 < p) {
11868 rb_str_buf_cat(buf, p1, p - p1);
11869 }
11870 if (p < e) {
11871 if (rep) {
11872 rb_str_buf_cat(buf, rep, replen);
11873 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11874 }
11875 else {
11876 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11877 str_mod_check(str, sp, slen);
11878 repl = str_compat_and_valid(repl, enc);
11879 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11882 }
11883 }
11884 }
11885 else {
11886 /* ASCII incompatible */
11887 long mbminlen = rb_enc_mbminlen(enc);
11888 if (!replen) {
11889 rep = NULL;
11890 }
11891 else if (!NIL_P(repl)) {
11892 rep = RSTRING_PTR(repl);
11893 replen = RSTRING_LEN(repl);
11894 }
11895 else if (encidx == ENCINDEX_UTF_16BE) {
11896 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11897 }
11898 else if (encidx == ENCINDEX_UTF_16LE) {
11899 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11900 }
11901 else if (encidx == ENCINDEX_UTF_32BE) {
11902 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11903 }
11904 else if (encidx == ENCINDEX_UTF_32LE) {
11905 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11906 }
11907 else {
11908 DEFAULT_REPLACE_CHAR("?");
11909 }
11910
11911 while (p < e) {
11912 int ret = rb_enc_precise_mbclen(p, e, enc);
11913 if (MBCLEN_NEEDMORE_P(ret)) {
11914 break;
11915 }
11916 else if (MBCLEN_CHARFOUND_P(ret)) {
11917 p += MBCLEN_CHARFOUND_LEN(ret);
11918 }
11919 else if (MBCLEN_INVALID_P(ret)) {
11920 const char *q = p;
11921 long clen = rb_enc_mbmaxlen(enc);
11922 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11923 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11924
11925 if (e - p < clen) clen = e - p;
11926 if (clen <= mbminlen * 2) {
11927 clen = mbminlen;
11928 }
11929 else {
11930 clen -= mbminlen;
11931 for (; clen > mbminlen; clen-=mbminlen) {
11932 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11933 if (MBCLEN_NEEDMORE_P(ret)) break;
11934 if (MBCLEN_INVALID_P(ret)) continue;
11936 }
11937 }
11938 if (rep) {
11939 rb_str_buf_cat(buf, rep, replen);
11940 }
11941 else {
11942 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11943 str_mod_check(str, sp, slen);
11944 repl = str_compat_and_valid(repl, enc);
11945 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11946 }
11947 p += clen;
11948 p1 = p;
11949 }
11950 else {
11952 }
11953 }
11954 if (NIL_P(buf)) {
11955 if (p == e) {
11957 return Qnil;
11958 }
11959 buf = rb_str_buf_new(RSTRING_LEN(str));
11960 }
11961 if (p1 < p) {
11962 rb_str_buf_cat(buf, p1, p - p1);
11963 }
11964 if (p < e) {
11965 if (rep) {
11966 rb_str_buf_cat(buf, rep, replen);
11967 }
11968 else {
11969 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11970 str_mod_check(str, sp, slen);
11971 repl = str_compat_and_valid(repl, enc);
11972 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11973 }
11974 }
11976 }
11977 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11978 return buf;
11979}
11980
11981/*
11982 * call-seq:
11983 * scrub(replacement_string = default_replacement_string) -> new_string
11984 * scrub{|sequence| ... } -> new_string
11985 *
11986 * :include: doc/string/scrub.rdoc
11987 *
11988 */
11989static VALUE
11990str_scrub(int argc, VALUE *argv, VALUE str)
11991{
11992 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11993 VALUE new = rb_str_scrub(str, repl);
11994 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11995}
11996
11997/*
11998 * call-seq:
11999 * scrub!(replacement_string = default_replacement_string) -> self
12000 * scrub!{|sequence| ... } -> self
12001 *
12002 * Like String#scrub, except that:
12003 *
12004 * - Any replacements are made in +self+.
12005 * - Returns +self+.
12006 *
12007 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12008 *
12009 */
12010static VALUE
12011str_scrub_bang(int argc, VALUE *argv, VALUE str)
12012{
12013 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12014 VALUE new = rb_str_scrub(str, repl);
12015 if (!NIL_P(new)) rb_str_replace(str, new);
12016 return str;
12017}
12018
12019static ID id_normalize;
12020static ID id_normalized_p;
12021static VALUE mUnicodeNormalize;
12022
12023static VALUE
12024unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12025{
12026 static int UnicodeNormalizeRequired = 0;
12027 VALUE argv2[2];
12028
12029 if (!UnicodeNormalizeRequired) {
12030 rb_require("unicode_normalize/normalize.rb");
12031 UnicodeNormalizeRequired = 1;
12032 }
12033 argv2[0] = str;
12034 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12035 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12036}
12037
12038/*
12039 * call-seq:
12040 * unicode_normalize(form = :nfc) -> string
12041 *
12042 * :include: doc/string/unicode_normalize.rdoc
12043 *
12044 */
12045static VALUE
12046rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12047{
12048 return unicode_normalize_common(argc, argv, str, id_normalize);
12049}
12050
12051/*
12052 * call-seq:
12053 * unicode_normalize!(form = :nfc) -> self
12054 *
12055 * Like String#unicode_normalize, except that the normalization
12056 * is performed on +self+ (not on a copy of +self+).
12057 *
12058 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12059 *
12060 */
12061static VALUE
12062rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12063{
12064 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12065}
12066
12067/* call-seq:
12068 * unicode_normalized?(form = :nfc) -> true or false
12069 *
12070 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12071 * see String#unicode_normalize.
12072 *
12073 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12074 *
12075 * Examples:
12076 *
12077 * "a\u0300".unicode_normalized? # => false
12078 * "a\u0300".unicode_normalized?(:nfd) # => true
12079 * "\u00E0".unicode_normalized? # => true
12080 * "\u00E0".unicode_normalized?(:nfd) # => false
12081 *
12082 *
12083 * Raises an exception if +self+ is not in a Unicode encoding:
12084 *
12085 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12086 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12087 *
12088 * Related: see {Querying}[rdoc-ref:String@Querying].
12089 */
12090static VALUE
12091rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12092{
12093 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12094}
12095
12096/**********************************************************************
12097 * Document-class: Symbol
12098 *
12099 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12100 *
12101 * You can create a +Symbol+ object explicitly with:
12102 *
12103 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12104 *
12105 * The same +Symbol+ object will be
12106 * created for a given name or string for the duration of a program's
12107 * execution, regardless of the context or meaning of that name. Thus
12108 * if <code>Fred</code> is a constant in one context, a method in
12109 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12110 * will be the same object in all three contexts.
12111 *
12112 * module One
12113 * class Fred
12114 * end
12115 * $f1 = :Fred
12116 * end
12117 * module Two
12118 * Fred = 1
12119 * $f2 = :Fred
12120 * end
12121 * def Fred()
12122 * end
12123 * $f3 = :Fred
12124 * $f1.object_id #=> 2514190
12125 * $f2.object_id #=> 2514190
12126 * $f3.object_id #=> 2514190
12127 *
12128 * Constant, method, and variable names are returned as symbols:
12129 *
12130 * module One
12131 * Two = 2
12132 * def three; 3 end
12133 * @four = 4
12134 * @@five = 5
12135 * $six = 6
12136 * end
12137 * seven = 7
12138 *
12139 * One.constants
12140 * # => [:Two]
12141 * One.instance_methods(true)
12142 * # => [:three]
12143 * One.instance_variables
12144 * # => [:@four]
12145 * One.class_variables
12146 * # => [:@@five]
12147 * global_variables.grep(/six/)
12148 * # => [:$six]
12149 * local_variables
12150 * # => [:seven]
12151 *
12152 * A +Symbol+ object differs from a String object in that
12153 * a +Symbol+ object represents an identifier, while a String object
12154 * represents text or data.
12155 *
12156 * == What's Here
12157 *
12158 * First, what's elsewhere. Class +Symbol+:
12159 *
12160 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12161 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12162 *
12163 * Here, class +Symbol+ provides methods that are useful for:
12164 *
12165 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12166 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12167 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12168 *
12169 * === Methods for Querying
12170 *
12171 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12172 * - #=~: Returns the index of the first substring in symbol that matches a
12173 * given Regexp or other object; returns +nil+ if no match is found.
12174 * - #[], #slice : Returns a substring of symbol
12175 * determined by a given index, start/length, or range, or string.
12176 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12177 * - #encoding: Returns the Encoding object that represents the encoding
12178 * of symbol.
12179 * - #end_with?: Returns +true+ if symbol ends with
12180 * any of the given strings.
12181 * - #match: Returns a MatchData object if symbol
12182 * matches a given Regexp; +nil+ otherwise.
12183 * - #match?: Returns +true+ if symbol
12184 * matches a given Regexp; +false+ otherwise.
12185 * - #length, #size: Returns the number of characters in symbol.
12186 * - #start_with?: Returns +true+ if symbol starts with
12187 * any of the given strings.
12188 *
12189 * === Methods for Comparing
12190 *
12191 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12192 * or larger than symbol.
12193 * - #==, #===: Returns +true+ if a given symbol has the same content and
12194 * encoding.
12195 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12196 * symbol is smaller than, equal to, or larger than symbol.
12197 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12198 * after Unicode case folding; +false+ otherwise.
12199 *
12200 * === Methods for Converting
12201 *
12202 * - #capitalize: Returns symbol with the first character upcased
12203 * and all other characters downcased.
12204 * - #downcase: Returns symbol with all characters downcased.
12205 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12206 * - #name: Returns the frozen string corresponding to symbol.
12207 * - #succ, #next: Returns the symbol that is the successor to symbol.
12208 * - #swapcase: Returns symbol with all upcase characters downcased
12209 * and all downcase characters upcased.
12210 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12211 * - #to_s, #id2name: Returns the string corresponding to +self+.
12212 * - #to_sym, #intern: Returns +self+.
12213 * - #upcase: Returns symbol with all characters upcased.
12214 *
12215 */
12216
12217
12218/*
12219 * call-seq:
12220 * symbol == object -> true or false
12221 *
12222 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12223 */
12224
12225#define sym_equal rb_obj_equal
12226
12227static int
12228sym_printable(const char *s, const char *send, rb_encoding *enc)
12229{
12230 while (s < send) {
12231 int n;
12232 int c = rb_enc_precise_mbclen(s, send, enc);
12233
12234 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12235 n = MBCLEN_CHARFOUND_LEN(c);
12236 c = rb_enc_mbc_to_codepoint(s, send, enc);
12237 if (!rb_enc_isprint(c, enc)) return FALSE;
12238 s += n;
12239 }
12240 return TRUE;
12241}
12242
12243int
12244rb_str_symname_p(VALUE sym)
12245{
12246 rb_encoding *enc;
12247 const char *ptr;
12248 long len;
12249 rb_encoding *resenc = rb_default_internal_encoding();
12250
12251 if (resenc == NULL) resenc = rb_default_external_encoding();
12252 enc = STR_ENC_GET(sym);
12253 ptr = RSTRING_PTR(sym);
12254 len = RSTRING_LEN(sym);
12255 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12256 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12257 return FALSE;
12258 }
12259 return TRUE;
12260}
12261
12262VALUE
12263rb_str_quote_unprintable(VALUE str)
12264{
12265 rb_encoding *enc;
12266 const char *ptr;
12267 long len;
12268 rb_encoding *resenc;
12269
12270 Check_Type(str, T_STRING);
12271 resenc = rb_default_internal_encoding();
12272 if (resenc == NULL) resenc = rb_default_external_encoding();
12273 enc = STR_ENC_GET(str);
12274 ptr = RSTRING_PTR(str);
12275 len = RSTRING_LEN(str);
12276 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12277 !sym_printable(ptr, ptr + len, enc)) {
12278 return rb_str_escape(str);
12279 }
12280 return str;
12281}
12282
12283VALUE
12284rb_id_quote_unprintable(ID id)
12285{
12286 VALUE str = rb_id2str(id);
12287 if (!rb_str_symname_p(str)) {
12288 return rb_str_escape(str);
12289 }
12290 return str;
12291}
12292
12293/*
12294 * call-seq:
12295 * inspect -> string
12296 *
12297 * Returns a string representation of +self+ (including the leading colon):
12298 *
12299 * :foo.inspect # => ":foo"
12300 *
12301 * Related: Symbol#to_s, Symbol#name.
12302 *
12303 */
12304
12305static VALUE
12306sym_inspect(VALUE sym)
12307{
12308 VALUE str = rb_sym2str(sym);
12309 const char *ptr;
12310 long len;
12311 char *dest;
12312
12313 if (!rb_str_symname_p(str)) {
12314 str = rb_str_inspect(str);
12315 len = RSTRING_LEN(str);
12316 rb_str_resize(str, len + 1);
12317 dest = RSTRING_PTR(str);
12318 memmove(dest + 1, dest, len);
12319 }
12320 else {
12321 rb_encoding *enc = STR_ENC_GET(str);
12322 VALUE orig_str = str;
12323
12324 len = RSTRING_LEN(orig_str);
12325 str = rb_enc_str_new(0, len + 1, enc);
12326
12327 // Get data pointer after allocation
12328 ptr = RSTRING_PTR(orig_str);
12329 dest = RSTRING_PTR(str);
12330 memcpy(dest + 1, ptr, len);
12331
12332 RB_GC_GUARD(orig_str);
12333 }
12334 dest[0] = ':';
12335
12337
12338 return str;
12339}
12340
12341VALUE
12343{
12344 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12345 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12346 return str;
12347}
12348
12349VALUE
12350rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12351{
12352 VALUE obj;
12353
12354 if (argc < 1) {
12355 rb_raise(rb_eArgError, "no receiver given");
12356 }
12357 obj = argv[0];
12358 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12359}
12360
12361/*
12362 * call-seq:
12363 * succ
12364 *
12365 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12366 *
12367 * :foo.succ # => :fop
12368 *
12369 * Related: String#succ.
12370 */
12371
12372static VALUE
12373sym_succ(VALUE sym)
12374{
12375 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12376}
12377
12378/*
12379 * call-seq:
12380 * self <=> other -> -1, 0, 1, or nil
12381 *
12382 * Compares +self+ and +other+, using String#<=>.
12383 *
12384 * Returns:
12385 *
12386 * - <tt>self.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12387 * - +nil+, otherwise.
12388 *
12389 * Examples:
12390 *
12391 * :bar <=> :foo # => -1
12392 * :foo <=> :foo # => 0
12393 * :foo <=> :bar # => 1
12394 * :foo <=> 'bar' # => nil
12395 *
12396 * \Class \Symbol includes module Comparable,
12397 * each of whose methods uses Symbol#<=> for comparison.
12398 *
12399 * Related: String#<=>.
12400 */
12401
12402static VALUE
12403sym_cmp(VALUE sym, VALUE other)
12404{
12405 if (!SYMBOL_P(other)) {
12406 return Qnil;
12407 }
12408 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12409}
12410
12411/*
12412 * call-seq:
12413 * casecmp(object) -> -1, 0, 1, or nil
12414 *
12415 * :include: doc/symbol/casecmp.rdoc
12416 *
12417 */
12418
12419static VALUE
12420sym_casecmp(VALUE sym, VALUE other)
12421{
12422 if (!SYMBOL_P(other)) {
12423 return Qnil;
12424 }
12425 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12426}
12427
12428/*
12429 * call-seq:
12430 * casecmp?(object) -> true, false, or nil
12431 *
12432 * :include: doc/symbol/casecmp_p.rdoc
12433 *
12434 */
12435
12436static VALUE
12437sym_casecmp_p(VALUE sym, VALUE other)
12438{
12439 if (!SYMBOL_P(other)) {
12440 return Qnil;
12441 }
12442 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12443}
12444
12445/*
12446 * call-seq:
12447 * symbol =~ object -> integer or nil
12448 *
12449 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12450 * including possible updates to global variables;
12451 * see String#=~.
12452 *
12453 */
12454
12455static VALUE
12456sym_match(VALUE sym, VALUE other)
12457{
12458 return rb_str_match(rb_sym2str(sym), other);
12459}
12460
12461/*
12462 * call-seq:
12463 * match(pattern, offset = 0) -> matchdata or nil
12464 * match(pattern, offset = 0) {|matchdata| } -> object
12465 *
12466 * Equivalent to <tt>self.to_s.match</tt>,
12467 * including possible updates to global variables;
12468 * see String#match.
12469 *
12470 */
12471
12472static VALUE
12473sym_match_m(int argc, VALUE *argv, VALUE sym)
12474{
12475 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12476}
12477
12478/*
12479 * call-seq:
12480 * match?(pattern, offset) -> true or false
12481 *
12482 * Equivalent to <tt>sym.to_s.match?</tt>;
12483 * see String#match.
12484 *
12485 */
12486
12487static VALUE
12488sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12489{
12490 return rb_str_match_m_p(argc, argv, sym);
12491}
12492
12493/*
12494 * call-seq:
12495 * symbol[index] -> string or nil
12496 * symbol[start, length] -> string or nil
12497 * symbol[range] -> string or nil
12498 * symbol[regexp, capture = 0] -> string or nil
12499 * symbol[substring] -> string or nil
12500 *
12501 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12502 *
12503 */
12504
12505static VALUE
12506sym_aref(int argc, VALUE *argv, VALUE sym)
12507{
12508 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12509}
12510
12511/*
12512 * call-seq:
12513 * length -> integer
12514 *
12515 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12516 */
12517
12518static VALUE
12519sym_length(VALUE sym)
12520{
12521 return rb_str_length(rb_sym2str(sym));
12522}
12523
12524/*
12525 * call-seq:
12526 * empty? -> true or false
12527 *
12528 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12529 *
12530 */
12531
12532static VALUE
12533sym_empty(VALUE sym)
12534{
12535 return rb_str_empty(rb_sym2str(sym));
12536}
12537
12538/*
12539 * call-seq:
12540 * upcase(mapping) -> symbol
12541 *
12542 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12543 *
12544 * See String#upcase.
12545 *
12546 */
12547
12548static VALUE
12549sym_upcase(int argc, VALUE *argv, VALUE sym)
12550{
12551 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12552}
12553
12554/*
12555 * call-seq:
12556 * downcase(mapping) -> symbol
12557 *
12558 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12559 *
12560 * See String#downcase.
12561 *
12562 * Related: Symbol#upcase.
12563 *
12564 */
12565
12566static VALUE
12567sym_downcase(int argc, VALUE *argv, VALUE sym)
12568{
12569 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12570}
12571
12572/*
12573 * call-seq:
12574 * capitalize(mapping) -> symbol
12575 *
12576 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12577 *
12578 * See String#capitalize.
12579 *
12580 */
12581
12582static VALUE
12583sym_capitalize(int argc, VALUE *argv, VALUE sym)
12584{
12585 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12586}
12587
12588/*
12589 * call-seq:
12590 * swapcase(mapping) -> symbol
12591 *
12592 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12593 *
12594 * See String#swapcase.
12595 *
12596 */
12597
12598static VALUE
12599sym_swapcase(int argc, VALUE *argv, VALUE sym)
12600{
12601 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12602}
12603
12604/*
12605 * call-seq:
12606 * start_with?(*string_or_regexp) -> true or false
12607 *
12608 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12609 *
12610 */
12611
12612static VALUE
12613sym_start_with(int argc, VALUE *argv, VALUE sym)
12614{
12615 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12616}
12617
12618/*
12619 * call-seq:
12620 * end_with?(*strings) -> true or false
12621 *
12622 *
12623 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12624 *
12625 */
12626
12627static VALUE
12628sym_end_with(int argc, VALUE *argv, VALUE sym)
12629{
12630 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12631}
12632
12633/*
12634 * call-seq:
12635 * encoding -> encoding
12636 *
12637 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12638 *
12639 */
12640
12641static VALUE
12642sym_encoding(VALUE sym)
12643{
12644 return rb_obj_encoding(rb_sym2str(sym));
12645}
12646
12647static VALUE
12648string_for_symbol(VALUE name)
12649{
12650 if (!RB_TYPE_P(name, T_STRING)) {
12651 VALUE tmp = rb_check_string_type(name);
12652 if (NIL_P(tmp)) {
12653 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12654 name);
12655 }
12656 name = tmp;
12657 }
12658 return name;
12659}
12660
12661ID
12663{
12664 if (SYMBOL_P(name)) {
12665 return SYM2ID(name);
12666 }
12667 name = string_for_symbol(name);
12668 return rb_intern_str(name);
12669}
12670
12671VALUE
12673{
12674 if (SYMBOL_P(name)) {
12675 return name;
12676 }
12677 name = string_for_symbol(name);
12678 return rb_str_intern(name);
12679}
12680
12681/*
12682 * call-seq:
12683 * Symbol.all_symbols -> array_of_symbols
12684 *
12685 * Returns an array of all symbols currently in Ruby's symbol table:
12686 *
12687 * Symbol.all_symbols.size # => 9334
12688 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12689 *
12690 */
12691
12692static VALUE
12693sym_all_symbols(VALUE _)
12694{
12695 return rb_sym_all_symbols();
12696}
12697
12698VALUE
12699rb_str_to_interned_str(VALUE str)
12700{
12701 return rb_fstring(str);
12702}
12703
12704VALUE
12705rb_interned_str(const char *ptr, long len)
12706{
12707 struct RString fake_str = {RBASIC_INIT};
12708 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12709}
12710
12711VALUE
12713{
12714 return rb_interned_str(ptr, strlen(ptr));
12715}
12716
12717VALUE
12718rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12719{
12720 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12721 rb_enc_autoload(enc);
12722 }
12723
12724 struct RString fake_str = {RBASIC_INIT};
12725 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12726}
12727
12728VALUE
12729rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12730{
12731 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12732 rb_enc_autoload(enc);
12733 }
12734
12735 struct RString fake_str = {RBASIC_INIT};
12736 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12737 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12738 return str;
12739}
12740
12741VALUE
12743{
12744 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12745}
12746
12747#if USE_YJIT || USE_ZJIT
12748void
12749rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12750{
12751 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12752 ssize_t code = RB_NUM2SSIZE(codepoint);
12753
12754 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12755 rb_str_buf_cat_byte(str, (char) code);
12756 return;
12757 }
12758 }
12759
12760 rb_str_concat(str, codepoint);
12761}
12762#endif
12763
12764static int
12765fstring_set_class_i(VALUE *str, void *data)
12766{
12767 RBASIC_SET_CLASS(*str, rb_cString);
12768
12769 return ST_CONTINUE;
12770}
12771
12772void
12773Init_String(void)
12774{
12775 rb_cString = rb_define_class("String", rb_cObject);
12776
12777 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12778
12780 rb_define_alloc_func(rb_cString, empty_str_alloc);
12781 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12782 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12783 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12785 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12786 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12789 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12790 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12791 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12792 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12795 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12796 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12797 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12798 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12801 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12802 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12803 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12804 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12805 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12807 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12809 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12810 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12811 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12812 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12813 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12814 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12815 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12816 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12817 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12818 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12819 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12820 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12821 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12822 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12824 rb_define_method(rb_cString, "+@", str_uplus, 0);
12825 rb_define_method(rb_cString, "-@", str_uminus, 0);
12826 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12827 rb_define_alias(rb_cString, "dedup", "-@");
12828
12829 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12830 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12831 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12832 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12835 rb_define_method(rb_cString, "undump", str_undump, 0);
12836
12837 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12838 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12839 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12840 sym_fold = ID2SYM(rb_intern_const("fold"));
12841
12842 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12843 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12844 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12845 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12846
12847 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12848 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12849 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12850 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12851
12852 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12853 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12854 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12855 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12856 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12857 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12858 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12859 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12860 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12861 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12862 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12863 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12865 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12866 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12867 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12868 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12869 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12870
12871 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12872 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12873 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12874
12875 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12876
12877 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12878 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12879 rb_define_method(rb_cString, "center", rb_str_center, -1);
12880
12881 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12882 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12883 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12884 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12885 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12886 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12887 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12888 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12889 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12890
12891 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12892 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12893 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12894 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12895 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12896 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12897 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12898 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12899 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12900
12901 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12902 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12903 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12904 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12905 rb_define_method(rb_cString, "count", rb_str_count, -1);
12906
12907 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12908 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12909 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12910 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12911
12912 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12913 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12914 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12915 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12916 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12917
12918 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12919
12920 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12921 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12922
12923 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12924 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12925
12926 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12927 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12928 rb_define_method(rb_cString, "b", rb_str_b, 0);
12929 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12930 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12931
12932 /* define UnicodeNormalize module here so that we don't have to look it up */
12933 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12934 id_normalize = rb_intern_const("normalize");
12935 id_normalized_p = rb_intern_const("normalized?");
12936
12937 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12938 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12939 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12940
12941 rb_fs = Qnil;
12942 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12943 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12944 rb_gc_register_address(&rb_fs);
12945
12946 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12950 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12951
12952 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12953 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12954 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12955 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12956 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12957 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12958
12959 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12960 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12961 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12962 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12963
12964 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12965 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12966 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12967 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12968 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12969 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12970 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12971
12972 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12973 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12974 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12975 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12976
12977 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12978 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12979
12980 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12981}
12982
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1796
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1589
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1702
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2956
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2768
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3246
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1010
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3035
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:653
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3909
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1435
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1431
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1438
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1429
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1433
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:675
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2208
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2226
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1354
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3622
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:264
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:582
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:176
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1342
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3306
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1342
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:947
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1207
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3028
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1226
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12718
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2334
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3732
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1155
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1447
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1348
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:966
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12742
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:831
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:714
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2030
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2036
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1950
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4223
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3720
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1752
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1512
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2487
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3797
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1423
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12342
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2560
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1399
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1746
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3056
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5339
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4160
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3153
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11663
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1788
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1189
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1001
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1518
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1996
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4146
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3565
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2423
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2014
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6546
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3161
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12712
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1429
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3763
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3103
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4267
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3387
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7225
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2790
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12705
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4214
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4034
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4189
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3739
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3278
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5823
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11721
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1702
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2950
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3250
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3369
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1201
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2744
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7332
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1411
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1718
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2437
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5741
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9339
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1195
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1850
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2017
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2096
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3402
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1650
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12672
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12662
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3499
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4467
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1441
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2927
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2809
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1435
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2822
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1779
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:461
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:208
Definition string.c:8219
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:307
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113