Ruby 4.0.0dev (2025-12-14 revision e8d32dddc04b34e2454b1c37b271bc242dddb06e)
string.c (e8d32dddc04b34e2454b1c37b271bc242dddb06e)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 FL_SET((shared_str), STR_SHARED_ROOT); \
209 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
210 FL_SET_RAW((shared_str), STR_BORROWED); \
211 } \
212} while (0)
213
214#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
215#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216/* TODO: include the terminator size in capa. */
217
218#define STR_ENC_GET(str) get_encoding(str)
219
220#if !defined SHARABLE_MIDDLE_SUBSTRING
221# define SHARABLE_MIDDLE_SUBSTRING 0
222#endif
223#if !SHARABLE_MIDDLE_SUBSTRING
224#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#else
226#define SHARABLE_SUBSTRING_P(beg, len, end) 1
227#endif
228
229
230static inline long
231str_embed_capa(VALUE str)
232{
233 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
234}
235
236bool
237rb_str_reembeddable_p(VALUE str)
238{
239 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
240}
241
242static inline size_t
243rb_str_embed_size(long capa, long termlen)
244{
245 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
246 if (size < sizeof(struct RString)) size = sizeof(struct RString);
247 return size;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254 if (STR_EMBED_P(str)) {
255 size_t capa = RSTRING(str)->len;
256 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
257
258 real_size = rb_str_embed_size(capa, TERM_LEN(str));
259 }
260 /* if the string is not currently embedded, but it can be embedded, how
261 * much space would it require */
262 else if (rb_str_reembeddable_p(str)) {
263 size_t capa = RSTRING(str)->as.heap.aux.capa;
264 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
265
266 real_size = rb_str_embed_size(capa, TERM_LEN(str));
267 }
268 else {
269 real_size = sizeof(struct RString);
270 }
271
272 return real_size;
273}
274
275static inline bool
276STR_EMBEDDABLE_P(long len, long termlen)
277{
278 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
279}
280
281static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
282static VALUE str_new_frozen(VALUE klass, VALUE orig);
283static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
284static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
285static VALUE str_new(VALUE klass, const char *ptr, long len);
286static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
287static inline void str_modifiable(VALUE str);
288static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
289static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->len;
317
318 STR_SET_EMBED(str);
319 STR_SET_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_debug_rstring_null_ptr(const char *func)
331{
332 fprintf(stderr, "%s is returning NULL!! "
333 "SIGSEGV is highly expected to follow immediately.\n"
334 "If you could reproduce, attach your debugger here, "
335 "and look at the passed string.\n",
336 func);
337}
338
339/* symbols for [up|down|swap]case/capitalize options */
340static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
341
342static rb_encoding *
343get_encoding(VALUE str)
344{
345 return rb_enc_from_index(ENCODING_GET(str));
346}
347
348static void
349mustnot_broken(VALUE str)
350{
351 if (is_broken_string(str)) {
352 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
353 }
354}
355
356static void
357mustnot_wchar(VALUE str)
358{
359 rb_encoding *enc = STR_ENC_GET(str);
360 if (rb_enc_mbminlen(enc) > 1) {
361 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
362 }
363}
364
365static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
366
367#if SIZEOF_LONG == SIZEOF_VOIDP
368#define PRECOMPUTED_FAKESTR_HASH 1
369#else
370#endif
371
372static inline bool
373BARE_STRING_P(VALUE str)
374{
375 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
376}
377
378static inline st_index_t
379str_do_hash(VALUE str)
380{
381 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
382 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
383 if (e && !is_ascii_string(str)) {
384 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
385 }
386 return h;
387}
388
389static VALUE
390str_store_precomputed_hash(VALUE str, st_index_t hash)
391{
392 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
393 RUBY_ASSERT(STR_EMBED_P(str));
394
395#if RUBY_DEBUG
396 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
397 size_t free_bytes = str_embed_capa(str) - used_bytes;
398 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
399#endif
400
401 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
402
403 FL_SET(str, STR_PRECOMPUTED_HASH);
404
405 return str;
406}
407
408VALUE
409rb_fstring(VALUE str)
410{
411 VALUE fstr;
412 int bare;
413
414 Check_Type(str, T_STRING);
415
416 if (FL_TEST(str, RSTRING_FSTR))
417 return str;
418
419 bare = BARE_STRING_P(str);
420 if (!bare) {
421 if (STR_EMBED_P(str)) {
422 OBJ_FREEZE(str);
423 return str;
424 }
425
426 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 return str;
429 }
430 }
431
432 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
433 rb_str_resize(str, RSTRING_LEN(str));
434
435 fstr = register_fstring(str, false, false);
436
437 if (!bare) {
438 str_replace_shared_without_enc(str, fstr);
439 OBJ_FREEZE(str);
440 return str;
441 }
442 return fstr;
443}
444
445static VALUE fstring_table_obj;
446
447static VALUE
448fstring_concurrent_set_hash(VALUE str)
449{
450#ifdef PRECOMPUTED_FAKESTR_HASH
451 st_index_t h;
452 if (FL_TEST_RAW(str, STR_FAKESTR)) {
453 // register_fstring precomputes the hash and stores it in capa for fake strings
454 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
455 }
456 else {
457 h = rb_str_hash(str);
458 }
459 // rb_str_hash doesn't include the encoding for ascii only strings, so
460 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
461 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
462#else
463 return (VALUE)rb_str_hash(str);
464#endif
465}
466
467static bool
468fstring_concurrent_set_cmp(VALUE a, VALUE b)
469{
470 long alen, blen;
471 const char *aptr, *bptr;
472
475
476 RSTRING_GETMEM(a, aptr, alen);
477 RSTRING_GETMEM(b, bptr, blen);
478 return (alen == blen &&
479 ENCODING_GET(a) == ENCODING_GET(b) &&
480 memcmp(aptr, bptr, alen) == 0);
481}
482
484 bool copy;
485 bool force_precompute_hash;
486};
487
488static VALUE
489fstring_concurrent_set_create(VALUE str, void *data)
490{
491 struct fstr_create_arg *arg = data;
492
493 // Unless the string is empty or binary, its coderange has been precomputed.
494 int coderange = ENC_CODERANGE(str);
495
496 if (FL_TEST_RAW(str, STR_FAKESTR)) {
497 if (arg->copy) {
498 VALUE new_str;
499 long len = RSTRING_LEN(str);
500 long capa = len + sizeof(st_index_t);
501 int term_len = TERM_LEN(str);
502
503 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
504 new_str = str_alloc_embed(rb_cString, capa + term_len);
505 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
506 STR_SET_LEN(new_str, RSTRING_LEN(str));
507 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
508 rb_enc_copy(new_str, str);
509 str_store_precomputed_hash(new_str, str_do_hash(str));
510 }
511 else {
512 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
513 rb_enc_copy(new_str, str);
514#ifdef PRECOMPUTED_FAKESTR_HASH
515 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
516 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
517 }
518#endif
519 }
520 str = new_str;
521 }
522 else {
523 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
524 RSTRING(str)->len,
525 ENCODING_GET(str));
526 }
527 OBJ_FREEZE(str);
528 }
529 else {
530 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
531 str = str_new_frozen(rb_cString, str);
532 }
533 if (STR_SHARED_P(str)) { /* str should not be shared */
534 /* shared substring */
535 str_make_independent(str);
537 }
538 if (!BARE_STRING_P(str)) {
539 str = str_new_frozen(rb_cString, str);
540 }
541 }
542
543 ENC_CODERANGE_SET(str, coderange);
544 RBASIC(str)->flags |= RSTRING_FSTR;
545 if (!RB_OBJ_SHAREABLE_P(str)) {
546 RB_OBJ_SET_SHAREABLE(str);
547 }
548 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
551 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
552 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
554 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
555
556 return str;
557}
558
559static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
560 .hash = fstring_concurrent_set_hash,
561 .cmp = fstring_concurrent_set_cmp,
562 .create = fstring_concurrent_set_create,
563 .free = NULL,
564};
565
566void
567Init_fstring_table(void)
568{
569 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
570 rb_gc_register_address(&fstring_table_obj);
571}
572
573static VALUE
574register_fstring(VALUE str, bool copy, bool force_precompute_hash)
575{
576 struct fstr_create_arg args = {
577 .copy = copy,
578 .force_precompute_hash = force_precompute_hash
579 };
580
581#if SIZEOF_VOIDP == SIZEOF_LONG
582 if (FL_TEST_RAW(str, STR_FAKESTR)) {
583 // if the string hasn't been interned, we'll need the hash twice, so we
584 // compute it once and store it in capa
585 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
586 }
587#endif
588
589 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
590
591 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593 RUBY_ASSERT(OBJ_FROZEN(result));
595 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
596 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
598
599 return result;
600}
601
602bool
603rb_obj_is_fstring_table(VALUE obj)
604{
605 ASSERT_vm_locking();
606
607 return obj == fstring_table_obj;
608}
609
610void
611rb_gc_free_fstring(VALUE obj)
612{
613 ASSERT_vm_locking_with_barrier();
614
615 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
617 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
618
619 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
620
621 RB_DEBUG_COUNTER_INC(obj_str_fstr);
622
623 FL_UNSET(obj, RSTRING_FSTR);
624}
625
626void
627rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
628{
629 if (fstring_table_obj) {
630 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
631 }
632}
633
634static VALUE
635setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
636{
637 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
638 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
639
640 if (!name) {
642 name = "";
643 }
644
645 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
646
647 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
648 fake_str->len = len;
649 fake_str->as.heap.ptr = (char *)name;
650 fake_str->as.heap.aux.capa = len;
651 return (VALUE)fake_str;
652}
653
654/*
655 * set up a fake string which refers a static string literal.
656 */
657VALUE
658rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
659{
660 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
661}
662
663/*
664 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
665 * shared string which refers a static string literal. `ptr` must
666 * point a constant string.
667 */
668VALUE
669rb_fstring_new(const char *ptr, long len)
670{
671 struct RString fake_str = {RBASIC_INIT};
672 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
673}
674
675VALUE
676rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
677{
678 struct RString fake_str = {RBASIC_INIT};
679 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
680}
681
682VALUE
683rb_fstring_cstr(const char *ptr)
684{
685 return rb_fstring_new(ptr, strlen(ptr));
686}
687
688static inline bool
689single_byte_optimizable(VALUE str)
690{
691 int encindex = ENCODING_GET(str);
692 switch (encindex) {
693 case ENCINDEX_ASCII_8BIT:
694 case ENCINDEX_US_ASCII:
695 return true;
696 case ENCINDEX_UTF_8:
697 // For UTF-8 it's worth scanning the string coderange when unknown.
699 }
700 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
701 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
702 return true;
703 }
704
705 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
706 return true;
707 }
708
709 /* Conservative. Possibly single byte.
710 * "\xa1" in Shift_JIS for example. */
711 return false;
712}
713
715
716static inline const char *
717search_nonascii(const char *p, const char *e)
718{
719 const uintptr_t *s, *t;
720
721#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
722# if SIZEOF_UINTPTR_T == 8
723# define NONASCII_MASK UINT64_C(0x8080808080808080)
724# elif SIZEOF_UINTPTR_T == 4
725# define NONASCII_MASK UINT32_C(0x80808080)
726# else
727# error "don't know what to do."
728# endif
729#else
730# if SIZEOF_UINTPTR_T == 8
731# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
732# elif SIZEOF_UINTPTR_T == 4
733# define NONASCII_MASK 0x80808080UL /* or...? */
734# else
735# error "don't know what to do."
736# endif
737#endif
738
739 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
740#if !UNALIGNED_WORD_ACCESS
741 if ((uintptr_t)p % SIZEOF_VOIDP) {
742 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
743 p += l;
744 switch (l) {
745 default: UNREACHABLE;
746#if SIZEOF_VOIDP > 4
747 case 7: if (p[-7]&0x80) return p-7;
748 case 6: if (p[-6]&0x80) return p-6;
749 case 5: if (p[-5]&0x80) return p-5;
750 case 4: if (p[-4]&0x80) return p-4;
751#endif
752 case 3: if (p[-3]&0x80) return p-3;
753 case 2: if (p[-2]&0x80) return p-2;
754 case 1: if (p[-1]&0x80) return p-1;
755 case 0: break;
756 }
757 }
758#endif
759#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
760#define aligned_ptr(value) \
761 __builtin_assume_aligned((value), sizeof(uintptr_t))
762#else
763#define aligned_ptr(value) (uintptr_t *)(value)
764#endif
765 s = aligned_ptr(p);
766 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
767#undef aligned_ptr
768 for (;s < t; s++) {
769 if (*s & NONASCII_MASK) {
770#ifdef WORDS_BIGENDIAN
771 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
772#else
773 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
774#endif
775 }
776 }
777 p = (const char *)s;
778 }
779
780 switch (e - p) {
781 default: UNREACHABLE;
782#if SIZEOF_VOIDP > 4
783 case 7: if (e[-7]&0x80) return e-7;
784 case 6: if (e[-6]&0x80) return e-6;
785 case 5: if (e[-5]&0x80) return e-5;
786 case 4: if (e[-4]&0x80) return e-4;
787#endif
788 case 3: if (e[-3]&0x80) return e-3;
789 case 2: if (e[-2]&0x80) return e-2;
790 case 1: if (e[-1]&0x80) return e-1;
791 case 0: return NULL;
792 }
793}
794
795static int
796coderange_scan(const char *p, long len, rb_encoding *enc)
797{
798 const char *e = p + len;
799
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
801 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
802 p = search_nonascii(p, e);
804 }
805
806 if (rb_enc_asciicompat(enc)) {
807 p = search_nonascii(p, e);
808 if (!p) return ENC_CODERANGE_7BIT;
809 for (;;) {
810 int ret = rb_enc_precise_mbclen(p, e, enc);
812 p += MBCLEN_CHARFOUND_LEN(ret);
813 if (p == e) break;
814 p = search_nonascii(p, e);
815 if (!p) break;
816 }
817 }
818 else {
819 while (p < e) {
820 int ret = rb_enc_precise_mbclen(p, e, enc);
822 p += MBCLEN_CHARFOUND_LEN(ret);
823 }
824 }
825 return ENC_CODERANGE_VALID;
826}
827
828long
829rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
830{
831 const char *p = s;
832
833 if (*cr == ENC_CODERANGE_BROKEN)
834 return e - s;
835
836 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
837 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
838 if (*cr == ENC_CODERANGE_VALID) return e - s;
839 p = search_nonascii(p, e);
841 return e - s;
842 }
843 else if (rb_enc_asciicompat(enc)) {
844 p = search_nonascii(p, e);
845 if (!p) {
846 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
847 return e - s;
848 }
849 for (;;) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 if (p == e) break;
857 p = search_nonascii(p, e);
858 if (!p) break;
859 }
860 }
861 else {
862 while (p < e) {
863 int ret = rb_enc_precise_mbclen(p, e, enc);
864 if (!MBCLEN_CHARFOUND_P(ret)) {
866 return p - s;
867 }
868 p += MBCLEN_CHARFOUND_LEN(ret);
869 }
870 }
872 return e - s;
873}
874
875static inline void
876str_enc_copy(VALUE str1, VALUE str2)
877{
878 rb_enc_set_index(str1, ENCODING_GET(str2));
879}
880
881/* Like str_enc_copy, but does not check frozen status of str1.
882 * You should use this only if you're certain that str1 is not frozen. */
883static inline void
884str_enc_copy_direct(VALUE str1, VALUE str2)
885{
886 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
887 if (inlined_encoding == ENCODING_INLINE_MAX) {
888 rb_enc_set_index(str1, rb_enc_get_index(str2));
889 }
890 else {
891 ENCODING_SET_INLINED(str1, inlined_encoding);
892 }
893}
894
895static void
896rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
897{
898 /* this function is designed for copying encoding and coderange
899 * from src to new string "dest" which is made from the part of src.
900 */
901 str_enc_copy(dest, src);
902 if (RSTRING_LEN(dest) == 0) {
903 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
905 else
907 return;
908 }
909 switch (ENC_CODERANGE(src)) {
912 break;
914 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
915 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
917 else
919 break;
920 default:
921 break;
922 }
923}
924
925static void
926rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
927{
928 str_enc_copy(dest, src);
930}
931
932static int
933enc_coderange_scan(VALUE str, rb_encoding *enc)
934{
935 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
936}
937
938int
939rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
940{
941 return enc_coderange_scan(str, enc);
942}
943
944int
946{
947 int cr = ENC_CODERANGE(str);
948
949 if (cr == ENC_CODERANGE_UNKNOWN) {
950 cr = enc_coderange_scan(str, get_encoding(str));
951 ENC_CODERANGE_SET(str, cr);
952 }
953 return cr;
954}
955
956static inline bool
957rb_enc_str_asciicompat(VALUE str)
958{
959 int encindex = ENCODING_GET_INLINED(str);
960 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
961}
962
963int
965{
966 switch(ENC_CODERANGE(str)) {
968 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
970 return true;
971 default:
972 return false;
973 }
974}
975
976static inline void
977str_mod_check(VALUE s, const char *p, long len)
978{
979 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
980 rb_raise(rb_eRuntimeError, "string modified");
981 }
982}
983
984static size_t
985str_capacity(VALUE str, const int termlen)
986{
987 if (STR_EMBED_P(str)) {
988 return str_embed_capa(str) - termlen;
989 }
990 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
991 return RSTRING(str)->len;
992 }
993 else {
994 return RSTRING(str)->as.heap.aux.capa;
995 }
996}
997
998size_t
1000{
1001 return str_capacity(str, TERM_LEN(str));
1002}
1003
1004static inline void
1005must_not_null(const char *ptr)
1006{
1007 if (!ptr) {
1008 rb_raise(rb_eArgError, "NULL pointer given");
1009 }
1010}
1011
1012static inline VALUE
1013str_alloc_embed(VALUE klass, size_t capa)
1014{
1015 size_t size = rb_str_embed_size(capa, 0);
1016 RUBY_ASSERT(size > 0);
1017 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1018
1019 NEWOBJ_OF(str, struct RString, klass,
1021
1022 str->len = 0;
1023 str->as.embed.ary[0] = 0;
1024
1025 return (VALUE)str;
1026}
1027
1028static inline VALUE
1029str_alloc_heap(VALUE klass)
1030{
1031 NEWOBJ_OF(str, struct RString, klass,
1032 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1033
1034 str->len = 0;
1035 str->as.heap.aux.capa = 0;
1036 str->as.heap.ptr = NULL;
1037
1038 return (VALUE)str;
1039}
1040
1041static inline VALUE
1042empty_str_alloc(VALUE klass)
1043{
1044 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1045 VALUE str = str_alloc_embed(klass, 0);
1046 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1048 return str;
1049}
1050
1051static VALUE
1052str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1053{
1054 VALUE str;
1055
1056 if (len < 0) {
1057 rb_raise(rb_eArgError, "negative string size (or size too big)");
1058 }
1059
1060 if (enc == NULL) {
1061 enc = rb_ascii8bit_encoding();
1062 }
1063
1064 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1065
1066 int termlen = rb_enc_mbminlen(enc);
1067
1068 if (STR_EMBEDDABLE_P(len, termlen)) {
1069 str = str_alloc_embed(klass, len + termlen);
1070 if (len == 0) {
1071 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1072 }
1073 }
1074 else {
1075 str = str_alloc_heap(klass);
1076 RSTRING(str)->as.heap.aux.capa = len;
1077 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1078 * integer overflow. If we can STATIC_ASSERT that, the following
1079 * mul_add_mul can be reverted to a simple ALLOC_N. */
1080 RSTRING(str)->as.heap.ptr =
1081 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1082 }
1083
1084 rb_enc_raw_set(str, enc);
1085
1086 if (ptr) {
1087 memcpy(RSTRING_PTR(str), ptr, len);
1088 }
1089 else {
1090 memset(RSTRING_PTR(str), 0, len);
1091 }
1092
1093 STR_SET_LEN(str, len);
1094 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1095 return str;
1096}
1097
1098static VALUE
1099str_new(VALUE klass, const char *ptr, long len)
1100{
1101 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1102}
1103
1104VALUE
1105rb_str_new(const char *ptr, long len)
1106{
1107 return str_new(rb_cString, ptr, len);
1108}
1109
1110VALUE
1111rb_usascii_str_new(const char *ptr, long len)
1112{
1113 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1114}
1115
1116VALUE
1117rb_utf8_str_new(const char *ptr, long len)
1118{
1119 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1120}
1121
1122VALUE
1123rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1124{
1125 return str_enc_new(rb_cString, ptr, len, enc);
1126}
1127
1128VALUE
1130{
1131 must_not_null(ptr);
1132 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1133 * memory regions, and that cannot be detected by the MSAN. Just
1134 * trust the programmer that the argument passed here is a sane C
1135 * string. */
1136 __msan_unpoison_string(ptr);
1137 return rb_str_new(ptr, strlen(ptr));
1138}
1139
1140VALUE
1142{
1143 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1144}
1145
1146VALUE
1148{
1149 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1150}
1151
1152VALUE
1154{
1155 must_not_null(ptr);
1156 if (rb_enc_mbminlen(enc) != 1) {
1157 rb_raise(rb_eArgError, "wchar encoding given");
1158 }
1159 return rb_enc_str_new(ptr, strlen(ptr), enc);
1160}
1161
1162static VALUE
1163str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1164{
1165 VALUE str;
1166
1167 if (len < 0) {
1168 rb_raise(rb_eArgError, "negative string size (or size too big)");
1169 }
1170
1171 if (!ptr) {
1172 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1173 }
1174 else {
1175 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1176 str = str_alloc_heap(klass);
1177 RSTRING(str)->len = len;
1178 RSTRING(str)->as.heap.ptr = (char *)ptr;
1179 RSTRING(str)->as.heap.aux.capa = len;
1180 RBASIC(str)->flags |= STR_NOFREE;
1181 rb_enc_associate_index(str, encindex);
1182 }
1183 return str;
1184}
1185
1186VALUE
1187rb_str_new_static(const char *ptr, long len)
1188{
1189 return str_new_static(rb_cString, ptr, len, 0);
1190}
1191
1192VALUE
1194{
1195 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1196}
1197
1198VALUE
1200{
1201 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1202}
1203
1204VALUE
1206{
1207 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1208}
1209
1210static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1211 rb_encoding *from, rb_encoding *to,
1212 int ecflags, VALUE ecopts);
1213
1214static inline bool
1215is_enc_ascii_string(VALUE str, rb_encoding *enc)
1216{
1217 int encidx = rb_enc_to_index(enc);
1218 if (rb_enc_get_index(str) == encidx)
1219 return is_ascii_string(str);
1220 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1221}
1222
1223VALUE
1224rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1225{
1226 long len;
1227 const char *ptr;
1228 VALUE newstr;
1229
1230 if (!to) return str;
1231 if (!from) from = rb_enc_get(str);
1232 if (from == to) return str;
1233 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1234 rb_is_ascii8bit_enc(to)) {
1235 if (STR_ENC_GET(str) != to) {
1236 str = rb_str_dup(str);
1237 rb_enc_associate(str, to);
1238 }
1239 return str;
1240 }
1241
1242 RSTRING_GETMEM(str, ptr, len);
1243 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1244 from, to, ecflags, ecopts);
1245 if (NIL_P(newstr)) {
1246 /* some error, return original */
1247 return str;
1248 }
1249 return newstr;
1250}
1251
1252VALUE
1253rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1254 rb_encoding *from, int ecflags, VALUE ecopts)
1255{
1256 long olen;
1257
1258 olen = RSTRING_LEN(newstr);
1259 if (ofs < -olen || olen < ofs)
1260 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1261 if (ofs < 0) ofs += olen;
1262 if (!from) {
1263 STR_SET_LEN(newstr, ofs);
1264 return rb_str_cat(newstr, ptr, len);
1265 }
1266
1267 rb_str_modify(newstr);
1268 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1269 rb_enc_get(newstr),
1270 ecflags, ecopts);
1271}
1272
1273VALUE
1274rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1275{
1276 STR_SET_LEN(str, 0);
1277 rb_enc_associate(str, enc);
1278 rb_str_cat(str, ptr, len);
1279 return str;
1280}
1281
1282static VALUE
1283str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1284 rb_encoding *from, rb_encoding *to,
1285 int ecflags, VALUE ecopts)
1286{
1287 rb_econv_t *ec;
1289 long olen;
1290 VALUE econv_wrapper;
1291 const unsigned char *start, *sp;
1292 unsigned char *dest, *dp;
1293 size_t converted_output = (size_t)ofs;
1294
1295 olen = rb_str_capacity(newstr);
1296
1297 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1298 RBASIC_CLEAR_CLASS(econv_wrapper);
1299 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1300 if (!ec) return Qnil;
1301 DATA_PTR(econv_wrapper) = ec;
1302
1303 sp = (unsigned char*)ptr;
1304 start = sp;
1305 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1306 (dp = dest + converted_output),
1307 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1309 /* destination buffer short */
1310 size_t converted_input = sp - start;
1311 size_t rest = len - converted_input;
1312 converted_output = dp - dest;
1313 rb_str_set_len(newstr, converted_output);
1314 if (converted_input && converted_output &&
1315 rest < (LONG_MAX / converted_output)) {
1316 rest = (rest * converted_output) / converted_input;
1317 }
1318 else {
1319 rest = olen;
1320 }
1321 olen += rest < 2 ? 2 : rest;
1322 rb_str_resize(newstr, olen);
1323 }
1324 DATA_PTR(econv_wrapper) = 0;
1325 RB_GC_GUARD(econv_wrapper);
1326 rb_econv_close(ec);
1327 switch (ret) {
1328 case econv_finished:
1329 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1330 rb_str_set_len(newstr, len);
1331 rb_enc_associate(newstr, to);
1332 return newstr;
1333
1334 default:
1335 return Qnil;
1336 }
1337}
1338
1339VALUE
1341{
1342 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1343}
1344
1345VALUE
1347{
1348 rb_encoding *ienc;
1349 VALUE str;
1350 const int eidx = rb_enc_to_index(eenc);
1351
1352 if (!ptr) {
1353 return rb_enc_str_new(ptr, len, eenc);
1354 }
1355
1356 /* ASCII-8BIT case, no conversion */
1357 if ((eidx == rb_ascii8bit_encindex()) ||
1358 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1359 return rb_str_new(ptr, len);
1360 }
1361 /* no default_internal or same encoding, no conversion */
1362 ienc = rb_default_internal_encoding();
1363 if (!ienc || eenc == ienc) {
1364 return rb_enc_str_new(ptr, len, eenc);
1365 }
1366 /* ASCII compatible, and ASCII only string, no conversion in
1367 * default_internal */
1368 if ((eidx == rb_ascii8bit_encindex()) ||
1369 (eidx == rb_usascii_encindex()) ||
1370 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1371 return rb_enc_str_new(ptr, len, ienc);
1372 }
1373 /* convert from the given encoding to default_internal */
1374 str = rb_enc_str_new(NULL, 0, ienc);
1375 /* when the conversion failed for some reason, just ignore the
1376 * default_internal and result in the given encoding as-is. */
1377 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1378 rb_str_initialize(str, ptr, len, eenc);
1379 }
1380 return str;
1381}
1382
1383VALUE
1384rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1385{
1386 int eidx = rb_enc_to_index(eenc);
1387 if (eidx == rb_usascii_encindex() &&
1388 !is_ascii_string(str)) {
1389 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1390 return str;
1391 }
1392 rb_enc_associate_index(str, eidx);
1393 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1394}
1395
1396VALUE
1397rb_external_str_new(const char *ptr, long len)
1398{
1399 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1400}
1401
1402VALUE
1404{
1405 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1406}
1407
1408VALUE
1409rb_locale_str_new(const char *ptr, long len)
1410{
1411 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1412}
1413
1414VALUE
1416{
1417 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1418}
1419
1420VALUE
1422{
1423 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1424}
1425
1426VALUE
1428{
1429 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1430}
1431
1432VALUE
1434{
1435 return rb_str_export_to_enc(str, rb_default_external_encoding());
1436}
1437
1438VALUE
1440{
1441 return rb_str_export_to_enc(str, rb_locale_encoding());
1442}
1443
1444VALUE
1446{
1447 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1448}
1449
1450static VALUE
1451str_replace_shared_without_enc(VALUE str2, VALUE str)
1452{
1453 const int termlen = TERM_LEN(str);
1454 char *ptr;
1455 long len;
1456
1457 RSTRING_GETMEM(str, ptr, len);
1458 if (str_embed_capa(str2) >= len + termlen) {
1459 char *ptr2 = RSTRING(str2)->as.embed.ary;
1460 STR_SET_EMBED(str2);
1461 memcpy(ptr2, RSTRING_PTR(str), len);
1462 TERM_FILL(ptr2+len, termlen);
1463 }
1464 else {
1465 VALUE root;
1466 if (STR_SHARED_P(str)) {
1467 root = RSTRING(str)->as.heap.aux.shared;
1468 RSTRING_GETMEM(str, ptr, len);
1469 }
1470 else {
1471 root = rb_str_new_frozen(str);
1472 RSTRING_GETMEM(root, ptr, len);
1473 }
1474 RUBY_ASSERT(OBJ_FROZEN(root));
1475
1476 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1477 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1478 rb_fatal("about to free a possible shared root");
1479 }
1480 char *ptr2 = STR_HEAP_PTR(str2);
1481 if (ptr2 != ptr) {
1482 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1483 }
1484 }
1485 FL_SET(str2, STR_NOEMBED);
1486 RSTRING(str2)->as.heap.ptr = ptr;
1487 STR_SET_SHARED(str2, root);
1488 }
1489
1490 STR_SET_LEN(str2, len);
1491
1492 return str2;
1493}
1494
1495static VALUE
1496str_replace_shared(VALUE str2, VALUE str)
1497{
1498 str_replace_shared_without_enc(str2, str);
1499 rb_enc_cr_str_exact_copy(str2, str);
1500 return str2;
1501}
1502
1503static VALUE
1504str_new_shared(VALUE klass, VALUE str)
1505{
1506 return str_replace_shared(str_alloc_heap(klass), str);
1507}
1508
1509VALUE
1511{
1512 return str_new_shared(rb_obj_class(str), str);
1513}
1514
1515VALUE
1517{
1518 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1519 return str_new_frozen(rb_obj_class(orig), orig);
1520}
1521
1522static VALUE
1523rb_str_new_frozen_String(VALUE orig)
1524{
1525 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1526 return str_new_frozen(rb_cString, orig);
1527}
1528
1529
1530VALUE
1531rb_str_frozen_bare_string(VALUE orig)
1532{
1533 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1534 return str_new_frozen(rb_cString, orig);
1535}
1536
1537VALUE
1538rb_str_tmp_frozen_acquire(VALUE orig)
1539{
1540 if (OBJ_FROZEN_RAW(orig)) return orig;
1541 return str_new_frozen_buffer(0, orig, FALSE);
1542}
1543
1544VALUE
1545rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1546{
1547 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1548 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1549
1550 VALUE str = str_alloc_heap(0);
1551 OBJ_FREEZE(str);
1552 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1553 FL_SET(str, STR_SHARED_ROOT);
1554
1555 size_t capa = str_capacity(orig, TERM_LEN(orig));
1556
1557 /* If the string is embedded then we want to create a copy that is heap
1558 * allocated. If the string is shared then the shared root must be
1559 * embedded, so we want to create a copy. If the string is a shared root
1560 * then it must be embedded, so we want to create a copy. */
1561 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1562 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1563 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1564 }
1565 else {
1566 /* orig must be heap allocated and not shared, so we can safely transfer
1567 * the pointer to str. */
1568 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1569 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1570 RBASIC(orig)->flags &= ~STR_NOFREE;
1571 STR_SET_SHARED(orig, str);
1572 if (RB_OBJ_SHAREABLE_P(orig)) {
1573 RB_OBJ_SET_SHAREABLE(str);
1574 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1575 }
1576 }
1577
1578 RSTRING(str)->len = RSTRING(orig)->len;
1579 RSTRING(str)->as.heap.aux.capa = capa;
1580
1581 return str;
1582}
1583
1584void
1585rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1586{
1587 if (RBASIC_CLASS(tmp) != 0)
1588 return;
1589
1590 if (STR_EMBED_P(tmp)) {
1592 }
1593 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1594 !OBJ_FROZEN_RAW(orig)) {
1595 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1596
1597 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1598 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1599 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1600
1601 /* Unshare orig since the root (tmp) only has this one child. */
1602 FL_UNSET_RAW(orig, STR_SHARED);
1603 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1604 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1606
1607 /* Make tmp embedded and empty so it is safe for sweeping. */
1608 STR_SET_EMBED(tmp);
1609 STR_SET_LEN(tmp, 0);
1610 }
1611 }
1612}
1613
1614static VALUE
1615str_new_frozen(VALUE klass, VALUE orig)
1616{
1617 return str_new_frozen_buffer(klass, orig, TRUE);
1618}
1619
1620static VALUE
1621heap_str_make_shared(VALUE klass, VALUE orig)
1622{
1623 RUBY_ASSERT(!STR_EMBED_P(orig));
1624 RUBY_ASSERT(!STR_SHARED_P(orig));
1626
1627 VALUE str = str_alloc_heap(klass);
1628 STR_SET_LEN(str, RSTRING_LEN(orig));
1629 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1630 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1631 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1632 RBASIC(orig)->flags &= ~STR_NOFREE;
1633 STR_SET_SHARED(orig, str);
1634 if (klass == 0)
1635 FL_UNSET_RAW(str, STR_BORROWED);
1636 return str;
1637}
1638
1639static VALUE
1640str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1641{
1642 VALUE str;
1643
1644 long len = RSTRING_LEN(orig);
1645 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1646 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1647
1648 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1649 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1650 RUBY_ASSERT(STR_EMBED_P(str));
1651 }
1652 else {
1653 if (FL_TEST_RAW(orig, STR_SHARED)) {
1654 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1655 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1656 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1657 RUBY_ASSERT(ofs >= 0);
1658 RUBY_ASSERT(rest >= 0);
1659 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1661
1662 if ((ofs > 0) || (rest > 0) ||
1663 (klass != RBASIC(shared)->klass) ||
1664 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1665 str = str_new_shared(klass, shared);
1666 RUBY_ASSERT(!STR_EMBED_P(str));
1667 RSTRING(str)->as.heap.ptr += ofs;
1668 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1669 }
1670 else {
1671 if (RBASIC_CLASS(shared) == 0)
1672 FL_SET_RAW(shared, STR_BORROWED);
1673 return shared;
1674 }
1675 }
1676 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1677 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1678 STR_SET_EMBED(str);
1679 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1680 STR_SET_LEN(str, RSTRING_LEN(orig));
1681 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1682 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1683 }
1684 else {
1685 if (RB_OBJ_SHAREABLE_P(orig)) {
1686 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1687 }
1688 else {
1689 str = heap_str_make_shared(klass, orig);
1690 }
1691 }
1692 }
1693
1694 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1695 OBJ_FREEZE(str);
1696 return str;
1697}
1698
1699VALUE
1700rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1701{
1702 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1703}
1704
1705static VALUE
1706str_new_empty_String(VALUE str)
1707{
1708 VALUE v = rb_str_new(0, 0);
1709 rb_enc_copy(v, str);
1710 return v;
1711}
1712
1713#define STR_BUF_MIN_SIZE 63
1714
1715VALUE
1717{
1718 if (STR_EMBEDDABLE_P(capa, 1)) {
1719 return str_alloc_embed(rb_cString, capa + 1);
1720 }
1721
1722 VALUE str = str_alloc_heap(rb_cString);
1723
1724 RSTRING(str)->as.heap.aux.capa = capa;
1725 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1726 RSTRING(str)->as.heap.ptr[0] = '\0';
1727
1728 return str;
1729}
1730
1731VALUE
1733{
1734 VALUE str;
1735 long len = strlen(ptr);
1736
1737 str = rb_str_buf_new(len);
1738 rb_str_buf_cat(str, ptr, len);
1739
1740 return str;
1741}
1742
1743VALUE
1745{
1746 return str_new(0, 0, len);
1747}
1748
1749void
1751{
1752 if (STR_EMBED_P(str)) {
1753 RB_DEBUG_COUNTER_INC(obj_str_embed);
1754 }
1755 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1756 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1757 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1758 }
1759 else {
1760 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1761 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1762 }
1763}
1764
1765size_t
1766rb_str_memsize(VALUE str)
1767{
1768 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1769 return STR_HEAP_SIZE(str);
1770 }
1771 else {
1772 return 0;
1773 }
1774}
1775
1776VALUE
1778{
1779 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1780}
1781
1782static inline void str_discard(VALUE str);
1783static void str_shared_replace(VALUE str, VALUE str2);
1784
1785void
1787{
1788 if (str != str2) str_shared_replace(str, str2);
1789}
1790
1791static void
1792str_shared_replace(VALUE str, VALUE str2)
1793{
1794 rb_encoding *enc;
1795 int cr;
1796 int termlen;
1797
1798 RUBY_ASSERT(str2 != str);
1799 enc = STR_ENC_GET(str2);
1800 cr = ENC_CODERANGE(str2);
1801 str_discard(str);
1802 termlen = rb_enc_mbminlen(enc);
1803
1804 STR_SET_LEN(str, RSTRING_LEN(str2));
1805
1806 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1807 STR_SET_EMBED(str);
1808 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1809 rb_enc_associate(str, enc);
1810 ENC_CODERANGE_SET(str, cr);
1811 }
1812 else {
1813 if (STR_EMBED_P(str2)) {
1814 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1815 long len = RSTRING_LEN(str2);
1816 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1817
1818 char *new_ptr = ALLOC_N(char, len + termlen);
1819 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1820 RSTRING(str2)->as.heap.ptr = new_ptr;
1821 STR_SET_LEN(str2, len);
1822 RSTRING(str2)->as.heap.aux.capa = len;
1823 STR_SET_NOEMBED(str2);
1824 }
1825
1826 STR_SET_NOEMBED(str);
1827 FL_UNSET(str, STR_SHARED);
1828 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1829
1830 if (FL_TEST(str2, STR_SHARED)) {
1831 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1832 STR_SET_SHARED(str, shared);
1833 }
1834 else {
1835 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1836 }
1837
1838 /* abandon str2 */
1839 STR_SET_EMBED(str2);
1840 RSTRING_PTR(str2)[0] = 0;
1841 STR_SET_LEN(str2, 0);
1842 rb_enc_associate(str, enc);
1843 ENC_CODERANGE_SET(str, cr);
1844 }
1845}
1846
1847VALUE
1849{
1850 VALUE str;
1851
1852 if (RB_TYPE_P(obj, T_STRING)) {
1853 return obj;
1854 }
1855 str = rb_funcall(obj, idTo_s, 0);
1856 return rb_obj_as_string_result(str, obj);
1857}
1858
1859VALUE
1860rb_obj_as_string_result(VALUE str, VALUE obj)
1861{
1862 if (!RB_TYPE_P(str, T_STRING))
1863 return rb_any_to_s(obj);
1864 return str;
1865}
1866
1867static VALUE
1868str_replace(VALUE str, VALUE str2)
1869{
1870 long len;
1871
1872 len = RSTRING_LEN(str2);
1873 if (STR_SHARED_P(str2)) {
1874 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1876 STR_SET_NOEMBED(str);
1877 STR_SET_LEN(str, len);
1878 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1879 STR_SET_SHARED(str, shared);
1880 rb_enc_cr_str_exact_copy(str, str2);
1881 }
1882 else {
1883 str_replace_shared(str, str2);
1884 }
1885
1886 return str;
1887}
1888
1889static inline VALUE
1890ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1891{
1892 size_t size = rb_str_embed_size(capa, 0);
1893 RUBY_ASSERT(size > 0);
1894 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1895
1896 NEWOBJ_OF(str, struct RString, klass,
1898
1899 str->len = 0;
1900
1901 return (VALUE)str;
1902}
1903
1904static inline VALUE
1905ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1906{
1907 NEWOBJ_OF(str, struct RString, klass,
1908 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1909
1910 str->as.heap.aux.capa = 0;
1911 str->as.heap.ptr = NULL;
1912
1913 return (VALUE)str;
1914}
1915
1916static inline VALUE
1917str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1918{
1919 int encidx = 0;
1920 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1921 encidx = rb_enc_get_index(str);
1922 flags &= ~ENCODING_MASK;
1923 }
1924 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1925 if (encidx) rb_enc_associate_index(dup, encidx);
1926 return dup;
1927}
1928
1929static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1930
1931static inline VALUE
1932str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1933{
1934 VALUE flags = FL_TEST_RAW(str, flag_mask);
1935 long len = RSTRING_LEN(str);
1936
1937 RUBY_ASSERT(STR_EMBED_P(dup));
1938 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1939 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1940 STR_SET_LEN(dup, RSTRING_LEN(str));
1941 return str_duplicate_setup_encoding(str, dup, flags);
1942}
1943
1944static inline VALUE
1945str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1946{
1947 VALUE flags = FL_TEST_RAW(str, flag_mask);
1948 VALUE root = str;
1949 if (FL_TEST_RAW(str, STR_SHARED)) {
1950 root = RSTRING(str)->as.heap.aux.shared;
1951 }
1952 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1953 root = str = str_new_frozen(klass, str);
1954 flags = FL_TEST_RAW(str, flag_mask);
1955 }
1956 RUBY_ASSERT(!STR_SHARED_P(root));
1958
1959 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1960 FL_SET(root, STR_SHARED_ROOT);
1961 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1962 flags |= RSTRING_NOEMBED | STR_SHARED;
1963
1964 STR_SET_LEN(dup, RSTRING_LEN(str));
1965 return str_duplicate_setup_encoding(str, dup, flags);
1966}
1967
1968static inline VALUE
1969str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1970{
1971 if (STR_EMBED_P(str)) {
1972 return str_duplicate_setup_embed(klass, str, dup);
1973 }
1974 else {
1975 return str_duplicate_setup_heap(klass, str, dup);
1976 }
1977}
1978
1979static inline VALUE
1980str_duplicate(VALUE klass, VALUE str)
1981{
1982 VALUE dup;
1983 if (STR_EMBED_P(str)) {
1984 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1985 }
1986 else {
1987 dup = str_alloc_heap(klass);
1988 }
1989
1990 return str_duplicate_setup(klass, str, dup);
1991}
1992
1993VALUE
1995{
1996 return str_duplicate(rb_obj_class(str), str);
1997}
1998
1999/* :nodoc: */
2000VALUE
2001rb_str_dup_m(VALUE str)
2002{
2003 if (LIKELY(BARE_STRING_P(str))) {
2004 return str_duplicate(rb_cString, str);
2005 }
2006 else {
2007 return rb_obj_dup(str);
2008 }
2009}
2010
2011VALUE
2013{
2014 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2015 return str_duplicate(rb_cString, str);
2016}
2017
2018VALUE
2019rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2020{
2021 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2022 VALUE new_str, klass = rb_cString;
2023
2024 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2025 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2026 str_duplicate_setup_embed(klass, str, new_str);
2027 }
2028 else {
2029 new_str = ec_str_alloc_heap(ec, klass);
2030 str_duplicate_setup_heap(klass, str, new_str);
2031 }
2032 if (chilled) {
2033 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2034 }
2035 return new_str;
2036}
2037
2038VALUE
2039rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2040{
2041 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2042 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2043 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2044 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2045 return rb_str_freeze(str);
2046}
2047
2048/*
2049 * The documentation block below uses an include (instead of inline text)
2050 * because the included text has non-ASCII characters (which are not allowed in a C file).
2051 */
2052
2053/*
2054 *
2055 * call-seq:
2056 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2057 *
2058 * :include: doc/string/new.rdoc
2059 *
2060 */
2061
2062static VALUE
2063rb_str_init(int argc, VALUE *argv, VALUE str)
2064{
2065 static ID keyword_ids[2];
2066 VALUE orig, opt, venc, vcapa;
2067 VALUE kwargs[2];
2068 rb_encoding *enc = 0;
2069 int n;
2070
2071 if (!keyword_ids[0]) {
2072 keyword_ids[0] = rb_id_encoding();
2073 CONST_ID(keyword_ids[1], "capacity");
2074 }
2075
2076 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2077 if (!NIL_P(opt)) {
2078 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2079 venc = kwargs[0];
2080 vcapa = kwargs[1];
2081 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2082 enc = rb_to_encoding(venc);
2083 }
2084 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2085 long capa = NUM2LONG(vcapa);
2086 long len = 0;
2087 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2088
2089 if (capa < STR_BUF_MIN_SIZE) {
2090 capa = STR_BUF_MIN_SIZE;
2091 }
2092 if (n == 1) {
2093 StringValue(orig);
2094 len = RSTRING_LEN(orig);
2095 if (capa < len) {
2096 capa = len;
2097 }
2098 if (orig == str) n = 0;
2099 }
2100 str_modifiable(str);
2101 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2102 /* make noembed always */
2103 const size_t size = (size_t)capa + termlen;
2104 const char *const old_ptr = RSTRING_PTR(str);
2105 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2106 char *new_ptr = ALLOC_N(char, size);
2107 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2108 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2109 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2110 RSTRING(str)->as.heap.ptr = new_ptr;
2111 }
2112 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2113 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2114 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2115 }
2116 STR_SET_LEN(str, len);
2117 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2118 if (n == 1) {
2119 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2120 rb_enc_cr_str_exact_copy(str, orig);
2121 }
2122 FL_SET(str, STR_NOEMBED);
2123 RSTRING(str)->as.heap.aux.capa = capa;
2124 }
2125 else if (n == 1) {
2126 rb_str_replace(str, orig);
2127 }
2128 if (enc) {
2129 rb_enc_associate(str, enc);
2131 }
2132 }
2133 else if (n == 1) {
2134 rb_str_replace(str, orig);
2135 }
2136 return str;
2137}
2138
2139/* :nodoc: */
2140static VALUE
2141rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2142{
2143 if (klass != rb_cString) {
2144 return rb_class_new_instance_pass_kw(argc, argv, klass);
2145 }
2146
2147 static ID keyword_ids[2];
2148 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2149 VALUE kwargs[2];
2150 rb_encoding *enc = NULL;
2151
2152 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2153 if (NIL_P(opt)) {
2154 return rb_class_new_instance_pass_kw(argc, argv, klass);
2155 }
2156
2157 keyword_ids[0] = rb_id_encoding();
2158 CONST_ID(keyword_ids[1], "capacity");
2159 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2160 encoding = kwargs[0];
2161 capacity = kwargs[1];
2162
2163 if (n == 1) {
2164 orig = StringValue(orig);
2165 }
2166 else {
2167 orig = Qnil;
2168 }
2169
2170 if (UNDEF_P(encoding)) {
2171 if (!NIL_P(orig)) {
2172 encoding = rb_obj_encoding(orig);
2173 }
2174 }
2175
2176 if (!UNDEF_P(encoding)) {
2177 enc = rb_to_encoding(encoding);
2178 }
2179
2180 // If capacity is nil, we're basically just duping `orig`.
2181 if (UNDEF_P(capacity)) {
2182 if (NIL_P(orig)) {
2183 VALUE empty_str = str_new(klass, "", 0);
2184 if (enc) {
2185 rb_enc_associate(empty_str, enc);
2186 }
2187 return empty_str;
2188 }
2189 VALUE copy = str_duplicate(klass, orig);
2190 rb_enc_associate(copy, enc);
2191 ENC_CODERANGE_CLEAR(copy);
2192 return copy;
2193 }
2194
2195 long capa = 0;
2196 capa = NUM2LONG(capacity);
2197 if (capa < 0) {
2198 capa = 0;
2199 }
2200
2201 if (!NIL_P(orig)) {
2202 long orig_capa = rb_str_capacity(orig);
2203 if (orig_capa > capa) {
2204 capa = orig_capa;
2205 }
2206 }
2207
2208 VALUE str = str_enc_new(klass, NULL, capa, enc);
2209 STR_SET_LEN(str, 0);
2210 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2211
2212 if (!NIL_P(orig)) {
2213 rb_str_buf_append(str, orig);
2214 }
2215
2216 return str;
2217}
2218
2219#ifdef NONASCII_MASK
2220#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2221
2222/*
2223 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2224 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2225 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2226 *
2227 * if (!(byte & 0x80))
2228 * byte |= 0x40; // turn on bit6
2229 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2230 *
2231 * This function calculates whether a byte is leading or not for all bytes
2232 * in the argument word by concurrently using the above logic, and then
2233 * adds up the number of leading bytes in the word.
2234 */
2235static inline uintptr_t
2236count_utf8_lead_bytes_with_word(const uintptr_t *s)
2237{
2238 uintptr_t d = *s;
2239
2240 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2241 d = (d>>6) | (~d>>7);
2242 d &= NONASCII_MASK >> 7;
2243
2244 /* Gather all bytes. */
2245#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2246 /* use only if it can use POPCNT */
2247 return rb_popcount_intptr(d);
2248#else
2249 d += (d>>8);
2250 d += (d>>16);
2251# if SIZEOF_VOIDP == 8
2252 d += (d>>32);
2253# endif
2254 return (d&0xF);
2255#endif
2256}
2257#endif
2258
2259static inline long
2260enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2261{
2262 long c;
2263 const char *q;
2264
2265 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2266 long diff = (long)(e - p);
2267 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2268 }
2269#ifdef NONASCII_MASK
2270 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2271 uintptr_t len = 0;
2272 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2273 const uintptr_t *s, *t;
2274 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2275 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2276 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2277 while (p < (const char *)s) {
2278 if (is_utf8_lead_byte(*p)) len++;
2279 p++;
2280 }
2281 while (s < t) {
2282 len += count_utf8_lead_bytes_with_word(s);
2283 s++;
2284 }
2285 p = (const char *)s;
2286 }
2287 while (p < e) {
2288 if (is_utf8_lead_byte(*p)) len++;
2289 p++;
2290 }
2291 return (long)len;
2292 }
2293#endif
2294 else if (rb_enc_asciicompat(enc)) {
2295 c = 0;
2296 if (ENC_CODERANGE_CLEAN_P(cr)) {
2297 while (p < e) {
2298 if (ISASCII(*p)) {
2299 q = search_nonascii(p, e);
2300 if (!q)
2301 return c + (e - p);
2302 c += q - p;
2303 p = q;
2304 }
2305 p += rb_enc_fast_mbclen(p, e, enc);
2306 c++;
2307 }
2308 }
2309 else {
2310 while (p < e) {
2311 if (ISASCII(*p)) {
2312 q = search_nonascii(p, e);
2313 if (!q)
2314 return c + (e - p);
2315 c += q - p;
2316 p = q;
2317 }
2318 p += rb_enc_mbclen(p, e, enc);
2319 c++;
2320 }
2321 }
2322 return c;
2323 }
2324
2325 for (c=0; p<e; c++) {
2326 p += rb_enc_mbclen(p, e, enc);
2327 }
2328 return c;
2329}
2330
2331long
2332rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2333{
2334 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2335}
2336
2337/* To get strlen with cr
2338 * Note that given cr is not used.
2339 */
2340long
2341rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2342{
2343 long c;
2344 const char *q;
2345 int ret;
2346
2347 *cr = 0;
2348 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2349 long diff = (long)(e - p);
2350 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2351 }
2352 else if (rb_enc_asciicompat(enc)) {
2353 c = 0;
2354 while (p < e) {
2355 if (ISASCII(*p)) {
2356 q = search_nonascii(p, e);
2357 if (!q) {
2358 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2359 return c + (e - p);
2360 }
2361 c += q - p;
2362 p = q;
2363 }
2364 ret = rb_enc_precise_mbclen(p, e, enc);
2365 if (MBCLEN_CHARFOUND_P(ret)) {
2366 *cr |= ENC_CODERANGE_VALID;
2367 p += MBCLEN_CHARFOUND_LEN(ret);
2368 }
2369 else {
2371 p++;
2372 }
2373 c++;
2374 }
2375 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2376 return c;
2377 }
2378
2379 for (c=0; p<e; c++) {
2380 ret = rb_enc_precise_mbclen(p, e, enc);
2381 if (MBCLEN_CHARFOUND_P(ret)) {
2382 *cr |= ENC_CODERANGE_VALID;
2383 p += MBCLEN_CHARFOUND_LEN(ret);
2384 }
2385 else {
2387 if (p + rb_enc_mbminlen(enc) <= e)
2388 p += rb_enc_mbminlen(enc);
2389 else
2390 p = e;
2391 }
2392 }
2393 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2394 return c;
2395}
2396
2397/* enc must be str's enc or rb_enc_check(str, str2) */
2398static long
2399str_strlen(VALUE str, rb_encoding *enc)
2400{
2401 const char *p, *e;
2402 int cr;
2403
2404 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2405 if (!enc) enc = STR_ENC_GET(str);
2406 p = RSTRING_PTR(str);
2407 e = RSTRING_END(str);
2408 cr = ENC_CODERANGE(str);
2409
2410 if (cr == ENC_CODERANGE_UNKNOWN) {
2411 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2412 if (cr) ENC_CODERANGE_SET(str, cr);
2413 return n;
2414 }
2415 else {
2416 return enc_strlen(p, e, enc, cr);
2417 }
2418}
2419
2420long
2422{
2423 return str_strlen(str, NULL);
2424}
2425
2426/*
2427 * call-seq:
2428 * length -> integer
2429 *
2430 * :include: doc/string/length.rdoc
2431 *
2432 */
2433
2434VALUE
2436{
2437 return LONG2NUM(str_strlen(str, NULL));
2438}
2439
2440/*
2441 * call-seq:
2442 * bytesize -> integer
2443 *
2444 * :include: doc/string/bytesize.rdoc
2445 *
2446 */
2447
2448VALUE
2449rb_str_bytesize(VALUE str)
2450{
2451 return LONG2NUM(RSTRING_LEN(str));
2452}
2453
2454/*
2455 * call-seq:
2456 * empty? -> true or false
2457 *
2458 * Returns whether the length of +self+ is zero:
2459 *
2460 * 'hello'.empty? # => false
2461 * ' '.empty? # => false
2462 * ''.empty? # => true
2463 *
2464 * Related: see {Querying}[rdoc-ref:String@Querying].
2465 */
2466
2467static VALUE
2468rb_str_empty(VALUE str)
2469{
2470 return RBOOL(RSTRING_LEN(str) == 0);
2471}
2472
2473/*
2474 * call-seq:
2475 * self + other_string -> new_string
2476 *
2477 * Returns a new string containing +other_string+ concatenated to +self+:
2478 *
2479 * 'Hello from ' + self.to_s # => "Hello from main"
2480 *
2481 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2482 */
2483
2484VALUE
2486{
2487 VALUE str3;
2488 rb_encoding *enc;
2489 char *ptr1, *ptr2, *ptr3;
2490 long len1, len2;
2491 int termlen;
2492
2493 StringValue(str2);
2494 enc = rb_enc_check_str(str1, str2);
2495 RSTRING_GETMEM(str1, ptr1, len1);
2496 RSTRING_GETMEM(str2, ptr2, len2);
2497 termlen = rb_enc_mbminlen(enc);
2498 if (len1 > LONG_MAX - len2) {
2499 rb_raise(rb_eArgError, "string size too big");
2500 }
2501 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2502 ptr3 = RSTRING_PTR(str3);
2503 memcpy(ptr3, ptr1, len1);
2504 memcpy(ptr3+len1, ptr2, len2);
2505 TERM_FILL(&ptr3[len1+len2], termlen);
2506
2507 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2509 RB_GC_GUARD(str1);
2510 RB_GC_GUARD(str2);
2511 return str3;
2512}
2513
2514/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2515VALUE
2516rb_str_opt_plus(VALUE str1, VALUE str2)
2517{
2520 long len1, len2;
2521 MAYBE_UNUSED(char) *ptr1, *ptr2;
2522 RSTRING_GETMEM(str1, ptr1, len1);
2523 RSTRING_GETMEM(str2, ptr2, len2);
2524 int enc1 = rb_enc_get_index(str1);
2525 int enc2 = rb_enc_get_index(str2);
2526
2527 if (enc1 < 0) {
2528 return Qundef;
2529 }
2530 else if (enc2 < 0) {
2531 return Qundef;
2532 }
2533 else if (enc1 != enc2) {
2534 return Qundef;
2535 }
2536 else if (len1 > LONG_MAX - len2) {
2537 return Qundef;
2538 }
2539 else {
2540 return rb_str_plus(str1, str2);
2541 }
2542
2543}
2544
2545/*
2546 * call-seq:
2547 * self * n -> new_string
2548 *
2549 * Returns a new string containing +n+ copies of +self+:
2550 *
2551 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2552 * 'No!' * 0 # => ""
2553 *
2554 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2555 */
2556
2557VALUE
2559{
2560 VALUE str2;
2561 long n, len;
2562 char *ptr2;
2563 int termlen;
2564
2565 if (times == INT2FIX(1)) {
2566 return str_duplicate(rb_cString, str);
2567 }
2568 if (times == INT2FIX(0)) {
2569 str2 = str_alloc_embed(rb_cString, 0);
2570 rb_enc_copy(str2, str);
2571 return str2;
2572 }
2573 len = NUM2LONG(times);
2574 if (len < 0) {
2575 rb_raise(rb_eArgError, "negative argument");
2576 }
2577 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2578 if (STR_EMBEDDABLE_P(len, 1)) {
2579 str2 = str_alloc_embed(rb_cString, len + 1);
2580 memset(RSTRING_PTR(str2), 0, len + 1);
2581 }
2582 else {
2583 str2 = str_alloc_heap(rb_cString);
2584 RSTRING(str2)->as.heap.aux.capa = len;
2585 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2586 }
2587 STR_SET_LEN(str2, len);
2588 rb_enc_copy(str2, str);
2589 return str2;
2590 }
2591 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2592 rb_raise(rb_eArgError, "argument too big");
2593 }
2594
2595 len *= RSTRING_LEN(str);
2596 termlen = TERM_LEN(str);
2597 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2598 ptr2 = RSTRING_PTR(str2);
2599 if (len) {
2600 n = RSTRING_LEN(str);
2601 memcpy(ptr2, RSTRING_PTR(str), n);
2602 while (n <= len/2) {
2603 memcpy(ptr2 + n, ptr2, n);
2604 n *= 2;
2605 }
2606 memcpy(ptr2 + n, ptr2, len-n);
2607 }
2608 STR_SET_LEN(str2, len);
2609 TERM_FILL(&ptr2[len], termlen);
2610 rb_enc_cr_str_copy_for_substr(str2, str);
2611
2612 return str2;
2613}
2614
2615/*
2616 * call-seq:
2617 * self % object -> new_string
2618 *
2619 * Returns the result of formatting +object+ into the format specifications
2620 * contained in +self+
2621 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2622 *
2623 * '%05d' % 123 # => "00123"
2624 *
2625 * If +self+ contains multiple format specifications,
2626 * +object+ must be an array or hash containing the objects to be formatted:
2627 *
2628 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2629 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2630 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2631 *
2632 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2633 */
2634
2635static VALUE
2636rb_str_format_m(VALUE str, VALUE arg)
2637{
2638 VALUE tmp = rb_check_array_type(arg);
2639
2640 if (!NIL_P(tmp)) {
2641 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2642 }
2643 return rb_str_format(1, &arg, str);
2644}
2645
2646static inline void
2647rb_check_lockedtmp(VALUE str)
2648{
2649 if (FL_TEST(str, STR_TMPLOCK)) {
2650 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2651 }
2652}
2653
2654// If none of these flags are set, we know we have an modifiable string.
2655// If any is set, we need to do more detailed checks.
2656#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2657static inline void
2658str_modifiable(VALUE str)
2659{
2660 RUBY_ASSERT(ruby_thread_has_gvl_p());
2661
2662 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2663 if (CHILLED_STRING_P(str)) {
2664 CHILLED_STRING_MUTATED(str);
2665 }
2666 rb_check_lockedtmp(str);
2667 rb_check_frozen(str);
2668 }
2669}
2670
2671static inline int
2672str_dependent_p(VALUE str)
2673{
2674 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2675 return FALSE;
2676 }
2677 else {
2678 return TRUE;
2679 }
2680}
2681
2682// If none of these flags are set, we know we have an independent string.
2683// If any is set, we need to do more detailed checks.
2684#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2685static inline int
2686str_independent(VALUE str)
2687{
2688 RUBY_ASSERT(ruby_thread_has_gvl_p());
2689
2690 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2691 str_modifiable(str);
2692 return !str_dependent_p(str);
2693 }
2694 return TRUE;
2695}
2696
2697static void
2698str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2699{
2700 RUBY_ASSERT(ruby_thread_has_gvl_p());
2701
2702 char *ptr;
2703 char *oldptr;
2704 long capa = len + expand;
2705
2706 if (len > capa) len = capa;
2707
2708 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2709 ptr = RSTRING(str)->as.heap.ptr;
2710 STR_SET_EMBED(str);
2711 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2712 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2713 STR_SET_LEN(str, len);
2714 return;
2715 }
2716
2717 ptr = ALLOC_N(char, (size_t)capa + termlen);
2718 oldptr = RSTRING_PTR(str);
2719 if (oldptr) {
2720 memcpy(ptr, oldptr, len);
2721 }
2722 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2723 xfree(oldptr);
2724 }
2725 STR_SET_NOEMBED(str);
2726 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2727 TERM_FILL(ptr + len, termlen);
2728 RSTRING(str)->as.heap.ptr = ptr;
2729 STR_SET_LEN(str, len);
2730 RSTRING(str)->as.heap.aux.capa = capa;
2731}
2732
2733void
2734rb_str_modify(VALUE str)
2735{
2736 if (!str_independent(str))
2737 str_make_independent(str);
2739}
2740
2741void
2743{
2744 RUBY_ASSERT(ruby_thread_has_gvl_p());
2745
2746 int termlen = TERM_LEN(str);
2747 long len = RSTRING_LEN(str);
2748
2749 if (expand < 0) {
2750 rb_raise(rb_eArgError, "negative expanding string size");
2751 }
2752 if (expand >= LONG_MAX - len) {
2753 rb_raise(rb_eArgError, "string size too big");
2754 }
2755
2756 if (!str_independent(str)) {
2757 str_make_independent_expand(str, len, expand, termlen);
2758 }
2759 else if (expand > 0) {
2760 RESIZE_CAPA_TERM(str, len + expand, termlen);
2761 }
2763}
2764
2765/* As rb_str_modify(), but don't clear coderange */
2766static void
2767str_modify_keep_cr(VALUE str)
2768{
2769 if (!str_independent(str))
2770 str_make_independent(str);
2772 /* Force re-scan later */
2774}
2775
2776static inline void
2777str_discard(VALUE str)
2778{
2779 str_modifiable(str);
2780 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2781 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2782 RSTRING(str)->as.heap.ptr = 0;
2783 STR_SET_LEN(str, 0);
2784 }
2785}
2786
2787void
2789{
2790 int encindex = rb_enc_get_index(str);
2791
2792 if (RB_UNLIKELY(encindex == -1)) {
2793 rb_raise(rb_eTypeError, "not encoding capable object");
2794 }
2795
2796 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2797 return;
2798 }
2799
2800 rb_encoding *enc = rb_enc_from_index(encindex);
2801 if (!rb_enc_asciicompat(enc)) {
2802 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2803 }
2804}
2805
2806VALUE
2808{
2809 RUBY_ASSERT(ruby_thread_has_gvl_p());
2810
2811 VALUE s = *ptr;
2812 if (!RB_TYPE_P(s, T_STRING)) {
2813 s = rb_str_to_str(s);
2814 *ptr = s;
2815 }
2816 return s;
2817}
2818
2819char *
2821{
2822 VALUE str = rb_string_value(ptr);
2823 return RSTRING_PTR(str);
2824}
2825
2826static int
2827zero_filled(const char *s, int n)
2828{
2829 for (; n > 0; --n) {
2830 if (*s++) return 0;
2831 }
2832 return 1;
2833}
2834
2835static const char *
2836str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2837{
2838 const char *e = s + len;
2839
2840 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2841 if (zero_filled(s, minlen)) return s;
2842 }
2843 return 0;
2844}
2845
2846static char *
2847str_fill_term(VALUE str, char *s, long len, int termlen)
2848{
2849 /* This function assumes that (capa + termlen) bytes of memory
2850 * is allocated, like many other functions in this file.
2851 */
2852 if (str_dependent_p(str)) {
2853 if (!zero_filled(s + len, termlen))
2854 str_make_independent_expand(str, len, 0L, termlen);
2855 }
2856 else {
2857 TERM_FILL(s + len, termlen);
2858 return s;
2859 }
2860 return RSTRING_PTR(str);
2861}
2862
2863void
2864rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2865{
2866 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2867 long len = RSTRING_LEN(str);
2868
2869 RUBY_ASSERT(capa >= len);
2870 if (capa - len < termlen) {
2871 rb_check_lockedtmp(str);
2872 str_make_independent_expand(str, len, 0L, termlen);
2873 }
2874 else if (str_dependent_p(str)) {
2875 if (termlen > oldtermlen)
2876 str_make_independent_expand(str, len, 0L, termlen);
2877 }
2878 else {
2879 if (!STR_EMBED_P(str)) {
2880 /* modify capa instead of realloc */
2881 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2882 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2883 }
2884 if (termlen > oldtermlen) {
2885 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2886 }
2887 }
2888
2889 return;
2890}
2891
2892static char *
2893str_null_check(VALUE str, int *w)
2894{
2895 char *s = RSTRING_PTR(str);
2896 long len = RSTRING_LEN(str);
2897 rb_encoding *enc = rb_enc_get(str);
2898 const int minlen = rb_enc_mbminlen(enc);
2899
2900 if (minlen > 1) {
2901 *w = 1;
2902 if (str_null_char(s, len, minlen, enc)) {
2903 return NULL;
2904 }
2905 return str_fill_term(str, s, len, minlen);
2906 }
2907 *w = 0;
2908 if (!s || memchr(s, 0, len)) {
2909 return NULL;
2910 }
2911 if (s[len]) {
2912 s = str_fill_term(str, s, len, minlen);
2913 }
2914 return s;
2915}
2916
2917char *
2918rb_str_to_cstr(VALUE str)
2919{
2920 int w;
2921 return str_null_check(str, &w);
2922}
2923
2924char *
2926{
2927 VALUE str = rb_string_value(ptr);
2928 int w;
2929 char *s = str_null_check(str, &w);
2930 if (!s) {
2931 if (w) {
2932 rb_raise(rb_eArgError, "string contains null char");
2933 }
2934 rb_raise(rb_eArgError, "string contains null byte");
2935 }
2936 return s;
2937}
2938
2939char *
2940rb_str_fill_terminator(VALUE str, const int newminlen)
2941{
2942 char *s = RSTRING_PTR(str);
2943 long len = RSTRING_LEN(str);
2944 return str_fill_term(str, s, len, newminlen);
2945}
2946
2947VALUE
2949{
2950 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2951 return str;
2952}
2953
2954/*
2955 * call-seq:
2956 * String.try_convert(object) -> object, new_string, or nil
2957 *
2958 * Attempts to convert the given +object+ to a string.
2959 *
2960 * If +object+ is already a string, returns +object+, unmodified.
2961 *
2962 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2963 * calls <tt>object.to_str</tt> and returns the result.
2964 *
2965 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2966 *
2967 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2968 */
2969static VALUE
2970rb_str_s_try_convert(VALUE dummy, VALUE str)
2971{
2972 return rb_check_string_type(str);
2973}
2974
2975static char*
2976str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2977{
2978 long nth = *nthp;
2979 if (rb_enc_mbmaxlen(enc) == 1) {
2980 p += nth;
2981 }
2982 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2983 p += nth * rb_enc_mbmaxlen(enc);
2984 }
2985 else if (rb_enc_asciicompat(enc)) {
2986 const char *p2, *e2;
2987 int n;
2988
2989 while (p < e && 0 < nth) {
2990 e2 = p + nth;
2991 if (e < e2) {
2992 *nthp = nth;
2993 return (char *)e;
2994 }
2995 if (ISASCII(*p)) {
2996 p2 = search_nonascii(p, e2);
2997 if (!p2) {
2998 nth -= e2 - p;
2999 *nthp = nth;
3000 return (char *)e2;
3001 }
3002 nth -= p2 - p;
3003 p = p2;
3004 }
3005 n = rb_enc_mbclen(p, e, enc);
3006 p += n;
3007 nth--;
3008 }
3009 *nthp = nth;
3010 if (nth != 0) {
3011 return (char *)e;
3012 }
3013 return (char *)p;
3014 }
3015 else {
3016 while (p < e && nth--) {
3017 p += rb_enc_mbclen(p, e, enc);
3018 }
3019 }
3020 if (p > e) p = e;
3021 *nthp = nth;
3022 return (char*)p;
3023}
3024
3025char*
3026rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3027{
3028 return str_nth_len(p, e, &nth, enc);
3029}
3030
3031static char*
3032str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3033{
3034 if (singlebyte)
3035 p += nth;
3036 else {
3037 p = str_nth_len(p, e, &nth, enc);
3038 }
3039 if (!p) return 0;
3040 if (p > e) p = e;
3041 return (char *)p;
3042}
3043
3044/* char offset to byte offset */
3045static long
3046str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3047{
3048 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3049 if (!pp) return e - p;
3050 return pp - p;
3051}
3052
3053long
3054rb_str_offset(VALUE str, long pos)
3055{
3056 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3057 STR_ENC_GET(str), single_byte_optimizable(str));
3058}
3059
3060#ifdef NONASCII_MASK
3061static char *
3062str_utf8_nth(const char *p, const char *e, long *nthp)
3063{
3064 long nth = *nthp;
3065 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3066 const uintptr_t *s, *t;
3067 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3068 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3069 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3070 while (p < (const char *)s) {
3071 if (is_utf8_lead_byte(*p)) nth--;
3072 p++;
3073 }
3074 do {
3075 nth -= count_utf8_lead_bytes_with_word(s);
3076 s++;
3077 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3078 p = (char *)s;
3079 }
3080 while (p < e) {
3081 if (is_utf8_lead_byte(*p)) {
3082 if (nth == 0) break;
3083 nth--;
3084 }
3085 p++;
3086 }
3087 *nthp = nth;
3088 return (char *)p;
3089}
3090
3091static long
3092str_utf8_offset(const char *p, const char *e, long nth)
3093{
3094 const char *pp = str_utf8_nth(p, e, &nth);
3095 return pp - p;
3096}
3097#endif
3098
3099/* byte offset to char offset */
3100long
3101rb_str_sublen(VALUE str, long pos)
3102{
3103 if (single_byte_optimizable(str) || pos < 0)
3104 return pos;
3105 else {
3106 char *p = RSTRING_PTR(str);
3107 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3108 }
3109}
3110
3111static VALUE
3112str_subseq(VALUE str, long beg, long len)
3113{
3114 VALUE str2;
3115
3116 RUBY_ASSERT(beg >= 0);
3117 RUBY_ASSERT(len >= 0);
3118 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3119
3120 const int termlen = TERM_LEN(str);
3121 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3122 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3123 RB_GC_GUARD(str);
3124 return str2;
3125 }
3126
3127 str2 = str_alloc_heap(rb_cString);
3128 if (str_embed_capa(str2) >= len + termlen) {
3129 char *ptr2 = RSTRING(str2)->as.embed.ary;
3130 STR_SET_EMBED(str2);
3131 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3132 TERM_FILL(ptr2+len, termlen);
3133
3134 STR_SET_LEN(str2, len);
3135 RB_GC_GUARD(str);
3136 }
3137 else {
3138 str_replace_shared(str2, str);
3139 RUBY_ASSERT(!STR_EMBED_P(str2));
3140 ENC_CODERANGE_CLEAR(str2);
3141 RSTRING(str2)->as.heap.ptr += beg;
3142 if (RSTRING_LEN(str2) > len) {
3143 STR_SET_LEN(str2, len);
3144 }
3145 }
3146
3147 return str2;
3148}
3149
3150VALUE
3151rb_str_subseq(VALUE str, long beg, long len)
3152{
3153 VALUE str2 = str_subseq(str, beg, len);
3154 rb_enc_cr_str_copy_for_substr(str2, str);
3155 return str2;
3156}
3157
3158char *
3159rb_str_subpos(VALUE str, long beg, long *lenp)
3160{
3161 long len = *lenp;
3162 long slen = -1L;
3163 const long blen = RSTRING_LEN(str);
3164 rb_encoding *enc = STR_ENC_GET(str);
3165 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3166
3167 if (len < 0) return 0;
3168 if (beg < 0 && -beg < 0) return 0;
3169 if (!blen) {
3170 len = 0;
3171 }
3172 if (single_byte_optimizable(str)) {
3173 if (beg > blen) return 0;
3174 if (beg < 0) {
3175 beg += blen;
3176 if (beg < 0) return 0;
3177 }
3178 if (len > blen - beg)
3179 len = blen - beg;
3180 if (len < 0) return 0;
3181 p = s + beg;
3182 goto end;
3183 }
3184 if (beg < 0) {
3185 if (len > -beg) len = -beg;
3186 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3187 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3188 beg = -beg;
3189 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3190 p = e;
3191 if (!p) return 0;
3192 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3193 if (!p) return 0;
3194 len = e - p;
3195 goto end;
3196 }
3197 else {
3198 slen = str_strlen(str, enc);
3199 beg += slen;
3200 if (beg < 0) return 0;
3201 p = s + beg;
3202 if (len == 0) goto end;
3203 }
3204 }
3205 else if (beg > 0 && beg > blen) {
3206 return 0;
3207 }
3208 if (len == 0) {
3209 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3210 p = s + beg;
3211 }
3212#ifdef NONASCII_MASK
3213 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3214 enc == rb_utf8_encoding()) {
3215 p = str_utf8_nth(s, e, &beg);
3216 if (beg > 0) return 0;
3217 len = str_utf8_offset(p, e, len);
3218 }
3219#endif
3220 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3221 int char_sz = rb_enc_mbmaxlen(enc);
3222
3223 p = s + beg * char_sz;
3224 if (p > e) {
3225 return 0;
3226 }
3227 else if (len * char_sz > e - p)
3228 len = e - p;
3229 else
3230 len *= char_sz;
3231 }
3232 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3233 if (beg > 0) return 0;
3234 len = 0;
3235 }
3236 else {
3237 len = str_offset(p, e, len, enc, 0);
3238 }
3239 end:
3240 *lenp = len;
3241 RB_GC_GUARD(str);
3242 return p;
3243}
3244
3245static VALUE str_substr(VALUE str, long beg, long len, int empty);
3246
3247VALUE
3248rb_str_substr(VALUE str, long beg, long len)
3249{
3250 return str_substr(str, beg, len, TRUE);
3251}
3252
3253VALUE
3254rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3255{
3256 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3257}
3258
3259static VALUE
3260str_substr(VALUE str, long beg, long len, int empty)
3261{
3262 char *p = rb_str_subpos(str, beg, &len);
3263
3264 if (!p) return Qnil;
3265 if (!len && !empty) return Qnil;
3266
3267 beg = p - RSTRING_PTR(str);
3268
3269 VALUE str2 = str_subseq(str, beg, len);
3270 rb_enc_cr_str_copy_for_substr(str2, str);
3271 return str2;
3272}
3273
3274/* :nodoc: */
3275VALUE
3277{
3278 if (CHILLED_STRING_P(str)) {
3279 FL_UNSET_RAW(str, STR_CHILLED);
3280 }
3281
3282 if (OBJ_FROZEN(str)) return str;
3283 rb_str_resize(str, RSTRING_LEN(str));
3284 return rb_obj_freeze(str);
3285}
3286
3287/*
3288 * call-seq:
3289 * +string -> new_string or self
3290 *
3291 * Returns +self+ if +self+ is not frozen and can be mutated
3292 * without warning issuance.
3293 *
3294 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3295 *
3296 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3297 */
3298static VALUE
3299str_uplus(VALUE str)
3300{
3301 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3302 return rb_str_dup(str);
3303 }
3304 else {
3305 return str;
3306 }
3307}
3308
3309/*
3310 * call-seq:
3311 * -self -> frozen_string
3312 *
3313 * Returns a frozen string equal to +self+.
3314 *
3315 * The returned string is +self+ if and only if all of the following are true:
3316 *
3317 * - +self+ is already frozen.
3318 * - +self+ is an instance of \String (rather than of a subclass of \String)
3319 * - +self+ has no instance variables set on it.
3320 *
3321 * Otherwise, the returned string is a frozen copy of +self+.
3322 *
3323 * Returning +self+, when possible, saves duplicating +self+;
3324 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3325 *
3326 * It may also save duplicating other, already-existing, strings:
3327 *
3328 * s0 = 'foo'
3329 * s1 = 'foo'
3330 * s0.object_id == s1.object_id # => false
3331 * (-s0).object_id == (-s1).object_id # => true
3332 *
3333 * Note that method #-@ is convenient for defining a constant:
3334 *
3335 * FileName = -'config/database.yml'
3336 *
3337 * While its alias #dedup is better suited for chaining:
3338 *
3339 * 'foo'.dedup.gsub!('o')
3340 *
3341 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3342 */
3343static VALUE
3344str_uminus(VALUE str)
3345{
3346 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3347 str = rb_str_dup(str);
3348 }
3349 return rb_fstring(str);
3350}
3351
3352RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3353#define rb_str_dup_frozen rb_str_new_frozen
3354
3355VALUE
3357{
3358 rb_check_frozen(str);
3359 if (FL_TEST(str, STR_TMPLOCK)) {
3360 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3361 }
3362 FL_SET(str, STR_TMPLOCK);
3363 return str;
3364}
3365
3366VALUE
3368{
3369 rb_check_frozen(str);
3370 if (!FL_TEST(str, STR_TMPLOCK)) {
3371 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3372 }
3373 FL_UNSET(str, STR_TMPLOCK);
3374 return str;
3375}
3376
3377VALUE
3378rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3379{
3380 rb_str_locktmp(str);
3381 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3382}
3383
3384void
3386{
3387 RUBY_ASSERT(ruby_thread_has_gvl_p());
3388
3389 long capa;
3390 const int termlen = TERM_LEN(str);
3391
3392 str_modifiable(str);
3393 if (STR_SHARED_P(str)) {
3394 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3395 }
3396 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3397 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3398 }
3399
3400 int cr = ENC_CODERANGE(str);
3401 if (len == 0) {
3402 /* Empty string does not contain non-ASCII */
3404 }
3405 else if (cr == ENC_CODERANGE_UNKNOWN) {
3406 /* Leave unknown. */
3407 }
3408 else if (len > RSTRING_LEN(str)) {
3409 if (ENC_CODERANGE_CLEAN_P(cr)) {
3410 /* Update the coderange regarding the extended part. */
3411 const char *const prev_end = RSTRING_END(str);
3412 const char *const new_end = RSTRING_PTR(str) + len;
3413 rb_encoding *enc = rb_enc_get(str);
3414 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3415 ENC_CODERANGE_SET(str, cr);
3416 }
3417 else if (cr == ENC_CODERANGE_BROKEN) {
3418 /* May be valid now, by appended part. */
3420 }
3421 }
3422 else if (len < RSTRING_LEN(str)) {
3423 if (cr != ENC_CODERANGE_7BIT) {
3424 /* ASCII-only string is keeping after truncated. Valid
3425 * and broken may be invalid or valid, leave unknown. */
3427 }
3428 }
3429
3430 STR_SET_LEN(str, len);
3431 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3432}
3433
3434VALUE
3435rb_str_resize(VALUE str, long len)
3436{
3437 if (len < 0) {
3438 rb_raise(rb_eArgError, "negative string size (or size too big)");
3439 }
3440
3441 int independent = str_independent(str);
3442 long slen = RSTRING_LEN(str);
3443 const int termlen = TERM_LEN(str);
3444
3445 if (slen > len || (termlen != 1 && slen < len)) {
3447 }
3448
3449 {
3450 long capa;
3451 if (STR_EMBED_P(str)) {
3452 if (len == slen) return str;
3453 if (str_embed_capa(str) >= len + termlen) {
3454 STR_SET_LEN(str, len);
3455 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3456 return str;
3457 }
3458 str_make_independent_expand(str, slen, len - slen, termlen);
3459 }
3460 else if (str_embed_capa(str) >= len + termlen) {
3461 char *ptr = STR_HEAP_PTR(str);
3462 STR_SET_EMBED(str);
3463 if (slen > len) slen = len;
3464 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3465 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3466 STR_SET_LEN(str, len);
3467 if (independent) ruby_xfree(ptr);
3468 return str;
3469 }
3470 else if (!independent) {
3471 if (len == slen) return str;
3472 str_make_independent_expand(str, slen, len - slen, termlen);
3473 }
3474 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3475 (capa - len) > (len < 1024 ? len : 1024)) {
3476 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3477 (size_t)len + termlen, STR_HEAP_SIZE(str));
3478 RSTRING(str)->as.heap.aux.capa = len;
3479 }
3480 else if (len == slen) return str;
3481 STR_SET_LEN(str, len);
3482 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3483 }
3484 return str;
3485}
3486
3487static void
3488str_ensure_available_capa(VALUE str, long len)
3489{
3490 str_modify_keep_cr(str);
3491
3492 const int termlen = TERM_LEN(str);
3493 long olen = RSTRING_LEN(str);
3494
3495 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3496 rb_raise(rb_eArgError, "string sizes too big");
3497 }
3498
3499 long total = olen + len;
3500 long capa = str_capacity(str, termlen);
3501
3502 if (capa < total) {
3503 if (total >= LONG_MAX / 2) {
3504 capa = total;
3505 }
3506 while (total > capa) {
3507 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3508 }
3509 RESIZE_CAPA_TERM(str, capa, termlen);
3510 }
3511}
3512
3513static VALUE
3514str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3515{
3516 if (keep_cr) {
3517 str_modify_keep_cr(str);
3518 }
3519 else {
3520 rb_str_modify(str);
3521 }
3522 if (len == 0) return 0;
3523
3524 long total, olen, off = -1;
3525 char *sptr;
3526 const int termlen = TERM_LEN(str);
3527
3528 RSTRING_GETMEM(str, sptr, olen);
3529 if (ptr >= sptr && ptr <= sptr + olen) {
3530 off = ptr - sptr;
3531 }
3532
3533 long capa = str_capacity(str, termlen);
3534
3535 if (olen > LONG_MAX - len) {
3536 rb_raise(rb_eArgError, "string sizes too big");
3537 }
3538 total = olen + len;
3539 if (capa < total) {
3540 if (total >= LONG_MAX / 2) {
3541 capa = total;
3542 }
3543 while (total > capa) {
3544 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3545 }
3546 RESIZE_CAPA_TERM(str, capa, termlen);
3547 sptr = RSTRING_PTR(str);
3548 }
3549 if (off != -1) {
3550 ptr = sptr + off;
3551 }
3552 memcpy(sptr + olen, ptr, len);
3553 STR_SET_LEN(str, total);
3554 TERM_FILL(sptr + total, termlen); /* sentinel */
3555
3556 return str;
3557}
3558
3559#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3560#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3561
3562VALUE
3563rb_str_cat(VALUE str, const char *ptr, long len)
3564{
3565 if (len == 0) return str;
3566 if (len < 0) {
3567 rb_raise(rb_eArgError, "negative string size (or size too big)");
3568 }
3569 return str_buf_cat(str, ptr, len);
3570}
3571
3572VALUE
3573rb_str_cat_cstr(VALUE str, const char *ptr)
3574{
3575 must_not_null(ptr);
3576 return rb_str_buf_cat(str, ptr, strlen(ptr));
3577}
3578
3579static void
3580rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3581{
3582 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3583
3584 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3585 if (UNLIKELY(!str_independent(str))) {
3586 str_make_independent(str);
3587 }
3588
3589 long string_length = -1;
3590 const int null_terminator_length = 1;
3591 char *sptr;
3592 RSTRING_GETMEM(str, sptr, string_length);
3593
3594 // Ensure the resulting string wouldn't be too long.
3595 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3596 rb_raise(rb_eArgError, "string sizes too big");
3597 }
3598
3599 long string_capacity = str_capacity(str, null_terminator_length);
3600
3601 // Get the code range before any modifications since those might clear the code range.
3602 int cr = ENC_CODERANGE(str);
3603
3604 // Check if the string has spare string_capacity to write the new byte.
3605 if (LIKELY(string_capacity >= string_length + 1)) {
3606 // In fast path we can write the new byte and note the string's new length.
3607 sptr[string_length] = byte;
3608 STR_SET_LEN(str, string_length + 1);
3609 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3610 }
3611 else {
3612 // If there's not enough string_capacity, make a call into the general string concatenation function.
3613 str_buf_cat(str, (char *)&byte, 1);
3614 }
3615
3616 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3617 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3618 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3619 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3620 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3621 if (ISASCII(byte)) {
3623 }
3624 else {
3626
3627 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3628 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3629 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3630 }
3631 }
3632 }
3633}
3634
3635RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3636RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3637RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3638
3639static VALUE
3640rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3641 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3642{
3643 int str_encindex = ENCODING_GET(str);
3644 int res_encindex;
3645 int str_cr, res_cr;
3646 rb_encoding *str_enc, *ptr_enc;
3647
3648 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3649
3650 if (str_encindex == ptr_encindex) {
3651 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3652 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3653 }
3654 }
3655 else {
3656 str_enc = rb_enc_from_index(str_encindex);
3657 ptr_enc = rb_enc_from_index(ptr_encindex);
3658 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3659 if (len == 0)
3660 return str;
3661 if (RSTRING_LEN(str) == 0) {
3662 rb_str_buf_cat(str, ptr, len);
3663 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3664 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3665 return str;
3666 }
3667 goto incompatible;
3668 }
3669 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3670 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3671 }
3672 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3673 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3674 str_cr = rb_enc_str_coderange(str);
3675 }
3676 }
3677 }
3678 if (ptr_cr_ret)
3679 *ptr_cr_ret = ptr_cr;
3680
3681 if (str_encindex != ptr_encindex &&
3682 str_cr != ENC_CODERANGE_7BIT &&
3683 ptr_cr != ENC_CODERANGE_7BIT) {
3684 str_enc = rb_enc_from_index(str_encindex);
3685 ptr_enc = rb_enc_from_index(ptr_encindex);
3686 goto incompatible;
3687 }
3688
3689 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3690 res_encindex = str_encindex;
3691 res_cr = ENC_CODERANGE_UNKNOWN;
3692 }
3693 else if (str_cr == ENC_CODERANGE_7BIT) {
3694 if (ptr_cr == ENC_CODERANGE_7BIT) {
3695 res_encindex = str_encindex;
3696 res_cr = ENC_CODERANGE_7BIT;
3697 }
3698 else {
3699 res_encindex = ptr_encindex;
3700 res_cr = ptr_cr;
3701 }
3702 }
3703 else if (str_cr == ENC_CODERANGE_VALID) {
3704 res_encindex = str_encindex;
3705 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3706 res_cr = str_cr;
3707 else
3708 res_cr = ptr_cr;
3709 }
3710 else { /* str_cr == ENC_CODERANGE_BROKEN */
3711 res_encindex = str_encindex;
3712 res_cr = str_cr;
3713 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3714 }
3715
3716 if (len < 0) {
3717 rb_raise(rb_eArgError, "negative string size (or size too big)");
3718 }
3719 str_buf_cat(str, ptr, len);
3720 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3721 return str;
3722
3723 incompatible:
3724 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3725 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3727}
3728
3729VALUE
3730rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3731{
3732 return rb_enc_cr_str_buf_cat(str, ptr, len,
3733 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3734}
3735
3736VALUE
3738{
3739 /* ptr must reference NUL terminated ASCII string. */
3740 int encindex = ENCODING_GET(str);
3741 rb_encoding *enc = rb_enc_from_index(encindex);
3742 if (rb_enc_asciicompat(enc)) {
3743 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3744 encindex, ENC_CODERANGE_7BIT, 0);
3745 }
3746 else {
3747 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3748 while (*ptr) {
3749 unsigned int c = (unsigned char)*ptr;
3750 int len = rb_enc_codelen(c, enc);
3751 rb_enc_mbcput(c, buf, enc);
3752 rb_enc_cr_str_buf_cat(str, buf, len,
3753 encindex, ENC_CODERANGE_VALID, 0);
3754 ptr++;
3755 }
3756 return str;
3757 }
3758}
3759
3760VALUE
3762{
3763 int str2_cr = rb_enc_str_coderange(str2);
3764
3765 if (str_enc_fastpath(str)) {
3766 switch (str2_cr) {
3767 case ENC_CODERANGE_7BIT:
3768 // If RHS is 7bit we can do simple concatenation
3769 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3770 RB_GC_GUARD(str2);
3771 return str;
3773 // If RHS is valid, we can do simple concatenation if encodings are the same
3774 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3775 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3776 int str_cr = ENC_CODERANGE(str);
3777 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3778 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3779 }
3780 RB_GC_GUARD(str2);
3781 return str;
3782 }
3783 }
3784 }
3785
3786 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3787 ENCODING_GET(str2), str2_cr, &str2_cr);
3788
3789 ENC_CODERANGE_SET(str2, str2_cr);
3790
3791 return str;
3792}
3793
3794VALUE
3796{
3797 StringValue(str2);
3798 return rb_str_buf_append(str, str2);
3799}
3800
3801VALUE
3802rb_str_concat_literals(size_t num, const VALUE *strary)
3803{
3804 VALUE str;
3805 size_t i, s = 0;
3806 unsigned long len = 1;
3807
3808 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3809 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3810
3811 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3812 str = rb_str_buf_new(len);
3813 str_enc_copy_direct(str, strary[0]);
3814
3815 for (i = s; i < num; ++i) {
3816 const VALUE v = strary[i];
3817 int encidx = ENCODING_GET(v);
3818
3819 rb_str_buf_append(str, v);
3820 if (encidx != ENCINDEX_US_ASCII) {
3821 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3822 rb_enc_set_index(str, encidx);
3823 }
3824 }
3825 return str;
3826}
3827
3828/*
3829 * call-seq:
3830 * concat(*objects) -> string
3831 *
3832 * :include: doc/string/concat.rdoc
3833 */
3834static VALUE
3835rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3836{
3837 str_modifiable(str);
3838
3839 if (argc == 1) {
3840 return rb_str_concat(str, argv[0]);
3841 }
3842 else if (argc > 1) {
3843 int i;
3844 VALUE arg_str = rb_str_tmp_new(0);
3845 rb_enc_copy(arg_str, str);
3846 for (i = 0; i < argc; i++) {
3847 rb_str_concat(arg_str, argv[i]);
3848 }
3849 rb_str_buf_append(str, arg_str);
3850 }
3851
3852 return str;
3853}
3854
3855/*
3856 * call-seq:
3857 * append_as_bytes(*objects) -> self
3858 *
3859 * Concatenates each object in +objects+ into +self+; returns +self+;
3860 * performs no encoding validation or conversion:
3861 *
3862 * s = 'foo'
3863 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3864 * s.valid_encoding? # => false
3865 * s.append_as_bytes("\xAC 12")
3866 * s.valid_encoding? # => true
3867 *
3868 * When a given object is an integer,
3869 * the value is considered an 8-bit byte;
3870 * if the integer occupies more than one byte (i.e,. is greater than 255),
3871 * appends only the low-order byte (similar to String#setbyte):
3872 *
3873 * s = ""
3874 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3875 * s.bytesize # => 2
3876 *
3877 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3878 */
3879
3880VALUE
3881rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3882{
3883 long needed_capacity = 0;
3884 volatile VALUE t0;
3885 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3886
3887 for (int index = 0; index < argc; index++) {
3888 VALUE obj = argv[index];
3889 enum ruby_value_type type = types[index] = rb_type(obj);
3890 switch (type) {
3891 case T_FIXNUM:
3892 case T_BIGNUM:
3893 needed_capacity++;
3894 break;
3895 case T_STRING:
3896 needed_capacity += RSTRING_LEN(obj);
3897 break;
3898 default:
3899 rb_raise(
3901 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3902 rb_obj_class(obj)
3903 );
3904 break;
3905 }
3906 }
3907
3908 str_ensure_available_capa(str, needed_capacity);
3909 char *sptr = RSTRING_END(str);
3910
3911 for (int index = 0; index < argc; index++) {
3912 VALUE obj = argv[index];
3913 enum ruby_value_type type = types[index];
3914 switch (type) {
3915 case T_FIXNUM:
3916 case T_BIGNUM: {
3917 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3918 char byte = (char)(NUM2INT(obj) & 0xFF);
3919 *sptr = byte;
3920 sptr++;
3921 break;
3922 }
3923 case T_STRING: {
3924 const char *ptr;
3925 long len;
3926 RSTRING_GETMEM(obj, ptr, len);
3927 memcpy(sptr, ptr, len);
3928 sptr += len;
3929 break;
3930 }
3931 default:
3932 rb_bug("append_as_bytes arguments should have been validated");
3933 }
3934 }
3935
3936 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3937 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3938
3939 int cr = ENC_CODERANGE(str);
3940 switch (cr) {
3941 case ENC_CODERANGE_7BIT: {
3942 for (int index = 0; index < argc; index++) {
3943 VALUE obj = argv[index];
3944 enum ruby_value_type type = types[index];
3945 switch (type) {
3946 case T_FIXNUM:
3947 case T_BIGNUM: {
3948 if (!ISASCII(NUM2INT(obj))) {
3949 goto clear_cr;
3950 }
3951 break;
3952 }
3953 case T_STRING: {
3954 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3955 goto clear_cr;
3956 }
3957 break;
3958 }
3959 default:
3960 rb_bug("append_as_bytes arguments should have been validated");
3961 }
3962 }
3963 break;
3964 }
3966 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3967 goto keep_cr;
3968 }
3969 else {
3970 goto clear_cr;
3971 }
3972 break;
3973 default:
3974 goto clear_cr;
3975 break;
3976 }
3977
3978 RB_GC_GUARD(t0);
3979
3980 clear_cr:
3981 // If no fast path was hit, we clear the coderange.
3982 // append_as_bytes is predominantly meant to be used in
3983 // buffering situation, hence it's likely the coderange
3984 // will never be scanned, so it's not worth spending time
3985 // precomputing the coderange except for simple and common
3986 // situations.
3988 keep_cr:
3989 return str;
3990}
3991
3992/*
3993 * call-seq:
3994 * self << object -> self
3995 *
3996 * Appends a string representation of +object+ to +self+;
3997 * returns +self+.
3998 *
3999 * If +object+ is a string, appends it to +self+:
4000 *
4001 * s = 'foo'
4002 * s << 'bar' # => "foobar"
4003 * s # => "foobar"
4004 *
4005 * If +object+ is an integer,
4006 * its value is considered a codepoint;
4007 * converts the value to a character before concatenating:
4008 *
4009 * s = 'foo'
4010 * s << 33 # => "foo!"
4011 *
4012 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4013 * and the encoding of +self+ is Encoding::US_ASCII,
4014 * changes the encoding to Encoding::ASCII_8BIT:
4015 *
4016 * s = 'foo'.encode(Encoding::US_ASCII)
4017 * s.encoding # => #<Encoding:US-ASCII>
4018 * s << 0xff # => "foo\xFF"
4019 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4020 *
4021 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4022 *
4023 * s = 'foo'
4024 * s.encoding # => <Encoding:UTF-8>
4025 * s << 0x00110000 # 1114112 out of char range (RangeError)
4026 * s = 'foo'.encode(Encoding::EUC_JP)
4027 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4028 *
4029 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4030 */
4031VALUE
4033{
4034 unsigned int code;
4035 rb_encoding *enc = STR_ENC_GET(str1);
4036 int encidx;
4037
4038 if (RB_INTEGER_TYPE_P(str2)) {
4039 if (rb_num_to_uint(str2, &code) == 0) {
4040 }
4041 else if (FIXNUM_P(str2)) {
4042 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4043 }
4044 else {
4045 rb_raise(rb_eRangeError, "bignum out of char range");
4046 }
4047 }
4048 else {
4049 return rb_str_append(str1, str2);
4050 }
4051
4052 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4053
4054 if (encidx >= 0) {
4055 rb_str_buf_cat_byte(str1, (unsigned char)code);
4056 }
4057 else {
4058 long pos = RSTRING_LEN(str1);
4059 int cr = ENC_CODERANGE(str1);
4060 int len;
4061 char *buf;
4062
4063 switch (len = rb_enc_codelen(code, enc)) {
4064 case ONIGERR_INVALID_CODE_POINT_VALUE:
4065 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4066 break;
4067 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4068 case 0:
4069 rb_raise(rb_eRangeError, "%u out of char range", code);
4070 break;
4071 }
4072 buf = ALLOCA_N(char, len + 1);
4073 rb_enc_mbcput(code, buf, enc);
4074 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4075 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4076 }
4077 rb_str_resize(str1, pos+len);
4078 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4079 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4081 }
4082 else if (cr == ENC_CODERANGE_BROKEN) {
4084 }
4085 ENC_CODERANGE_SET(str1, cr);
4086 }
4087 return str1;
4088}
4089
4090int
4091rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4092{
4093 int encidx = rb_enc_to_index(enc);
4094
4095 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4096 /* US-ASCII automatically extended to ASCII-8BIT */
4097 if (code > 0xFF) {
4098 rb_raise(rb_eRangeError, "%u out of char range", code);
4099 }
4100 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4101 return ENCINDEX_ASCII_8BIT;
4102 }
4103 return encidx;
4104 }
4105 else {
4106 return -1;
4107 }
4108}
4109
4110/*
4111 * call-seq:
4112 * prepend(*other_strings) -> new_string
4113 *
4114 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4115 *
4116 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4117 *
4118 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4119 *
4120 */
4121
4122static VALUE
4123rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4124{
4125 str_modifiable(str);
4126
4127 if (argc == 1) {
4128 rb_str_update(str, 0L, 0L, argv[0]);
4129 }
4130 else if (argc > 1) {
4131 int i;
4132 VALUE arg_str = rb_str_tmp_new(0);
4133 rb_enc_copy(arg_str, str);
4134 for (i = 0; i < argc; i++) {
4135 rb_str_append(arg_str, argv[i]);
4136 }
4137 rb_str_update(str, 0L, 0L, arg_str);
4138 }
4139
4140 return str;
4141}
4142
4143st_index_t
4145{
4146 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4147 st_index_t precomputed_hash;
4148 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4149
4150 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4151 return precomputed_hash;
4152 }
4153
4154 return str_do_hash(str);
4155}
4156
4157int
4159{
4160 long len1, len2;
4161 const char *ptr1, *ptr2;
4162 RSTRING_GETMEM(str1, ptr1, len1);
4163 RSTRING_GETMEM(str2, ptr2, len2);
4164 return (len1 != len2 ||
4165 !rb_str_comparable(str1, str2) ||
4166 memcmp(ptr1, ptr2, len1) != 0);
4167}
4168
4169/*
4170 * call-seq:
4171 * hash -> integer
4172 *
4173 * :include: doc/string/hash.rdoc
4174 *
4175 */
4176
4177static VALUE
4178rb_str_hash_m(VALUE str)
4179{
4180 st_index_t hval = rb_str_hash(str);
4181 return ST2FIX(hval);
4182}
4183
4184#define lesser(a,b) (((a)>(b))?(b):(a))
4185
4186int
4188{
4189 int idx1, idx2;
4190 int rc1, rc2;
4191
4192 if (RSTRING_LEN(str1) == 0) return TRUE;
4193 if (RSTRING_LEN(str2) == 0) return TRUE;
4194 idx1 = ENCODING_GET(str1);
4195 idx2 = ENCODING_GET(str2);
4196 if (idx1 == idx2) return TRUE;
4197 rc1 = rb_enc_str_coderange(str1);
4198 rc2 = rb_enc_str_coderange(str2);
4199 if (rc1 == ENC_CODERANGE_7BIT) {
4200 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4201 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4202 return TRUE;
4203 }
4204 if (rc2 == ENC_CODERANGE_7BIT) {
4205 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4206 return TRUE;
4207 }
4208 return FALSE;
4209}
4210
4211int
4213{
4214 long len1, len2;
4215 const char *ptr1, *ptr2;
4216 int retval;
4217
4218 if (str1 == str2) return 0;
4219 RSTRING_GETMEM(str1, ptr1, len1);
4220 RSTRING_GETMEM(str2, ptr2, len2);
4221 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4222 if (len1 == len2) {
4223 if (!rb_str_comparable(str1, str2)) {
4224 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4225 return 1;
4226 return -1;
4227 }
4228 return 0;
4229 }
4230 if (len1 > len2) return 1;
4231 return -1;
4232 }
4233 if (retval > 0) return 1;
4234 return -1;
4235}
4236
4237/*
4238 * call-seq:
4239 * self == object -> true or false
4240 *
4241 * Returns whether +object+ is equal to +self+.
4242 *
4243 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4244 *
4245 * s = 'foo'
4246 * s == 'foo' # => true
4247 * s == 'food' # => false
4248 * s == 'FOO' # => false
4249 *
4250 * Returns +false+ if the two strings' encodings are not compatible:
4251 *
4252 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4253 *
4254 * When +object+ is not a string:
4255 *
4256 * - If +object+ responds to method <tt>to_str</tt>,
4257 * <tt>object == self</tt> is called and its return value is returned.
4258 * - If +object+ does not respond to <tt>to_str</tt>,
4259 * +false+ is returned.
4260 *
4261 * Related: {Comparing}[rdoc-ref:String@Comparing].
4262 */
4263
4264VALUE
4266{
4267 if (str1 == str2) return Qtrue;
4268 if (!RB_TYPE_P(str2, T_STRING)) {
4269 if (!rb_respond_to(str2, idTo_str)) {
4270 return Qfalse;
4271 }
4272 return rb_equal(str2, str1);
4273 }
4274 return rb_str_eql_internal(str1, str2);
4275}
4276
4277/*
4278 * call-seq:
4279 * eql?(object) -> true or false
4280 *
4281 * :include: doc/string/eql_p.rdoc
4282 *
4283 */
4284
4285VALUE
4286rb_str_eql(VALUE str1, VALUE str2)
4287{
4288 if (str1 == str2) return Qtrue;
4289 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4290 return rb_str_eql_internal(str1, str2);
4291}
4292
4293/*
4294 * call-seq:
4295 * self <=> other_string -> -1, 0, 1, or nil
4296 *
4297 * Compares +self+ and +other_string+, returning:
4298 *
4299 * - -1 if +other_string+ is larger.
4300 * - 0 if the two are equal.
4301 * - 1 if +other_string+ is smaller.
4302 * - +nil+ if the two are incomparable.
4303 *
4304 * Examples:
4305 *
4306 * 'foo' <=> 'foo' # => 0
4307 * 'foo' <=> 'food' # => -1
4308 * 'food' <=> 'foo' # => 1
4309 * 'FOO' <=> 'foo' # => -1
4310 * 'foo' <=> 'FOO' # => 1
4311 * 'foo' <=> 1 # => nil
4312 *
4313 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4314 */
4315
4316static VALUE
4317rb_str_cmp_m(VALUE str1, VALUE str2)
4318{
4319 int result;
4320 VALUE s = rb_check_string_type(str2);
4321 if (NIL_P(s)) {
4322 return rb_invcmp(str1, str2);
4323 }
4324 result = rb_str_cmp(str1, s);
4325 return INT2FIX(result);
4326}
4327
4328static VALUE str_casecmp(VALUE str1, VALUE str2);
4329static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4330
4331/*
4332 * call-seq:
4333 * casecmp(other_string) -> -1, 0, 1, or nil
4334 *
4335 * Ignoring case, compares +self+ and +other_string+; returns:
4336 *
4337 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4338 * - 0 if the two are equal.
4339 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4340 * - +nil+ if the two are incomparable.
4341 *
4342 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4343 *
4344 * Examples:
4345 *
4346 * 'foo'.casecmp('goo') # => -1
4347 * 'goo'.casecmp('foo') # => 1
4348 * 'foo'.casecmp('food') # => -1
4349 * 'food'.casecmp('foo') # => 1
4350 * 'FOO'.casecmp('foo') # => 0
4351 * 'foo'.casecmp('FOO') # => 0
4352 * 'foo'.casecmp(1) # => nil
4353 *
4354 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4355 */
4356
4357static VALUE
4358rb_str_casecmp(VALUE str1, VALUE str2)
4359{
4360 VALUE s = rb_check_string_type(str2);
4361 if (NIL_P(s)) {
4362 return Qnil;
4363 }
4364 return str_casecmp(str1, s);
4365}
4366
4367static VALUE
4368str_casecmp(VALUE str1, VALUE str2)
4369{
4370 long len;
4371 rb_encoding *enc;
4372 const char *p1, *p1end, *p2, *p2end;
4373
4374 enc = rb_enc_compatible(str1, str2);
4375 if (!enc) {
4376 return Qnil;
4377 }
4378
4379 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4380 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4381 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4382 while (p1 < p1end && p2 < p2end) {
4383 if (*p1 != *p2) {
4384 unsigned int c1 = TOLOWER(*p1 & 0xff);
4385 unsigned int c2 = TOLOWER(*p2 & 0xff);
4386 if (c1 != c2)
4387 return INT2FIX(c1 < c2 ? -1 : 1);
4388 }
4389 p1++;
4390 p2++;
4391 }
4392 }
4393 else {
4394 while (p1 < p1end && p2 < p2end) {
4395 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4396 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4397
4398 if (0 <= c1 && 0 <= c2) {
4399 c1 = TOLOWER(c1);
4400 c2 = TOLOWER(c2);
4401 if (c1 != c2)
4402 return INT2FIX(c1 < c2 ? -1 : 1);
4403 }
4404 else {
4405 int r;
4406 l1 = rb_enc_mbclen(p1, p1end, enc);
4407 l2 = rb_enc_mbclen(p2, p2end, enc);
4408 len = l1 < l2 ? l1 : l2;
4409 r = memcmp(p1, p2, len);
4410 if (r != 0)
4411 return INT2FIX(r < 0 ? -1 : 1);
4412 if (l1 != l2)
4413 return INT2FIX(l1 < l2 ? -1 : 1);
4414 }
4415 p1 += l1;
4416 p2 += l2;
4417 }
4418 }
4419 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4420 if (p1 == p1end) return INT2FIX(-1);
4421 return INT2FIX(1);
4422}
4423
4424/*
4425 * call-seq:
4426 * casecmp?(other_string) -> true, false, or nil
4427 *
4428 * Returns +true+ if +self+ and +other_string+ are equal after
4429 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4430 *
4431 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4432 *
4433 * Examples:
4434 *
4435 * 'foo'.casecmp?('goo') # => false
4436 * 'goo'.casecmp?('foo') # => false
4437 * 'foo'.casecmp?('food') # => false
4438 * 'food'.casecmp?('foo') # => false
4439 * 'FOO'.casecmp?('foo') # => true
4440 * 'foo'.casecmp?('FOO') # => true
4441 * 'foo'.casecmp?(1) # => nil
4442 *
4443 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4444 */
4445
4446static VALUE
4447rb_str_casecmp_p(VALUE str1, VALUE str2)
4448{
4449 VALUE s = rb_check_string_type(str2);
4450 if (NIL_P(s)) {
4451 return Qnil;
4452 }
4453 return str_casecmp_p(str1, s);
4454}
4455
4456static VALUE
4457str_casecmp_p(VALUE str1, VALUE str2)
4458{
4459 rb_encoding *enc;
4460 VALUE folded_str1, folded_str2;
4461 VALUE fold_opt = sym_fold;
4462
4463 enc = rb_enc_compatible(str1, str2);
4464 if (!enc) {
4465 return Qnil;
4466 }
4467
4468 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4469 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4470
4471 return rb_str_eql(folded_str1, folded_str2);
4472}
4473
4474static long
4475strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4476 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4477{
4478 const char *search_start = str_ptr;
4479 long pos, search_len = str_len - offset;
4480
4481 for (;;) {
4482 const char *t;
4483 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4484 if (pos < 0) return pos;
4485 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4486 if (t == search_start + pos) break;
4487 search_len -= t - search_start;
4488 if (search_len <= 0) return -1;
4489 offset += t - search_start;
4490 search_start = t;
4491 }
4492 return pos + offset;
4493}
4494
4495/* found index in byte */
4496#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4497#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4498
4499static long
4500rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4501{
4502 const char *str_ptr, *str_ptr_end, *sub_ptr;
4503 long str_len, sub_len;
4504 rb_encoding *enc;
4505
4506 enc = rb_enc_check(str, sub);
4507 if (is_broken_string(sub)) return -1;
4508
4509 str_ptr = RSTRING_PTR(str);
4510 str_ptr_end = RSTRING_END(str);
4511 str_len = RSTRING_LEN(str);
4512 sub_ptr = RSTRING_PTR(sub);
4513 sub_len = RSTRING_LEN(sub);
4514
4515 if (str_len < sub_len) return -1;
4516
4517 if (offset != 0) {
4518 long str_len_char, sub_len_char;
4519 int single_byte = single_byte_optimizable(str);
4520 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4521 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4522 if (offset < 0) {
4523 offset += str_len_char;
4524 if (offset < 0) return -1;
4525 }
4526 if (str_len_char - offset < sub_len_char) return -1;
4527 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4528 str_ptr += offset;
4529 }
4530 if (sub_len == 0) return offset;
4531
4532 /* need proceed one character at a time */
4533 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4534}
4535
4536
4537/*
4538 * call-seq:
4539 * index(pattern, offset = 0) -> integer or nil
4540 *
4541 * :include: doc/string/index.rdoc
4542 *
4543 */
4544
4545static VALUE
4546rb_str_index_m(int argc, VALUE *argv, VALUE str)
4547{
4548 VALUE sub;
4549 VALUE initpos;
4550 rb_encoding *enc = STR_ENC_GET(str);
4551 long pos;
4552
4553 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4554 long slen = str_strlen(str, enc); /* str's enc */
4555 pos = NUM2LONG(initpos);
4556 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4557 if (RB_TYPE_P(sub, T_REGEXP)) {
4559 }
4560 return Qnil;
4561 }
4562 }
4563 else {
4564 pos = 0;
4565 }
4566
4567 if (RB_TYPE_P(sub, T_REGEXP)) {
4568 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4569 enc, single_byte_optimizable(str));
4570
4571 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4572 VALUE match = rb_backref_get();
4573 struct re_registers *regs = RMATCH_REGS(match);
4574 pos = rb_str_sublen(str, BEG(0));
4575 return LONG2NUM(pos);
4576 }
4577 }
4578 else {
4579 StringValue(sub);
4580 pos = rb_str_index(str, sub, pos);
4581 if (pos >= 0) {
4582 pos = rb_str_sublen(str, pos);
4583 return LONG2NUM(pos);
4584 }
4585 }
4586 return Qnil;
4587}
4588
4589/* Ensure that the given pos is a valid character boundary.
4590 * Note that in this function, "character" means a code point
4591 * (Unicode scalar value), not a grapheme cluster.
4592 */
4593static void
4594str_ensure_byte_pos(VALUE str, long pos)
4595{
4596 if (!single_byte_optimizable(str)) {
4597 const char *s = RSTRING_PTR(str);
4598 const char *e = RSTRING_END(str);
4599 const char *p = s + pos;
4600 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4601 rb_raise(rb_eIndexError,
4602 "offset %ld does not land on character boundary", pos);
4603 }
4604 }
4605}
4606
4607/*
4608 * call-seq:
4609 * byteindex(object, offset = 0) -> integer or nil
4610 *
4611 * Returns the 0-based integer index of a substring of +self+
4612 * specified by +object+ (a string or Regexp) and +offset+,
4613 * or +nil+ if there is no such substring;
4614 * the returned index is the count of _bytes_ (not characters).
4615 *
4616 * When +object+ is a string,
4617 * returns the index of the first found substring equal to +object+:
4618 *
4619 * s = 'foo' # => "foo"
4620 * s.size # => 3 # Three 1-byte characters.
4621 * s.bytesize # => 3 # Three bytes.
4622 * s.byteindex('f') # => 0
4623 * s.byteindex('o') # => 1
4624 * s.byteindex('oo') # => 1
4625 * s.byteindex('ooo') # => nil
4626 *
4627 * When +object+ is a Regexp,
4628 * returns the index of the first found substring matching +object+;
4629 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4630 *
4631 * s = 'foo'
4632 * s.byteindex(/f/) # => 0
4633 * $~ # => #<MatchData "f">
4634 * s.byteindex(/o/) # => 1
4635 * s.byteindex(/oo/) # => 1
4636 * s.byteindex(/ooo/) # => nil
4637 * $~ # => nil
4638 *
4639 * \Integer argument +offset+, if given, specifies the 0-based index
4640 * of the byte where searching is to begin.
4641 *
4642 * When +offset+ is non-negative,
4643 * searching begins at byte position +offset+:
4644 *
4645 * s = 'foo'
4646 * s.byteindex('o', 1) # => 1
4647 * s.byteindex('o', 2) # => 2
4648 * s.byteindex('o', 3) # => nil
4649 *
4650 * When +offset+ is negative, counts backward from the end of +self+:
4651 *
4652 * s = 'foo'
4653 * s.byteindex('o', -1) # => 2
4654 * s.byteindex('o', -2) # => 1
4655 * s.byteindex('o', -3) # => 1
4656 * s.byteindex('o', -4) # => nil
4657 *
4658 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4659 *
4660 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4661 * s.size # => 2 # Two 3-byte characters.
4662 * s.bytesize # => 6 # Six bytes.
4663 * s.byteindex("\uFFFF") # => 0
4664 * s.byteindex("\uFFFF", 1) # Raises IndexError
4665 * s.byteindex("\uFFFF", 2) # Raises IndexError
4666 * s.byteindex("\uFFFF", 3) # => 3
4667 * s.byteindex("\uFFFF", 4) # Raises IndexError
4668 * s.byteindex("\uFFFF", 5) # Raises IndexError
4669 * s.byteindex("\uFFFF", 6) # => nil
4670 *
4671 * Related: see {Querying}[rdoc-ref:String@Querying].
4672 */
4673
4674static VALUE
4675rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4676{
4677 VALUE sub;
4678 VALUE initpos;
4679 long pos;
4680
4681 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4682 long slen = RSTRING_LEN(str);
4683 pos = NUM2LONG(initpos);
4684 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4685 if (RB_TYPE_P(sub, T_REGEXP)) {
4687 }
4688 return Qnil;
4689 }
4690 }
4691 else {
4692 pos = 0;
4693 }
4694
4695 str_ensure_byte_pos(str, pos);
4696
4697 if (RB_TYPE_P(sub, T_REGEXP)) {
4698 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4699 VALUE match = rb_backref_get();
4700 struct re_registers *regs = RMATCH_REGS(match);
4701 pos = BEG(0);
4702 return LONG2NUM(pos);
4703 }
4704 }
4705 else {
4706 StringValue(sub);
4707 pos = rb_str_byteindex(str, sub, pos);
4708 if (pos >= 0) return LONG2NUM(pos);
4709 }
4710 return Qnil;
4711}
4712
4713#ifndef HAVE_MEMRCHR
4714static void*
4715memrchr(const char *search_str, int chr, long search_len)
4716{
4717 const char *ptr = search_str + search_len;
4718 while (ptr > search_str) {
4719 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4720 }
4721
4722 return ((void *)0);
4723}
4724#endif
4725
4726static long
4727str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4728{
4729 char *hit, *adjusted;
4730 int c;
4731 long slen, searchlen;
4732 char *sbeg, *e, *t;
4733
4734 sbeg = RSTRING_PTR(str);
4735 slen = RSTRING_LEN(sub);
4736 if (slen == 0) return s - sbeg;
4737 e = RSTRING_END(str);
4738 t = RSTRING_PTR(sub);
4739 c = *t & 0xff;
4740 searchlen = s - sbeg + 1;
4741
4742 if (memcmp(s, t, slen) == 0) {
4743 return s - sbeg;
4744 }
4745
4746 do {
4747 hit = memrchr(sbeg, c, searchlen);
4748 if (!hit) break;
4749 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4750 if (hit != adjusted) {
4751 searchlen = adjusted - sbeg;
4752 continue;
4753 }
4754 if (memcmp(hit, t, slen) == 0)
4755 return hit - sbeg;
4756 searchlen = adjusted - sbeg;
4757 } while (searchlen > 0);
4758
4759 return -1;
4760}
4761
4762/* found index in byte */
4763static long
4764rb_str_rindex(VALUE str, VALUE sub, long pos)
4765{
4766 long len, slen;
4767 char *sbeg, *s;
4768 rb_encoding *enc;
4769 int singlebyte;
4770
4771 enc = rb_enc_check(str, sub);
4772 if (is_broken_string(sub)) return -1;
4773 singlebyte = single_byte_optimizable(str);
4774 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4775 slen = str_strlen(sub, enc); /* rb_enc_check */
4776
4777 /* substring longer than string */
4778 if (len < slen) return -1;
4779 if (len - pos < slen) pos = len - slen;
4780 if (len == 0) return pos;
4781
4782 sbeg = RSTRING_PTR(str);
4783
4784 if (pos == 0) {
4785 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4786 return 0;
4787 else
4788 return -1;
4789 }
4790
4791 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4792 return str_rindex(str, sub, s, enc);
4793}
4794
4795/*
4796 * call-seq:
4797 * rindex(pattern, offset = self.length) -> integer or nil
4798 *
4799 * :include:doc/string/rindex.rdoc
4800 *
4801 */
4802
4803static VALUE
4804rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4805{
4806 VALUE sub;
4807 VALUE initpos;
4808 rb_encoding *enc = STR_ENC_GET(str);
4809 long pos, len = str_strlen(str, enc); /* str's enc */
4810
4811 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4812 pos = NUM2LONG(initpos);
4813 if (pos < 0 && (pos += len) < 0) {
4814 if (RB_TYPE_P(sub, T_REGEXP)) {
4816 }
4817 return Qnil;
4818 }
4819 if (pos > len) pos = len;
4820 }
4821 else {
4822 pos = len;
4823 }
4824
4825 if (RB_TYPE_P(sub, T_REGEXP)) {
4826 /* enc = rb_enc_check(str, sub); */
4827 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4828 enc, single_byte_optimizable(str));
4829
4830 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4831 VALUE match = rb_backref_get();
4832 struct re_registers *regs = RMATCH_REGS(match);
4833 pos = rb_str_sublen(str, BEG(0));
4834 return LONG2NUM(pos);
4835 }
4836 }
4837 else {
4838 StringValue(sub);
4839 pos = rb_str_rindex(str, sub, pos);
4840 if (pos >= 0) {
4841 pos = rb_str_sublen(str, pos);
4842 return LONG2NUM(pos);
4843 }
4844 }
4845 return Qnil;
4846}
4847
4848static long
4849rb_str_byterindex(VALUE str, VALUE sub, long pos)
4850{
4851 long len, slen;
4852 char *sbeg, *s;
4853 rb_encoding *enc;
4854
4855 enc = rb_enc_check(str, sub);
4856 if (is_broken_string(sub)) return -1;
4857 len = RSTRING_LEN(str);
4858 slen = RSTRING_LEN(sub);
4859
4860 /* substring longer than string */
4861 if (len < slen) return -1;
4862 if (len - pos < slen) pos = len - slen;
4863 if (len == 0) return pos;
4864
4865 sbeg = RSTRING_PTR(str);
4866
4867 if (pos == 0) {
4868 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4869 return 0;
4870 else
4871 return -1;
4872 }
4873
4874 s = sbeg + pos;
4875 return str_rindex(str, sub, s, enc);
4876}
4877
4878/*
4879 * call-seq:
4880 * byterindex(object, offset = self.bytesize) -> integer or nil
4881 *
4882 * Returns the 0-based integer index of a substring of +self+
4883 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4884 * or +nil+ if there is no such substring;
4885 * the returned index is the count of _bytes_ (not characters).
4886 *
4887 * When +object+ is a string,
4888 * returns the index of the _last_ found substring equal to +object+:
4889 *
4890 * s = 'foo' # => "foo"
4891 * s.size # => 3 # Three 1-byte characters.
4892 * s.bytesize # => 3 # Three bytes.
4893 * s.byterindex('f') # => 0
4894 s.byterindex('o') # => 2
4895 s.byterindex('oo') # => 1
4896 s.byterindex('ooo') # => nil
4897 *
4898 * When +object+ is a Regexp,
4899 * returns the index of the last found substring matching +object+;
4900 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4901 *
4902 * s = 'foo'
4903 * s.byterindex(/f/) # => 0
4904 * $~ # => #<MatchData "f">
4905 * s.byterindex(/o/) # => 2
4906 * s.byterindex(/oo/) # => 1
4907 * s.byterindex(/ooo/) # => nil
4908 * $~ # => nil
4909 *
4910 * The last match means starting at the possible last position,
4911 * not the last of the longest matches:
4912 *
4913 * s = 'foo'
4914 * s.byterindex(/o+/) # => 2
4915 * $~ #=> #<MatchData "o">
4916 *
4917 * To get the last longest match, use a negative lookbehind:
4918 *
4919 * s = 'foo'
4920 * s.byterindex(/(?<!o)o+/) # => 1
4921 * $~ # => #<MatchData "oo">
4922 *
4923 * Or use method #byteindex with negative lookahead:
4924 *
4925 * s = 'foo'
4926 * s.byteindex(/o+(?!.*o)/) # => 1
4927 * $~ #=> #<MatchData "oo">
4928 *
4929 * \Integer argument +offset+, if given, specifies the 0-based index
4930 * of the byte where searching is to end.
4931 *
4932 * When +offset+ is non-negative,
4933 * searching ends at byte position +offset+:
4934 *
4935 * s = 'foo'
4936 * s.byterindex('o', 0) # => nil
4937 * s.byterindex('o', 1) # => 1
4938 * s.byterindex('o', 2) # => 2
4939 * s.byterindex('o', 3) # => 2
4940 *
4941 * When +offset+ is negative, counts backward from the end of +self+:
4942 *
4943 * s = 'foo'
4944 * s.byterindex('o', -1) # => 2
4945 * s.byterindex('o', -2) # => 1
4946 * s.byterindex('o', -3) # => nil
4947 *
4948 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4949 *
4950 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4951 * s.size # => 2 # Two 3-byte characters.
4952 * s.bytesize # => 6 # Six bytes.
4953 * s.byterindex("\uFFFF") # => 3
4954 * s.byterindex("\uFFFF", 1) # Raises IndexError
4955 * s.byterindex("\uFFFF", 2) # Raises IndexError
4956 * s.byterindex("\uFFFF", 3) # => 3
4957 * s.byterindex("\uFFFF", 4) # Raises IndexError
4958 * s.byterindex("\uFFFF", 5) # Raises IndexError
4959 * s.byterindex("\uFFFF", 6) # => nil
4960 *
4961 * Related: see {Querying}[rdoc-ref:String@Querying].
4962 */
4963
4964static VALUE
4965rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4966{
4967 VALUE sub;
4968 VALUE initpos;
4969 long pos, len = RSTRING_LEN(str);
4970
4971 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4972 pos = NUM2LONG(initpos);
4973 if (pos < 0 && (pos += len) < 0) {
4974 if (RB_TYPE_P(sub, T_REGEXP)) {
4976 }
4977 return Qnil;
4978 }
4979 if (pos > len) pos = len;
4980 }
4981 else {
4982 pos = len;
4983 }
4984
4985 str_ensure_byte_pos(str, pos);
4986
4987 if (RB_TYPE_P(sub, T_REGEXP)) {
4988 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4989 VALUE match = rb_backref_get();
4990 struct re_registers *regs = RMATCH_REGS(match);
4991 pos = BEG(0);
4992 return LONG2NUM(pos);
4993 }
4994 }
4995 else {
4996 StringValue(sub);
4997 pos = rb_str_byterindex(str, sub, pos);
4998 if (pos >= 0) return LONG2NUM(pos);
4999 }
5000 return Qnil;
5001}
5002
5003/*
5004 * call-seq:
5005 * self =~ object -> integer or nil
5006 *
5007 * When +object+ is a Regexp, returns the index of the first substring in +self+
5008 * matched by +object+,
5009 * or +nil+ if no match is found;
5010 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5011 *
5012 * 'foo' =~ /f/ # => 0
5013 * $~ # => #<MatchData "f">
5014 * 'foo' =~ /o/ # => 1
5015 * $~ # => #<MatchData "o">
5016 * 'foo' =~ /x/ # => nil
5017 * $~ # => nil
5018 *
5019 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5020 * (see Regexp#=~):
5021 *
5022 * number = nil
5023 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5024 * number # => nil # Not assigned.
5025 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5026 * number # => "9" # Assigned.
5027 *
5028 * If +object+ is not a Regexp, returns the value
5029 * returned by <tt>object =~ self</tt>.
5030 *
5031 * Related: see {Querying}[rdoc-ref:String@Querying].
5032 */
5033
5034static VALUE
5035rb_str_match(VALUE x, VALUE y)
5036{
5037 switch (OBJ_BUILTIN_TYPE(y)) {
5038 case T_STRING:
5039 rb_raise(rb_eTypeError, "type mismatch: String given");
5040
5041 case T_REGEXP:
5042 return rb_reg_match(y, x);
5043
5044 default:
5045 return rb_funcall(y, idEqTilde, 1, x);
5046 }
5047}
5048
5049
5050static VALUE get_pat(VALUE);
5051
5052
5053/*
5054 * call-seq:
5055 * match(pattern, offset = 0) -> matchdata or nil
5056 * match(pattern, offset = 0) {|matchdata| ... } -> object
5057 *
5058 * Creates a MatchData object based on +self+ and the given arguments;
5059 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5060 *
5061 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5062 *
5063 * regexp = Regexp.new(pattern)
5064 *
5065 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5066 * (see Regexp#match):
5067 *
5068 * matchdata = regexp.match(self[offset..])
5069 *
5070 * With no block given, returns the computed +matchdata+ or +nil+:
5071 *
5072 * 'foo'.match('f') # => #<MatchData "f">
5073 * 'foo'.match('o') # => #<MatchData "o">
5074 * 'foo'.match('x') # => nil
5075 * 'foo'.match('f', 1) # => nil
5076 * 'foo'.match('o', 1) # => #<MatchData "o">
5077 *
5078 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5079 * returns the block's return value:
5080 *
5081 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5082 *
5083 * With a block given and +nil+ +matchdata+, does not call the block:
5084 *
5085 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5086 *
5087 * Related: see {Querying}[rdoc-ref:String@Querying].
5088 */
5089
5090static VALUE
5091rb_str_match_m(int argc, VALUE *argv, VALUE str)
5092{
5093 VALUE re, result;
5094 if (argc < 1)
5095 rb_check_arity(argc, 1, 2);
5096 re = argv[0];
5097 argv[0] = str;
5098 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5099 if (!NIL_P(result) && rb_block_given_p()) {
5100 return rb_yield(result);
5101 }
5102 return result;
5103}
5104
5105/*
5106 * call-seq:
5107 * match?(pattern, offset = 0) -> true or false
5108 *
5109 * Returns whether a match is found for +self+ and the given arguments;
5110 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5111 *
5112 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5113 *
5114 * regexp = Regexp.new(pattern)
5115 *
5116 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5117 * +false+ otherwise:
5118 *
5119 * 'foo'.match?(/o/) # => true
5120 * 'foo'.match?('o') # => true
5121 * 'foo'.match?(/x/) # => false
5122 * 'foo'.match?('f', 1) # => false
5123 * 'foo'.match?('o', 1) # => true
5124 *
5125 * Related: see {Querying}[rdoc-ref:String@Querying].
5126 */
5127
5128static VALUE
5129rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5130{
5131 VALUE re;
5132 rb_check_arity(argc, 1, 2);
5133 re = get_pat(argv[0]);
5134 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5135}
5136
5137enum neighbor_char {
5138 NEIGHBOR_NOT_CHAR,
5139 NEIGHBOR_FOUND,
5140 NEIGHBOR_WRAPPED
5141};
5142
5143static enum neighbor_char
5144enc_succ_char(char *p, long len, rb_encoding *enc)
5145{
5146 long i;
5147 int l;
5148
5149 if (rb_enc_mbminlen(enc) > 1) {
5150 /* wchar, trivial case */
5151 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5152 if (!MBCLEN_CHARFOUND_P(r)) {
5153 return NEIGHBOR_NOT_CHAR;
5154 }
5155 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5156 l = rb_enc_code_to_mbclen(c, enc);
5157 if (!l) return NEIGHBOR_NOT_CHAR;
5158 if (l != len) return NEIGHBOR_WRAPPED;
5159 rb_enc_mbcput(c, p, enc);
5160 r = rb_enc_precise_mbclen(p, p + len, enc);
5161 if (!MBCLEN_CHARFOUND_P(r)) {
5162 return NEIGHBOR_NOT_CHAR;
5163 }
5164 return NEIGHBOR_FOUND;
5165 }
5166 while (1) {
5167 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5168 p[i] = '\0';
5169 if (i < 0)
5170 return NEIGHBOR_WRAPPED;
5171 ++((unsigned char*)p)[i];
5172 l = rb_enc_precise_mbclen(p, p+len, enc);
5173 if (MBCLEN_CHARFOUND_P(l)) {
5174 l = MBCLEN_CHARFOUND_LEN(l);
5175 if (l == len) {
5176 return NEIGHBOR_FOUND;
5177 }
5178 else {
5179 memset(p+l, 0xff, len-l);
5180 }
5181 }
5182 if (MBCLEN_INVALID_P(l) && i < len-1) {
5183 long len2;
5184 int l2;
5185 for (len2 = len-1; 0 < len2; len2--) {
5186 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5187 if (!MBCLEN_INVALID_P(l2))
5188 break;
5189 }
5190 memset(p+len2+1, 0xff, len-(len2+1));
5191 }
5192 }
5193}
5194
5195static enum neighbor_char
5196enc_pred_char(char *p, long len, rb_encoding *enc)
5197{
5198 long i;
5199 int l;
5200 if (rb_enc_mbminlen(enc) > 1) {
5201 /* wchar, trivial case */
5202 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5203 if (!MBCLEN_CHARFOUND_P(r)) {
5204 return NEIGHBOR_NOT_CHAR;
5205 }
5206 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5207 if (!c) return NEIGHBOR_NOT_CHAR;
5208 --c;
5209 l = rb_enc_code_to_mbclen(c, enc);
5210 if (!l) return NEIGHBOR_NOT_CHAR;
5211 if (l != len) return NEIGHBOR_WRAPPED;
5212 rb_enc_mbcput(c, p, enc);
5213 r = rb_enc_precise_mbclen(p, p + len, enc);
5214 if (!MBCLEN_CHARFOUND_P(r)) {
5215 return NEIGHBOR_NOT_CHAR;
5216 }
5217 return NEIGHBOR_FOUND;
5218 }
5219 while (1) {
5220 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5221 p[i] = '\xff';
5222 if (i < 0)
5223 return NEIGHBOR_WRAPPED;
5224 --((unsigned char*)p)[i];
5225 l = rb_enc_precise_mbclen(p, p+len, enc);
5226 if (MBCLEN_CHARFOUND_P(l)) {
5227 l = MBCLEN_CHARFOUND_LEN(l);
5228 if (l == len) {
5229 return NEIGHBOR_FOUND;
5230 }
5231 else {
5232 memset(p+l, 0, len-l);
5233 }
5234 }
5235 if (MBCLEN_INVALID_P(l) && i < len-1) {
5236 long len2;
5237 int l2;
5238 for (len2 = len-1; 0 < len2; len2--) {
5239 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5240 if (!MBCLEN_INVALID_P(l2))
5241 break;
5242 }
5243 memset(p+len2+1, 0, len-(len2+1));
5244 }
5245 }
5246}
5247
5248/*
5249 overwrite +p+ by succeeding letter in +enc+ and returns
5250 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5251 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5252 assuming each ranges are successive, and mbclen
5253 never change in each ranges.
5254 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5255 character.
5256 */
5257static enum neighbor_char
5258enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5259{
5260 enum neighbor_char ret;
5261 unsigned int c;
5262 int ctype;
5263 int range;
5264 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5265
5266 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5267 int try;
5268 const int max_gaps = 1;
5269
5270 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5271 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5272 ctype = ONIGENC_CTYPE_DIGIT;
5273 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5274 ctype = ONIGENC_CTYPE_ALPHA;
5275 else
5276 return NEIGHBOR_NOT_CHAR;
5277
5278 MEMCPY(save, p, char, len);
5279 for (try = 0; try <= max_gaps; ++try) {
5280 ret = enc_succ_char(p, len, enc);
5281 if (ret == NEIGHBOR_FOUND) {
5282 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5283 if (rb_enc_isctype(c, ctype, enc))
5284 return NEIGHBOR_FOUND;
5285 }
5286 }
5287 MEMCPY(p, save, char, len);
5288 range = 1;
5289 while (1) {
5290 MEMCPY(save, p, char, len);
5291 ret = enc_pred_char(p, len, enc);
5292 if (ret == NEIGHBOR_FOUND) {
5293 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5294 if (!rb_enc_isctype(c, ctype, enc)) {
5295 MEMCPY(p, save, char, len);
5296 break;
5297 }
5298 }
5299 else {
5300 MEMCPY(p, save, char, len);
5301 break;
5302 }
5303 range++;
5304 }
5305 if (range == 1) {
5306 return NEIGHBOR_NOT_CHAR;
5307 }
5308
5309 if (ctype != ONIGENC_CTYPE_DIGIT) {
5310 MEMCPY(carry, p, char, len);
5311 return NEIGHBOR_WRAPPED;
5312 }
5313
5314 MEMCPY(carry, p, char, len);
5315 enc_succ_char(carry, len, enc);
5316 return NEIGHBOR_WRAPPED;
5317}
5318
5319
5320static VALUE str_succ(VALUE str);
5321
5322/*
5323 * call-seq:
5324 * succ -> new_str
5325 *
5326 * :include: doc/string/succ.rdoc
5327 *
5328 */
5329
5330VALUE
5332{
5333 VALUE str;
5334 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5335 rb_enc_cr_str_copy_for_substr(str, orig);
5336 return str_succ(str);
5337}
5338
5339static VALUE
5340str_succ(VALUE str)
5341{
5342 rb_encoding *enc;
5343 char *sbeg, *s, *e, *last_alnum = 0;
5344 int found_alnum = 0;
5345 long l, slen;
5346 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5347 long carry_pos = 0, carry_len = 1;
5348 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5349
5350 slen = RSTRING_LEN(str);
5351 if (slen == 0) return str;
5352
5353 enc = STR_ENC_GET(str);
5354 sbeg = RSTRING_PTR(str);
5355 s = e = sbeg + slen;
5356
5357 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5358 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5359 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5360 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5361 break;
5362 }
5363 }
5364 l = rb_enc_precise_mbclen(s, e, enc);
5365 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5366 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5367 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5368 switch (neighbor) {
5369 case NEIGHBOR_NOT_CHAR:
5370 continue;
5371 case NEIGHBOR_FOUND:
5372 return str;
5373 case NEIGHBOR_WRAPPED:
5374 last_alnum = s;
5375 break;
5376 }
5377 found_alnum = 1;
5378 carry_pos = s - sbeg;
5379 carry_len = l;
5380 }
5381 if (!found_alnum) { /* str contains no alnum */
5382 s = e;
5383 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5384 enum neighbor_char neighbor;
5385 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5386 l = rb_enc_precise_mbclen(s, e, enc);
5387 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5388 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5389 MEMCPY(tmp, s, char, l);
5390 neighbor = enc_succ_char(tmp, l, enc);
5391 switch (neighbor) {
5392 case NEIGHBOR_FOUND:
5393 MEMCPY(s, tmp, char, l);
5394 return str;
5395 break;
5396 case NEIGHBOR_WRAPPED:
5397 MEMCPY(s, tmp, char, l);
5398 break;
5399 case NEIGHBOR_NOT_CHAR:
5400 break;
5401 }
5402 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5403 /* wrapped to \0...\0. search next valid char. */
5404 enc_succ_char(s, l, enc);
5405 }
5406 if (!rb_enc_asciicompat(enc)) {
5407 MEMCPY(carry, s, char, l);
5408 carry_len = l;
5409 }
5410 carry_pos = s - sbeg;
5411 }
5413 }
5414 RESIZE_CAPA(str, slen + carry_len);
5415 sbeg = RSTRING_PTR(str);
5416 s = sbeg + carry_pos;
5417 memmove(s + carry_len, s, slen - carry_pos);
5418 memmove(s, carry, carry_len);
5419 slen += carry_len;
5420 STR_SET_LEN(str, slen);
5421 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5423 return str;
5424}
5425
5426
5427/*
5428 * call-seq:
5429 * succ! -> self
5430 *
5431 * Like String#succ, but modifies +self+ in place; returns +self+.
5432 *
5433 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5434 */
5435
5436static VALUE
5437rb_str_succ_bang(VALUE str)
5438{
5439 rb_str_modify(str);
5440 str_succ(str);
5441 return str;
5442}
5443
5444static int
5445all_digits_p(const char *s, long len)
5446{
5447 while (len-- > 0) {
5448 if (!ISDIGIT(*s)) return 0;
5449 s++;
5450 }
5451 return 1;
5452}
5453
5454static int
5455str_upto_i(VALUE str, VALUE arg)
5456{
5457 rb_yield(str);
5458 return 0;
5459}
5460
5461/*
5462 * call-seq:
5463 * upto(other_string, exclusive = false) {|string| ... } -> self
5464 * upto(other_string, exclusive = false) -> new_enumerator
5465 *
5466 * :include: doc/string/upto.rdoc
5467 *
5468 */
5469
5470static VALUE
5471rb_str_upto(int argc, VALUE *argv, VALUE beg)
5472{
5473 VALUE end, exclusive;
5474
5475 rb_scan_args(argc, argv, "11", &end, &exclusive);
5476 RETURN_ENUMERATOR(beg, argc, argv);
5477 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5478}
5479
5480VALUE
5481rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5482{
5483 VALUE current, after_end;
5484 ID succ;
5485 int n, ascii;
5486 rb_encoding *enc;
5487
5488 CONST_ID(succ, "succ");
5489 StringValue(end);
5490 enc = rb_enc_check(beg, end);
5491 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5492 /* single character */
5493 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5494 char c = RSTRING_PTR(beg)[0];
5495 char e = RSTRING_PTR(end)[0];
5496
5497 if (c > e || (excl && c == e)) return beg;
5498 for (;;) {
5499 VALUE str = rb_enc_str_new(&c, 1, enc);
5501 if ((*each)(str, arg)) break;
5502 if (!excl && c == e) break;
5503 c++;
5504 if (excl && c == e) break;
5505 }
5506 return beg;
5507 }
5508 /* both edges are all digits */
5509 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5510 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5511 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5512 VALUE b, e;
5513 int width;
5514
5515 width = RSTRING_LENINT(beg);
5516 b = rb_str_to_inum(beg, 10, FALSE);
5517 e = rb_str_to_inum(end, 10, FALSE);
5518 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5519 long bi = FIX2LONG(b);
5520 long ei = FIX2LONG(e);
5521 rb_encoding *usascii = rb_usascii_encoding();
5522
5523 while (bi <= ei) {
5524 if (excl && bi == ei) break;
5525 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5526 bi++;
5527 }
5528 }
5529 else {
5530 ID op = excl ? '<' : idLE;
5531 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5532
5533 args[0] = INT2FIX(width);
5534 while (rb_funcall(b, op, 1, e)) {
5535 args[1] = b;
5536 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5537 b = rb_funcallv(b, succ, 0, 0);
5538 }
5539 }
5540 return beg;
5541 }
5542 /* normal case */
5543 n = rb_str_cmp(beg, end);
5544 if (n > 0 || (excl && n == 0)) return beg;
5545
5546 after_end = rb_funcallv(end, succ, 0, 0);
5547 current = str_duplicate(rb_cString, beg);
5548 while (!rb_str_equal(current, after_end)) {
5549 VALUE next = Qnil;
5550 if (excl || !rb_str_equal(current, end))
5551 next = rb_funcallv(current, succ, 0, 0);
5552 if ((*each)(current, arg)) break;
5553 if (NIL_P(next)) break;
5554 current = next;
5555 StringValue(current);
5556 if (excl && rb_str_equal(current, end)) break;
5557 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5558 break;
5559 }
5560
5561 return beg;
5562}
5563
5564VALUE
5565rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5566{
5567 VALUE current;
5568 ID succ;
5569
5570 CONST_ID(succ, "succ");
5571 /* both edges are all digits */
5572 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5573 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5574 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5575 int width = RSTRING_LENINT(beg);
5576 b = rb_str_to_inum(beg, 10, FALSE);
5577 if (FIXNUM_P(b)) {
5578 long bi = FIX2LONG(b);
5579 rb_encoding *usascii = rb_usascii_encoding();
5580
5581 while (FIXABLE(bi)) {
5582 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5583 bi++;
5584 }
5585 b = LONG2NUM(bi);
5586 }
5587 args[0] = INT2FIX(width);
5588 while (1) {
5589 args[1] = b;
5590 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5591 b = rb_funcallv(b, succ, 0, 0);
5592 }
5593 }
5594 /* normal case */
5595 current = str_duplicate(rb_cString, beg);
5596 while (1) {
5597 VALUE next = rb_funcallv(current, succ, 0, 0);
5598 if ((*each)(current, arg)) break;
5599 current = next;
5600 StringValue(current);
5601 if (RSTRING_LEN(current) == 0)
5602 break;
5603 }
5604
5605 return beg;
5606}
5607
5608static int
5609include_range_i(VALUE str, VALUE arg)
5610{
5611 VALUE *argp = (VALUE *)arg;
5612 if (!rb_equal(str, *argp)) return 0;
5613 *argp = Qnil;
5614 return 1;
5615}
5616
5617VALUE
5618rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5619{
5620 beg = rb_str_new_frozen(beg);
5621 StringValue(end);
5622 end = rb_str_new_frozen(end);
5623 if (NIL_P(val)) return Qfalse;
5624 val = rb_check_string_type(val);
5625 if (NIL_P(val)) return Qfalse;
5626 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5627 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5628 rb_enc_asciicompat(STR_ENC_GET(val))) {
5629 const char *bp = RSTRING_PTR(beg);
5630 const char *ep = RSTRING_PTR(end);
5631 const char *vp = RSTRING_PTR(val);
5632 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5633 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5634 return Qfalse;
5635 else {
5636 char b = *bp;
5637 char e = *ep;
5638 char v = *vp;
5639
5640 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5641 if (b <= v && v < e) return Qtrue;
5642 return RBOOL(!RTEST(exclusive) && v == e);
5643 }
5644 }
5645 }
5646#if 0
5647 /* both edges are all digits */
5648 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5649 all_digits_p(bp, RSTRING_LEN(beg)) &&
5650 all_digits_p(ep, RSTRING_LEN(end))) {
5651 /* TODO */
5652 }
5653#endif
5654 }
5655 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5656
5657 return RBOOL(NIL_P(val));
5658}
5659
5660static VALUE
5661rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5662{
5663 if (rb_reg_search(re, str, 0, 0) >= 0) {
5664 VALUE match = rb_backref_get();
5665 int nth = rb_reg_backref_number(match, backref);
5666 return rb_reg_nth_match(nth, match);
5667 }
5668 return Qnil;
5669}
5670
5671static VALUE
5672rb_str_aref(VALUE str, VALUE indx)
5673{
5674 long idx;
5675
5676 if (FIXNUM_P(indx)) {
5677 idx = FIX2LONG(indx);
5678 }
5679 else if (RB_TYPE_P(indx, T_REGEXP)) {
5680 return rb_str_subpat(str, indx, INT2FIX(0));
5681 }
5682 else if (RB_TYPE_P(indx, T_STRING)) {
5683 if (rb_str_index(str, indx, 0) != -1)
5684 return str_duplicate(rb_cString, indx);
5685 return Qnil;
5686 }
5687 else {
5688 /* check if indx is Range */
5689 long beg, len = str_strlen(str, NULL);
5690 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5691 case Qfalse:
5692 break;
5693 case Qnil:
5694 return Qnil;
5695 default:
5696 return rb_str_substr(str, beg, len);
5697 }
5698 idx = NUM2LONG(indx);
5699 }
5700
5701 return str_substr(str, idx, 1, FALSE);
5702}
5703
5704
5705/*
5706 * call-seq:
5707 * self[index] -> new_string or nil
5708 * self[start, length] -> new_string or nil
5709 * self[range] -> new_string or nil
5710 * self[regexp, capture = 0] -> new_string or nil
5711 * self[substring] -> new_string or nil
5712 *
5713 * :include: doc/string/aref.rdoc
5714 *
5715 */
5716
5717static VALUE
5718rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5719{
5720 if (argc == 2) {
5721 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5722 return rb_str_subpat(str, argv[0], argv[1]);
5723 }
5724 else {
5725 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5726 }
5727 }
5728 rb_check_arity(argc, 1, 2);
5729 return rb_str_aref(str, argv[0]);
5730}
5731
5732VALUE
5734{
5735 char *ptr = RSTRING_PTR(str);
5736 long olen = RSTRING_LEN(str), nlen;
5737
5738 str_modifiable(str);
5739 if (len > olen) len = olen;
5740 nlen = olen - len;
5741 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5742 char *oldptr = ptr;
5743 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5744 STR_SET_EMBED(str);
5745 ptr = RSTRING(str)->as.embed.ary;
5746 memmove(ptr, oldptr + len, nlen);
5747 if (fl == STR_NOEMBED) xfree(oldptr);
5748 }
5749 else {
5750 if (!STR_SHARED_P(str)) {
5751 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5752 rb_enc_cr_str_exact_copy(shared, str);
5753 OBJ_FREEZE(shared);
5754 }
5755 ptr = RSTRING(str)->as.heap.ptr += len;
5756 }
5757 STR_SET_LEN(str, nlen);
5758
5759 if (!SHARABLE_MIDDLE_SUBSTRING) {
5760 TERM_FILL(ptr + nlen, TERM_LEN(str));
5761 }
5763 return str;
5764}
5765
5766static void
5767rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5768{
5769 char *sptr;
5770 long slen;
5771 int cr;
5772
5773 if (beg == 0 && vlen == 0) {
5774 rb_str_drop_bytes(str, len);
5775 return;
5776 }
5777
5778 str_modify_keep_cr(str);
5779 RSTRING_GETMEM(str, sptr, slen);
5780 if (len < vlen) {
5781 /* expand string */
5782 RESIZE_CAPA(str, slen + vlen - len);
5783 sptr = RSTRING_PTR(str);
5784 }
5785
5787 cr = rb_enc_str_coderange(val);
5788 else
5790
5791 if (vlen != len) {
5792 memmove(sptr + beg + vlen,
5793 sptr + beg + len,
5794 slen - (beg + len));
5795 }
5796 if (vlen < beg && len < 0) {
5797 MEMZERO(sptr + slen, char, -len);
5798 }
5799 if (vlen > 0) {
5800 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5801 }
5802 slen += vlen - len;
5803 STR_SET_LEN(str, slen);
5804 TERM_FILL(&sptr[slen], TERM_LEN(str));
5805 ENC_CODERANGE_SET(str, cr);
5806}
5807
5808static inline void
5809rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5810{
5811 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5812}
5813
5814void
5815rb_str_update(VALUE str, long beg, long len, VALUE val)
5816{
5817 long slen;
5818 char *p, *e;
5819 rb_encoding *enc;
5820 int singlebyte = single_byte_optimizable(str);
5821 int cr;
5822
5823 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5824
5825 StringValue(val);
5826 enc = rb_enc_check(str, val);
5827 slen = str_strlen(str, enc); /* rb_enc_check */
5828
5829 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5830 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5831 }
5832 if (beg < 0) {
5833 beg += slen;
5834 }
5835 RUBY_ASSERT(beg >= 0);
5836 RUBY_ASSERT(beg <= slen);
5837
5838 if (len > slen - beg) {
5839 len = slen - beg;
5840 }
5841 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5842 if (!p) p = RSTRING_END(str);
5843 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5844 if (!e) e = RSTRING_END(str);
5845 /* error check */
5846 beg = p - RSTRING_PTR(str); /* physical position */
5847 len = e - p; /* physical length */
5848 rb_str_update_0(str, beg, len, val);
5849 rb_enc_associate(str, enc);
5851 if (cr != ENC_CODERANGE_BROKEN)
5852 ENC_CODERANGE_SET(str, cr);
5853}
5854
5855static void
5856rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5857{
5858 int nth;
5859 VALUE match;
5860 long start, end, len;
5861 rb_encoding *enc;
5862 struct re_registers *regs;
5863
5864 if (rb_reg_search(re, str, 0, 0) < 0) {
5865 rb_raise(rb_eIndexError, "regexp not matched");
5866 }
5867 match = rb_backref_get();
5868 nth = rb_reg_backref_number(match, backref);
5869 regs = RMATCH_REGS(match);
5870 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5871 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5872 }
5873 if (nth < 0) {
5874 nth += regs->num_regs;
5875 }
5876
5877 start = BEG(nth);
5878 if (start == -1) {
5879 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5880 }
5881 end = END(nth);
5882 len = end - start;
5883 StringValue(val);
5884 enc = rb_enc_check_str(str, val);
5885 rb_str_update_0(str, start, len, val);
5886 rb_enc_associate(str, enc);
5887}
5888
5889static VALUE
5890rb_str_aset(VALUE str, VALUE indx, VALUE val)
5891{
5892 long idx, beg;
5893
5894 switch (TYPE(indx)) {
5895 case T_REGEXP:
5896 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5897 return val;
5898
5899 case T_STRING:
5900 beg = rb_str_index(str, indx, 0);
5901 if (beg < 0) {
5902 rb_raise(rb_eIndexError, "string not matched");
5903 }
5904 beg = rb_str_sublen(str, beg);
5905 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5906 return val;
5907
5908 default:
5909 /* check if indx is Range */
5910 {
5911 long beg, len;
5912 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5913 rb_str_update(str, beg, len, val);
5914 return val;
5915 }
5916 }
5917 /* FALLTHROUGH */
5918
5919 case T_FIXNUM:
5920 idx = NUM2LONG(indx);
5921 rb_str_update(str, idx, 1, val);
5922 return val;
5923 }
5924}
5925
5926/*
5927 * call-seq:
5928 * self[index] = other_string -> new_string
5929 * self[start, length] = other_string -> new_string
5930 * self[range] = other_string -> new_string
5931 * self[regexp, capture = 0] = other_string -> new_string
5932 * self[substring] = other_string -> new_string
5933 *
5934 * :include: doc/string/aset.rdoc
5935 *
5936 */
5937
5938static VALUE
5939rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5940{
5941 if (argc == 3) {
5942 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5943 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5944 }
5945 else {
5946 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5947 }
5948 return argv[2];
5949 }
5950 rb_check_arity(argc, 2, 3);
5951 return rb_str_aset(str, argv[0], argv[1]);
5952}
5953
5954/*
5955 * call-seq:
5956 * insert(offset, other_string) -> self
5957 *
5958 * :include: doc/string/insert.rdoc
5959 *
5960 */
5961
5962static VALUE
5963rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5964{
5965 long pos = NUM2LONG(idx);
5966
5967 if (pos == -1) {
5968 return rb_str_append(str, str2);
5969 }
5970 else if (pos < 0) {
5971 pos++;
5972 }
5973 rb_str_update(str, pos, 0, str2);
5974 return str;
5975}
5976
5977
5978/*
5979 * call-seq:
5980 * slice!(index) -> new_string or nil
5981 * slice!(start, length) -> new_string or nil
5982 * slice!(range) -> new_string or nil
5983 * slice!(regexp, capture = 0) -> new_string or nil
5984 * slice!(substring) -> new_string or nil
5985 *
5986 * Like String#[] (and its alias String#slice), except that:
5987 *
5988 * - Performs substitutions in +self+ (not in a copy of +self+).
5989 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
5990 *
5991 * A few examples:
5992 *
5993 * s = 'hello'
5994 * s.slice!('e') # => "e"
5995 * s # => "hllo"
5996 * s.slice!('e') # => nil
5997 * s # => "hllo"
5998 *
5999 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6000 */
6001
6002static VALUE
6003rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6004{
6005 VALUE result = Qnil;
6006 VALUE indx;
6007 long beg, len = 1;
6008 char *p;
6009
6010 rb_check_arity(argc, 1, 2);
6011 str_modify_keep_cr(str);
6012 indx = argv[0];
6013 if (RB_TYPE_P(indx, T_REGEXP)) {
6014 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6015 VALUE match = rb_backref_get();
6016 struct re_registers *regs = RMATCH_REGS(match);
6017 int nth = 0;
6018 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6019 if ((nth += regs->num_regs) <= 0) return Qnil;
6020 }
6021 else if (nth >= regs->num_regs) return Qnil;
6022 beg = BEG(nth);
6023 len = END(nth) - beg;
6024 goto subseq;
6025 }
6026 else if (argc == 2) {
6027 beg = NUM2LONG(indx);
6028 len = NUM2LONG(argv[1]);
6029 goto num_index;
6030 }
6031 else if (FIXNUM_P(indx)) {
6032 beg = FIX2LONG(indx);
6033 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6034 if (!len) return Qnil;
6035 beg = p - RSTRING_PTR(str);
6036 goto subseq;
6037 }
6038 else if (RB_TYPE_P(indx, T_STRING)) {
6039 beg = rb_str_index(str, indx, 0);
6040 if (beg == -1) return Qnil;
6041 len = RSTRING_LEN(indx);
6042 result = str_duplicate(rb_cString, indx);
6043 goto squash;
6044 }
6045 else {
6046 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6047 case Qnil:
6048 return Qnil;
6049 case Qfalse:
6050 beg = NUM2LONG(indx);
6051 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6052 if (!len) return Qnil;
6053 beg = p - RSTRING_PTR(str);
6054 goto subseq;
6055 default:
6056 goto num_index;
6057 }
6058 }
6059
6060 num_index:
6061 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6062 beg = p - RSTRING_PTR(str);
6063
6064 subseq:
6065 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6066 rb_enc_cr_str_copy_for_substr(result, str);
6067
6068 squash:
6069 if (len > 0) {
6070 if (beg == 0) {
6071 rb_str_drop_bytes(str, len);
6072 }
6073 else {
6074 char *sptr = RSTRING_PTR(str);
6075 long slen = RSTRING_LEN(str);
6076 if (beg + len > slen) /* pathological check */
6077 len = slen - beg;
6078 memmove(sptr + beg,
6079 sptr + beg + len,
6080 slen - (beg + len));
6081 slen -= len;
6082 STR_SET_LEN(str, slen);
6083 TERM_FILL(&sptr[slen], TERM_LEN(str));
6084 }
6085 }
6086 return result;
6087}
6088
6089static VALUE
6090get_pat(VALUE pat)
6091{
6092 VALUE val;
6093
6094 switch (OBJ_BUILTIN_TYPE(pat)) {
6095 case T_REGEXP:
6096 return pat;
6097
6098 case T_STRING:
6099 break;
6100
6101 default:
6102 val = rb_check_string_type(pat);
6103 if (NIL_P(val)) {
6104 Check_Type(pat, T_REGEXP);
6105 }
6106 pat = val;
6107 }
6108
6109 return rb_reg_regcomp(pat);
6110}
6111
6112static VALUE
6113get_pat_quoted(VALUE pat, int check)
6114{
6115 VALUE val;
6116
6117 switch (OBJ_BUILTIN_TYPE(pat)) {
6118 case T_REGEXP:
6119 return pat;
6120
6121 case T_STRING:
6122 break;
6123
6124 default:
6125 val = rb_check_string_type(pat);
6126 if (NIL_P(val)) {
6127 Check_Type(pat, T_REGEXP);
6128 }
6129 pat = val;
6130 }
6131 if (check && is_broken_string(pat)) {
6132 rb_exc_raise(rb_reg_check_preprocess(pat));
6133 }
6134 return pat;
6135}
6136
6137static long
6138rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6139{
6140 if (BUILTIN_TYPE(pat) == T_STRING) {
6141 pos = rb_str_byteindex(str, pat, pos);
6142 if (set_backref_str) {
6143 if (pos >= 0) {
6144 str = rb_str_new_frozen_String(str);
6145 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6146 if (match) {
6147 *match = match_data;
6148 }
6149 }
6150 else {
6152 }
6153 }
6154 return pos;
6155 }
6156 else {
6157 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6158 }
6159}
6160
6161static long
6162rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6163{
6164 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6165}
6166
6167
6168/*
6169 * call-seq:
6170 * sub!(pattern, replacement) -> self or nil
6171 * sub!(pattern) {|match| ... } -> self or nil
6172 *
6173 * Like String#sub, except that:
6174 *
6175 * - Changes are made to +self+, not to copy of +self+.
6176 * - Returns +self+ if any changes are made, +nil+ otherwise.
6177 *
6178 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6179 */
6180
6181static VALUE
6182rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6183{
6184 VALUE pat, repl, hash = Qnil;
6185 int iter = 0;
6186 long plen;
6187 int min_arity = rb_block_given_p() ? 1 : 2;
6188 long beg;
6189
6190 rb_check_arity(argc, min_arity, 2);
6191 if (argc == 1) {
6192 iter = 1;
6193 }
6194 else {
6195 repl = argv[1];
6196 hash = rb_check_hash_type(argv[1]);
6197 if (NIL_P(hash)) {
6198 StringValue(repl);
6199 }
6200 }
6201
6202 pat = get_pat_quoted(argv[0], 1);
6203
6204 str_modifiable(str);
6205 beg = rb_pat_search(pat, str, 0, 1);
6206 if (beg >= 0) {
6207 rb_encoding *enc;
6208 int cr = ENC_CODERANGE(str);
6209 long beg0, end0;
6210 VALUE match, match0 = Qnil;
6211 struct re_registers *regs;
6212 char *p, *rp;
6213 long len, rlen;
6214
6215 match = rb_backref_get();
6216 regs = RMATCH_REGS(match);
6217 if (RB_TYPE_P(pat, T_STRING)) {
6218 beg0 = beg;
6219 end0 = beg0 + RSTRING_LEN(pat);
6220 match0 = pat;
6221 }
6222 else {
6223 beg0 = BEG(0);
6224 end0 = END(0);
6225 if (iter) match0 = rb_reg_nth_match(0, match);
6226 }
6227
6228 if (iter || !NIL_P(hash)) {
6229 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6230
6231 if (iter) {
6232 repl = rb_obj_as_string(rb_yield(match0));
6233 }
6234 else {
6235 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6236 repl = rb_obj_as_string(repl);
6237 }
6238 str_mod_check(str, p, len);
6239 rb_check_frozen(str);
6240 }
6241 else {
6242 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6243 }
6244
6245 enc = rb_enc_compatible(str, repl);
6246 if (!enc) {
6247 rb_encoding *str_enc = STR_ENC_GET(str);
6248 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6249 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6250 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6251 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6252 rb_enc_inspect_name(str_enc),
6253 rb_enc_inspect_name(STR_ENC_GET(repl)));
6254 }
6255 enc = STR_ENC_GET(repl);
6256 }
6257 rb_str_modify(str);
6258 rb_enc_associate(str, enc);
6260 int cr2 = ENC_CODERANGE(repl);
6261 if (cr2 == ENC_CODERANGE_BROKEN ||
6262 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6264 else
6265 cr = cr2;
6266 }
6267 plen = end0 - beg0;
6268 rlen = RSTRING_LEN(repl);
6269 len = RSTRING_LEN(str);
6270 if (rlen > plen) {
6271 RESIZE_CAPA(str, len + rlen - plen);
6272 }
6273 p = RSTRING_PTR(str);
6274 if (rlen != plen) {
6275 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6276 }
6277 rp = RSTRING_PTR(repl);
6278 memmove(p + beg0, rp, rlen);
6279 len += rlen - plen;
6280 STR_SET_LEN(str, len);
6281 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6282 ENC_CODERANGE_SET(str, cr);
6283
6284 RB_GC_GUARD(match);
6285
6286 return str;
6287 }
6288 return Qnil;
6289}
6290
6291
6292/*
6293 * call-seq:
6294 * sub(pattern, replacement) -> new_string
6295 * sub(pattern) {|match| ... } -> new_string
6296 *
6297 * :include: doc/string/sub.rdoc
6298 */
6299
6300static VALUE
6301rb_str_sub(int argc, VALUE *argv, VALUE str)
6302{
6303 str = str_duplicate(rb_cString, str);
6304 rb_str_sub_bang(argc, argv, str);
6305 return str;
6306}
6307
6308static VALUE
6309str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6310{
6311 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6312 long beg, beg0, end0;
6313 long offset, blen, slen, len, last;
6314 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6315 char *sp, *cp;
6316 int need_backref_str = -1;
6317 rb_encoding *str_enc;
6318
6319 switch (argc) {
6320 case 1:
6321 RETURN_ENUMERATOR(str, argc, argv);
6322 mode = ITER;
6323 break;
6324 case 2:
6325 repl = argv[1];
6326 hash = rb_check_hash_type(argv[1]);
6327 if (NIL_P(hash)) {
6328 StringValue(repl);
6329 }
6330 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6331 mode = FAST_MAP;
6332 }
6333 else {
6334 mode = MAP;
6335 }
6336 break;
6337 default:
6338 rb_error_arity(argc, 1, 2);
6339 }
6340
6341 pat = get_pat_quoted(argv[0], 1);
6342 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6343
6344 if (beg < 0) {
6345 if (bang) return Qnil; /* no match, no substitution */
6346 return str_duplicate(rb_cString, str);
6347 }
6348
6349 offset = 0;
6350 blen = RSTRING_LEN(str) + 30; /* len + margin */
6351 dest = rb_str_buf_new(blen);
6352 sp = RSTRING_PTR(str);
6353 slen = RSTRING_LEN(str);
6354 cp = sp;
6355 str_enc = STR_ENC_GET(str);
6356 rb_enc_associate(dest, str_enc);
6357 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6358
6359 do {
6360 struct re_registers *regs = RMATCH_REGS(match);
6361 if (RB_TYPE_P(pat, T_STRING)) {
6362 beg0 = beg;
6363 end0 = beg0 + RSTRING_LEN(pat);
6364 match0 = pat;
6365 }
6366 else {
6367 beg0 = BEG(0);
6368 end0 = END(0);
6369 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6370 }
6371
6372 if (mode != STR) {
6373 if (mode == ITER) {
6374 val = rb_obj_as_string(rb_yield(match0));
6375 }
6376 else {
6377 struct RString fake_str = {RBASIC_INIT};
6378 VALUE key;
6379 if (mode == FAST_MAP) {
6380 // It is safe to use a fake_str here because we established that it won't escape,
6381 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6382 // default proc.
6383 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6384 }
6385 else {
6386 key = rb_str_subseq(str, beg0, end0 - beg0);
6387 }
6388 val = rb_hash_aref(hash, key);
6389 val = rb_obj_as_string(val);
6390 }
6391 str_mod_check(str, sp, slen);
6392 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6393 rb_raise(rb_eRuntimeError, "block should not cheat");
6394 }
6395 }
6396 else if (need_backref_str) {
6397 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6398 if (need_backref_str < 0) {
6399 need_backref_str = val != repl;
6400 }
6401 }
6402 else {
6403 val = repl;
6404 }
6405
6406 len = beg0 - offset; /* copy pre-match substr */
6407 if (len) {
6408 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6409 }
6410
6411 rb_str_buf_append(dest, val);
6412
6413 last = offset;
6414 offset = end0;
6415 if (beg0 == end0) {
6416 /*
6417 * Always consume at least one character of the input string
6418 * in order to prevent infinite loops.
6419 */
6420 if (RSTRING_LEN(str) <= end0) break;
6421 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6422 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6423 offset = end0 + len;
6424 }
6425 cp = RSTRING_PTR(str) + offset;
6426 if (offset > RSTRING_LEN(str)) break;
6427
6428 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6429 if (mode != FAST_MAP && mode != STR) {
6430 match = Qnil;
6431 }
6432 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6433
6434 RB_GC_GUARD(match);
6435 } while (beg >= 0);
6436
6437 if (RSTRING_LEN(str) > offset) {
6438 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6439 }
6440 rb_pat_search0(pat, str, last, 1, &match);
6441 if (bang) {
6442 str_shared_replace(str, dest);
6443 }
6444 else {
6445 str = dest;
6446 }
6447
6448 return str;
6449}
6450
6451
6452/*
6453 * call-seq:
6454 * gsub!(pattern, replacement) -> self or nil
6455 * gsub!(pattern) {|match| ... } -> self or nil
6456 * gsub!(pattern) -> an_enumerator
6457 *
6458 * Like String#gsub, except that:
6459 *
6460 * - Performs substitutions in +self+ (not in a copy of +self+).
6461 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6462 *
6463 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6464 */
6465
6466static VALUE
6467rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6468{
6469 str_modify_keep_cr(str);
6470 return str_gsub(argc, argv, str, 1);
6471}
6472
6473
6474/*
6475 * call-seq:
6476 * gsub(pattern, replacement) -> new_string
6477 * gsub(pattern) {|match| ... } -> new_string
6478 * gsub(pattern) -> enumerator
6479 *
6480 * Returns a copy of +self+ with zero or more substrings replaced.
6481 *
6482 * Argument +pattern+ may be a string or a Regexp;
6483 * argument +replacement+ may be a string or a Hash.
6484 * Varying types for the argument values makes this method very versatile.
6485 *
6486 * Below are some simple examples;
6487 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6488 *
6489 * With arguments +pattern+ and string +replacement+ given,
6490 * replaces each matching substring with the given +replacement+ string:
6491 *
6492 * s = 'abracadabra'
6493 * s.gsub('ab', 'AB') # => "ABracadABra"
6494 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6495 *
6496 * With arguments +pattern+ and hash +replacement+ given,
6497 * replaces each matching substring with a value from the given +replacement+ hash,
6498 * or removes it:
6499 *
6500 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6501 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6502 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6503 *
6504 * With argument +pattern+ and a block given,
6505 * calls the block with each matching substring;
6506 * replaces that substring with the block's return value:
6507 *
6508 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6509 * # => "ABrACADABrA"
6510 *
6511 * With argument +pattern+ and no block given,
6512 * returns a new Enumerator.
6513 *
6514 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6515 */
6516
6517static VALUE
6518rb_str_gsub(int argc, VALUE *argv, VALUE str)
6519{
6520 return str_gsub(argc, argv, str, 0);
6521}
6522
6523
6524/*
6525 * call-seq:
6526 * replace(other_string) -> self
6527 *
6528 * Replaces the contents of +self+ with the contents of +other_string+;
6529 * returns +self+:
6530 *
6531 * s = 'foo' # => "foo"
6532 * s.replace('bar') # => "bar"
6533 *
6534 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6535 */
6536
6537VALUE
6539{
6540 str_modifiable(str);
6541 if (str == str2) return str;
6542
6543 StringValue(str2);
6544 str_discard(str);
6545 return str_replace(str, str2);
6546}
6547
6548/*
6549 * call-seq:
6550 * clear -> self
6551 *
6552 * Removes the contents of +self+:
6553 *
6554 * s = 'foo'
6555 * s.clear # => ""
6556 * s # => ""
6557 *
6558 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6559 */
6560
6561static VALUE
6562rb_str_clear(VALUE str)
6563{
6564 str_discard(str);
6565 STR_SET_EMBED(str);
6566 STR_SET_LEN(str, 0);
6567 RSTRING_PTR(str)[0] = 0;
6568 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6570 else
6572 return str;
6573}
6574
6575/*
6576 * call-seq:
6577 * chr -> string
6578 *
6579 * :include: doc/string/chr.rdoc
6580 *
6581 */
6582
6583static VALUE
6584rb_str_chr(VALUE str)
6585{
6586 return rb_str_substr(str, 0, 1);
6587}
6588
6589/*
6590 * call-seq:
6591 * getbyte(index) -> integer or nil
6592 *
6593 * :include: doc/string/getbyte.rdoc
6594 *
6595 */
6596VALUE
6597rb_str_getbyte(VALUE str, VALUE index)
6598{
6599 long pos = NUM2LONG(index);
6600
6601 if (pos < 0)
6602 pos += RSTRING_LEN(str);
6603 if (pos < 0 || RSTRING_LEN(str) <= pos)
6604 return Qnil;
6605
6606 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6607}
6608
6609/*
6610 * call-seq:
6611 * setbyte(index, integer) -> integer
6612 *
6613 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6614 * returns +integer+:
6615 *
6616 * s = 'xyzzy'
6617 * s.setbyte(2, 129) # => 129
6618 * s # => "xy\x81zy"
6619 *
6620 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6621 */
6622VALUE
6623rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6624{
6625 long pos = NUM2LONG(index);
6626 long len = RSTRING_LEN(str);
6627 char *ptr, *head, *left = 0;
6628 rb_encoding *enc;
6629 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6630
6631 if (pos < -len || len <= pos)
6632 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6633 if (pos < 0)
6634 pos += len;
6635
6636 VALUE v = rb_to_int(value);
6637 VALUE w = rb_int_and(v, INT2FIX(0xff));
6638 char byte = (char)(NUM2INT(w) & 0xFF);
6639
6640 if (!str_independent(str))
6641 str_make_independent(str);
6642 enc = STR_ENC_GET(str);
6643 head = RSTRING_PTR(str);
6644 ptr = &head[pos];
6645 if (!STR_EMBED_P(str)) {
6646 cr = ENC_CODERANGE(str);
6647 switch (cr) {
6648 case ENC_CODERANGE_7BIT:
6649 left = ptr;
6650 *ptr = byte;
6651 if (ISASCII(byte)) goto end;
6652 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6653 if (!MBCLEN_CHARFOUND_P(nlen))
6655 else
6657 goto end;
6659 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6660 width = rb_enc_precise_mbclen(left, head+len, enc);
6661 *ptr = byte;
6662 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6663 if (!MBCLEN_CHARFOUND_P(nlen))
6665 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6667 goto end;
6668 }
6669 }
6671 *ptr = byte;
6672
6673 end:
6674 return value;
6675}
6676
6677static VALUE
6678str_byte_substr(VALUE str, long beg, long len, int empty)
6679{
6680 long n = RSTRING_LEN(str);
6681
6682 if (beg > n || len < 0) return Qnil;
6683 if (beg < 0) {
6684 beg += n;
6685 if (beg < 0) return Qnil;
6686 }
6687 if (len > n - beg)
6688 len = n - beg;
6689 if (len <= 0) {
6690 if (!empty) return Qnil;
6691 len = 0;
6692 }
6693
6694 VALUE str2 = str_subseq(str, beg, len);
6695
6696 str_enc_copy_direct(str2, str);
6697
6698 if (RSTRING_LEN(str2) == 0) {
6699 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6701 else
6703 }
6704 else {
6705 switch (ENC_CODERANGE(str)) {
6706 case ENC_CODERANGE_7BIT:
6708 break;
6709 default:
6711 break;
6712 }
6713 }
6714
6715 return str2;
6716}
6717
6718VALUE
6719rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6720{
6721 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6722}
6723
6724static VALUE
6725str_byte_aref(VALUE str, VALUE indx)
6726{
6727 long idx;
6728 if (FIXNUM_P(indx)) {
6729 idx = FIX2LONG(indx);
6730 }
6731 else {
6732 /* check if indx is Range */
6733 long beg, len = RSTRING_LEN(str);
6734
6735 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6736 case Qfalse:
6737 break;
6738 case Qnil:
6739 return Qnil;
6740 default:
6741 return str_byte_substr(str, beg, len, TRUE);
6742 }
6743
6744 idx = NUM2LONG(indx);
6745 }
6746 return str_byte_substr(str, idx, 1, FALSE);
6747}
6748
6749/*
6750 * call-seq:
6751 * byteslice(offset, length = 1) -> string or nil
6752 * byteslice(range) -> string or nil
6753 *
6754 * :include: doc/string/byteslice.rdoc
6755 */
6756
6757static VALUE
6758rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6759{
6760 if (argc == 2) {
6761 long beg = NUM2LONG(argv[0]);
6762 long len = NUM2LONG(argv[1]);
6763 return str_byte_substr(str, beg, len, TRUE);
6764 }
6765 rb_check_arity(argc, 1, 2);
6766 return str_byte_aref(str, argv[0]);
6767}
6768
6769static void
6770str_check_beg_len(VALUE str, long *beg, long *len)
6771{
6772 long end, slen = RSTRING_LEN(str);
6773
6774 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6775 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6776 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6777 }
6778 if (*beg < 0) {
6779 *beg += slen;
6780 }
6781 RUBY_ASSERT(*beg >= 0);
6782 RUBY_ASSERT(*beg <= slen);
6783
6784 if (*len > slen - *beg) {
6785 *len = slen - *beg;
6786 }
6787 end = *beg + *len;
6788 str_ensure_byte_pos(str, *beg);
6789 str_ensure_byte_pos(str, end);
6790}
6791
6792/*
6793 * call-seq:
6794 * bytesplice(offset, length, str) -> self
6795 * bytesplice(offset, length, str, str_offset, str_length) -> self
6796 * bytesplice(range, str) -> self
6797 * bytesplice(range, str, str_range) -> self
6798 *
6799 * :include: doc/string/bytesplice.rdoc
6800 */
6801
6802static VALUE
6803rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6804{
6805 long beg, len, vbeg, vlen;
6806 VALUE val;
6807 int cr;
6808
6809 rb_check_arity(argc, 2, 5);
6810 if (!(argc == 2 || argc == 3 || argc == 5)) {
6811 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6812 }
6813 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6814 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6815 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6816 rb_builtin_class_name(argv[0]));
6817 }
6818 val = argv[1];
6819 StringValue(val);
6820 if (argc == 2) {
6821 /* bytesplice(range, str) */
6822 vbeg = 0;
6823 vlen = RSTRING_LEN(val);
6824 }
6825 else {
6826 /* bytesplice(range, str, str_range) */
6827 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6828 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6829 rb_builtin_class_name(argv[2]));
6830 }
6831 }
6832 }
6833 else {
6834 beg = NUM2LONG(argv[0]);
6835 len = NUM2LONG(argv[1]);
6836 val = argv[2];
6837 StringValue(val);
6838 if (argc == 3) {
6839 /* bytesplice(index, length, str) */
6840 vbeg = 0;
6841 vlen = RSTRING_LEN(val);
6842 }
6843 else {
6844 /* bytesplice(index, length, str, str_index, str_length) */
6845 vbeg = NUM2LONG(argv[3]);
6846 vlen = NUM2LONG(argv[4]);
6847 }
6848 }
6849 str_check_beg_len(str, &beg, &len);
6850 str_check_beg_len(val, &vbeg, &vlen);
6851 str_modify_keep_cr(str);
6852
6853 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6854 rb_enc_associate(str, rb_enc_check(str, val));
6855 }
6856
6857 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6859 if (cr != ENC_CODERANGE_BROKEN)
6860 ENC_CODERANGE_SET(str, cr);
6861 return str;
6862}
6863
6864/*
6865 * call-seq:
6866 * reverse -> new_string
6867 *
6868 * Returns a new string with the characters from +self+ in reverse order.
6869 *
6870 * 'drawer'.reverse # => "reward"
6871 * 'reviled'.reverse # => "deliver"
6872 * 'stressed'.reverse # => "desserts"
6873 * 'semordnilaps'.reverse # => "spalindromes"
6874 *
6875 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6876 */
6877
6878static VALUE
6879rb_str_reverse(VALUE str)
6880{
6881 rb_encoding *enc;
6882 VALUE rev;
6883 char *s, *e, *p;
6884 int cr;
6885
6886 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6887 enc = STR_ENC_GET(str);
6888 rev = rb_str_new(0, RSTRING_LEN(str));
6889 s = RSTRING_PTR(str); e = RSTRING_END(str);
6890 p = RSTRING_END(rev);
6891 cr = ENC_CODERANGE(str);
6892
6893 if (RSTRING_LEN(str) > 1) {
6894 if (single_byte_optimizable(str)) {
6895 while (s < e) {
6896 *--p = *s++;
6897 }
6898 }
6899 else if (cr == ENC_CODERANGE_VALID) {
6900 while (s < e) {
6901 int clen = rb_enc_fast_mbclen(s, e, enc);
6902
6903 p -= clen;
6904 memcpy(p, s, clen);
6905 s += clen;
6906 }
6907 }
6908 else {
6909 cr = rb_enc_asciicompat(enc) ?
6911 while (s < e) {
6912 int clen = rb_enc_mbclen(s, e, enc);
6913
6914 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6915 p -= clen;
6916 memcpy(p, s, clen);
6917 s += clen;
6918 }
6919 }
6920 }
6921 STR_SET_LEN(rev, RSTRING_LEN(str));
6922 str_enc_copy_direct(rev, str);
6923 ENC_CODERANGE_SET(rev, cr);
6924
6925 return rev;
6926}
6927
6928
6929/*
6930 * call-seq:
6931 * reverse! -> self
6932 *
6933 * Returns +self+ with its characters reversed:
6934 *
6935 * 'drawer'.reverse! # => "reward"
6936 * 'reviled'.reverse! # => "deliver"
6937 * 'stressed'.reverse! # => "desserts"
6938 * 'semordnilaps'.reverse! # => "spalindromes"
6939 *
6940 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6941 */
6942
6943static VALUE
6944rb_str_reverse_bang(VALUE str)
6945{
6946 if (RSTRING_LEN(str) > 1) {
6947 if (single_byte_optimizable(str)) {
6948 char *s, *e, c;
6949
6950 str_modify_keep_cr(str);
6951 s = RSTRING_PTR(str);
6952 e = RSTRING_END(str) - 1;
6953 while (s < e) {
6954 c = *s;
6955 *s++ = *e;
6956 *e-- = c;
6957 }
6958 }
6959 else {
6960 str_shared_replace(str, rb_str_reverse(str));
6961 }
6962 }
6963 else {
6964 str_modify_keep_cr(str);
6965 }
6966 return str;
6967}
6968
6969
6970/*
6971 * call-seq:
6972 * include?(other_string) -> true or false
6973 *
6974 * Returns whether +self+ contains +other_string+:
6975 *
6976 * s = 'bar'
6977 * s.include?('ba') # => true
6978 * s.include?('ar') # => true
6979 * s.include?('bar') # => true
6980 * s.include?('a') # => true
6981 * s.include?('') # => true
6982 * s.include?('foo') # => false
6983 *
6984 * Related: see {Querying}[rdoc-ref:String@Querying].
6985 */
6986
6987VALUE
6988rb_str_include(VALUE str, VALUE arg)
6989{
6990 long i;
6991
6992 StringValue(arg);
6993 i = rb_str_index(str, arg, 0);
6994
6995 return RBOOL(i != -1);
6996}
6997
6998
6999/*
7000 * call-seq:
7001 * to_i(base = 10) -> integer
7002 *
7003 * Returns the result of interpreting leading characters in +self+
7004 * as an integer in the given +base+;
7005 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7006 *
7007 * '123456'.to_i # => 123456
7008 * '123def'.to_i(16) # => 1195503
7009 *
7010 * With +base+ zero given, string +object+ may contain leading characters
7011 * to specify the actual base:
7012 *
7013 * '123def'.to_i(0) # => 123
7014 * '0123def'.to_i(0) # => 83
7015 * '0b123def'.to_i(0) # => 1
7016 * '0o123def'.to_i(0) # => 83
7017 * '0d123def'.to_i(0) # => 123
7018 * '0x123def'.to_i(0) # => 1195503
7019 *
7020 * Characters past a leading valid number (in the given +base+) are ignored:
7021 *
7022 * '12.345'.to_i # => 12
7023 * '12345'.to_i(2) # => 1
7024 *
7025 * Returns zero if there is no leading valid number:
7026 *
7027 * 'abcdef'.to_i # => 0
7028 * '2'.to_i(2) # => 0
7029 *
7030 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7031 */
7032
7033static VALUE
7034rb_str_to_i(int argc, VALUE *argv, VALUE str)
7035{
7036 int base = 10;
7037
7038 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7039 rb_raise(rb_eArgError, "invalid radix %d", base);
7040 }
7041 return rb_str_to_inum(str, base, FALSE);
7042}
7043
7044
7045/*
7046 * call-seq:
7047 * to_f -> float
7048 *
7049 * Returns the result of interpreting leading characters in +self+ as a Float:
7050 *
7051 * '3.14159'.to_f # => 3.14159
7052 * '1.234e-2'.to_f # => 0.01234
7053 *
7054 * Characters past a leading valid number are ignored:
7055 *
7056 * '3.14 (pi to two places)'.to_f # => 3.14
7057 *
7058 * Returns zero if there is no leading valid number:
7059 *
7060 * 'abcdef'.to_f # => 0.0
7061 *
7062 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7063 */
7064
7065static VALUE
7066rb_str_to_f(VALUE str)
7067{
7068 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7069}
7070
7071
7072/*
7073 * call-seq:
7074 * to_s -> self or new_string
7075 *
7076 * Returns +self+ if +self+ is a +String+,
7077 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7078 *
7079 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7080 */
7081
7082static VALUE
7083rb_str_to_s(VALUE str)
7084{
7085 if (rb_obj_class(str) != rb_cString) {
7086 return str_duplicate(rb_cString, str);
7087 }
7088 return str;
7089}
7090
7091#if 0
7092static void
7093str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7094{
7095 char s[RUBY_MAX_CHAR_LEN];
7096 int n = rb_enc_codelen(c, enc);
7097
7098 rb_enc_mbcput(c, s, enc);
7099 rb_enc_str_buf_cat(str, s, n, enc);
7100}
7101#endif
7102
7103#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7104
7105int
7106rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7107{
7108 char buf[CHAR_ESC_LEN + 1];
7109 int l;
7110
7111#if SIZEOF_INT > 4
7112 c &= 0xffffffff;
7113#endif
7114 if (unicode_p) {
7115 if (c < 0x7F && ISPRINT(c)) {
7116 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7117 }
7118 else if (c < 0x10000) {
7119 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7120 }
7121 else {
7122 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7123 }
7124 }
7125 else {
7126 if (c < 0x100) {
7127 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7128 }
7129 else {
7130 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7131 }
7132 }
7133 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7134 rb_str_buf_cat(result, buf, l);
7135 return l;
7136}
7137
7138const char *
7139ruby_escaped_char(int c)
7140{
7141 switch (c) {
7142 case '\0': return "\\0";
7143 case '\n': return "\\n";
7144 case '\r': return "\\r";
7145 case '\t': return "\\t";
7146 case '\f': return "\\f";
7147 case '\013': return "\\v";
7148 case '\010': return "\\b";
7149 case '\007': return "\\a";
7150 case '\033': return "\\e";
7151 case '\x7f': return "\\c?";
7152 }
7153 return NULL;
7154}
7155
7156VALUE
7157rb_str_escape(VALUE str)
7158{
7159 int encidx = ENCODING_GET(str);
7160 rb_encoding *enc = rb_enc_from_index(encidx);
7161 const char *p = RSTRING_PTR(str);
7162 const char *pend = RSTRING_END(str);
7163 const char *prev = p;
7164 char buf[CHAR_ESC_LEN + 1];
7165 VALUE result = rb_str_buf_new(0);
7166 int unicode_p = rb_enc_unicode_p(enc);
7167 int asciicompat = rb_enc_asciicompat(enc);
7168
7169 while (p < pend) {
7170 unsigned int c;
7171 const char *cc;
7172 int n = rb_enc_precise_mbclen(p, pend, enc);
7173 if (!MBCLEN_CHARFOUND_P(n)) {
7174 if (p > prev) str_buf_cat(result, prev, p - prev);
7175 n = rb_enc_mbminlen(enc);
7176 if (pend < p + n)
7177 n = (int)(pend - p);
7178 while (n--) {
7179 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7180 str_buf_cat(result, buf, strlen(buf));
7181 prev = ++p;
7182 }
7183 continue;
7184 }
7185 n = MBCLEN_CHARFOUND_LEN(n);
7186 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7187 p += n;
7188 cc = ruby_escaped_char(c);
7189 if (cc) {
7190 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7191 str_buf_cat(result, cc, strlen(cc));
7192 prev = p;
7193 }
7194 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7195 }
7196 else {
7197 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7198 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7199 prev = p;
7200 }
7201 }
7202 if (p > prev) str_buf_cat(result, prev, p - prev);
7203 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7204
7205 return result;
7206}
7207
7208/*
7209 * call-seq:
7210 * inspect -> string
7211 *
7212 * :include: doc/string/inspect.rdoc
7213 *
7214 */
7215
7216VALUE
7218{
7219 int encidx = ENCODING_GET(str);
7220 rb_encoding *enc = rb_enc_from_index(encidx);
7221 const char *p, *pend, *prev;
7222 char buf[CHAR_ESC_LEN + 1];
7223 VALUE result = rb_str_buf_new(0);
7224 rb_encoding *resenc = rb_default_internal_encoding();
7225 int unicode_p = rb_enc_unicode_p(enc);
7226 int asciicompat = rb_enc_asciicompat(enc);
7227
7228 if (resenc == NULL) resenc = rb_default_external_encoding();
7229 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7230 rb_enc_associate(result, resenc);
7231 str_buf_cat2(result, "\"");
7232
7233 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7234 prev = p;
7235 while (p < pend) {
7236 unsigned int c, cc;
7237 int n;
7238
7239 n = rb_enc_precise_mbclen(p, pend, enc);
7240 if (!MBCLEN_CHARFOUND_P(n)) {
7241 if (p > prev) str_buf_cat(result, prev, p - prev);
7242 n = rb_enc_mbminlen(enc);
7243 if (pend < p + n)
7244 n = (int)(pend - p);
7245 while (n--) {
7246 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7247 str_buf_cat(result, buf, strlen(buf));
7248 prev = ++p;
7249 }
7250 continue;
7251 }
7252 n = MBCLEN_CHARFOUND_LEN(n);
7253 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7254 p += n;
7255 if ((asciicompat || unicode_p) &&
7256 (c == '"'|| c == '\\' ||
7257 (c == '#' &&
7258 p < pend &&
7259 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7260 (cc = rb_enc_codepoint(p,pend,enc),
7261 (cc == '$' || cc == '@' || cc == '{'))))) {
7262 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7263 str_buf_cat2(result, "\\");
7264 if (asciicompat || enc == resenc) {
7265 prev = p - n;
7266 continue;
7267 }
7268 }
7269 switch (c) {
7270 case '\n': cc = 'n'; break;
7271 case '\r': cc = 'r'; break;
7272 case '\t': cc = 't'; break;
7273 case '\f': cc = 'f'; break;
7274 case '\013': cc = 'v'; break;
7275 case '\010': cc = 'b'; break;
7276 case '\007': cc = 'a'; break;
7277 case 033: cc = 'e'; break;
7278 default: cc = 0; break;
7279 }
7280 if (cc) {
7281 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7282 buf[0] = '\\';
7283 buf[1] = (char)cc;
7284 str_buf_cat(result, buf, 2);
7285 prev = p;
7286 continue;
7287 }
7288 /* The special casing of 0x85 (NEXT_LINE) here is because
7289 * Oniguruma historically treats it as printable, but it
7290 * doesn't match the print POSIX bracket class or character
7291 * property in regexps.
7292 *
7293 * See Ruby Bug #16842 for details:
7294 * https://bugs.ruby-lang.org/issues/16842
7295 */
7296 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7297 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7298 continue;
7299 }
7300 else {
7301 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7302 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7303 prev = p;
7304 continue;
7305 }
7306 }
7307 if (p > prev) str_buf_cat(result, prev, p - prev);
7308 str_buf_cat2(result, "\"");
7309
7310 return result;
7311}
7312
7313#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7314
7315/*
7316 * call-seq:
7317 * dump -> new_string
7318 *
7319 * :include: doc/string/dump.rdoc
7320 *
7321 */
7322
7323VALUE
7325{
7326 int encidx = rb_enc_get_index(str);
7327 rb_encoding *enc = rb_enc_from_index(encidx);
7328 long len;
7329 const char *p, *pend;
7330 char *q, *qend;
7331 VALUE result;
7332 int u8 = (encidx == rb_utf8_encindex());
7333 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7334
7335 len = 2; /* "" */
7336 if (!rb_enc_asciicompat(enc)) {
7337 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7338 len += strlen(enc->name);
7339 }
7340
7341 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7342 while (p < pend) {
7343 int clen;
7344 unsigned char c = *p++;
7345
7346 switch (c) {
7347 case '"': case '\\':
7348 case '\n': case '\r':
7349 case '\t': case '\f':
7350 case '\013': case '\010': case '\007': case '\033':
7351 clen = 2;
7352 break;
7353
7354 case '#':
7355 clen = IS_EVSTR(p, pend) ? 2 : 1;
7356 break;
7357
7358 default:
7359 if (ISPRINT(c)) {
7360 clen = 1;
7361 }
7362 else {
7363 if (u8 && c > 0x7F) { /* \u notation */
7364 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7365 if (MBCLEN_CHARFOUND_P(n)) {
7366 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7367 if (cc <= 0xFFFF)
7368 clen = 6; /* \uXXXX */
7369 else if (cc <= 0xFFFFF)
7370 clen = 9; /* \u{XXXXX} */
7371 else
7372 clen = 10; /* \u{XXXXXX} */
7373 p += MBCLEN_CHARFOUND_LEN(n)-1;
7374 break;
7375 }
7376 }
7377 clen = 4; /* \xNN */
7378 }
7379 break;
7380 }
7381
7382 if (clen > LONG_MAX - len) {
7383 rb_raise(rb_eRuntimeError, "string size too big");
7384 }
7385 len += clen;
7386 }
7387
7388 result = rb_str_new(0, len);
7389 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7390 q = RSTRING_PTR(result); qend = q + len + 1;
7391
7392 *q++ = '"';
7393 while (p < pend) {
7394 unsigned char c = *p++;
7395
7396 if (c == '"' || c == '\\') {
7397 *q++ = '\\';
7398 *q++ = c;
7399 }
7400 else if (c == '#') {
7401 if (IS_EVSTR(p, pend)) *q++ = '\\';
7402 *q++ = '#';
7403 }
7404 else if (c == '\n') {
7405 *q++ = '\\';
7406 *q++ = 'n';
7407 }
7408 else if (c == '\r') {
7409 *q++ = '\\';
7410 *q++ = 'r';
7411 }
7412 else if (c == '\t') {
7413 *q++ = '\\';
7414 *q++ = 't';
7415 }
7416 else if (c == '\f') {
7417 *q++ = '\\';
7418 *q++ = 'f';
7419 }
7420 else if (c == '\013') {
7421 *q++ = '\\';
7422 *q++ = 'v';
7423 }
7424 else if (c == '\010') {
7425 *q++ = '\\';
7426 *q++ = 'b';
7427 }
7428 else if (c == '\007') {
7429 *q++ = '\\';
7430 *q++ = 'a';
7431 }
7432 else if (c == '\033') {
7433 *q++ = '\\';
7434 *q++ = 'e';
7435 }
7436 else if (ISPRINT(c)) {
7437 *q++ = c;
7438 }
7439 else {
7440 *q++ = '\\';
7441 if (u8) {
7442 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7443 if (MBCLEN_CHARFOUND_P(n)) {
7444 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7445 p += n;
7446 if (cc <= 0xFFFF)
7447 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7448 else
7449 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7450 q += strlen(q);
7451 continue;
7452 }
7453 }
7454 snprintf(q, qend-q, "x%02X", c);
7455 q += 3;
7456 }
7457 }
7458 *q++ = '"';
7459 *q = '\0';
7460 if (!rb_enc_asciicompat(enc)) {
7461 snprintf(q, qend-q, nonascii_suffix, enc->name);
7462 encidx = rb_ascii8bit_encindex();
7463 }
7464 /* result from dump is ASCII */
7465 rb_enc_associate_index(result, encidx);
7467 return result;
7468}
7469
7470static int
7471unescape_ascii(unsigned int c)
7472{
7473 switch (c) {
7474 case 'n':
7475 return '\n';
7476 case 'r':
7477 return '\r';
7478 case 't':
7479 return '\t';
7480 case 'f':
7481 return '\f';
7482 case 'v':
7483 return '\13';
7484 case 'b':
7485 return '\010';
7486 case 'a':
7487 return '\007';
7488 case 'e':
7489 return 033;
7490 }
7492}
7493
7494static void
7495undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7496{
7497 const char *s = *ss;
7498 unsigned int c;
7499 int codelen;
7500 size_t hexlen;
7501 unsigned char buf[6];
7502 static rb_encoding *enc_utf8 = NULL;
7503
7504 switch (*s) {
7505 case '\\':
7506 case '"':
7507 case '#':
7508 rb_str_cat(undumped, s, 1); /* cat itself */
7509 s++;
7510 break;
7511 case 'n':
7512 case 'r':
7513 case 't':
7514 case 'f':
7515 case 'v':
7516 case 'b':
7517 case 'a':
7518 case 'e':
7519 *buf = unescape_ascii(*s);
7520 rb_str_cat(undumped, (char *)buf, 1);
7521 s++;
7522 break;
7523 case 'u':
7524 if (*binary) {
7525 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7526 }
7527 *utf8 = true;
7528 if (++s >= s_end) {
7529 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7530 }
7531 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7532 if (*penc != enc_utf8) {
7533 *penc = enc_utf8;
7534 rb_enc_associate(undumped, enc_utf8);
7535 }
7536 if (*s == '{') { /* handle \u{...} form */
7537 s++;
7538 for (;;) {
7539 if (s >= s_end) {
7540 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7541 }
7542 if (*s == '}') {
7543 s++;
7544 break;
7545 }
7546 if (ISSPACE(*s)) {
7547 s++;
7548 continue;
7549 }
7550 c = scan_hex(s, s_end-s, &hexlen);
7551 if (hexlen == 0 || hexlen > 6) {
7552 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7553 }
7554 if (c > 0x10ffff) {
7555 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7556 }
7557 if (0xd800 <= c && c <= 0xdfff) {
7558 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7559 }
7560 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7561 rb_str_cat(undumped, (char *)buf, codelen);
7562 s += hexlen;
7563 }
7564 }
7565 else { /* handle \uXXXX form */
7566 c = scan_hex(s, 4, &hexlen);
7567 if (hexlen != 4) {
7568 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7569 }
7570 if (0xd800 <= c && c <= 0xdfff) {
7571 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7572 }
7573 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7574 rb_str_cat(undumped, (char *)buf, codelen);
7575 s += hexlen;
7576 }
7577 break;
7578 case 'x':
7579 if (++s >= s_end) {
7580 rb_raise(rb_eRuntimeError, "invalid hex escape");
7581 }
7582 *buf = scan_hex(s, 2, &hexlen);
7583 if (hexlen != 2) {
7584 rb_raise(rb_eRuntimeError, "invalid hex escape");
7585 }
7586 if (!ISASCII(*buf)) {
7587 if (*utf8) {
7588 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7589 }
7590 *binary = true;
7591 }
7592 rb_str_cat(undumped, (char *)buf, 1);
7593 s += hexlen;
7594 break;
7595 default:
7596 rb_str_cat(undumped, s-1, 2);
7597 s++;
7598 }
7599
7600 *ss = s;
7601}
7602
7603static VALUE rb_str_is_ascii_only_p(VALUE str);
7604
7605/*
7606 * call-seq:
7607 * undump -> new_string
7608 *
7609 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7610 *
7611 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7612 */
7613
7614static VALUE
7615str_undump(VALUE str)
7616{
7617 const char *s = RSTRING_PTR(str);
7618 const char *s_end = RSTRING_END(str);
7619 rb_encoding *enc = rb_enc_get(str);
7620 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7621 bool utf8 = false;
7622 bool binary = false;
7623 int w;
7624
7626 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7627 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7628 }
7629 if (!str_null_check(str, &w)) {
7630 rb_raise(rb_eRuntimeError, "string contains null byte");
7631 }
7632 if (RSTRING_LEN(str) < 2) goto invalid_format;
7633 if (*s != '"') goto invalid_format;
7634
7635 /* strip '"' at the start */
7636 s++;
7637
7638 for (;;) {
7639 if (s >= s_end) {
7640 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7641 }
7642
7643 if (*s == '"') {
7644 /* epilogue */
7645 s++;
7646 if (s == s_end) {
7647 /* ascii compatible dumped string */
7648 break;
7649 }
7650 else {
7651 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7652 static const char dup_suffix[] = ".dup";
7653 const char *encname;
7654 int encidx;
7655 ptrdiff_t size;
7656
7657 /* check separately for strings dumped by older versions */
7658 size = sizeof(dup_suffix) - 1;
7659 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7660
7661 size = sizeof(force_encoding_suffix) - 1;
7662 if (s_end - s <= size) goto invalid_format;
7663 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7664 s += size;
7665
7666 if (utf8) {
7667 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7668 }
7669
7670 encname = s;
7671 s = memchr(s, '"', s_end-s);
7672 size = s - encname;
7673 if (!s) goto invalid_format;
7674 if (s_end - s != 2) goto invalid_format;
7675 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7676
7677 encidx = rb_enc_find_index2(encname, (long)size);
7678 if (encidx < 0) {
7679 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7680 }
7681 rb_enc_associate_index(undumped, encidx);
7682 }
7683 break;
7684 }
7685
7686 if (*s == '\\') {
7687 s++;
7688 if (s >= s_end) {
7689 rb_raise(rb_eRuntimeError, "invalid escape");
7690 }
7691 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7692 }
7693 else {
7694 rb_str_cat(undumped, s++, 1);
7695 }
7696 }
7697
7698 RB_GC_GUARD(str);
7699
7700 return undumped;
7701invalid_format:
7702 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7703}
7704
7705static void
7706rb_str_check_dummy_enc(rb_encoding *enc)
7707{
7708 if (rb_enc_dummy_p(enc)) {
7709 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7710 rb_enc_name(enc));
7711 }
7712}
7713
7714static rb_encoding *
7715str_true_enc(VALUE str)
7716{
7717 rb_encoding *enc = STR_ENC_GET(str);
7718 rb_str_check_dummy_enc(enc);
7719 return enc;
7720}
7721
7722static OnigCaseFoldType
7723check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7724{
7725 if (argc==0)
7726 return flags;
7727 if (argc>2)
7728 rb_raise(rb_eArgError, "too many options");
7729 if (argv[0]==sym_turkic) {
7730 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7731 if (argc==2) {
7732 if (argv[1]==sym_lithuanian)
7733 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7734 else
7735 rb_raise(rb_eArgError, "invalid second option");
7736 }
7737 }
7738 else if (argv[0]==sym_lithuanian) {
7739 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7740 if (argc==2) {
7741 if (argv[1]==sym_turkic)
7742 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7743 else
7744 rb_raise(rb_eArgError, "invalid second option");
7745 }
7746 }
7747 else if (argc>1)
7748 rb_raise(rb_eArgError, "too many options");
7749 else if (argv[0]==sym_ascii)
7750 flags |= ONIGENC_CASE_ASCII_ONLY;
7751 else if (argv[0]==sym_fold) {
7752 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7753 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7754 else
7755 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7756 }
7757 else
7758 rb_raise(rb_eArgError, "invalid option");
7759 return flags;
7760}
7761
7762static inline bool
7763case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7764{
7765 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7766 return true;
7767 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7768}
7769
7770/* 16 should be long enough to absorb any kind of single character length increase */
7771#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7772#ifndef CASEMAP_DEBUG
7773# define CASEMAP_DEBUG 0
7774#endif
7775
7776struct mapping_buffer;
7777typedef struct mapping_buffer {
7778 size_t capa;
7779 size_t used;
7780 struct mapping_buffer *next;
7781 OnigUChar space[FLEX_ARY_LEN];
7783
7784static void
7785mapping_buffer_free(void *p)
7786{
7787 mapping_buffer *previous_buffer;
7788 mapping_buffer *current_buffer = p;
7789 while (current_buffer) {
7790 previous_buffer = current_buffer;
7791 current_buffer = current_buffer->next;
7792 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7793 }
7794}
7795
7796static const rb_data_type_t mapping_buffer_type = {
7797 "mapping_buffer",
7798 {0, mapping_buffer_free,},
7799 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7800};
7801
7802static VALUE
7803rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7804{
7805 VALUE target;
7806
7807 const OnigUChar *source_current, *source_end;
7808 int target_length = 0;
7809 VALUE buffer_anchor;
7810 mapping_buffer *current_buffer = 0;
7811 mapping_buffer **pre_buffer;
7812 size_t buffer_count = 0;
7813 int buffer_length_or_invalid;
7814
7815 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7816
7817 source_current = (OnigUChar*)RSTRING_PTR(source);
7818 source_end = (OnigUChar*)RSTRING_END(source);
7819
7820 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7821 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7822 while (source_current < source_end) {
7823 /* increase multiplier using buffer count to converge quickly */
7824 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7825 if (CASEMAP_DEBUG) {
7826 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7827 }
7828 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7829 *pre_buffer = current_buffer;
7830 pre_buffer = &current_buffer->next;
7831 current_buffer->next = NULL;
7832 current_buffer->capa = capa;
7833 buffer_length_or_invalid = enc->case_map(flags,
7834 &source_current, source_end,
7835 current_buffer->space,
7836 current_buffer->space+current_buffer->capa,
7837 enc);
7838 if (buffer_length_or_invalid < 0) {
7839 current_buffer = DATA_PTR(buffer_anchor);
7840 DATA_PTR(buffer_anchor) = 0;
7841 mapping_buffer_free(current_buffer);
7842 rb_raise(rb_eArgError, "input string invalid");
7843 }
7844 target_length += current_buffer->used = buffer_length_or_invalid;
7845 }
7846 if (CASEMAP_DEBUG) {
7847 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7848 }
7849
7850 if (buffer_count==1) {
7851 target = rb_str_new((const char*)current_buffer->space, target_length);
7852 }
7853 else {
7854 char *target_current;
7855
7856 target = rb_str_new(0, target_length);
7857 target_current = RSTRING_PTR(target);
7858 current_buffer = DATA_PTR(buffer_anchor);
7859 while (current_buffer) {
7860 memcpy(target_current, current_buffer->space, current_buffer->used);
7861 target_current += current_buffer->used;
7862 current_buffer = current_buffer->next;
7863 }
7864 }
7865 current_buffer = DATA_PTR(buffer_anchor);
7866 DATA_PTR(buffer_anchor) = 0;
7867 mapping_buffer_free(current_buffer);
7868
7869 RB_GC_GUARD(buffer_anchor);
7870
7871 /* TODO: check about string terminator character */
7872 str_enc_copy_direct(target, source);
7873 /*ENC_CODERANGE_SET(mapped, cr);*/
7874
7875 return target;
7876}
7877
7878static VALUE
7879rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7880{
7881 const OnigUChar *source_current, *source_end;
7882 OnigUChar *target_current, *target_end;
7883 long old_length = RSTRING_LEN(source);
7884 int length_or_invalid;
7885
7886 if (old_length == 0) return Qnil;
7887
7888 source_current = (OnigUChar*)RSTRING_PTR(source);
7889 source_end = (OnigUChar*)RSTRING_END(source);
7890 if (source == target) {
7891 target_current = (OnigUChar*)source_current;
7892 target_end = (OnigUChar*)source_end;
7893 }
7894 else {
7895 target_current = (OnigUChar*)RSTRING_PTR(target);
7896 target_end = (OnigUChar*)RSTRING_END(target);
7897 }
7898
7899 length_or_invalid = onigenc_ascii_only_case_map(flags,
7900 &source_current, source_end,
7901 target_current, target_end, enc);
7902 if (length_or_invalid < 0)
7903 rb_raise(rb_eArgError, "input string invalid");
7904 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7905 fprintf(stderr, "problem with rb_str_ascii_casemap"
7906 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7907 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7908 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7909 }
7910
7911 str_enc_copy(target, source);
7912
7913 return target;
7914}
7915
7916static bool
7917upcase_single(VALUE str)
7918{
7919 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7920 bool modified = false;
7921
7922 while (s < send) {
7923 unsigned int c = *(unsigned char*)s;
7924
7925 if ('a' <= c && c <= 'z') {
7926 *s = 'A' + (c - 'a');
7927 modified = true;
7928 }
7929 s++;
7930 }
7931 return modified;
7932}
7933
7934/*
7935 * call-seq:
7936 * upcase!(mapping) -> self or nil
7937 *
7938 * Like String#upcase, except that:
7939 *
7940 * - Changes character casings in +self+ (not in a copy of +self+).
7941 * - Returns +self+ if any changes are made, +nil+ otherwise.
7942 *
7943 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7944 */
7945
7946static VALUE
7947rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7948{
7949 rb_encoding *enc;
7950 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7951
7952 flags = check_case_options(argc, argv, flags);
7953 str_modify_keep_cr(str);
7954 enc = str_true_enc(str);
7955 if (case_option_single_p(flags, enc, str)) {
7956 if (upcase_single(str))
7957 flags |= ONIGENC_CASE_MODIFIED;
7958 }
7959 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7960 rb_str_ascii_casemap(str, str, &flags, enc);
7961 else
7962 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7963
7964 if (ONIGENC_CASE_MODIFIED&flags) return str;
7965 return Qnil;
7966}
7967
7968
7969/*
7970 * call-seq:
7971 * upcase(mapping = :ascii) -> new_string
7972 *
7973 * :include: doc/string/upcase.rdoc
7974 */
7975
7976static VALUE
7977rb_str_upcase(int argc, VALUE *argv, VALUE str)
7978{
7979 rb_encoding *enc;
7980 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7981 VALUE ret;
7982
7983 flags = check_case_options(argc, argv, flags);
7984 enc = str_true_enc(str);
7985 if (case_option_single_p(flags, enc, str)) {
7986 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7987 str_enc_copy_direct(ret, str);
7988 upcase_single(ret);
7989 }
7990 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7991 ret = rb_str_new(0, RSTRING_LEN(str));
7992 rb_str_ascii_casemap(str, ret, &flags, enc);
7993 }
7994 else {
7995 ret = rb_str_casemap(str, &flags, enc);
7996 }
7997
7998 return ret;
7999}
8000
8001static bool
8002downcase_single(VALUE str)
8003{
8004 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8005 bool modified = false;
8006
8007 while (s < send) {
8008 unsigned int c = *(unsigned char*)s;
8009
8010 if ('A' <= c && c <= 'Z') {
8011 *s = 'a' + (c - 'A');
8012 modified = true;
8013 }
8014 s++;
8015 }
8016
8017 return modified;
8018}
8019
8020/*
8021 * call-seq:
8022 * downcase!(mapping) -> self or nil
8023 *
8024 * Like String#downcase, except that:
8025 *
8026 * - Changes character casings in +self+ (not in a copy of +self+).
8027 * - Returns +self+ if any changes are made, +nil+ otherwise.
8028 *
8029 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8030 */
8031
8032static VALUE
8033rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8034{
8035 rb_encoding *enc;
8036 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8037
8038 flags = check_case_options(argc, argv, flags);
8039 str_modify_keep_cr(str);
8040 enc = str_true_enc(str);
8041 if (case_option_single_p(flags, enc, str)) {
8042 if (downcase_single(str))
8043 flags |= ONIGENC_CASE_MODIFIED;
8044 }
8045 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8046 rb_str_ascii_casemap(str, str, &flags, enc);
8047 else
8048 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8049
8050 if (ONIGENC_CASE_MODIFIED&flags) return str;
8051 return Qnil;
8052}
8053
8054
8055/*
8056 * call-seq:
8057 * downcase(mapping = :ascii) -> new_string
8058 *
8059 * :include: doc/string/downcase.rdoc
8060 *
8061 */
8062
8063static VALUE
8064rb_str_downcase(int argc, VALUE *argv, VALUE str)
8065{
8066 rb_encoding *enc;
8067 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8068 VALUE ret;
8069
8070 flags = check_case_options(argc, argv, flags);
8071 enc = str_true_enc(str);
8072 if (case_option_single_p(flags, enc, str)) {
8073 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8074 str_enc_copy_direct(ret, str);
8075 downcase_single(ret);
8076 }
8077 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8078 ret = rb_str_new(0, RSTRING_LEN(str));
8079 rb_str_ascii_casemap(str, ret, &flags, enc);
8080 }
8081 else {
8082 ret = rb_str_casemap(str, &flags, enc);
8083 }
8084
8085 return ret;
8086}
8087
8088
8089/*
8090 * call-seq:
8091 * capitalize!(mapping = :ascii) -> self or nil
8092 *
8093 * Like String#capitalize, except that:
8094 *
8095 * - Changes character casings in +self+ (not in a copy of +self+).
8096 * - Returns +self+ if any changes are made, +nil+ otherwise.
8097 *
8098 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8099 */
8100
8101static VALUE
8102rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8103{
8104 rb_encoding *enc;
8105 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8106
8107 flags = check_case_options(argc, argv, flags);
8108 str_modify_keep_cr(str);
8109 enc = str_true_enc(str);
8110 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8111 if (flags&ONIGENC_CASE_ASCII_ONLY)
8112 rb_str_ascii_casemap(str, str, &flags, enc);
8113 else
8114 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8115
8116 if (ONIGENC_CASE_MODIFIED&flags) return str;
8117 return Qnil;
8118}
8119
8120
8121/*
8122 * call-seq:
8123 * capitalize(mapping = :ascii) -> new_string
8124 *
8125 * :include: doc/string/capitalize.rdoc
8126 *
8127 */
8128
8129static VALUE
8130rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8131{
8132 rb_encoding *enc;
8133 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8134 VALUE ret;
8135
8136 flags = check_case_options(argc, argv, flags);
8137 enc = str_true_enc(str);
8138 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8139 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8140 ret = rb_str_new(0, RSTRING_LEN(str));
8141 rb_str_ascii_casemap(str, ret, &flags, enc);
8142 }
8143 else {
8144 ret = rb_str_casemap(str, &flags, enc);
8145 }
8146 return ret;
8147}
8148
8149
8150/*
8151 * call-seq:
8152 * swapcase!(mapping) -> self or nil
8153 *
8154 * Like String#swapcase, except that:
8155 *
8156 * - Changes are made to +self+, not to copy of +self+.
8157 * - Returns +self+ if any changes are made, +nil+ otherwise.
8158 *
8159 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8160 */
8161
8162static VALUE
8163rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8164{
8165 rb_encoding *enc;
8166 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8167
8168 flags = check_case_options(argc, argv, flags);
8169 str_modify_keep_cr(str);
8170 enc = str_true_enc(str);
8171 if (flags&ONIGENC_CASE_ASCII_ONLY)
8172 rb_str_ascii_casemap(str, str, &flags, enc);
8173 else
8174 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8175
8176 if (ONIGENC_CASE_MODIFIED&flags) return str;
8177 return Qnil;
8178}
8179
8180
8181/*
8182 * call-seq:
8183 * swapcase(mapping = :ascii) -> new_string
8184 *
8185 * :include: doc/string/swapcase.rdoc
8186 *
8187 */
8188
8189static VALUE
8190rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8191{
8192 rb_encoding *enc;
8193 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8194 VALUE ret;
8195
8196 flags = check_case_options(argc, argv, flags);
8197 enc = str_true_enc(str);
8198 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8199 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8200 ret = rb_str_new(0, RSTRING_LEN(str));
8201 rb_str_ascii_casemap(str, ret, &flags, enc);
8202 }
8203 else {
8204 ret = rb_str_casemap(str, &flags, enc);
8205 }
8206 return ret;
8207}
8208
8209typedef unsigned char *USTR;
8210
8211struct tr {
8212 int gen;
8213 unsigned int now, max;
8214 char *p, *pend;
8215};
8216
8217static unsigned int
8218trnext(struct tr *t, rb_encoding *enc)
8219{
8220 int n;
8221
8222 for (;;) {
8223 nextpart:
8224 if (!t->gen) {
8225 if (t->p == t->pend) return -1;
8226 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8227 t->p += n;
8228 }
8229 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8230 t->p += n;
8231 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8232 t->p += n;
8233 if (t->p < t->pend) {
8234 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8235 t->p += n;
8236 if (t->now > c) {
8237 if (t->now < 0x80 && c < 0x80) {
8238 rb_raise(rb_eArgError,
8239 "invalid range \"%c-%c\" in string transliteration",
8240 t->now, c);
8241 }
8242 else {
8243 rb_raise(rb_eArgError, "invalid range in string transliteration");
8244 }
8245 continue; /* not reached */
8246 }
8247 else if (t->now < c) {
8248 t->gen = 1;
8249 t->max = c;
8250 }
8251 }
8252 }
8253 return t->now;
8254 }
8255 else {
8256 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8257 if (t->now == t->max) {
8258 t->gen = 0;
8259 goto nextpart;
8260 }
8261 }
8262 if (t->now < t->max) {
8263 return t->now;
8264 }
8265 else {
8266 t->gen = 0;
8267 return t->max;
8268 }
8269 }
8270 }
8271}
8272
8273static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8274
8275static VALUE
8276tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8277{
8278 const unsigned int errc = -1;
8279 unsigned int trans[256];
8280 rb_encoding *enc, *e1, *e2;
8281 struct tr trsrc, trrepl;
8282 int cflag = 0;
8283 unsigned int c, c0, last = 0;
8284 int modify = 0, i, l;
8285 unsigned char *s, *send;
8286 VALUE hash = 0;
8287 int singlebyte = single_byte_optimizable(str);
8288 int termlen;
8289 int cr;
8290
8291#define CHECK_IF_ASCII(c) \
8292 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8293 (cr = ENC_CODERANGE_VALID) : 0)
8294
8295 StringValue(src);
8296 StringValue(repl);
8297 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8298 if (RSTRING_LEN(repl) == 0) {
8299 return rb_str_delete_bang(1, &src, str);
8300 }
8301
8302 cr = ENC_CODERANGE(str);
8303 e1 = rb_enc_check(str, src);
8304 e2 = rb_enc_check(str, repl);
8305 if (e1 == e2) {
8306 enc = e1;
8307 }
8308 else {
8309 enc = rb_enc_check(src, repl);
8310 }
8311 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8312 if (RSTRING_LEN(src) > 1 &&
8313 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8314 trsrc.p + l < trsrc.pend) {
8315 cflag = 1;
8316 trsrc.p += l;
8317 }
8318 trrepl.p = RSTRING_PTR(repl);
8319 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8320 trsrc.gen = trrepl.gen = 0;
8321 trsrc.now = trrepl.now = 0;
8322 trsrc.max = trrepl.max = 0;
8323
8324 if (cflag) {
8325 for (i=0; i<256; i++) {
8326 trans[i] = 1;
8327 }
8328 while ((c = trnext(&trsrc, enc)) != errc) {
8329 if (c < 256) {
8330 trans[c] = errc;
8331 }
8332 else {
8333 if (!hash) hash = rb_hash_new();
8334 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8335 }
8336 }
8337 while ((c = trnext(&trrepl, enc)) != errc)
8338 /* retrieve last replacer */;
8339 last = trrepl.now;
8340 for (i=0; i<256; i++) {
8341 if (trans[i] != errc) {
8342 trans[i] = last;
8343 }
8344 }
8345 }
8346 else {
8347 unsigned int r;
8348
8349 for (i=0; i<256; i++) {
8350 trans[i] = errc;
8351 }
8352 while ((c = trnext(&trsrc, enc)) != errc) {
8353 r = trnext(&trrepl, enc);
8354 if (r == errc) r = trrepl.now;
8355 if (c < 256) {
8356 trans[c] = r;
8357 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8358 }
8359 else {
8360 if (!hash) hash = rb_hash_new();
8361 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8362 }
8363 }
8364 }
8365
8366 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8367 cr = ENC_CODERANGE_7BIT;
8368 str_modify_keep_cr(str);
8369 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8370 termlen = rb_enc_mbminlen(enc);
8371 if (sflag) {
8372 int clen, tlen;
8373 long offset, max = RSTRING_LEN(str);
8374 unsigned int save = -1;
8375 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8376
8377 while (s < send) {
8378 int may_modify = 0;
8379
8380 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8381 if (!MBCLEN_CHARFOUND_P(r)) {
8382 xfree(buf);
8383 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8384 }
8385 clen = MBCLEN_CHARFOUND_LEN(r);
8386 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8387
8388 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8389
8390 s += clen;
8391 if (c < 256) {
8392 c = trans[c];
8393 }
8394 else if (hash) {
8395 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8396 if (NIL_P(tmp)) {
8397 if (cflag) c = last;
8398 else c = errc;
8399 }
8400 else if (cflag) c = errc;
8401 else c = NUM2INT(tmp);
8402 }
8403 else {
8404 c = errc;
8405 }
8406 if (c != (unsigned int)-1) {
8407 if (save == c) {
8408 CHECK_IF_ASCII(c);
8409 continue;
8410 }
8411 save = c;
8412 tlen = rb_enc_codelen(c, enc);
8413 modify = 1;
8414 }
8415 else {
8416 save = -1;
8417 c = c0;
8418 if (enc != e1) may_modify = 1;
8419 }
8420 if ((offset = t - buf) + tlen > max) {
8421 size_t MAYBE_UNUSED(old) = max + termlen;
8422 max = offset + tlen + (send - s);
8423 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8424 t = buf + offset;
8425 }
8426 rb_enc_mbcput(c, t, enc);
8427 if (may_modify && memcmp(s, t, tlen) != 0) {
8428 modify = 1;
8429 }
8430 CHECK_IF_ASCII(c);
8431 t += tlen;
8432 }
8433 if (!STR_EMBED_P(str)) {
8434 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8435 }
8436 TERM_FILL((char *)t, termlen);
8437 RSTRING(str)->as.heap.ptr = (char *)buf;
8438 STR_SET_LEN(str, t - buf);
8439 STR_SET_NOEMBED(str);
8440 RSTRING(str)->as.heap.aux.capa = max;
8441 }
8442 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8443 while (s < send) {
8444 c = (unsigned char)*s;
8445 if (trans[c] != errc) {
8446 if (!cflag) {
8447 c = trans[c];
8448 *s = c;
8449 modify = 1;
8450 }
8451 else {
8452 *s = last;
8453 modify = 1;
8454 }
8455 }
8456 CHECK_IF_ASCII(c);
8457 s++;
8458 }
8459 }
8460 else {
8461 int clen, tlen;
8462 long offset, max = (long)((send - s) * 1.2);
8463 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8464
8465 while (s < send) {
8466 int may_modify = 0;
8467
8468 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8469 if (!MBCLEN_CHARFOUND_P(r)) {
8470 xfree(buf);
8471 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8472 }
8473 clen = MBCLEN_CHARFOUND_LEN(r);
8474 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8475
8476 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8477
8478 if (c < 256) {
8479 c = trans[c];
8480 }
8481 else if (hash) {
8482 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8483 if (NIL_P(tmp)) {
8484 if (cflag) c = last;
8485 else c = errc;
8486 }
8487 else if (cflag) c = errc;
8488 else c = NUM2INT(tmp);
8489 }
8490 else {
8491 c = cflag ? last : errc;
8492 }
8493 if (c != errc) {
8494 tlen = rb_enc_codelen(c, enc);
8495 modify = 1;
8496 }
8497 else {
8498 c = c0;
8499 if (enc != e1) may_modify = 1;
8500 }
8501 if ((offset = t - buf) + tlen > max) {
8502 size_t MAYBE_UNUSED(old) = max + termlen;
8503 max = offset + tlen + (long)((send - s) * 1.2);
8504 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8505 t = buf + offset;
8506 }
8507 if (s != t) {
8508 rb_enc_mbcput(c, t, enc);
8509 if (may_modify && memcmp(s, t, tlen) != 0) {
8510 modify = 1;
8511 }
8512 }
8513 CHECK_IF_ASCII(c);
8514 s += clen;
8515 t += tlen;
8516 }
8517 if (!STR_EMBED_P(str)) {
8518 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8519 }
8520 TERM_FILL((char *)t, termlen);
8521 RSTRING(str)->as.heap.ptr = (char *)buf;
8522 STR_SET_LEN(str, t - buf);
8523 STR_SET_NOEMBED(str);
8524 RSTRING(str)->as.heap.aux.capa = max;
8525 }
8526
8527 if (modify) {
8528 if (cr != ENC_CODERANGE_BROKEN)
8529 ENC_CODERANGE_SET(str, cr);
8530 rb_enc_associate(str, enc);
8531 return str;
8532 }
8533 return Qnil;
8534}
8535
8536
8537/*
8538 * call-seq:
8539 * tr!(selector, replacements) -> self or nil
8540 *
8541 * Like String#tr, except:
8542 *
8543 * - Performs substitutions in +self+ (not in a copy of +self+).
8544 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8545 *
8546 * Related: {Modifying}[rdoc-ref:String@Modifying].
8547 */
8548
8549static VALUE
8550rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8551{
8552 return tr_trans(str, src, repl, 0);
8553}
8554
8555
8556/*
8557 * call-seq:
8558 * tr(selector, replacements) -> new_string
8559 *
8560 * Returns a copy of +self+ with each character specified by string +selector+
8561 * translated to the corresponding character in string +replacements+.
8562 * The correspondence is _positional_:
8563 *
8564 * - Each occurrence of the first character specified by +selector+
8565 * is translated to the first character in +replacements+.
8566 * - Each occurrence of the second character specified by +selector+
8567 * is translated to the second character in +replacements+.
8568 * - And so on.
8569 *
8570 * Example:
8571 *
8572 * 'hello'.tr('el', 'ip') #=> "hippo"
8573 *
8574 * If +replacements+ is shorter than +selector+,
8575 * it is implicitly padded with its own last character:
8576 *
8577 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8578 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8579 *
8580 * Arguments +selector+ and +replacements+ must be valid character selectors
8581 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8582 * and may use any of its valid forms, including negation, ranges, and escapes:
8583 *
8584 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8585 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8586 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8587 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8588 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8589 *
8590 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8591 */
8592
8593static VALUE
8594rb_str_tr(VALUE str, VALUE src, VALUE repl)
8595{
8596 str = str_duplicate(rb_cString, str);
8597 tr_trans(str, src, repl, 0);
8598 return str;
8599}
8600
8601#define TR_TABLE_MAX (UCHAR_MAX+1)
8602#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8603static void
8604tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8605 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8606{
8607 const unsigned int errc = -1;
8608 char buf[TR_TABLE_MAX];
8609 struct tr tr;
8610 unsigned int c;
8611 VALUE table = 0, ptable = 0;
8612 int i, l, cflag = 0;
8613
8614 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8615 tr.gen = tr.now = tr.max = 0;
8616
8617 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8618 cflag = 1;
8619 tr.p += l;
8620 }
8621 if (first) {
8622 for (i=0; i<TR_TABLE_MAX; i++) {
8623 stable[i] = 1;
8624 }
8625 stable[TR_TABLE_MAX] = cflag;
8626 }
8627 else if (stable[TR_TABLE_MAX] && !cflag) {
8628 stable[TR_TABLE_MAX] = 0;
8629 }
8630 for (i=0; i<TR_TABLE_MAX; i++) {
8631 buf[i] = cflag;
8632 }
8633
8634 while ((c = trnext(&tr, enc)) != errc) {
8635 if (c < TR_TABLE_MAX) {
8636 buf[(unsigned char)c] = !cflag;
8637 }
8638 else {
8639 VALUE key = UINT2NUM(c);
8640
8641 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8642 if (cflag) {
8643 ptable = *ctablep;
8644 table = ptable ? ptable : rb_hash_new();
8645 *ctablep = table;
8646 }
8647 else {
8648 table = rb_hash_new();
8649 ptable = *tablep;
8650 *tablep = table;
8651 }
8652 }
8653 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8654 rb_hash_aset(table, key, Qtrue);
8655 }
8656 }
8657 }
8658 for (i=0; i<TR_TABLE_MAX; i++) {
8659 stable[i] = stable[i] && buf[i];
8660 }
8661 if (!table && !cflag) {
8662 *tablep = 0;
8663 }
8664}
8665
8666
8667static int
8668tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8669{
8670 if (c < TR_TABLE_MAX) {
8671 return table[c] != 0;
8672 }
8673 else {
8674 VALUE v = UINT2NUM(c);
8675
8676 if (del) {
8677 if (!NIL_P(rb_hash_lookup(del, v)) &&
8678 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8679 return TRUE;
8680 }
8681 }
8682 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8683 return FALSE;
8684 }
8685 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8686 }
8687}
8688
8689/*
8690 * call-seq:
8691 * delete!(*selectors) -> self or nil
8692 *
8693 * Like String#delete, but modifies +self+ in place;
8694 * returns +self+ if any characters were deleted, +nil+ otherwise.
8695 *
8696 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8697 */
8698
8699static VALUE
8700rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8701{
8702 char squeez[TR_TABLE_SIZE];
8703 rb_encoding *enc = 0;
8704 char *s, *send, *t;
8705 VALUE del = 0, nodel = 0;
8706 int modify = 0;
8707 int i, ascompat, cr;
8708
8709 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8711 for (i=0; i<argc; i++) {
8712 VALUE s = argv[i];
8713
8714 StringValue(s);
8715 enc = rb_enc_check(str, s);
8716 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8717 }
8718
8719 str_modify_keep_cr(str);
8720 ascompat = rb_enc_asciicompat(enc);
8721 s = t = RSTRING_PTR(str);
8722 send = RSTRING_END(str);
8723 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8724 while (s < send) {
8725 unsigned int c;
8726 int clen;
8727
8728 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8729 if (squeez[c]) {
8730 modify = 1;
8731 }
8732 else {
8733 if (t != s) *t = c;
8734 t++;
8735 }
8736 s++;
8737 }
8738 else {
8739 c = rb_enc_codepoint_len(s, send, &clen, enc);
8740
8741 if (tr_find(c, squeez, del, nodel)) {
8742 modify = 1;
8743 }
8744 else {
8745 if (t != s) rb_enc_mbcput(c, t, enc);
8746 t += clen;
8748 }
8749 s += clen;
8750 }
8751 }
8752 TERM_FILL(t, TERM_LEN(str));
8753 STR_SET_LEN(str, t - RSTRING_PTR(str));
8754 ENC_CODERANGE_SET(str, cr);
8755
8756 if (modify) return str;
8757 return Qnil;
8758}
8759
8760
8761/*
8762 * call-seq:
8763 * delete(*selectors) -> new_string
8764 *
8765 * :include: doc/string/delete.rdoc
8766 *
8767 */
8768
8769static VALUE
8770rb_str_delete(int argc, VALUE *argv, VALUE str)
8771{
8772 str = str_duplicate(rb_cString, str);
8773 rb_str_delete_bang(argc, argv, str);
8774 return str;
8775}
8776
8777
8778/*
8779 * call-seq:
8780 * squeeze!(*selectors) -> self or nil
8781 *
8782 * Like String#squeeze, except that:
8783 *
8784 * - Characters are squeezed in +self+ (not in a copy of +self+).
8785 * - Returns +self+ if any changes are made, +nil+ otherwise.
8786 *
8787 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8788 */
8789
8790static VALUE
8791rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8792{
8793 char squeez[TR_TABLE_SIZE];
8794 rb_encoding *enc = 0;
8795 VALUE del = 0, nodel = 0;
8796 unsigned char *s, *send, *t;
8797 int i, modify = 0;
8798 int ascompat, singlebyte = single_byte_optimizable(str);
8799 unsigned int save;
8800
8801 if (argc == 0) {
8802 enc = STR_ENC_GET(str);
8803 }
8804 else {
8805 for (i=0; i<argc; i++) {
8806 VALUE s = argv[i];
8807
8808 StringValue(s);
8809 enc = rb_enc_check(str, s);
8810 if (singlebyte && !single_byte_optimizable(s))
8811 singlebyte = 0;
8812 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8813 }
8814 }
8815
8816 str_modify_keep_cr(str);
8817 s = t = (unsigned char *)RSTRING_PTR(str);
8818 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8819 send = (unsigned char *)RSTRING_END(str);
8820 save = -1;
8821 ascompat = rb_enc_asciicompat(enc);
8822
8823 if (singlebyte) {
8824 while (s < send) {
8825 unsigned int c = *s++;
8826 if (c != save || (argc > 0 && !squeez[c])) {
8827 *t++ = save = c;
8828 }
8829 }
8830 }
8831 else {
8832 while (s < send) {
8833 unsigned int c;
8834 int clen;
8835
8836 if (ascompat && (c = *s) < 0x80) {
8837 if (c != save || (argc > 0 && !squeez[c])) {
8838 *t++ = save = c;
8839 }
8840 s++;
8841 }
8842 else {
8843 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8844
8845 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8846 if (t != s) rb_enc_mbcput(c, t, enc);
8847 save = c;
8848 t += clen;
8849 }
8850 s += clen;
8851 }
8852 }
8853 }
8854
8855 TERM_FILL((char *)t, TERM_LEN(str));
8856 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8857 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8858 modify = 1;
8859 }
8860
8861 if (modify) return str;
8862 return Qnil;
8863}
8864
8865
8866/*
8867 * call-seq:
8868 * squeeze(*selectors) -> new_string
8869 *
8870 * :include: doc/string/squeeze.rdoc
8871 *
8872 */
8873
8874static VALUE
8875rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8876{
8877 str = str_duplicate(rb_cString, str);
8878 rb_str_squeeze_bang(argc, argv, str);
8879 return str;
8880}
8881
8882
8883/*
8884 * call-seq:
8885 * tr_s!(selector, replacements) -> self or nil
8886 *
8887 * Like String#tr_s, except:
8888 *
8889 * - Modifies +self+ in place (not a copy of +self+).
8890 * - Returns +self+ if any changes were made, +nil+ otherwise.
8891 *
8892 * Related: {Modifying}[rdoc-ref:String@Modifying].
8893 */
8894
8895static VALUE
8896rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8897{
8898 return tr_trans(str, src, repl, 1);
8899}
8900
8901
8902/*
8903 * call-seq:
8904 * tr_s(selector, replacements) -> new_string
8905 *
8906 * Like String#tr, except:
8907 *
8908 * - Also squeezes the modified portions of the translated string;
8909 * see String#squeeze.
8910 * - Returns the translated and squeezed string.
8911 *
8912 * Examples:
8913 *
8914 * 'hello'.tr_s('l', 'r') #=> "hero"
8915 * 'hello'.tr_s('el', '-') #=> "h-o"
8916 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8917 *
8918 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8919 *
8920 */
8921
8922static VALUE
8923rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8924{
8925 str = str_duplicate(rb_cString, str);
8926 tr_trans(str, src, repl, 1);
8927 return str;
8928}
8929
8930
8931/*
8932 * call-seq:
8933 * count(*selectors) -> integer
8934 *
8935 * :include: doc/string/count.rdoc
8936 */
8937
8938static VALUE
8939rb_str_count(int argc, VALUE *argv, VALUE str)
8940{
8941 char table[TR_TABLE_SIZE];
8942 rb_encoding *enc = 0;
8943 VALUE del = 0, nodel = 0, tstr;
8944 char *s, *send;
8945 int i;
8946 int ascompat;
8947 size_t n = 0;
8948
8950
8951 tstr = argv[0];
8952 StringValue(tstr);
8953 enc = rb_enc_check(str, tstr);
8954 if (argc == 1) {
8955 const char *ptstr;
8956 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8957 (ptstr = RSTRING_PTR(tstr),
8958 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8959 !is_broken_string(str)) {
8960 int clen;
8961 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8962
8963 s = RSTRING_PTR(str);
8964 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8965 send = RSTRING_END(str);
8966 while (s < send) {
8967 if (*(unsigned char*)s++ == c) n++;
8968 }
8969 return SIZET2NUM(n);
8970 }
8971 }
8972
8973 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8974 for (i=1; i<argc; i++) {
8975 tstr = argv[i];
8976 StringValue(tstr);
8977 enc = rb_enc_check(str, tstr);
8978 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8979 }
8980
8981 s = RSTRING_PTR(str);
8982 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8983 send = RSTRING_END(str);
8984 ascompat = rb_enc_asciicompat(enc);
8985 while (s < send) {
8986 unsigned int c;
8987
8988 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8989 if (table[c]) {
8990 n++;
8991 }
8992 s++;
8993 }
8994 else {
8995 int clen;
8996 c = rb_enc_codepoint_len(s, send, &clen, enc);
8997 if (tr_find(c, table, del, nodel)) {
8998 n++;
8999 }
9000 s += clen;
9001 }
9002 }
9003
9004 return SIZET2NUM(n);
9005}
9006
9007static VALUE
9008rb_fs_check(VALUE val)
9009{
9010 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9011 val = rb_check_string_type(val);
9012 if (NIL_P(val)) return 0;
9013 }
9014 return val;
9015}
9016
9017static const char isspacetable[256] = {
9018 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9019 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9020 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9021 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9022 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9023 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9024 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9025 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9026 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9027 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9028 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9029 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9030 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9034};
9035
9036#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9037
9038static long
9039split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9040{
9041 if (empty_count >= 0 && len == 0) {
9042 return empty_count + 1;
9043 }
9044 if (empty_count > 0) {
9045 /* make different substrings */
9046 if (result) {
9047 do {
9048 rb_ary_push(result, str_new_empty_String(str));
9049 } while (--empty_count > 0);
9050 }
9051 else {
9052 do {
9053 rb_yield(str_new_empty_String(str));
9054 } while (--empty_count > 0);
9055 }
9056 }
9057 str = rb_str_subseq(str, beg, len);
9058 if (result) {
9059 rb_ary_push(result, str);
9060 }
9061 else {
9062 rb_yield(str);
9063 }
9064 return empty_count;
9065}
9066
9067typedef enum {
9068 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9069} split_type_t;
9070
9071static split_type_t
9072literal_split_pattern(VALUE spat, split_type_t default_type)
9073{
9074 rb_encoding *enc = STR_ENC_GET(spat);
9075 const char *ptr;
9076 long len;
9077 RSTRING_GETMEM(spat, ptr, len);
9078 if (len == 0) {
9079 /* Special case - split into chars */
9080 return SPLIT_TYPE_CHARS;
9081 }
9082 else if (rb_enc_asciicompat(enc)) {
9083 if (len == 1 && ptr[0] == ' ') {
9084 return SPLIT_TYPE_AWK;
9085 }
9086 }
9087 else {
9088 int l;
9089 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9090 return SPLIT_TYPE_AWK;
9091 }
9092 }
9093 return default_type;
9094}
9095
9096/*
9097 * call-seq:
9098 * split(field_sep = $;, limit = 0) -> array_of_substrings
9099 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9100 *
9101 * :include: doc/string/split.rdoc
9102 *
9103 */
9104
9105static VALUE
9106rb_str_split_m(int argc, VALUE *argv, VALUE str)
9107{
9108 rb_encoding *enc;
9109 VALUE spat;
9110 VALUE limit;
9111 split_type_t split_type;
9112 long beg, end, i = 0, empty_count = -1;
9113 int lim = 0;
9114 VALUE result, tmp;
9115
9116 result = rb_block_given_p() ? Qfalse : Qnil;
9117 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9118 lim = NUM2INT(limit);
9119 if (lim <= 0) limit = Qnil;
9120 else if (lim == 1) {
9121 if (RSTRING_LEN(str) == 0)
9122 return result ? rb_ary_new2(0) : str;
9123 tmp = str_duplicate(rb_cString, str);
9124 if (!result) {
9125 rb_yield(tmp);
9126 return str;
9127 }
9128 return rb_ary_new3(1, tmp);
9129 }
9130 i = 1;
9131 }
9132 if (NIL_P(limit) && !lim) empty_count = 0;
9133
9134 enc = STR_ENC_GET(str);
9135 split_type = SPLIT_TYPE_REGEXP;
9136 if (!NIL_P(spat)) {
9137 spat = get_pat_quoted(spat, 0);
9138 }
9139 else if (NIL_P(spat = rb_fs)) {
9140 split_type = SPLIT_TYPE_AWK;
9141 }
9142 else if (!(spat = rb_fs_check(spat))) {
9143 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9144 }
9145 else {
9146 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9147 }
9148 if (split_type != SPLIT_TYPE_AWK) {
9149 switch (BUILTIN_TYPE(spat)) {
9150 case T_REGEXP:
9151 rb_reg_options(spat); /* check if uninitialized */
9152 tmp = RREGEXP_SRC(spat);
9153 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9154 if (split_type == SPLIT_TYPE_AWK) {
9155 spat = tmp;
9156 split_type = SPLIT_TYPE_STRING;
9157 }
9158 break;
9159
9160 case T_STRING:
9161 mustnot_broken(spat);
9162 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9163 break;
9164
9165 default:
9167 }
9168 }
9169
9170#define SPLIT_STR(beg, len) ( \
9171 empty_count = split_string(result, str, beg, len, empty_count), \
9172 str_mod_check(str, str_start, str_len))
9173
9174 beg = 0;
9175 char *ptr = RSTRING_PTR(str);
9176 char *const str_start = ptr;
9177 const long str_len = RSTRING_LEN(str);
9178 char *const eptr = str_start + str_len;
9179 if (split_type == SPLIT_TYPE_AWK) {
9180 char *bptr = ptr;
9181 int skip = 1;
9182 unsigned int c;
9183
9184 if (result) result = rb_ary_new();
9185 end = beg;
9186 if (is_ascii_string(str)) {
9187 while (ptr < eptr) {
9188 c = (unsigned char)*ptr++;
9189 if (skip) {
9190 if (ascii_isspace(c)) {
9191 beg = ptr - bptr;
9192 }
9193 else {
9194 end = ptr - bptr;
9195 skip = 0;
9196 if (!NIL_P(limit) && lim <= i) break;
9197 }
9198 }
9199 else if (ascii_isspace(c)) {
9200 SPLIT_STR(beg, end-beg);
9201 skip = 1;
9202 beg = ptr - bptr;
9203 if (!NIL_P(limit)) ++i;
9204 }
9205 else {
9206 end = ptr - bptr;
9207 }
9208 }
9209 }
9210 else {
9211 while (ptr < eptr) {
9212 int n;
9213
9214 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9215 ptr += n;
9216 if (skip) {
9217 if (rb_isspace(c)) {
9218 beg = ptr - bptr;
9219 }
9220 else {
9221 end = ptr - bptr;
9222 skip = 0;
9223 if (!NIL_P(limit) && lim <= i) break;
9224 }
9225 }
9226 else if (rb_isspace(c)) {
9227 SPLIT_STR(beg, end-beg);
9228 skip = 1;
9229 beg = ptr - bptr;
9230 if (!NIL_P(limit)) ++i;
9231 }
9232 else {
9233 end = ptr - bptr;
9234 }
9235 }
9236 }
9237 }
9238 else if (split_type == SPLIT_TYPE_STRING) {
9239 char *substr_start = ptr;
9240 char *sptr = RSTRING_PTR(spat);
9241 long slen = RSTRING_LEN(spat);
9242
9243 if (result) result = rb_ary_new();
9244 mustnot_broken(str);
9245 enc = rb_enc_check(str, spat);
9246 while (ptr < eptr &&
9247 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9248 /* Check we are at the start of a char */
9249 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9250 if (t != ptr + end) {
9251 ptr = t;
9252 continue;
9253 }
9254 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9255 str_mod_check(spat, sptr, slen);
9256 ptr += end + slen;
9257 substr_start = ptr;
9258 if (!NIL_P(limit) && lim <= ++i) break;
9259 }
9260 beg = ptr - str_start;
9261 }
9262 else if (split_type == SPLIT_TYPE_CHARS) {
9263 int n;
9264
9265 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9266 mustnot_broken(str);
9267 enc = rb_enc_get(str);
9268 while (ptr < eptr &&
9269 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9270 SPLIT_STR(ptr - str_start, n);
9271 ptr += n;
9272 if (!NIL_P(limit) && lim <= ++i) break;
9273 }
9274 beg = ptr - str_start;
9275 }
9276 else {
9277 if (result) result = rb_ary_new();
9278 long len = RSTRING_LEN(str);
9279 long start = beg;
9280 long idx;
9281 int last_null = 0;
9282 struct re_registers *regs;
9283 VALUE match = 0;
9284
9285 for (; rb_reg_search(spat, str, start, 0) >= 0;
9286 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9287 match = rb_backref_get();
9288 if (!result) rb_match_busy(match);
9289 regs = RMATCH_REGS(match);
9290 end = BEG(0);
9291 if (start == end && BEG(0) == END(0)) {
9292 if (!ptr) {
9293 SPLIT_STR(0, 0);
9294 break;
9295 }
9296 else if (last_null == 1) {
9297 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9298 beg = start;
9299 }
9300 else {
9301 if (start == len)
9302 start++;
9303 else
9304 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9305 last_null = 1;
9306 continue;
9307 }
9308 }
9309 else {
9310 SPLIT_STR(beg, end-beg);
9311 beg = start = END(0);
9312 }
9313 last_null = 0;
9314
9315 for (idx=1; idx < regs->num_regs; idx++) {
9316 if (BEG(idx) == -1) continue;
9317 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9318 }
9319 if (!NIL_P(limit) && lim <= ++i) break;
9320 }
9321 if (match) rb_match_unbusy(match);
9322 }
9323 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9324 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9325 }
9326
9327 return result ? result : str;
9328}
9329
9330VALUE
9331rb_str_split(VALUE str, const char *sep0)
9332{
9333 VALUE sep;
9334
9335 StringValue(str);
9336 sep = rb_str_new_cstr(sep0);
9337 return rb_str_split_m(1, &sep, str);
9338}
9339
9340#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9341
9342static inline int
9343enumerator_element(VALUE ary, VALUE e)
9344{
9345 if (ary) {
9346 rb_ary_push(ary, e);
9347 return 0;
9348 }
9349 else {
9350 rb_yield(e);
9351 return 1;
9352 }
9353}
9354
9355#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9356
9357static const char *
9358chomp_newline(const char *p, const char *e, rb_encoding *enc)
9359{
9360 const char *prev = rb_enc_prev_char(p, e, e, enc);
9361 if (rb_enc_is_newline(prev, e, enc)) {
9362 e = prev;
9363 prev = rb_enc_prev_char(p, e, e, enc);
9364 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9365 e = prev;
9366 }
9367 return e;
9368}
9369
9370static VALUE
9371get_rs(void)
9372{
9373 VALUE rs = rb_rs;
9374 if (!NIL_P(rs) &&
9375 (!RB_TYPE_P(rs, T_STRING) ||
9376 RSTRING_LEN(rs) != 1 ||
9377 RSTRING_PTR(rs)[0] != '\n')) {
9378 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9379 }
9380 return rs;
9381}
9382
9383#define rb_rs get_rs()
9384
9385static VALUE
9386rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9387{
9388 rb_encoding *enc;
9389 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9390 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9391 long pos, len, rslen;
9392 int rsnewline = 0;
9393
9394 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9395 rs = rb_rs;
9396 if (!NIL_P(opts)) {
9397 static ID keywords[1];
9398 if (!keywords[0]) {
9399 keywords[0] = rb_intern_const("chomp");
9400 }
9401 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9402 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9403 }
9404
9405 if (NIL_P(rs)) {
9406 if (!ENUM_ELEM(ary, str)) {
9407 return ary;
9408 }
9409 else {
9410 return orig;
9411 }
9412 }
9413
9414 if (!RSTRING_LEN(str)) goto end;
9415 str = rb_str_new_frozen(str);
9416 ptr = subptr = RSTRING_PTR(str);
9417 pend = RSTRING_END(str);
9418 len = RSTRING_LEN(str);
9419 StringValue(rs);
9420 rslen = RSTRING_LEN(rs);
9421
9422 if (rs == rb_default_rs)
9423 enc = rb_enc_get(str);
9424 else
9425 enc = rb_enc_check(str, rs);
9426
9427 if (rslen == 0) {
9428 /* paragraph mode */
9429 int n;
9430 const char *eol = NULL;
9431 subend = subptr;
9432 while (subend < pend) {
9433 long chomp_rslen = 0;
9434 do {
9435 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9436 n = 0;
9437 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9438 if (rb_enc_is_newline(subend + n, pend, enc)) {
9439 if (eol == subend) break;
9440 subend += rslen;
9441 if (subptr) {
9442 eol = subend;
9443 chomp_rslen = -rslen;
9444 }
9445 }
9446 else {
9447 if (!subptr) subptr = subend;
9448 subend += rslen;
9449 }
9450 rslen = 0;
9451 } while (subend < pend);
9452 if (!subptr) break;
9453 if (rslen == 0) chomp_rslen = 0;
9454 line = rb_str_subseq(str, subptr - ptr,
9455 subend - subptr + (chomp ? chomp_rslen : rslen));
9456 if (ENUM_ELEM(ary, line)) {
9457 str_mod_check(str, ptr, len);
9458 }
9459 subptr = eol = NULL;
9460 }
9461 goto end;
9462 }
9463 else {
9464 rsptr = RSTRING_PTR(rs);
9465 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9466 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9467 rsnewline = 1;
9468 }
9469 }
9470
9471 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9472 rs = rb_str_new(rsptr, rslen);
9473 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9474 rsptr = RSTRING_PTR(rs);
9475 rslen = RSTRING_LEN(rs);
9476 }
9477
9478 while (subptr < pend) {
9479 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9480 if (pos < 0) break;
9481 hit = subptr + pos;
9482 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9483 if (hit != adjusted) {
9484 subptr = adjusted;
9485 continue;
9486 }
9487 subend = hit += rslen;
9488 if (chomp) {
9489 if (rsnewline) {
9490 subend = chomp_newline(subptr, subend, enc);
9491 }
9492 else {
9493 subend -= rslen;
9494 }
9495 }
9496 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9497 if (ENUM_ELEM(ary, line)) {
9498 str_mod_check(str, ptr, len);
9499 }
9500 subptr = hit;
9501 }
9502
9503 if (subptr != pend) {
9504 if (chomp) {
9505 if (rsnewline) {
9506 pend = chomp_newline(subptr, pend, enc);
9507 }
9508 else if (pend - subptr >= rslen &&
9509 memcmp(pend - rslen, rsptr, rslen) == 0) {
9510 pend -= rslen;
9511 }
9512 }
9513 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9514 ENUM_ELEM(ary, line);
9515 RB_GC_GUARD(str);
9516 }
9517
9518 end:
9519 if (ary)
9520 return ary;
9521 else
9522 return orig;
9523}
9524
9525/*
9526 * call-seq:
9527 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9528 * each_line(record_separator = $/, chomp: false) -> enumerator
9529 *
9530 * :include: doc/string/each_line.rdoc
9531 *
9532 */
9533
9534static VALUE
9535rb_str_each_line(int argc, VALUE *argv, VALUE str)
9536{
9537 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9538 return rb_str_enumerate_lines(argc, argv, str, 0);
9539}
9540
9541/*
9542 * call-seq:
9543 * lines(record_separator = $/, chomp: false) -> array_of_strings
9544 *
9545 * Returns substrings ("lines") of +self+
9546 * according to the given arguments:
9547 *
9548 * s = <<~EOT
9549 * This is the first line.
9550 * This is line two.
9551 *
9552 * This is line four.
9553 * This is line five.
9554 * EOT
9555 *
9556 * With the default argument values:
9557 *
9558 * $/ # => "\n"
9559 * s.lines
9560 * # =>
9561 * ["This is the first line.\n",
9562 * "This is line two.\n",
9563 * "\n",
9564 * "This is line four.\n",
9565 * "This is line five.\n"]
9566 *
9567 * With a different +record_separator+:
9568 *
9569 * record_separator = ' is '
9570 * s.lines(record_separator)
9571 * # =>
9572 * ["This is ",
9573 * "the first line.\nThis is ",
9574 * "line two.\n\nThis is ",
9575 * "line four.\nThis is ",
9576 * "line five.\n"]
9577 *
9578 * With keyword argument +chomp+ as +true+,
9579 * removes the trailing newline from each line:
9580 *
9581 * s.lines(chomp: true)
9582 * # =>
9583 * ["This is the first line.",
9584 * "This is line two.",
9585 * "",
9586 * "This is line four.",
9587 * "This is line five."]
9588 *
9589 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9590 */
9591
9592static VALUE
9593rb_str_lines(int argc, VALUE *argv, VALUE str)
9594{
9595 VALUE ary = WANTARRAY("lines", 0);
9596 return rb_str_enumerate_lines(argc, argv, str, ary);
9597}
9598
9599static VALUE
9600rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9601{
9602 return LONG2FIX(RSTRING_LEN(str));
9603}
9604
9605static VALUE
9606rb_str_enumerate_bytes(VALUE str, VALUE ary)
9607{
9608 long i;
9609
9610 for (i=0; i<RSTRING_LEN(str); i++) {
9611 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9612 }
9613 if (ary)
9614 return ary;
9615 else
9616 return str;
9617}
9618
9619/*
9620 * call-seq:
9621 * each_byte {|byte| ... } -> self
9622 * each_byte -> enumerator
9623 *
9624 * :include: doc/string/each_byte.rdoc
9625 *
9626 */
9627
9628static VALUE
9629rb_str_each_byte(VALUE str)
9630{
9631 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9632 return rb_str_enumerate_bytes(str, 0);
9633}
9634
9635/*
9636 * call-seq:
9637 * bytes -> array_of_bytes
9638 *
9639 * :include: doc/string/bytes.rdoc
9640 *
9641 */
9642
9643static VALUE
9644rb_str_bytes(VALUE str)
9645{
9646 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9647 return rb_str_enumerate_bytes(str, ary);
9648}
9649
9650static VALUE
9651rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9652{
9653 return rb_str_length(str);
9654}
9655
9656static VALUE
9657rb_str_enumerate_chars(VALUE str, VALUE ary)
9658{
9659 VALUE orig = str;
9660 long i, len, n;
9661 const char *ptr;
9662 rb_encoding *enc;
9663
9664 str = rb_str_new_frozen(str);
9665 ptr = RSTRING_PTR(str);
9666 len = RSTRING_LEN(str);
9667 enc = rb_enc_get(str);
9668
9670 for (i = 0; i < len; i += n) {
9671 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9672 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9673 }
9674 }
9675 else {
9676 for (i = 0; i < len; i += n) {
9677 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9678 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9679 }
9680 }
9681 RB_GC_GUARD(str);
9682 if (ary)
9683 return ary;
9684 else
9685 return orig;
9686}
9687
9688/*
9689 * call-seq:
9690 * each_char {|char| ... } -> self
9691 * each_char -> enumerator
9692 *
9693 * :include: doc/string/each_char.rdoc
9694 *
9695 */
9696
9697static VALUE
9698rb_str_each_char(VALUE str)
9699{
9700 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9701 return rb_str_enumerate_chars(str, 0);
9702}
9703
9704/*
9705 * call-seq:
9706 * chars -> array_of_characters
9707 *
9708 * :include: doc/string/chars.rdoc
9709 *
9710 */
9711
9712static VALUE
9713rb_str_chars(VALUE str)
9714{
9715 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9716 return rb_str_enumerate_chars(str, ary);
9717}
9718
9719static VALUE
9720rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9721{
9722 VALUE orig = str;
9723 int n;
9724 unsigned int c;
9725 const char *ptr, *end;
9726 rb_encoding *enc;
9727
9728 if (single_byte_optimizable(str))
9729 return rb_str_enumerate_bytes(str, ary);
9730
9731 str = rb_str_new_frozen(str);
9732 ptr = RSTRING_PTR(str);
9733 end = RSTRING_END(str);
9734 enc = STR_ENC_GET(str);
9735
9736 while (ptr < end) {
9737 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9738 ENUM_ELEM(ary, UINT2NUM(c));
9739 ptr += n;
9740 }
9741 RB_GC_GUARD(str);
9742 if (ary)
9743 return ary;
9744 else
9745 return orig;
9746}
9747
9748/*
9749 * call-seq:
9750 * each_codepoint {|codepoint| ... } -> self
9751 * each_codepoint -> enumerator
9752 *
9753 * :include: doc/string/each_codepoint.rdoc
9754 *
9755 */
9756
9757static VALUE
9758rb_str_each_codepoint(VALUE str)
9759{
9760 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9761 return rb_str_enumerate_codepoints(str, 0);
9762}
9763
9764/*
9765 * call-seq:
9766 * codepoints -> array_of_integers
9767 *
9768 * :include: doc/string/codepoints.rdoc
9769 *
9770 */
9771
9772static VALUE
9773rb_str_codepoints(VALUE str)
9774{
9775 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9776 return rb_str_enumerate_codepoints(str, ary);
9777}
9778
9779static regex_t *
9780get_reg_grapheme_cluster(rb_encoding *enc)
9781{
9782 int encidx = rb_enc_to_index(enc);
9783
9784 const OnigUChar source_ascii[] = "\\X";
9785 const OnigUChar *source = source_ascii;
9786 size_t source_len = sizeof(source_ascii) - 1;
9787
9788 switch (encidx) {
9789#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9790#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9791#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9792#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9793#define CASE_UTF(e) \
9794 case ENCINDEX_UTF_##e: { \
9795 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9796 source = source_UTF_##e; \
9797 source_len = sizeof(source_UTF_##e); \
9798 break; \
9799 }
9800 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9801#undef CASE_UTF
9802#undef CHARS_16BE
9803#undef CHARS_16LE
9804#undef CHARS_32BE
9805#undef CHARS_32LE
9806 }
9807
9808 regex_t *reg_grapheme_cluster;
9809 OnigErrorInfo einfo;
9810 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9811 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9812 if (r) {
9813 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9814 onig_error_code_to_str(message, r, &einfo);
9815 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9816 }
9817
9818 return reg_grapheme_cluster;
9819}
9820
9821static regex_t *
9822get_cached_reg_grapheme_cluster(rb_encoding *enc)
9823{
9824 int encidx = rb_enc_to_index(enc);
9825 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9826
9827 if (encidx == rb_utf8_encindex()) {
9828 if (!reg_grapheme_cluster_utf8) {
9829 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9830 }
9831
9832 return reg_grapheme_cluster_utf8;
9833 }
9834
9835 return NULL;
9836}
9837
9838static VALUE
9839rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9840{
9841 size_t grapheme_cluster_count = 0;
9842 rb_encoding *enc = get_encoding(str);
9843 const char *ptr, *end;
9844
9845 if (!rb_enc_unicode_p(enc)) {
9846 return rb_str_length(str);
9847 }
9848
9849 bool cached_reg_grapheme_cluster = true;
9850 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9851 if (!reg_grapheme_cluster) {
9852 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9853 cached_reg_grapheme_cluster = false;
9854 }
9855
9856 ptr = RSTRING_PTR(str);
9857 end = RSTRING_END(str);
9858
9859 while (ptr < end) {
9860 OnigPosition len = onig_match(reg_grapheme_cluster,
9861 (const OnigUChar *)ptr, (const OnigUChar *)end,
9862 (const OnigUChar *)ptr, NULL, 0);
9863 if (len <= 0) break;
9864 grapheme_cluster_count++;
9865 ptr += len;
9866 }
9867
9868 if (!cached_reg_grapheme_cluster) {
9869 onig_free(reg_grapheme_cluster);
9870 }
9871
9872 return SIZET2NUM(grapheme_cluster_count);
9873}
9874
9875static VALUE
9876rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9877{
9878 VALUE orig = str;
9879 rb_encoding *enc = get_encoding(str);
9880 const char *ptr0, *ptr, *end;
9881
9882 if (!rb_enc_unicode_p(enc)) {
9883 return rb_str_enumerate_chars(str, ary);
9884 }
9885
9886 if (!ary) str = rb_str_new_frozen(str);
9887
9888 bool cached_reg_grapheme_cluster = true;
9889 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9890 if (!reg_grapheme_cluster) {
9891 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9892 cached_reg_grapheme_cluster = false;
9893 }
9894
9895 ptr0 = ptr = RSTRING_PTR(str);
9896 end = RSTRING_END(str);
9897
9898 while (ptr < end) {
9899 OnigPosition len = onig_match(reg_grapheme_cluster,
9900 (const OnigUChar *)ptr, (const OnigUChar *)end,
9901 (const OnigUChar *)ptr, NULL, 0);
9902 if (len <= 0) break;
9903 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9904 ptr += len;
9905 }
9906
9907 if (!cached_reg_grapheme_cluster) {
9908 onig_free(reg_grapheme_cluster);
9909 }
9910
9911 RB_GC_GUARD(str);
9912 if (ary)
9913 return ary;
9914 else
9915 return orig;
9916}
9917
9918/*
9919 * call-seq:
9920 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9921 * each_grapheme_cluster -> enumerator
9922 *
9923 * :include: doc/string/each_grapheme_cluster.rdoc
9924 *
9925 */
9926
9927static VALUE
9928rb_str_each_grapheme_cluster(VALUE str)
9929{
9930 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9931 return rb_str_enumerate_grapheme_clusters(str, 0);
9932}
9933
9934/*
9935 * call-seq:
9936 * grapheme_clusters -> array_of_grapheme_clusters
9937 *
9938 * :include: doc/string/grapheme_clusters.rdoc
9939 *
9940 */
9941
9942static VALUE
9943rb_str_grapheme_clusters(VALUE str)
9944{
9945 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9946 return rb_str_enumerate_grapheme_clusters(str, ary);
9947}
9948
9949static long
9950chopped_length(VALUE str)
9951{
9952 rb_encoding *enc = STR_ENC_GET(str);
9953 const char *p, *p2, *beg, *end;
9954
9955 beg = RSTRING_PTR(str);
9956 end = beg + RSTRING_LEN(str);
9957 if (beg >= end) return 0;
9958 p = rb_enc_prev_char(beg, end, end, enc);
9959 if (!p) return 0;
9960 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9961 p2 = rb_enc_prev_char(beg, p, end, enc);
9962 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9963 }
9964 return p - beg;
9965}
9966
9967/*
9968 * call-seq:
9969 * chop! -> self or nil
9970 *
9971 * Like String#chop, except that:
9972 *
9973 * - Removes trailing characters from +self+ (not from a copy of +self+).
9974 * - Returns +self+ if any characters are removed, +nil+ otherwise.
9975 *
9976 * Related: see {Modifying}[rdoc-ref:String@Modifying].
9977 */
9978
9979static VALUE
9980rb_str_chop_bang(VALUE str)
9981{
9982 str_modify_keep_cr(str);
9983 if (RSTRING_LEN(str) > 0) {
9984 long len;
9985 len = chopped_length(str);
9986 STR_SET_LEN(str, len);
9987 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9988 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9990 }
9991 return str;
9992 }
9993 return Qnil;
9994}
9995
9996
9997/*
9998 * call-seq:
9999 * chop -> new_string
10000 *
10001 * :include: doc/string/chop.rdoc
10002 *
10003 */
10004
10005static VALUE
10006rb_str_chop(VALUE str)
10007{
10008 return rb_str_subseq(str, 0, chopped_length(str));
10009}
10010
10011static long
10012smart_chomp(VALUE str, const char *e, const char *p)
10013{
10014 rb_encoding *enc = rb_enc_get(str);
10015 if (rb_enc_mbminlen(enc) > 1) {
10016 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10017 if (rb_enc_is_newline(pp, e, enc)) {
10018 e = pp;
10019 }
10020 pp = e - rb_enc_mbminlen(enc);
10021 if (pp >= p) {
10022 pp = rb_enc_left_char_head(p, pp, e, enc);
10023 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10024 e = pp;
10025 }
10026 }
10027 }
10028 else {
10029 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10030 case '\n':
10031 if (--e > p && *(e-1) == '\r') {
10032 --e;
10033 }
10034 break;
10035 case '\r':
10036 --e;
10037 break;
10038 }
10039 }
10040 return e - p;
10041}
10042
10043static long
10044chompped_length(VALUE str, VALUE rs)
10045{
10046 rb_encoding *enc;
10047 int newline;
10048 char *pp, *e, *rsptr;
10049 long rslen;
10050 char *const p = RSTRING_PTR(str);
10051 long len = RSTRING_LEN(str);
10052
10053 if (len == 0) return 0;
10054 e = p + len;
10055 if (rs == rb_default_rs) {
10056 return smart_chomp(str, e, p);
10057 }
10058
10059 enc = rb_enc_get(str);
10060 RSTRING_GETMEM(rs, rsptr, rslen);
10061 if (rslen == 0) {
10062 if (rb_enc_mbminlen(enc) > 1) {
10063 while (e > p) {
10064 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10065 if (!rb_enc_is_newline(pp, e, enc)) break;
10066 e = pp;
10067 pp -= rb_enc_mbminlen(enc);
10068 if (pp >= p) {
10069 pp = rb_enc_left_char_head(p, pp, e, enc);
10070 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10071 e = pp;
10072 }
10073 }
10074 }
10075 }
10076 else {
10077 while (e > p && *(e-1) == '\n') {
10078 --e;
10079 if (e > p && *(e-1) == '\r')
10080 --e;
10081 }
10082 }
10083 return e - p;
10084 }
10085 if (rslen > len) return len;
10086
10087 enc = rb_enc_get(rs);
10088 newline = rsptr[rslen-1];
10089 if (rslen == rb_enc_mbminlen(enc)) {
10090 if (rslen == 1) {
10091 if (newline == '\n')
10092 return smart_chomp(str, e, p);
10093 }
10094 else {
10095 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10096 return smart_chomp(str, e, p);
10097 }
10098 }
10099
10100 enc = rb_enc_check(str, rs);
10101 if (is_broken_string(rs)) {
10102 return len;
10103 }
10104 pp = e - rslen;
10105 if (p[len-1] == newline &&
10106 (rslen <= 1 ||
10107 memcmp(rsptr, pp, rslen) == 0)) {
10108 if (at_char_boundary(p, pp, e, enc))
10109 return len - rslen;
10110 RB_GC_GUARD(rs);
10111 }
10112 return len;
10113}
10114
10120static VALUE
10121chomp_rs(int argc, const VALUE *argv)
10122{
10123 rb_check_arity(argc, 0, 1);
10124 if (argc > 0) {
10125 VALUE rs = argv[0];
10126 if (!NIL_P(rs)) StringValue(rs);
10127 return rs;
10128 }
10129 else {
10130 return rb_rs;
10131 }
10132}
10133
10134VALUE
10135rb_str_chomp_string(VALUE str, VALUE rs)
10136{
10137 long olen = RSTRING_LEN(str);
10138 long len = chompped_length(str, rs);
10139 if (len >= olen) return Qnil;
10140 str_modify_keep_cr(str);
10141 STR_SET_LEN(str, len);
10142 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10143 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10145 }
10146 return str;
10147}
10148
10149/*
10150 * call-seq:
10151 * chomp!(line_sep = $/) -> self or nil
10152 *
10153 * Like String#chomp, except that:
10154 *
10155 * - Removes trailing characters from +self+ (not from a copy of +self+).
10156 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10157 *
10158 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10159 */
10160
10161static VALUE
10162rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10163{
10164 VALUE rs;
10165 str_modifiable(str);
10166 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10167 rs = chomp_rs(argc, argv);
10168 if (NIL_P(rs)) return Qnil;
10169 return rb_str_chomp_string(str, rs);
10170}
10171
10172
10173/*
10174 * call-seq:
10175 * chomp(line_sep = $/) -> new_string
10176 *
10177 * :include: doc/string/chomp.rdoc
10178 *
10179 */
10180
10181static VALUE
10182rb_str_chomp(int argc, VALUE *argv, VALUE str)
10183{
10184 VALUE rs = chomp_rs(argc, argv);
10185 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10186 return rb_str_subseq(str, 0, chompped_length(str, rs));
10187}
10188
10189static void
10190tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10191 VALUE str, int num_selectors, VALUE *selectors)
10192{
10193 int i;
10194
10195 for (i=0; i<num_selectors; i++) {
10196 VALUE selector = selectors[i];
10197 rb_encoding *enc;
10198
10199 StringValue(selector);
10200 enc = rb_enc_check(str, selector);
10201 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10202 }
10203}
10204
10205static long
10206lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10207{
10208 const char *const start = s;
10209
10210 if (!s || s >= e) return 0;
10211
10212 /* remove spaces at head */
10213 if (single_byte_optimizable(str)) {
10214 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10215 }
10216 else {
10217 while (s < e) {
10218 int n;
10219 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10220
10221 if (cc && !rb_isspace(cc)) break;
10222 s += n;
10223 }
10224 }
10225 return s - start;
10226}
10227
10228static long
10229lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10230 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10231{
10232 const char *const start = s;
10233
10234 if (!s || s >= e) return 0;
10235
10236 /* remove leading characters in the table */
10237 while (s < e) {
10238 int n;
10239 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10240
10241 if (!tr_find(cc, table, del, nodel)) break;
10242 s += n;
10243 }
10244 return s - start;
10245}
10246
10247/*
10248 * call-seq:
10249 * lstrip!(*selectors) -> self or nil
10250 *
10251 * Like String#lstrip, except that:
10252 *
10253 * - Performs stripping in +self+ (not in a copy of +self+).
10254 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10255 *
10256 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10257 */
10258
10259static VALUE
10260rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10261{
10262 rb_encoding *enc;
10263 char *start, *s;
10264 long olen, loffset;
10265
10266 str_modify_keep_cr(str);
10267 enc = STR_ENC_GET(str);
10268 RSTRING_GETMEM(str, start, olen);
10269 if (argc > 0) {
10270 char table[TR_TABLE_SIZE];
10271 VALUE del = 0, nodel = 0;
10272
10273 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10274 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10275 }
10276 else {
10277 loffset = lstrip_offset(str, start, start+olen, enc);
10278 }
10279
10280 if (loffset > 0) {
10281 long len = olen-loffset;
10282 s = start + loffset;
10283 memmove(start, s, len);
10284 STR_SET_LEN(str, len);
10285 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10286 return str;
10287 }
10288 return Qnil;
10289}
10290
10291
10292/*
10293 * call-seq:
10294 * lstrip(*selectors) -> new_string
10295 *
10296 * Returns a copy of +self+ with leading whitespace removed;
10297 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10298 *
10299 * whitespace = "\x00\t\n\v\f\r "
10300 * s = whitespace + 'abc' + whitespace
10301 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10302 * s.lstrip
10303 * # => "abc\u0000\t\n\v\f\r "
10304 *
10305 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10306 *
10307 * s = "---abc+++"
10308 * s.lstrip("-") # => "abc+++"
10309 *
10310 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10311 * and may use any of its valid forms, including negation, ranges, and escapes:
10312 *
10313 * "01234abc56789".lstrip("0-9") # "abc56789"
10314 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10315 *
10316 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10317 */
10318
10319static VALUE
10320rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10321{
10322 char *start;
10323 long len, loffset;
10324
10325 RSTRING_GETMEM(str, start, len);
10326 if (argc > 0) {
10327 char table[TR_TABLE_SIZE];
10328 VALUE del = 0, nodel = 0;
10329
10330 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10331 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10332 }
10333 else {
10334 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10335 }
10336 if (loffset <= 0) return str_duplicate(rb_cString, str);
10337 return rb_str_subseq(str, loffset, len - loffset);
10338}
10339
10340static long
10341rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10342{
10343 const char *t;
10344
10345 rb_str_check_dummy_enc(enc);
10347 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10348 }
10349 if (!s || s >= e) return 0;
10350 t = e;
10351
10352 /* remove trailing spaces or '\0's */
10353 if (single_byte_optimizable(str)) {
10354 unsigned char c;
10355 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10356 }
10357 else {
10358 char *tp;
10359
10360 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10361 unsigned int c = rb_enc_codepoint(tp, e, enc);
10362 if (c && !rb_isspace(c)) break;
10363 t = tp;
10364 }
10365 }
10366 return e - t;
10367}
10368
10369static long
10370rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10371 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10372{
10373 const char *t;
10374 char *tp;
10375
10376 rb_str_check_dummy_enc(enc);
10378 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10379 }
10380 if (!s || s >= e) return 0;
10381 t = e;
10382
10383 /* remove trailing characters in the table */
10384 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10385 unsigned int c = rb_enc_codepoint(tp, e, enc);
10386 if (!tr_find(c, table, del, nodel)) break;
10387 t = tp;
10388 }
10389
10390 return e - t;
10391}
10392
10393/*
10394 * call-seq:
10395 * rstrip!(*selectors) -> self or nil
10396 *
10397 * Like String#rstrip, except that:
10398 *
10399 * - Performs stripping in +self+ (not in a copy of +self+).
10400 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10401 *
10402 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10403 */
10404
10405static VALUE
10406rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10407{
10408 rb_encoding *enc;
10409 char *start;
10410 long olen, roffset;
10411
10412 str_modify_keep_cr(str);
10413 enc = STR_ENC_GET(str);
10414 RSTRING_GETMEM(str, start, olen);
10415 if (argc > 0) {
10416 char table[TR_TABLE_SIZE];
10417 VALUE del = 0, nodel = 0;
10418
10419 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10420 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10421 }
10422 else {
10423 roffset = rstrip_offset(str, start, start+olen, enc);
10424 }
10425 if (roffset > 0) {
10426 long len = olen - roffset;
10427
10428 STR_SET_LEN(str, len);
10429 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10430 return str;
10431 }
10432 return Qnil;
10433}
10434
10435
10436/*
10437 * call-seq:
10438 * rstrip(*selectors) -> new_string
10439 *
10440 * Returns a copy of +self+ with trailing whitespace removed;
10441 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10442 *
10443 * whitespace = "\x00\t\n\v\f\r "
10444 * s = whitespace + 'abc' + whitespace
10445 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10446 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10447 *
10448 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10449 *
10450 * s = "---abc+++"
10451 * s.rstrip("+") # => "---abc"
10452 *
10453 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10454 * and may use any of its valid forms, including negation, ranges, and escapes:
10455 *
10456 * "01234abc56789".rstrip("0-9") # "01234abc"
10457 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10458 *
10459 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10460 */
10461
10462static VALUE
10463rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10464{
10465 rb_encoding *enc;
10466 char *start;
10467 long olen, roffset;
10468
10469 enc = STR_ENC_GET(str);
10470 RSTRING_GETMEM(str, start, olen);
10471 if (argc > 0) {
10472 char table[TR_TABLE_SIZE];
10473 VALUE del = 0, nodel = 0;
10474
10475 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10476 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10477 }
10478 else {
10479 roffset = rstrip_offset(str, start, start+olen, enc);
10480 }
10481 if (roffset <= 0) return str_duplicate(rb_cString, str);
10482 return rb_str_subseq(str, 0, olen-roffset);
10483}
10484
10485
10486/*
10487 * call-seq:
10488 * strip!(*selectors) -> self or nil
10489 *
10490 * Like String#strip, except that:
10491 *
10492 * - Any modifications are made to +self+.
10493 * - Returns +self+ if any modification are made, +nil+ otherwise.
10494 *
10495 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10496 */
10497
10498static VALUE
10499rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10500{
10501 char *start;
10502 long olen, loffset, roffset;
10503 rb_encoding *enc;
10504
10505 str_modify_keep_cr(str);
10506 enc = STR_ENC_GET(str);
10507 RSTRING_GETMEM(str, start, olen);
10508
10509 if (argc > 0) {
10510 char table[TR_TABLE_SIZE];
10511 VALUE del = 0, nodel = 0;
10512
10513 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10514 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10515 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10516 }
10517 else {
10518 loffset = lstrip_offset(str, start, start+olen, enc);
10519 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10520 }
10521
10522 if (loffset > 0 || roffset > 0) {
10523 long len = olen-roffset;
10524 if (loffset > 0) {
10525 len -= loffset;
10526 memmove(start, start + loffset, len);
10527 }
10528 STR_SET_LEN(str, len);
10529 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10530 return str;
10531 }
10532 return Qnil;
10533}
10534
10535
10536/*
10537 * call-seq:
10538 * strip(*selectors) -> new_string
10539 *
10540 * Returns a copy of +self+ with leading and trailing whitespace removed;
10541 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10542 *
10543 * whitespace = "\x00\t\n\v\f\r "
10544 * s = whitespace + 'abc' + whitespace
10545 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10546 * s.strip # => "abc"
10547 *
10548 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10549 *
10550 * s = "---abc+++"
10551 * s.strip("-+") # => "abc"
10552 * s.strip("+-") # => "abc"
10553 *
10554 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10555 * and may use any of its valid forms, including negation, ranges, and escapes:
10556 *
10557 * "01234abc56789".strip("0-9") # "abc"
10558 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10559 *
10560 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10561 */
10562
10563static VALUE
10564rb_str_strip(int argc, VALUE *argv, VALUE str)
10565{
10566 char *start;
10567 long olen, loffset, roffset;
10568 rb_encoding *enc = STR_ENC_GET(str);
10569
10570 RSTRING_GETMEM(str, start, olen);
10571
10572 if (argc > 0) {
10573 char table[TR_TABLE_SIZE];
10574 VALUE del = 0, nodel = 0;
10575
10576 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10577 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10578 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10579 }
10580 else {
10581 loffset = lstrip_offset(str, start, start+olen, enc);
10582 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10583 }
10584
10585 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10586 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10587}
10588
10589static VALUE
10590scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10591{
10592 VALUE result = Qnil;
10593 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10594 if (pos >= 0) {
10595 VALUE match;
10596 struct re_registers *regs;
10597 if (BUILTIN_TYPE(pat) == T_STRING) {
10598 regs = NULL;
10599 end = pos + RSTRING_LEN(pat);
10600 }
10601 else {
10602 match = rb_backref_get();
10603 regs = RMATCH_REGS(match);
10604 pos = BEG(0);
10605 end = END(0);
10606 }
10607
10608 if (pos == end) {
10609 rb_encoding *enc = STR_ENC_GET(str);
10610 /*
10611 * Always consume at least one character of the input string
10612 */
10613 if (RSTRING_LEN(str) > end)
10614 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10615 RSTRING_END(str), enc);
10616 else
10617 *start = end + 1;
10618 }
10619 else {
10620 *start = end;
10621 }
10622
10623 if (!regs || regs->num_regs == 1) {
10624 result = rb_str_subseq(str, pos, end - pos);
10625 return result;
10626 }
10627 else {
10628 result = rb_ary_new2(regs->num_regs);
10629 for (int i = 1; i < regs->num_regs; i++) {
10630 VALUE s = Qnil;
10631 if (BEG(i) >= 0) {
10632 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10633 }
10634
10635 rb_ary_push(result, s);
10636 }
10637 }
10638
10639 RB_GC_GUARD(match);
10640 }
10641
10642 return result;
10643}
10644
10645
10646/*
10647 * call-seq:
10648 * scan(pattern) -> array_of_results
10649 * scan(pattern) {|result| ... } -> self
10650 *
10651 * :include: doc/string/scan.rdoc
10652 *
10653 */
10654
10655static VALUE
10656rb_str_scan(VALUE str, VALUE pat)
10657{
10658 VALUE result;
10659 long start = 0;
10660 long last = -1, prev = 0;
10661 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10662
10663 pat = get_pat_quoted(pat, 1);
10664 mustnot_broken(str);
10665 if (!rb_block_given_p()) {
10666 VALUE ary = rb_ary_new();
10667
10668 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10669 last = prev;
10670 prev = start;
10671 rb_ary_push(ary, result);
10672 }
10673 if (last >= 0) rb_pat_search(pat, str, last, 1);
10674 else rb_backref_set(Qnil);
10675 return ary;
10676 }
10677
10678 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10679 last = prev;
10680 prev = start;
10681 rb_yield(result);
10682 str_mod_check(str, p, len);
10683 }
10684 if (last >= 0) rb_pat_search(pat, str, last, 1);
10685 return str;
10686}
10687
10688
10689/*
10690 * call-seq:
10691 * hex -> integer
10692 *
10693 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10694 * returns its value as an integer.
10695 *
10696 * The leading substring is interpreted as hexadecimal when it begins with:
10697 *
10698 * - One or more character representing hexadecimal digits
10699 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10700 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10701 *
10702 * 'f'.hex # => 15
10703 * '11'.hex # => 17
10704 * 'FFF'.hex # => 4095
10705 * 'fffg'.hex # => 4095
10706 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10707 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10708 * 'deadbeef'.hex # => 3735928559
10709 *
10710 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10711 *
10712 * '0xfff'.hex # => 4095
10713 * '0xfffg'.hex # => 4095
10714 *
10715 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10716 *
10717 * '-fff'.hex # => -4095
10718 * '-0xFFF'.hex # => -4095
10719 *
10720 * For any substring not described above, returns zero:
10721 *
10722 * 'xxx'.hex # => 0
10723 * ''.hex # => 0
10724 *
10725 * Note that, unlike #oct, this method interprets only hexadecimal,
10726 * and not binary, octal, or decimal notations:
10727 *
10728 * '0b111'.hex # => 45329
10729 * '0o777'.hex # => 0
10730 * '0d999'.hex # => 55705
10731 *
10732 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10733 */
10734
10735static VALUE
10736rb_str_hex(VALUE str)
10737{
10738 return rb_str_to_inum(str, 16, FALSE);
10739}
10740
10741
10742/*
10743 * call-seq:
10744 * oct -> integer
10745 *
10746 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10747 * returns their value as an integer.
10748 *
10749 * In brief:
10750 *
10751 * # Interpreted as octal.
10752 * '777'.oct # => 511
10753 * '777x'.oct # => 511
10754 * '0777'.oct # => 511
10755 * '0o777'.oct # => 511
10756 * '-777'.oct # => -511
10757 * # Not interpreted as octal.
10758 * '0b111'.oct # => 7 # Interpreted as binary.
10759 * '0d999'.oct # => 999 # Interpreted as decimal.
10760 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10761 *
10762 * The leading substring is interpreted as octal when it begins with:
10763 *
10764 * - One or more character representing octal digits
10765 * (each in the range <tt>'0'..'7'</tt>);
10766 * the string to be interpreted ends at the first character that does not represent an octal digit:
10767 *
10768 * '7'.oct @ => 7
10769 * '11'.oct # => 9
10770 * '777'.oct # => 511
10771 * '0777'.oct # => 511
10772 * '7778'.oct # => 511
10773 * '777x'.oct # => 511
10774 *
10775 * - <tt>'0o'</tt>, followed by one or more octal digits:
10776 *
10777 * '0o777'.oct # => 511
10778 * '0o7778'.oct # => 511
10779 *
10780 * The leading substring is _not_ interpreted as octal when it begins with:
10781 *
10782 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10783 * (each in the range <tt>'0'..'1'</tt>);
10784 * the string to be interpreted ends at the first character that does not represent a binary digit.
10785 * the string is interpreted as binary digits (base 2):
10786 *
10787 * '0b111'.oct # => 7
10788 * '0b1112'.oct # => 7
10789 *
10790 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10791 * (each in the range <tt>'0'..'9'</tt>);
10792 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10793 * the string is interpreted as decimal digits (base 10):
10794 *
10795 * '0d999'.oct # => 999
10796 * '0d999x'.oct # => 999
10797 *
10798 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10799 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10800 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10801 * the string is interpreted as hexadecimal digits (base 16):
10802 *
10803 * '0xfff'.oct # => 4095
10804 * '0xfffg'.oct # => 4095
10805 *
10806 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10807 *
10808 * '-777'.oct # => -511
10809 * '-0777'.oct # => -511
10810 * '-0b111'.oct # => -7
10811 * '-0xfff'.oct # => -4095
10812 *
10813 * For any substring not described above, returns zero:
10814 *
10815 * 'foo'.oct # => 0
10816 * ''.oct # => 0
10817 *
10818 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10819 */
10820
10821static VALUE
10822rb_str_oct(VALUE str)
10823{
10824 return rb_str_to_inum(str, -8, FALSE);
10825}
10826
10827#ifndef HAVE_CRYPT_R
10828# include "ruby/thread_native.h"
10829# include "ruby/atomic.h"
10830
10831static struct {
10832 rb_nativethread_lock_t lock;
10833} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10834#endif
10835
10836/*
10837 * call-seq:
10838 * crypt(salt_str) -> new_string
10839 *
10840 * Returns the string generated by calling <code>crypt(3)</code>
10841 * standard library function with <code>str</code> and
10842 * <code>salt_str</code>, in this order, as its arguments. Please do
10843 * not use this method any longer. It is legacy; provided only for
10844 * backward compatibility with ruby scripts in earlier days. It is
10845 * bad to use in contemporary programs for several reasons:
10846 *
10847 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10848 * run. The generated string lacks data portability.
10849 *
10850 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10851 * (i.e. silently ends up in unexpected results).
10852 *
10853 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10854 * thread safe.
10855 *
10856 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10857 * very very weak. According to its manpage, Linux's traditional
10858 * <code>crypt(3)</code> output has only 2**56 variations; too
10859 * easy to brute force today. And this is the default behaviour.
10860 *
10861 * * In order to make things robust some OSes implement so-called
10862 * "modular" usage. To go through, you have to do a complex
10863 * build-up of the <code>salt_str</code> parameter, by hand.
10864 * Failure in generation of a proper salt string tends not to
10865 * yield any errors; typos in parameters are normally not
10866 * detectable.
10867 *
10868 * * For instance, in the following example, the second invocation
10869 * of String#crypt is wrong; it has a typo in "round=" (lacks
10870 * "s"). However the call does not fail and something unexpected
10871 * is generated.
10872 *
10873 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10874 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10875 *
10876 * * Even in the "modular" mode, some hash functions are considered
10877 * archaic and no longer recommended at all; for instance module
10878 * <code>$1$</code> is officially abandoned by its author: see
10879 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10880 * instance module <code>$3$</code> is considered completely
10881 * broken: see the manpage of FreeBSD.
10882 *
10883 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10884 * written above, <code>crypt(3)</code> on Mac OS never fails.
10885 * This means even if you build up a proper salt string it
10886 * generates a traditional DES hash anyways, and there is no way
10887 * for you to be aware of.
10888 *
10889 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10890 *
10891 * If for some reason you cannot migrate to other secure contemporary
10892 * password hashing algorithms, install the string-crypt gem and
10893 * <code>require 'string/crypt'</code> to continue using it.
10894 */
10895
10896static VALUE
10897rb_str_crypt(VALUE str, VALUE salt)
10898{
10899#ifdef HAVE_CRYPT_R
10900 VALUE databuf;
10901 struct crypt_data *data;
10902# define CRYPT_END() ALLOCV_END(databuf)
10903#else
10904 char *tmp_buf;
10905 extern char *crypt(const char *, const char *);
10906# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10907#endif
10908 VALUE result;
10909 const char *s, *saltp;
10910 char *res;
10911#ifdef BROKEN_CRYPT
10912 char salt_8bit_clean[3];
10913#endif
10914
10915 StringValue(salt);
10916 mustnot_wchar(str);
10917 mustnot_wchar(salt);
10918 s = StringValueCStr(str);
10919 saltp = RSTRING_PTR(salt);
10920 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10921 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10922 }
10923
10924#ifdef BROKEN_CRYPT
10925 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10926 salt_8bit_clean[0] = saltp[0] & 0x7f;
10927 salt_8bit_clean[1] = saltp[1] & 0x7f;
10928 salt_8bit_clean[2] = '\0';
10929 saltp = salt_8bit_clean;
10930 }
10931#endif
10932#ifdef HAVE_CRYPT_R
10933 data = ALLOCV(databuf, sizeof(struct crypt_data));
10934# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10935 data->initialized = 0;
10936# endif
10937 res = crypt_r(s, saltp, data);
10938#else
10939 rb_nativethread_lock_lock(&crypt_mutex.lock);
10940 res = crypt(s, saltp);
10941#endif
10942 if (!res) {
10943 int err = errno;
10944 CRYPT_END();
10945 rb_syserr_fail(err, "crypt");
10946 }
10947#ifdef HAVE_CRYPT_R
10948 result = rb_str_new_cstr(res);
10949 CRYPT_END();
10950#else
10951 // We need to copy this buffer because it's static and we need to unlock the mutex
10952 // before allocating a new object (the string to be returned). If we allocate while
10953 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10954 // if other ractors are waiting on this lock.
10955 size_t res_size = strlen(res)+1;
10956 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10957 memcpy(tmp_buf, res, res_size);
10958 res = tmp_buf;
10959 CRYPT_END();
10960 result = rb_str_new_cstr(res);
10961#endif
10962 return result;
10963}
10964
10965
10966/*
10967 * call-seq:
10968 * ord -> integer
10969 *
10970 * :include: doc/string/ord.rdoc
10971 *
10972 */
10973
10974static VALUE
10975rb_str_ord(VALUE s)
10976{
10977 unsigned int c;
10978
10979 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10980 return UINT2NUM(c);
10981}
10982/*
10983 * call-seq:
10984 * sum(n = 16) -> integer
10985 *
10986 * :include: doc/string/sum.rdoc
10987 *
10988 */
10989
10990static VALUE
10991rb_str_sum(int argc, VALUE *argv, VALUE str)
10992{
10993 int bits = 16;
10994 char *ptr, *p, *pend;
10995 long len;
10996 VALUE sum = INT2FIX(0);
10997 unsigned long sum0 = 0;
10998
10999 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11000 bits = 0;
11001 }
11002 ptr = p = RSTRING_PTR(str);
11003 len = RSTRING_LEN(str);
11004 pend = p + len;
11005
11006 while (p < pend) {
11007 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11008 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11009 str_mod_check(str, ptr, len);
11010 sum0 = 0;
11011 }
11012 sum0 += (unsigned char)*p;
11013 p++;
11014 }
11015
11016 if (bits == 0) {
11017 if (sum0) {
11018 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11019 }
11020 }
11021 else {
11022 if (sum == INT2FIX(0)) {
11023 if (bits < (int)sizeof(long)*CHAR_BIT) {
11024 sum0 &= (((unsigned long)1)<<bits)-1;
11025 }
11026 sum = LONG2FIX(sum0);
11027 }
11028 else {
11029 VALUE mod;
11030
11031 if (sum0) {
11032 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11033 }
11034
11035 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11036 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11037 sum = rb_funcall(sum, '&', 1, mod);
11038 }
11039 }
11040 return sum;
11041}
11042
11043static VALUE
11044rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11045{
11046 rb_encoding *enc;
11047 VALUE w;
11048 long width, len, flen = 1, fclen = 1;
11049 VALUE res;
11050 char *p;
11051 const char *f = " ";
11052 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11053 VALUE pad;
11054 int singlebyte = 1, cr;
11055 int termlen;
11056
11057 rb_scan_args(argc, argv, "11", &w, &pad);
11058 enc = STR_ENC_GET(str);
11059 termlen = rb_enc_mbminlen(enc);
11060 width = NUM2LONG(w);
11061 if (argc == 2) {
11062 StringValue(pad);
11063 enc = rb_enc_check(str, pad);
11064 f = RSTRING_PTR(pad);
11065 flen = RSTRING_LEN(pad);
11066 fclen = str_strlen(pad, enc); /* rb_enc_check */
11067 singlebyte = single_byte_optimizable(pad);
11068 if (flen == 0 || fclen == 0) {
11069 rb_raise(rb_eArgError, "zero width padding");
11070 }
11071 }
11072 len = str_strlen(str, enc); /* rb_enc_check */
11073 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11074 n = width - len;
11075 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11076 rlen = n - llen;
11077 cr = ENC_CODERANGE(str);
11078 if (flen > 1) {
11079 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11080 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11081 }
11082 size = RSTRING_LEN(str);
11083 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11084 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11085 (len += llen2 + rlen2) >= LONG_MAX - size) {
11086 rb_raise(rb_eArgError, "argument too big");
11087 }
11088 len += size;
11089 res = str_enc_new(rb_cString, 0, len, enc);
11090 p = RSTRING_PTR(res);
11091 if (flen <= 1) {
11092 memset(p, *f, llen);
11093 p += llen;
11094 }
11095 else {
11096 while (llen >= fclen) {
11097 memcpy(p,f,flen);
11098 p += flen;
11099 llen -= fclen;
11100 }
11101 if (llen > 0) {
11102 memcpy(p, f, llen2);
11103 p += llen2;
11104 }
11105 }
11106 memcpy(p, RSTRING_PTR(str), size);
11107 p += size;
11108 if (flen <= 1) {
11109 memset(p, *f, rlen);
11110 p += rlen;
11111 }
11112 else {
11113 while (rlen >= fclen) {
11114 memcpy(p,f,flen);
11115 p += flen;
11116 rlen -= fclen;
11117 }
11118 if (rlen > 0) {
11119 memcpy(p, f, rlen2);
11120 p += rlen2;
11121 }
11122 }
11123 TERM_FILL(p, termlen);
11124 STR_SET_LEN(res, p-RSTRING_PTR(res));
11125
11126 if (argc == 2)
11127 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11128 if (cr != ENC_CODERANGE_BROKEN)
11129 ENC_CODERANGE_SET(res, cr);
11130
11131 RB_GC_GUARD(pad);
11132 return res;
11133}
11134
11135
11136/*
11137 * call-seq:
11138 * ljust(width, pad_string = ' ') -> new_string
11139 *
11140 * :include: doc/string/ljust.rdoc
11141 *
11142 */
11143
11144static VALUE
11145rb_str_ljust(int argc, VALUE *argv, VALUE str)
11146{
11147 return rb_str_justify(argc, argv, str, 'l');
11148}
11149
11150/*
11151 * call-seq:
11152 * rjust(width, pad_string = ' ') -> new_string
11153 *
11154 * :include: doc/string/rjust.rdoc
11155 *
11156 */
11157
11158static VALUE
11159rb_str_rjust(int argc, VALUE *argv, VALUE str)
11160{
11161 return rb_str_justify(argc, argv, str, 'r');
11162}
11163
11164
11165/*
11166 * call-seq:
11167 * center(size, pad_string = ' ') -> new_string
11168 *
11169 * :include: doc/string/center.rdoc
11170 *
11171 */
11172
11173static VALUE
11174rb_str_center(int argc, VALUE *argv, VALUE str)
11175{
11176 return rb_str_justify(argc, argv, str, 'c');
11177}
11178
11179/*
11180 * call-seq:
11181 * partition(pattern) -> [pre_match, first_match, post_match]
11182 *
11183 * :include: doc/string/partition.rdoc
11184 *
11185 */
11186
11187static VALUE
11188rb_str_partition(VALUE str, VALUE sep)
11189{
11190 long pos;
11191
11192 sep = get_pat_quoted(sep, 0);
11193 if (RB_TYPE_P(sep, T_REGEXP)) {
11194 if (rb_reg_search(sep, str, 0, 0) < 0) {
11195 goto failed;
11196 }
11197 VALUE match = rb_backref_get();
11198 struct re_registers *regs = RMATCH_REGS(match);
11199
11200 pos = BEG(0);
11201 sep = rb_str_subseq(str, pos, END(0) - pos);
11202 }
11203 else {
11204 pos = rb_str_index(str, sep, 0);
11205 if (pos < 0) goto failed;
11206 }
11207 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11208 sep,
11209 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11210 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11211
11212 failed:
11213 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11214}
11215
11216/*
11217 * call-seq:
11218 * rpartition(pattern) -> [pre_match, last_match, post_match]
11219 *
11220 * :include: doc/string/rpartition.rdoc
11221 *
11222 */
11223
11224static VALUE
11225rb_str_rpartition(VALUE str, VALUE sep)
11226{
11227 long pos = RSTRING_LEN(str);
11228
11229 sep = get_pat_quoted(sep, 0);
11230 if (RB_TYPE_P(sep, T_REGEXP)) {
11231 if (rb_reg_search(sep, str, pos, 1) < 0) {
11232 goto failed;
11233 }
11234 VALUE match = rb_backref_get();
11235 struct re_registers *regs = RMATCH_REGS(match);
11236
11237 pos = BEG(0);
11238 sep = rb_str_subseq(str, pos, END(0) - pos);
11239 }
11240 else {
11241 pos = rb_str_sublen(str, pos);
11242 pos = rb_str_rindex(str, sep, pos);
11243 if (pos < 0) {
11244 goto failed;
11245 }
11246 }
11247
11248 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11249 sep,
11250 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11251 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11252 failed:
11253 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11254}
11255
11256/*
11257 * call-seq:
11258 * start_with?(*patterns) -> true or false
11259 *
11260 * :include: doc/string/start_with_p.rdoc
11261 *
11262 */
11263
11264static VALUE
11265rb_str_start_with(int argc, VALUE *argv, VALUE str)
11266{
11267 int i;
11268
11269 for (i=0; i<argc; i++) {
11270 VALUE tmp = argv[i];
11271 if (RB_TYPE_P(tmp, T_REGEXP)) {
11272 if (rb_reg_start_with_p(tmp, str))
11273 return Qtrue;
11274 }
11275 else {
11276 const char *p, *s, *e;
11277 long slen, tlen;
11278 rb_encoding *enc;
11279
11280 StringValue(tmp);
11281 enc = rb_enc_check(str, tmp);
11282 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11283 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11284 p = RSTRING_PTR(str);
11285 e = p + slen;
11286 s = p + tlen;
11287 if (!at_char_right_boundary(p, s, e, enc))
11288 continue;
11289 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11290 return Qtrue;
11291 }
11292 }
11293 return Qfalse;
11294}
11295
11296/*
11297 * call-seq:
11298 * end_with?(*strings) -> true or false
11299 *
11300 * :include: doc/string/end_with_p.rdoc
11301 *
11302 */
11303
11304static VALUE
11305rb_str_end_with(int argc, VALUE *argv, VALUE str)
11306{
11307 int i;
11308
11309 for (i=0; i<argc; i++) {
11310 VALUE tmp = argv[i];
11311 const char *p, *s, *e;
11312 long slen, tlen;
11313 rb_encoding *enc;
11314
11315 StringValue(tmp);
11316 enc = rb_enc_check(str, tmp);
11317 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11318 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11319 p = RSTRING_PTR(str);
11320 e = p + slen;
11321 s = e - tlen;
11322 if (!at_char_boundary(p, s, e, enc))
11323 continue;
11324 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11325 return Qtrue;
11326 }
11327 return Qfalse;
11328}
11329
11339static long
11340deleted_prefix_length(VALUE str, VALUE prefix)
11341{
11342 const char *strptr, *prefixptr;
11343 long olen, prefixlen;
11344 rb_encoding *enc = rb_enc_get(str);
11345
11346 StringValue(prefix);
11347
11348 if (!is_broken_string(prefix) ||
11349 !rb_enc_asciicompat(enc) ||
11350 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11351 enc = rb_enc_check(str, prefix);
11352 }
11353
11354 /* return 0 if not start with prefix */
11355 prefixlen = RSTRING_LEN(prefix);
11356 if (prefixlen <= 0) return 0;
11357 olen = RSTRING_LEN(str);
11358 if (olen < prefixlen) return 0;
11359 strptr = RSTRING_PTR(str);
11360 prefixptr = RSTRING_PTR(prefix);
11361 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11362 if (is_broken_string(prefix)) {
11363 if (!is_broken_string(str)) {
11364 /* prefix in a valid string cannot be broken */
11365 return 0;
11366 }
11367 const char *strend = strptr + olen;
11368 const char *after_prefix = strptr + prefixlen;
11369 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11370 /* prefix does not end at char-boundary */
11371 return 0;
11372 }
11373 }
11374 /* prefix part in `str` also should be valid. */
11375
11376 return prefixlen;
11377}
11378
11379/*
11380 * call-seq:
11381 * delete_prefix!(prefix) -> self or nil
11382 *
11383 * Like String#delete_prefix, except that +self+ is modified in place;
11384 * returns +self+ if the prefix is removed, +nil+ otherwise.
11385 *
11386 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11387 */
11388
11389static VALUE
11390rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11391{
11392 long prefixlen;
11393 str_modify_keep_cr(str);
11394
11395 prefixlen = deleted_prefix_length(str, prefix);
11396 if (prefixlen <= 0) return Qnil;
11397
11398 return rb_str_drop_bytes(str, prefixlen);
11399}
11400
11401/*
11402 * call-seq:
11403 * delete_prefix(prefix) -> new_string
11404 *
11405 * :include: doc/string/delete_prefix.rdoc
11406 *
11407 */
11408
11409static VALUE
11410rb_str_delete_prefix(VALUE str, VALUE prefix)
11411{
11412 long prefixlen;
11413
11414 prefixlen = deleted_prefix_length(str, prefix);
11415 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11416
11417 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11418}
11419
11429static long
11430deleted_suffix_length(VALUE str, VALUE suffix)
11431{
11432 const char *strptr, *suffixptr;
11433 long olen, suffixlen;
11434 rb_encoding *enc;
11435
11436 StringValue(suffix);
11437 if (is_broken_string(suffix)) return 0;
11438 enc = rb_enc_check(str, suffix);
11439
11440 /* return 0 if not start with suffix */
11441 suffixlen = RSTRING_LEN(suffix);
11442 if (suffixlen <= 0) return 0;
11443 olen = RSTRING_LEN(str);
11444 if (olen < suffixlen) return 0;
11445 strptr = RSTRING_PTR(str);
11446 suffixptr = RSTRING_PTR(suffix);
11447 const char *strend = strptr + olen;
11448 const char *before_suffix = strend - suffixlen;
11449 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11450 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11451
11452 return suffixlen;
11453}
11454
11455/*
11456 * call-seq:
11457 * delete_suffix!(suffix) -> self or nil
11458 *
11459 * Like String#delete_suffix, except that +self+ is modified in place;
11460 * returns +self+ if the suffix is removed, +nil+ otherwise.
11461 *
11462 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11463 */
11464
11465static VALUE
11466rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11467{
11468 long olen, suffixlen, len;
11469 str_modifiable(str);
11470
11471 suffixlen = deleted_suffix_length(str, suffix);
11472 if (suffixlen <= 0) return Qnil;
11473
11474 olen = RSTRING_LEN(str);
11475 str_modify_keep_cr(str);
11476 len = olen - suffixlen;
11477 STR_SET_LEN(str, len);
11478 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11479 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11481 }
11482 return str;
11483}
11484
11485/*
11486 * call-seq:
11487 * delete_suffix(suffix) -> new_string
11488 *
11489 * :include: doc/string/delete_suffix.rdoc
11490 *
11491 */
11492
11493static VALUE
11494rb_str_delete_suffix(VALUE str, VALUE suffix)
11495{
11496 long suffixlen;
11497
11498 suffixlen = deleted_suffix_length(str, suffix);
11499 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11500
11501 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11502}
11503
11504void
11505rb_str_setter(VALUE val, ID id, VALUE *var)
11506{
11507 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11508 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11509 }
11510 *var = val;
11511}
11512
11513static void
11514nil_setter_warning(ID id)
11515{
11516 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11517}
11518
11519void
11520rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11521{
11522 rb_str_setter(val, id, var);
11523 if (!NIL_P(*var)) {
11524 nil_setter_warning(id);
11525 }
11526}
11527
11528static void
11529rb_fs_setter(VALUE val, ID id, VALUE *var)
11530{
11531 val = rb_fs_check(val);
11532 if (!val) {
11533 rb_raise(rb_eTypeError,
11534 "value of %"PRIsVALUE" must be String or Regexp",
11535 rb_id2str(id));
11536 }
11537 if (!NIL_P(val)) {
11538 nil_setter_warning(id);
11539 }
11540 *var = val;
11541}
11542
11543
11544/*
11545 * call-seq:
11546 * force_encoding(encoding) -> self
11547 *
11548 * :include: doc/string/force_encoding.rdoc
11549 *
11550 */
11551
11552static VALUE
11553rb_str_force_encoding(VALUE str, VALUE enc)
11554{
11555 str_modifiable(str);
11556
11557 rb_encoding *encoding = rb_to_encoding(enc);
11558 int idx = rb_enc_to_index(encoding);
11559
11560 // If the encoding is unchanged, we do nothing.
11561 if (ENCODING_GET(str) == idx) {
11562 return str;
11563 }
11564
11565 rb_enc_associate_index(str, idx);
11566
11567 // If the coderange was 7bit and the new encoding is ASCII-compatible
11568 // we can keep the coderange.
11569 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11570 return str;
11571 }
11572
11574 return str;
11575}
11576
11577/*
11578 * call-seq:
11579 * b -> new_string
11580 *
11581 * :include: doc/string/b.rdoc
11582 *
11583 */
11584
11585static VALUE
11586rb_str_b(VALUE str)
11587{
11588 VALUE str2;
11589 if (STR_EMBED_P(str)) {
11590 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11591 }
11592 else {
11593 str2 = str_alloc_heap(rb_cString);
11594 }
11595 str_replace_shared_without_enc(str2, str);
11596
11597 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11598 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11599 // If we know the receiver's code range then we know the result's code range.
11600 int cr = ENC_CODERANGE(str);
11601 switch (cr) {
11602 case ENC_CODERANGE_7BIT:
11604 break;
11608 break;
11609 default:
11610 ENC_CODERANGE_CLEAR(str2);
11611 break;
11612 }
11613 }
11614
11615 return str2;
11616}
11617
11618/*
11619 * call-seq:
11620 * valid_encoding? -> true or false
11621 *
11622 * :include: doc/string/valid_encoding_p.rdoc
11623 *
11624 */
11625
11626static VALUE
11627rb_str_valid_encoding_p(VALUE str)
11628{
11629 int cr = rb_enc_str_coderange(str);
11630
11631 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11632}
11633
11634/*
11635 * call-seq:
11636 * ascii_only? -> true or false
11637 *
11638 * Returns whether +self+ contains only ASCII characters:
11639 *
11640 * 'abc'.ascii_only? # => true
11641 * "abc\u{6666}".ascii_only? # => false
11642 *
11643 * Related: see {Querying}[rdoc-ref:String@Querying].
11644 */
11645
11646static VALUE
11647rb_str_is_ascii_only_p(VALUE str)
11648{
11649 int cr = rb_enc_str_coderange(str);
11650
11651 return RBOOL(cr == ENC_CODERANGE_7BIT);
11652}
11653
11654VALUE
11656{
11657 static const char ellipsis[] = "...";
11658 const long ellipsislen = sizeof(ellipsis) - 1;
11659 rb_encoding *const enc = rb_enc_get(str);
11660 const long blen = RSTRING_LEN(str);
11661 const char *const p = RSTRING_PTR(str), *e = p + blen;
11662 VALUE estr, ret = 0;
11663
11664 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11665 if (len * rb_enc_mbminlen(enc) >= blen ||
11666 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11667 ret = str;
11668 }
11669 else if (len <= ellipsislen ||
11670 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11671 if (rb_enc_asciicompat(enc)) {
11672 ret = rb_str_new(ellipsis, len);
11673 rb_enc_associate(ret, enc);
11674 }
11675 else {
11676 estr = rb_usascii_str_new(ellipsis, len);
11677 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11678 }
11679 }
11680 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11681 rb_str_cat(ret, ellipsis, ellipsislen);
11682 }
11683 else {
11684 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11685 rb_enc_from_encoding(enc), 0, Qnil);
11686 rb_str_append(ret, estr);
11687 }
11688 return ret;
11689}
11690
11691static VALUE
11692str_compat_and_valid(VALUE str, rb_encoding *enc)
11693{
11694 int cr;
11695 str = StringValue(str);
11696 cr = rb_enc_str_coderange(str);
11697 if (cr == ENC_CODERANGE_BROKEN) {
11698 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11699 }
11700 else {
11701 rb_encoding *e = STR_ENC_GET(str);
11702 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11703 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11704 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11705 }
11706 }
11707 return str;
11708}
11709
11710static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11711
11712VALUE
11714{
11715 rb_encoding *enc = STR_ENC_GET(str);
11716 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11717}
11718
11719VALUE
11720rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11721{
11722 int cr = ENC_CODERANGE_UNKNOWN;
11723 if (enc == STR_ENC_GET(str)) {
11724 /* cached coderange makes sense only when enc equals the
11725 * actual encoding of str */
11726 cr = ENC_CODERANGE(str);
11727 }
11728 return enc_str_scrub(enc, str, repl, cr);
11729}
11730
11731static VALUE
11732enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11733{
11734 int encidx;
11735 VALUE buf = Qnil;
11736 const char *rep, *p, *e, *p1, *sp;
11737 long replen = -1;
11738 long slen;
11739
11740 if (rb_block_given_p()) {
11741 if (!NIL_P(repl))
11742 rb_raise(rb_eArgError, "both of block and replacement given");
11743 replen = 0;
11744 }
11745
11746 if (ENC_CODERANGE_CLEAN_P(cr))
11747 return Qnil;
11748
11749 if (!NIL_P(repl)) {
11750 repl = str_compat_and_valid(repl, enc);
11751 }
11752
11753 if (rb_enc_dummy_p(enc)) {
11754 return Qnil;
11755 }
11756 encidx = rb_enc_to_index(enc);
11757
11758#define DEFAULT_REPLACE_CHAR(str) do { \
11759 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11760 rep = replace; replen = (int)sizeof(replace); \
11761 } while (0)
11762
11763 slen = RSTRING_LEN(str);
11764 p = RSTRING_PTR(str);
11765 e = RSTRING_END(str);
11766 p1 = p;
11767 sp = p;
11768
11769 if (rb_enc_asciicompat(enc)) {
11770 int rep7bit_p;
11771 if (!replen) {
11772 rep = NULL;
11773 rep7bit_p = FALSE;
11774 }
11775 else if (!NIL_P(repl)) {
11776 rep = RSTRING_PTR(repl);
11777 replen = RSTRING_LEN(repl);
11778 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11779 }
11780 else if (encidx == rb_utf8_encindex()) {
11781 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11782 rep7bit_p = FALSE;
11783 }
11784 else {
11785 DEFAULT_REPLACE_CHAR("?");
11786 rep7bit_p = TRUE;
11787 }
11788 cr = ENC_CODERANGE_7BIT;
11789
11790 p = search_nonascii(p, e);
11791 if (!p) {
11792 p = e;
11793 }
11794 while (p < e) {
11795 int ret = rb_enc_precise_mbclen(p, e, enc);
11796 if (MBCLEN_NEEDMORE_P(ret)) {
11797 break;
11798 }
11799 else if (MBCLEN_CHARFOUND_P(ret)) {
11801 p += MBCLEN_CHARFOUND_LEN(ret);
11802 }
11803 else if (MBCLEN_INVALID_P(ret)) {
11804 /*
11805 * p1~p: valid ascii/multibyte chars
11806 * p ~e: invalid bytes + unknown bytes
11807 */
11808 long clen = rb_enc_mbmaxlen(enc);
11809 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11810 if (p > p1) {
11811 rb_str_buf_cat(buf, p1, p - p1);
11812 }
11813
11814 if (e - p < clen) clen = e - p;
11815 if (clen <= 2) {
11816 clen = 1;
11817 }
11818 else {
11819 const char *q = p;
11820 clen--;
11821 for (; clen > 1; clen--) {
11822 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11823 if (MBCLEN_NEEDMORE_P(ret)) break;
11824 if (MBCLEN_INVALID_P(ret)) continue;
11826 }
11827 }
11828 if (rep) {
11829 rb_str_buf_cat(buf, rep, replen);
11830 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11831 }
11832 else {
11833 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11834 str_mod_check(str, sp, slen);
11835 repl = str_compat_and_valid(repl, enc);
11836 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11839 }
11840 p += clen;
11841 p1 = p;
11842 p = search_nonascii(p, e);
11843 if (!p) {
11844 p = e;
11845 break;
11846 }
11847 }
11848 else {
11850 }
11851 }
11852 if (NIL_P(buf)) {
11853 if (p == e) {
11854 ENC_CODERANGE_SET(str, cr);
11855 return Qnil;
11856 }
11857 buf = rb_str_buf_new(RSTRING_LEN(str));
11858 }
11859 if (p1 < p) {
11860 rb_str_buf_cat(buf, p1, p - p1);
11861 }
11862 if (p < e) {
11863 if (rep) {
11864 rb_str_buf_cat(buf, rep, replen);
11865 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11866 }
11867 else {
11868 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11869 str_mod_check(str, sp, slen);
11870 repl = str_compat_and_valid(repl, enc);
11871 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11874 }
11875 }
11876 }
11877 else {
11878 /* ASCII incompatible */
11879 long mbminlen = rb_enc_mbminlen(enc);
11880 if (!replen) {
11881 rep = NULL;
11882 }
11883 else if (!NIL_P(repl)) {
11884 rep = RSTRING_PTR(repl);
11885 replen = RSTRING_LEN(repl);
11886 }
11887 else if (encidx == ENCINDEX_UTF_16BE) {
11888 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11889 }
11890 else if (encidx == ENCINDEX_UTF_16LE) {
11891 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11892 }
11893 else if (encidx == ENCINDEX_UTF_32BE) {
11894 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11895 }
11896 else if (encidx == ENCINDEX_UTF_32LE) {
11897 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11898 }
11899 else {
11900 DEFAULT_REPLACE_CHAR("?");
11901 }
11902
11903 while (p < e) {
11904 int ret = rb_enc_precise_mbclen(p, e, enc);
11905 if (MBCLEN_NEEDMORE_P(ret)) {
11906 break;
11907 }
11908 else if (MBCLEN_CHARFOUND_P(ret)) {
11909 p += MBCLEN_CHARFOUND_LEN(ret);
11910 }
11911 else if (MBCLEN_INVALID_P(ret)) {
11912 const char *q = p;
11913 long clen = rb_enc_mbmaxlen(enc);
11914 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11915 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11916
11917 if (e - p < clen) clen = e - p;
11918 if (clen <= mbminlen * 2) {
11919 clen = mbminlen;
11920 }
11921 else {
11922 clen -= mbminlen;
11923 for (; clen > mbminlen; clen-=mbminlen) {
11924 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11925 if (MBCLEN_NEEDMORE_P(ret)) break;
11926 if (MBCLEN_INVALID_P(ret)) continue;
11928 }
11929 }
11930 if (rep) {
11931 rb_str_buf_cat(buf, rep, replen);
11932 }
11933 else {
11934 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11935 str_mod_check(str, sp, slen);
11936 repl = str_compat_and_valid(repl, enc);
11937 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11938 }
11939 p += clen;
11940 p1 = p;
11941 }
11942 else {
11944 }
11945 }
11946 if (NIL_P(buf)) {
11947 if (p == e) {
11949 return Qnil;
11950 }
11951 buf = rb_str_buf_new(RSTRING_LEN(str));
11952 }
11953 if (p1 < p) {
11954 rb_str_buf_cat(buf, p1, p - p1);
11955 }
11956 if (p < e) {
11957 if (rep) {
11958 rb_str_buf_cat(buf, rep, replen);
11959 }
11960 else {
11961 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11962 str_mod_check(str, sp, slen);
11963 repl = str_compat_and_valid(repl, enc);
11964 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11965 }
11966 }
11968 }
11969 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11970 return buf;
11971}
11972
11973/*
11974 * call-seq:
11975 * scrub(replacement_string = default_replacement_string) -> new_string
11976 * scrub{|sequence| ... } -> new_string
11977 *
11978 * :include: doc/string/scrub.rdoc
11979 *
11980 */
11981static VALUE
11982str_scrub(int argc, VALUE *argv, VALUE str)
11983{
11984 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11985 VALUE new = rb_str_scrub(str, repl);
11986 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11987}
11988
11989/*
11990 * call-seq:
11991 * scrub!(replacement_string = default_replacement_string) -> self
11992 * scrub!{|sequence| ... } -> self
11993 *
11994 * Like String#scrub, except that:
11995 *
11996 * - Any replacements are made in +self+.
11997 * - Returns +self+.
11998 *
11999 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12000 *
12001 */
12002static VALUE
12003str_scrub_bang(int argc, VALUE *argv, VALUE str)
12004{
12005 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12006 VALUE new = rb_str_scrub(str, repl);
12007 if (!NIL_P(new)) rb_str_replace(str, new);
12008 return str;
12009}
12010
12011static ID id_normalize;
12012static ID id_normalized_p;
12013static VALUE mUnicodeNormalize;
12014
12015static VALUE
12016unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12017{
12018 static int UnicodeNormalizeRequired = 0;
12019 VALUE argv2[2];
12020
12021 if (!UnicodeNormalizeRequired) {
12022 rb_require("unicode_normalize/normalize.rb");
12023 UnicodeNormalizeRequired = 1;
12024 }
12025 argv2[0] = str;
12026 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12027 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12028}
12029
12030/*
12031 * call-seq:
12032 * unicode_normalize(form = :nfc) -> string
12033 *
12034 * :include: doc/string/unicode_normalize.rdoc
12035 *
12036 */
12037static VALUE
12038rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12039{
12040 return unicode_normalize_common(argc, argv, str, id_normalize);
12041}
12042
12043/*
12044 * call-seq:
12045 * unicode_normalize!(form = :nfc) -> self
12046 *
12047 * Like String#unicode_normalize, except that the normalization
12048 * is performed on +self+ (not on a copy of +self+).
12049 *
12050 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12051 *
12052 */
12053static VALUE
12054rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12055{
12056 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12057}
12058
12059/* call-seq:
12060 * unicode_normalized?(form = :nfc) -> true or false
12061 *
12062 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12063 * see String#unicode_normalize.
12064 *
12065 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12066 *
12067 * Examples:
12068 *
12069 * "a\u0300".unicode_normalized? # => false
12070 * "a\u0300".unicode_normalized?(:nfd) # => true
12071 * "\u00E0".unicode_normalized? # => true
12072 * "\u00E0".unicode_normalized?(:nfd) # => false
12073 *
12074 *
12075 * Raises an exception if +self+ is not in a Unicode encoding:
12076 *
12077 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12078 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12079 *
12080 * Related: see {Querying}[rdoc-ref:String@Querying].
12081 */
12082static VALUE
12083rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12084{
12085 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12086}
12087
12088/**********************************************************************
12089 * Document-class: Symbol
12090 *
12091 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12092 *
12093 * You can create a +Symbol+ object explicitly with:
12094 *
12095 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12096 *
12097 * The same +Symbol+ object will be
12098 * created for a given name or string for the duration of a program's
12099 * execution, regardless of the context or meaning of that name. Thus
12100 * if <code>Fred</code> is a constant in one context, a method in
12101 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12102 * will be the same object in all three contexts.
12103 *
12104 * module One
12105 * class Fred
12106 * end
12107 * $f1 = :Fred
12108 * end
12109 * module Two
12110 * Fred = 1
12111 * $f2 = :Fred
12112 * end
12113 * def Fred()
12114 * end
12115 * $f3 = :Fred
12116 * $f1.object_id #=> 2514190
12117 * $f2.object_id #=> 2514190
12118 * $f3.object_id #=> 2514190
12119 *
12120 * Constant, method, and variable names are returned as symbols:
12121 *
12122 * module One
12123 * Two = 2
12124 * def three; 3 end
12125 * @four = 4
12126 * @@five = 5
12127 * $six = 6
12128 * end
12129 * seven = 7
12130 *
12131 * One.constants
12132 * # => [:Two]
12133 * One.instance_methods(true)
12134 * # => [:three]
12135 * One.instance_variables
12136 * # => [:@four]
12137 * One.class_variables
12138 * # => [:@@five]
12139 * global_variables.grep(/six/)
12140 * # => [:$six]
12141 * local_variables
12142 * # => [:seven]
12143 *
12144 * A +Symbol+ object differs from a String object in that
12145 * a +Symbol+ object represents an identifier, while a String object
12146 * represents text or data.
12147 *
12148 * == What's Here
12149 *
12150 * First, what's elsewhere. Class +Symbol+:
12151 *
12152 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12153 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12154 *
12155 * Here, class +Symbol+ provides methods that are useful for:
12156 *
12157 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12158 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12159 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12160 *
12161 * === Methods for Querying
12162 *
12163 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12164 * - #=~: Returns the index of the first substring in symbol that matches a
12165 * given Regexp or other object; returns +nil+ if no match is found.
12166 * - #[], #slice : Returns a substring of symbol
12167 * determined by a given index, start/length, or range, or string.
12168 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12169 * - #encoding: Returns the Encoding object that represents the encoding
12170 * of symbol.
12171 * - #end_with?: Returns +true+ if symbol ends with
12172 * any of the given strings.
12173 * - #match: Returns a MatchData object if symbol
12174 * matches a given Regexp; +nil+ otherwise.
12175 * - #match?: Returns +true+ if symbol
12176 * matches a given Regexp; +false+ otherwise.
12177 * - #length, #size: Returns the number of characters in symbol.
12178 * - #start_with?: Returns +true+ if symbol starts with
12179 * any of the given strings.
12180 *
12181 * === Methods for Comparing
12182 *
12183 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12184 * or larger than symbol.
12185 * - #==, #===: Returns +true+ if a given symbol has the same content and
12186 * encoding.
12187 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12188 * symbol is smaller than, equal to, or larger than symbol.
12189 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12190 * after Unicode case folding; +false+ otherwise.
12191 *
12192 * === Methods for Converting
12193 *
12194 * - #capitalize: Returns symbol with the first character upcased
12195 * and all other characters downcased.
12196 * - #downcase: Returns symbol with all characters downcased.
12197 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12198 * - #name: Returns the frozen string corresponding to symbol.
12199 * - #succ, #next: Returns the symbol that is the successor to symbol.
12200 * - #swapcase: Returns symbol with all upcase characters downcased
12201 * and all downcase characters upcased.
12202 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12203 * - #to_s, #id2name: Returns the string corresponding to +self+.
12204 * - #to_sym, #intern: Returns +self+.
12205 * - #upcase: Returns symbol with all characters upcased.
12206 *
12207 */
12208
12209
12210/*
12211 * call-seq:
12212 * symbol == object -> true or false
12213 *
12214 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12215 */
12216
12217#define sym_equal rb_obj_equal
12218
12219static int
12220sym_printable(const char *s, const char *send, rb_encoding *enc)
12221{
12222 while (s < send) {
12223 int n;
12224 int c = rb_enc_precise_mbclen(s, send, enc);
12225
12226 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12227 n = MBCLEN_CHARFOUND_LEN(c);
12228 c = rb_enc_mbc_to_codepoint(s, send, enc);
12229 if (!rb_enc_isprint(c, enc)) return FALSE;
12230 s += n;
12231 }
12232 return TRUE;
12233}
12234
12235int
12236rb_str_symname_p(VALUE sym)
12237{
12238 rb_encoding *enc;
12239 const char *ptr;
12240 long len;
12241 rb_encoding *resenc = rb_default_internal_encoding();
12242
12243 if (resenc == NULL) resenc = rb_default_external_encoding();
12244 enc = STR_ENC_GET(sym);
12245 ptr = RSTRING_PTR(sym);
12246 len = RSTRING_LEN(sym);
12247 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12248 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12249 return FALSE;
12250 }
12251 return TRUE;
12252}
12253
12254VALUE
12255rb_str_quote_unprintable(VALUE str)
12256{
12257 rb_encoding *enc;
12258 const char *ptr;
12259 long len;
12260 rb_encoding *resenc;
12261
12262 Check_Type(str, T_STRING);
12263 resenc = rb_default_internal_encoding();
12264 if (resenc == NULL) resenc = rb_default_external_encoding();
12265 enc = STR_ENC_GET(str);
12266 ptr = RSTRING_PTR(str);
12267 len = RSTRING_LEN(str);
12268 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12269 !sym_printable(ptr, ptr + len, enc)) {
12270 return rb_str_escape(str);
12271 }
12272 return str;
12273}
12274
12275VALUE
12276rb_id_quote_unprintable(ID id)
12277{
12278 VALUE str = rb_id2str(id);
12279 if (!rb_str_symname_p(str)) {
12280 return rb_str_escape(str);
12281 }
12282 return str;
12283}
12284
12285/*
12286 * call-seq:
12287 * inspect -> string
12288 *
12289 * Returns a string representation of +self+ (including the leading colon):
12290 *
12291 * :foo.inspect # => ":foo"
12292 *
12293 * Related: Symbol#to_s, Symbol#name.
12294 *
12295 */
12296
12297static VALUE
12298sym_inspect(VALUE sym)
12299{
12300 VALUE str = rb_sym2str(sym);
12301 const char *ptr;
12302 long len;
12303 char *dest;
12304
12305 if (!rb_str_symname_p(str)) {
12306 str = rb_str_inspect(str);
12307 len = RSTRING_LEN(str);
12308 rb_str_resize(str, len + 1);
12309 dest = RSTRING_PTR(str);
12310 memmove(dest + 1, dest, len);
12311 }
12312 else {
12313 rb_encoding *enc = STR_ENC_GET(str);
12314 VALUE orig_str = str;
12315
12316 len = RSTRING_LEN(orig_str);
12317 str = rb_enc_str_new(0, len + 1, enc);
12318
12319 // Get data pointer after allocation
12320 ptr = RSTRING_PTR(orig_str);
12321 dest = RSTRING_PTR(str);
12322 memcpy(dest + 1, ptr, len);
12323
12324 RB_GC_GUARD(orig_str);
12325 }
12326 dest[0] = ':';
12327
12329
12330 return str;
12331}
12332
12333VALUE
12335{
12336 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12337 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12338 return str;
12339}
12340
12341VALUE
12342rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12343{
12344 VALUE obj;
12345
12346 if (argc < 1) {
12347 rb_raise(rb_eArgError, "no receiver given");
12348 }
12349 obj = argv[0];
12350 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12351}
12352
12353/*
12354 * call-seq:
12355 * succ
12356 *
12357 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12358 *
12359 * :foo.succ # => :fop
12360 *
12361 * Related: String#succ.
12362 */
12363
12364static VALUE
12365sym_succ(VALUE sym)
12366{
12367 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12368}
12369
12370/*
12371 * call-seq:
12372 * symbol <=> object -> -1, 0, +1, or nil
12373 *
12374 * If +object+ is a symbol,
12375 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12376 *
12377 * :bar <=> :foo # => -1
12378 * :foo <=> :foo # => 0
12379 * :foo <=> :bar # => 1
12380 *
12381 * Otherwise, returns +nil+:
12382 *
12383 * :foo <=> 'bar' # => nil
12384 *
12385 * Related: String#<=>.
12386 */
12387
12388static VALUE
12389sym_cmp(VALUE sym, VALUE other)
12390{
12391 if (!SYMBOL_P(other)) {
12392 return Qnil;
12393 }
12394 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12395}
12396
12397/*
12398 * call-seq:
12399 * casecmp(object) -> -1, 0, 1, or nil
12400 *
12401 * :include: doc/symbol/casecmp.rdoc
12402 *
12403 */
12404
12405static VALUE
12406sym_casecmp(VALUE sym, VALUE other)
12407{
12408 if (!SYMBOL_P(other)) {
12409 return Qnil;
12410 }
12411 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12412}
12413
12414/*
12415 * call-seq:
12416 * casecmp?(object) -> true, false, or nil
12417 *
12418 * :include: doc/symbol/casecmp_p.rdoc
12419 *
12420 */
12421
12422static VALUE
12423sym_casecmp_p(VALUE sym, VALUE other)
12424{
12425 if (!SYMBOL_P(other)) {
12426 return Qnil;
12427 }
12428 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12429}
12430
12431/*
12432 * call-seq:
12433 * symbol =~ object -> integer or nil
12434 *
12435 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12436 * including possible updates to global variables;
12437 * see String#=~.
12438 *
12439 */
12440
12441static VALUE
12442sym_match(VALUE sym, VALUE other)
12443{
12444 return rb_str_match(rb_sym2str(sym), other);
12445}
12446
12447/*
12448 * call-seq:
12449 * match(pattern, offset = 0) -> matchdata or nil
12450 * match(pattern, offset = 0) {|matchdata| } -> object
12451 *
12452 * Equivalent to <tt>self.to_s.match</tt>,
12453 * including possible updates to global variables;
12454 * see String#match.
12455 *
12456 */
12457
12458static VALUE
12459sym_match_m(int argc, VALUE *argv, VALUE sym)
12460{
12461 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12462}
12463
12464/*
12465 * call-seq:
12466 * match?(pattern, offset) -> true or false
12467 *
12468 * Equivalent to <tt>sym.to_s.match?</tt>;
12469 * see String#match.
12470 *
12471 */
12472
12473static VALUE
12474sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12475{
12476 return rb_str_match_m_p(argc, argv, sym);
12477}
12478
12479/*
12480 * call-seq:
12481 * symbol[index] -> string or nil
12482 * symbol[start, length] -> string or nil
12483 * symbol[range] -> string or nil
12484 * symbol[regexp, capture = 0] -> string or nil
12485 * symbol[substring] -> string or nil
12486 *
12487 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12488 *
12489 */
12490
12491static VALUE
12492sym_aref(int argc, VALUE *argv, VALUE sym)
12493{
12494 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12495}
12496
12497/*
12498 * call-seq:
12499 * length -> integer
12500 *
12501 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12502 */
12503
12504static VALUE
12505sym_length(VALUE sym)
12506{
12507 return rb_str_length(rb_sym2str(sym));
12508}
12509
12510/*
12511 * call-seq:
12512 * empty? -> true or false
12513 *
12514 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12515 *
12516 */
12517
12518static VALUE
12519sym_empty(VALUE sym)
12520{
12521 return rb_str_empty(rb_sym2str(sym));
12522}
12523
12524/*
12525 * call-seq:
12526 * upcase(mapping) -> symbol
12527 *
12528 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12529 *
12530 * See String#upcase.
12531 *
12532 */
12533
12534static VALUE
12535sym_upcase(int argc, VALUE *argv, VALUE sym)
12536{
12537 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12538}
12539
12540/*
12541 * call-seq:
12542 * downcase(mapping) -> symbol
12543 *
12544 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12545 *
12546 * See String#downcase.
12547 *
12548 * Related: Symbol#upcase.
12549 *
12550 */
12551
12552static VALUE
12553sym_downcase(int argc, VALUE *argv, VALUE sym)
12554{
12555 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12556}
12557
12558/*
12559 * call-seq:
12560 * capitalize(mapping) -> symbol
12561 *
12562 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12563 *
12564 * See String#capitalize.
12565 *
12566 */
12567
12568static VALUE
12569sym_capitalize(int argc, VALUE *argv, VALUE sym)
12570{
12571 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12572}
12573
12574/*
12575 * call-seq:
12576 * swapcase(mapping) -> symbol
12577 *
12578 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12579 *
12580 * See String#swapcase.
12581 *
12582 */
12583
12584static VALUE
12585sym_swapcase(int argc, VALUE *argv, VALUE sym)
12586{
12587 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12588}
12589
12590/*
12591 * call-seq:
12592 * start_with?(*string_or_regexp) -> true or false
12593 *
12594 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12595 *
12596 */
12597
12598static VALUE
12599sym_start_with(int argc, VALUE *argv, VALUE sym)
12600{
12601 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12602}
12603
12604/*
12605 * call-seq:
12606 * end_with?(*strings) -> true or false
12607 *
12608 *
12609 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12610 *
12611 */
12612
12613static VALUE
12614sym_end_with(int argc, VALUE *argv, VALUE sym)
12615{
12616 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12617}
12618
12619/*
12620 * call-seq:
12621 * encoding -> encoding
12622 *
12623 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12624 *
12625 */
12626
12627static VALUE
12628sym_encoding(VALUE sym)
12629{
12630 return rb_obj_encoding(rb_sym2str(sym));
12631}
12632
12633static VALUE
12634string_for_symbol(VALUE name)
12635{
12636 if (!RB_TYPE_P(name, T_STRING)) {
12637 VALUE tmp = rb_check_string_type(name);
12638 if (NIL_P(tmp)) {
12639 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12640 name);
12641 }
12642 name = tmp;
12643 }
12644 return name;
12645}
12646
12647ID
12649{
12650 if (SYMBOL_P(name)) {
12651 return SYM2ID(name);
12652 }
12653 name = string_for_symbol(name);
12654 return rb_intern_str(name);
12655}
12656
12657VALUE
12659{
12660 if (SYMBOL_P(name)) {
12661 return name;
12662 }
12663 name = string_for_symbol(name);
12664 return rb_str_intern(name);
12665}
12666
12667/*
12668 * call-seq:
12669 * Symbol.all_symbols -> array_of_symbols
12670 *
12671 * Returns an array of all symbols currently in Ruby's symbol table:
12672 *
12673 * Symbol.all_symbols.size # => 9334
12674 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12675 *
12676 */
12677
12678static VALUE
12679sym_all_symbols(VALUE _)
12680{
12681 return rb_sym_all_symbols();
12682}
12683
12684VALUE
12685rb_str_to_interned_str(VALUE str)
12686{
12687 return rb_fstring(str);
12688}
12689
12690VALUE
12691rb_interned_str(const char *ptr, long len)
12692{
12693 struct RString fake_str = {RBASIC_INIT};
12694 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12695}
12696
12697VALUE
12699{
12700 return rb_interned_str(ptr, strlen(ptr));
12701}
12702
12703VALUE
12704rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12705{
12706 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12707 rb_enc_autoload(enc);
12708 }
12709
12710 struct RString fake_str = {RBASIC_INIT};
12711 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12712}
12713
12714VALUE
12715rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12716{
12717 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12718 rb_enc_autoload(enc);
12719 }
12720
12721 struct RString fake_str = {RBASIC_INIT};
12722 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12723 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12724 return str;
12725}
12726
12727VALUE
12729{
12730 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12731}
12732
12733#if USE_YJIT || USE_ZJIT
12734void
12735rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12736{
12737 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12738 ssize_t code = RB_NUM2SSIZE(codepoint);
12739
12740 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12741 rb_str_buf_cat_byte(str, (char) code);
12742 return;
12743 }
12744 }
12745
12746 rb_str_concat(str, codepoint);
12747}
12748#endif
12749
12750static int
12751fstring_set_class_i(VALUE *str, void *data)
12752{
12753 RBASIC_SET_CLASS(*str, rb_cString);
12754
12755 return ST_CONTINUE;
12756}
12757
12758void
12759Init_String(void)
12760{
12761 rb_cString = rb_define_class("String", rb_cObject);
12762
12763 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12764
12766 rb_define_alloc_func(rb_cString, empty_str_alloc);
12767 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12768 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12769 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12771 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12772 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12775 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12776 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12777 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12778 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12781 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12782 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12783 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12784 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12787 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12788 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12789 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12790 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12791 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12793 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12795 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12796 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12797 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12798 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12799 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12800 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12801 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12802 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12803 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12804 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12805 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12806 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12807 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12808 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12810 rb_define_method(rb_cString, "+@", str_uplus, 0);
12811 rb_define_method(rb_cString, "-@", str_uminus, 0);
12812 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12813 rb_define_alias(rb_cString, "dedup", "-@");
12814
12815 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12816 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12817 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12818 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12821 rb_define_method(rb_cString, "undump", str_undump, 0);
12822
12823 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12824 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12825 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12826 sym_fold = ID2SYM(rb_intern_const("fold"));
12827
12828 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12829 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12830 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12831 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12832
12833 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12834 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12835 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12836 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12837
12838 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12839 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12840 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12841 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12842 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12843 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12844 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12845 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12846 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12847 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12848 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12849 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12851 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12852 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12853 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12854 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12855 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12856
12857 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12858 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12859 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12860
12861 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12862
12863 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12864 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12865 rb_define_method(rb_cString, "center", rb_str_center, -1);
12866
12867 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12868 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12869 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12870 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12871 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12872 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12873 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12874 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12875 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12876
12877 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12878 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12879 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12880 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12881 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12882 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12883 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12884 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12885 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12886
12887 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12888 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12889 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12890 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12891 rb_define_method(rb_cString, "count", rb_str_count, -1);
12892
12893 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12894 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12895 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12896 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12897
12898 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12899 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12900 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12901 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12902 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12903
12904 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12905
12906 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12907 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12908
12909 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12910 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12911
12912 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12913 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12914 rb_define_method(rb_cString, "b", rb_str_b, 0);
12915 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12916 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12917
12918 /* define UnicodeNormalize module here so that we don't have to look it up */
12919 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12920 id_normalize = rb_intern_const("normalize");
12921 id_normalized_p = rb_intern_const("normalized?");
12922
12923 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12924 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12925 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12926
12927 rb_fs = Qnil;
12928 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12929 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12930 rb_gc_register_address(&rb_fs);
12931
12932 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12936 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12937
12938 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12939 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12940 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12941 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12942 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12943 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12944
12945 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12946 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12947 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12948 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12949
12950 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12951 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12952 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12953 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12954 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12955 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12956 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12957
12958 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12959 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12960 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12961 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12962
12963 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12964 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12965
12966 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12967}
12968
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1795
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1588
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1701
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2947
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2767
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3237
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1007
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3026
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:653
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3909
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1435
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1431
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1438
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1429
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1433
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:675
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2191
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2209
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1354
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3605
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:264
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:582
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:176
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1342
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3289
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1340
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:945
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1205
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3026
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1224
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12704
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2332
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3730
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1153
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1445
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1346
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:964
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12728
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:829
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:714
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2027
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2033
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1936
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4223
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3720
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1750
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1510
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2485
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3795
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1421
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12334
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2558
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1397
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1744
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3054
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5331
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4158
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3151
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11655
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1786
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1187
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:999
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1516
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1994
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4144
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3563
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2421
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2012
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6538
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3159
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12698
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1427
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3761
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3101
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4265
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3385
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7217
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2788
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12691
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4212
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4032
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4187
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3737
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3276
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5815
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11713
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1700
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2948
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3248
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3367
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1199
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2742
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7324
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1409
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1716
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2435
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5733
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9331
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1193
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1848
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2013
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2092
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3382
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1630
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12658
ID rb_to_id(VALUE str)
Definition string.c:12648
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3499
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4467
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1439
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2925
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2807
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1433
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2820
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1777
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:461
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:208
Definition string.c:8211
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:307
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113