Ruby 4.0.0dev (2025-12-21 revision 3fee7dd90d19790950f476614ae53a95b7730592)
string.c (3fee7dd90d19790950f476614ae53a95b7730592)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 FL_SET((shared_str), STR_SHARED_ROOT); \
209 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
210 FL_SET_RAW((shared_str), STR_BORROWED); \
211 } \
212} while (0)
213
214#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
215#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216/* TODO: include the terminator size in capa. */
217
218#define STR_ENC_GET(str) get_encoding(str)
219
220#if !defined SHARABLE_MIDDLE_SUBSTRING
221# define SHARABLE_MIDDLE_SUBSTRING 0
222#endif
223#if !SHARABLE_MIDDLE_SUBSTRING
224#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#else
226#define SHARABLE_SUBSTRING_P(beg, len, end) 1
227#endif
228
229
230static inline long
231str_embed_capa(VALUE str)
232{
233 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
234}
235
236bool
237rb_str_reembeddable_p(VALUE str)
238{
239 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
240}
241
242static inline size_t
243rb_str_embed_size(long capa, long termlen)
244{
245 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
246 if (size < sizeof(struct RString)) size = sizeof(struct RString);
247 return size;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254 if (STR_EMBED_P(str)) {
255 size_t capa = RSTRING(str)->len;
256 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
257
258 real_size = rb_str_embed_size(capa, TERM_LEN(str));
259 }
260 /* if the string is not currently embedded, but it can be embedded, how
261 * much space would it require */
262 else if (rb_str_reembeddable_p(str)) {
263 size_t capa = RSTRING(str)->as.heap.aux.capa;
264 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
265
266 real_size = rb_str_embed_size(capa, TERM_LEN(str));
267 }
268 else {
269 real_size = sizeof(struct RString);
270 }
271
272 return real_size;
273}
274
275static inline bool
276STR_EMBEDDABLE_P(long len, long termlen)
277{
278 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
279}
280
281static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
282static VALUE str_new_frozen(VALUE klass, VALUE orig);
283static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
284static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
285static VALUE str_new(VALUE klass, const char *ptr, long len);
286static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
287static inline void str_modifiable(VALUE str);
288static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
289static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->len;
317
318 STR_SET_EMBED(str);
319 STR_SET_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_debug_rstring_null_ptr(const char *func)
331{
332 fprintf(stderr, "%s is returning NULL!! "
333 "SIGSEGV is highly expected to follow immediately.\n"
334 "If you could reproduce, attach your debugger here, "
335 "and look at the passed string.\n",
336 func);
337}
338
339/* symbols for [up|down|swap]case/capitalize options */
340static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
341
342static rb_encoding *
343get_encoding(VALUE str)
344{
345 return rb_enc_from_index(ENCODING_GET(str));
346}
347
348static void
349mustnot_broken(VALUE str)
350{
351 if (is_broken_string(str)) {
352 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
353 }
354}
355
356static void
357mustnot_wchar(VALUE str)
358{
359 rb_encoding *enc = STR_ENC_GET(str);
360 if (rb_enc_mbminlen(enc) > 1) {
361 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
362 }
363}
364
365static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
366
367#if SIZEOF_LONG == SIZEOF_VOIDP
368#define PRECOMPUTED_FAKESTR_HASH 1
369#else
370#endif
371
372static inline bool
373BARE_STRING_P(VALUE str)
374{
375 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
376}
377
378static inline st_index_t
379str_do_hash(VALUE str)
380{
381 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
382 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
383 if (e && !is_ascii_string(str)) {
384 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
385 }
386 return h;
387}
388
389static VALUE
390str_store_precomputed_hash(VALUE str, st_index_t hash)
391{
392 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
393 RUBY_ASSERT(STR_EMBED_P(str));
394
395#if RUBY_DEBUG
396 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
397 size_t free_bytes = str_embed_capa(str) - used_bytes;
398 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
399#endif
400
401 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
402
403 FL_SET(str, STR_PRECOMPUTED_HASH);
404
405 return str;
406}
407
408VALUE
409rb_fstring(VALUE str)
410{
411 VALUE fstr;
412 int bare;
413
414 Check_Type(str, T_STRING);
415
416 if (FL_TEST(str, RSTRING_FSTR))
417 return str;
418
419 bare = BARE_STRING_P(str);
420 if (!bare) {
421 if (STR_EMBED_P(str)) {
422 OBJ_FREEZE(str);
423 return str;
424 }
425
426 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 return str;
429 }
430 }
431
432 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
433 rb_str_resize(str, RSTRING_LEN(str));
434
435 fstr = register_fstring(str, false, false);
436
437 if (!bare) {
438 str_replace_shared_without_enc(str, fstr);
439 OBJ_FREEZE(str);
440 return str;
441 }
442 return fstr;
443}
444
445static VALUE fstring_table_obj;
446
447static VALUE
448fstring_concurrent_set_hash(VALUE str)
449{
450#ifdef PRECOMPUTED_FAKESTR_HASH
451 st_index_t h;
452 if (FL_TEST_RAW(str, STR_FAKESTR)) {
453 // register_fstring precomputes the hash and stores it in capa for fake strings
454 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
455 }
456 else {
457 h = rb_str_hash(str);
458 }
459 // rb_str_hash doesn't include the encoding for ascii only strings, so
460 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
461 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
462#else
463 return (VALUE)rb_str_hash(str);
464#endif
465}
466
467static bool
468fstring_concurrent_set_cmp(VALUE a, VALUE b)
469{
470 long alen, blen;
471 const char *aptr, *bptr;
472
475
476 RSTRING_GETMEM(a, aptr, alen);
477 RSTRING_GETMEM(b, bptr, blen);
478 return (alen == blen &&
479 ENCODING_GET(a) == ENCODING_GET(b) &&
480 memcmp(aptr, bptr, alen) == 0);
481}
482
484 bool copy;
485 bool force_precompute_hash;
486};
487
488static VALUE
489fstring_concurrent_set_create(VALUE str, void *data)
490{
491 struct fstr_create_arg *arg = data;
492
493 // Unless the string is empty or binary, its coderange has been precomputed.
494 int coderange = ENC_CODERANGE(str);
495
496 if (FL_TEST_RAW(str, STR_FAKESTR)) {
497 if (arg->copy) {
498 VALUE new_str;
499 long len = RSTRING_LEN(str);
500 long capa = len + sizeof(st_index_t);
501 int term_len = TERM_LEN(str);
502
503 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
504 new_str = str_alloc_embed(rb_cString, capa + term_len);
505 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
506 STR_SET_LEN(new_str, RSTRING_LEN(str));
507 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
508 rb_enc_copy(new_str, str);
509 str_store_precomputed_hash(new_str, str_do_hash(str));
510 }
511 else {
512 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
513 rb_enc_copy(new_str, str);
514#ifdef PRECOMPUTED_FAKESTR_HASH
515 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
516 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
517 }
518#endif
519 }
520 str = new_str;
521 }
522 else {
523 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
524 RSTRING(str)->len,
525 ENCODING_GET(str));
526 }
527 OBJ_FREEZE(str);
528 }
529 else {
530 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
531 str = str_new_frozen(rb_cString, str);
532 }
533 if (STR_SHARED_P(str)) { /* str should not be shared */
534 /* shared substring */
535 str_make_independent(str);
537 }
538 if (!BARE_STRING_P(str)) {
539 str = str_new_frozen(rb_cString, str);
540 }
541 }
542
543 ENC_CODERANGE_SET(str, coderange);
544 RBASIC(str)->flags |= RSTRING_FSTR;
545 if (!RB_OBJ_SHAREABLE_P(str)) {
546 RB_OBJ_SET_SHAREABLE(str);
547 }
548 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
551 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
552 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
554 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
555
556 return str;
557}
558
559static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
560 .hash = fstring_concurrent_set_hash,
561 .cmp = fstring_concurrent_set_cmp,
562 .create = fstring_concurrent_set_create,
563 .free = NULL,
564};
565
566void
567Init_fstring_table(void)
568{
569 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
570 rb_gc_register_address(&fstring_table_obj);
571}
572
573static VALUE
574register_fstring(VALUE str, bool copy, bool force_precompute_hash)
575{
576 struct fstr_create_arg args = {
577 .copy = copy,
578 .force_precompute_hash = force_precompute_hash
579 };
580
581#if SIZEOF_VOIDP == SIZEOF_LONG
582 if (FL_TEST_RAW(str, STR_FAKESTR)) {
583 // if the string hasn't been interned, we'll need the hash twice, so we
584 // compute it once and store it in capa
585 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
586 }
587#endif
588
589 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
590
591 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593 RUBY_ASSERT(OBJ_FROZEN(result));
595 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
596 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
598
599 return result;
600}
601
602bool
603rb_obj_is_fstring_table(VALUE obj)
604{
605 ASSERT_vm_locking();
606
607 return obj == fstring_table_obj;
608}
609
610void
611rb_gc_free_fstring(VALUE obj)
612{
613 ASSERT_vm_locking_with_barrier();
614
615 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
617 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
618
619 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
620
621 RB_DEBUG_COUNTER_INC(obj_str_fstr);
622
623 FL_UNSET(obj, RSTRING_FSTR);
624}
625
626void
627rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
628{
629 if (fstring_table_obj) {
630 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
631 }
632}
633
634static VALUE
635setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
636{
637 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
638 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
639
640 if (!name) {
642 name = "";
643 }
644
645 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
646
647 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
648 fake_str->len = len;
649 fake_str->as.heap.ptr = (char *)name;
650 fake_str->as.heap.aux.capa = len;
651 return (VALUE)fake_str;
652}
653
654/*
655 * set up a fake string which refers a static string literal.
656 */
657VALUE
658rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
659{
660 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
661}
662
663/*
664 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
665 * shared string which refers a static string literal. `ptr` must
666 * point a constant string.
667 */
668VALUE
669rb_fstring_new(const char *ptr, long len)
670{
671 struct RString fake_str = {RBASIC_INIT};
672 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
673}
674
675VALUE
676rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
677{
678 struct RString fake_str = {RBASIC_INIT};
679 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
680}
681
682VALUE
683rb_fstring_cstr(const char *ptr)
684{
685 return rb_fstring_new(ptr, strlen(ptr));
686}
687
688static inline bool
689single_byte_optimizable(VALUE str)
690{
691 int encindex = ENCODING_GET(str);
692 switch (encindex) {
693 case ENCINDEX_ASCII_8BIT:
694 case ENCINDEX_US_ASCII:
695 return true;
696 case ENCINDEX_UTF_8:
697 // For UTF-8 it's worth scanning the string coderange when unknown.
699 }
700 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
701 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
702 return true;
703 }
704
705 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
706 return true;
707 }
708
709 /* Conservative. Possibly single byte.
710 * "\xa1" in Shift_JIS for example. */
711 return false;
712}
713
715
716static inline const char *
717search_nonascii(const char *p, const char *e)
718{
719 const char *s, *t;
720
721#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
722# if SIZEOF_UINTPTR_T == 8
723# define NONASCII_MASK UINT64_C(0x8080808080808080)
724# elif SIZEOF_UINTPTR_T == 4
725# define NONASCII_MASK UINT32_C(0x80808080)
726# else
727# error "don't know what to do."
728# endif
729#else
730# if SIZEOF_UINTPTR_T == 8
731# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
732# elif SIZEOF_UINTPTR_T == 4
733# define NONASCII_MASK 0x80808080UL /* or...? */
734# else
735# error "don't know what to do."
736# endif
737#endif
738
739 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
740#if !UNALIGNED_WORD_ACCESS
741 if ((uintptr_t)p % SIZEOF_VOIDP) {
742 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
743 p += l;
744 switch (l) {
745 default: UNREACHABLE;
746#if SIZEOF_VOIDP > 4
747 case 7: if (p[-7]&0x80) return p-7;
748 case 6: if (p[-6]&0x80) return p-6;
749 case 5: if (p[-5]&0x80) return p-5;
750 case 4: if (p[-4]&0x80) return p-4;
751#endif
752 case 3: if (p[-3]&0x80) return p-3;
753 case 2: if (p[-2]&0x80) return p-2;
754 case 1: if (p[-1]&0x80) return p-1;
755 case 0: break;
756 }
757 }
758#endif
759#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
760#define aligned_ptr(value) \
761 __builtin_assume_aligned((value), sizeof(uintptr_t))
762#else
763#define aligned_ptr(value) (value)
764#endif
765 s = aligned_ptr(p);
766 t = (e - (SIZEOF_VOIDP-1));
767#undef aligned_ptr
768 for (;s < t; s += sizeof(uintptr_t)) {
769 uintptr_t word;
770 memcpy(&word, s, sizeof(word));
771 if (word & NONASCII_MASK) {
772#ifdef WORDS_BIGENDIAN
773 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
774#else
775 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
776#endif
777 }
778 }
779 p = (const char *)s;
780 }
781
782 switch (e - p) {
783 default: UNREACHABLE;
784#if SIZEOF_VOIDP > 4
785 case 7: if (e[-7]&0x80) return e-7;
786 case 6: if (e[-6]&0x80) return e-6;
787 case 5: if (e[-5]&0x80) return e-5;
788 case 4: if (e[-4]&0x80) return e-4;
789#endif
790 case 3: if (e[-3]&0x80) return e-3;
791 case 2: if (e[-2]&0x80) return e-2;
792 case 1: if (e[-1]&0x80) return e-1;
793 case 0: return NULL;
794 }
795}
796
797static int
798coderange_scan(const char *p, long len, rb_encoding *enc)
799{
800 const char *e = p + len;
801
802 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
803 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
804 p = search_nonascii(p, e);
806 }
807
808 if (rb_enc_asciicompat(enc)) {
809 p = search_nonascii(p, e);
810 if (!p) return ENC_CODERANGE_7BIT;
811 for (;;) {
812 int ret = rb_enc_precise_mbclen(p, e, enc);
814 p += MBCLEN_CHARFOUND_LEN(ret);
815 if (p == e) break;
816 p = search_nonascii(p, e);
817 if (!p) break;
818 }
819 }
820 else {
821 while (p < e) {
822 int ret = rb_enc_precise_mbclen(p, e, enc);
824 p += MBCLEN_CHARFOUND_LEN(ret);
825 }
826 }
827 return ENC_CODERANGE_VALID;
828}
829
830long
831rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
832{
833 const char *p = s;
834
835 if (*cr == ENC_CODERANGE_BROKEN)
836 return e - s;
837
838 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
839 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
840 if (*cr == ENC_CODERANGE_VALID) return e - s;
841 p = search_nonascii(p, e);
843 return e - s;
844 }
845 else if (rb_enc_asciicompat(enc)) {
846 p = search_nonascii(p, e);
847 if (!p) {
848 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
849 return e - s;
850 }
851 for (;;) {
852 int ret = rb_enc_precise_mbclen(p, e, enc);
853 if (!MBCLEN_CHARFOUND_P(ret)) {
855 return p - s;
856 }
857 p += MBCLEN_CHARFOUND_LEN(ret);
858 if (p == e) break;
859 p = search_nonascii(p, e);
860 if (!p) break;
861 }
862 }
863 else {
864 while (p < e) {
865 int ret = rb_enc_precise_mbclen(p, e, enc);
866 if (!MBCLEN_CHARFOUND_P(ret)) {
868 return p - s;
869 }
870 p += MBCLEN_CHARFOUND_LEN(ret);
871 }
872 }
874 return e - s;
875}
876
877static inline void
878str_enc_copy(VALUE str1, VALUE str2)
879{
880 rb_enc_set_index(str1, ENCODING_GET(str2));
881}
882
883/* Like str_enc_copy, but does not check frozen status of str1.
884 * You should use this only if you're certain that str1 is not frozen. */
885static inline void
886str_enc_copy_direct(VALUE str1, VALUE str2)
887{
888 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
889 if (inlined_encoding == ENCODING_INLINE_MAX) {
890 rb_enc_set_index(str1, rb_enc_get_index(str2));
891 }
892 else {
893 ENCODING_SET_INLINED(str1, inlined_encoding);
894 }
895}
896
897static void
898rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
899{
900 /* this function is designed for copying encoding and coderange
901 * from src to new string "dest" which is made from the part of src.
902 */
903 str_enc_copy(dest, src);
904 if (RSTRING_LEN(dest) == 0) {
905 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
907 else
909 return;
910 }
911 switch (ENC_CODERANGE(src)) {
914 break;
916 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
917 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
919 else
921 break;
922 default:
923 break;
924 }
925}
926
927static void
928rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
929{
930 str_enc_copy(dest, src);
932}
933
934static int
935enc_coderange_scan(VALUE str, rb_encoding *enc)
936{
937 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
938}
939
940int
941rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
942{
943 return enc_coderange_scan(str, enc);
944}
945
946int
948{
949 int cr = ENC_CODERANGE(str);
950
951 if (cr == ENC_CODERANGE_UNKNOWN) {
952 cr = enc_coderange_scan(str, get_encoding(str));
953 ENC_CODERANGE_SET(str, cr);
954 }
955 return cr;
956}
957
958static inline bool
959rb_enc_str_asciicompat(VALUE str)
960{
961 int encindex = ENCODING_GET_INLINED(str);
962 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
963}
964
965int
967{
968 switch(ENC_CODERANGE(str)) {
970 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
972 return true;
973 default:
974 return false;
975 }
976}
977
978static inline void
979str_mod_check(VALUE s, const char *p, long len)
980{
981 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
982 rb_raise(rb_eRuntimeError, "string modified");
983 }
984}
985
986static size_t
987str_capacity(VALUE str, const int termlen)
988{
989 if (STR_EMBED_P(str)) {
990 return str_embed_capa(str) - termlen;
991 }
992 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
993 return RSTRING(str)->len;
994 }
995 else {
996 return RSTRING(str)->as.heap.aux.capa;
997 }
998}
999
1000size_t
1002{
1003 return str_capacity(str, TERM_LEN(str));
1004}
1005
1006static inline void
1007must_not_null(const char *ptr)
1008{
1009 if (!ptr) {
1010 rb_raise(rb_eArgError, "NULL pointer given");
1011 }
1012}
1013
1014static inline VALUE
1015str_alloc_embed(VALUE klass, size_t capa)
1016{
1017 size_t size = rb_str_embed_size(capa, 0);
1018 RUBY_ASSERT(size > 0);
1019 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1020
1021 NEWOBJ_OF(str, struct RString, klass,
1023
1024 str->len = 0;
1025 str->as.embed.ary[0] = 0;
1026
1027 return (VALUE)str;
1028}
1029
1030static inline VALUE
1031str_alloc_heap(VALUE klass)
1032{
1033 NEWOBJ_OF(str, struct RString, klass,
1034 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1035
1036 str->len = 0;
1037 str->as.heap.aux.capa = 0;
1038 str->as.heap.ptr = NULL;
1039
1040 return (VALUE)str;
1041}
1042
1043static inline VALUE
1044empty_str_alloc(VALUE klass)
1045{
1046 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1047 VALUE str = str_alloc_embed(klass, 0);
1048 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1050 return str;
1051}
1052
1053static VALUE
1054str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1055{
1056 VALUE str;
1057
1058 if (len < 0) {
1059 rb_raise(rb_eArgError, "negative string size (or size too big)");
1060 }
1061
1062 if (enc == NULL) {
1063 enc = rb_ascii8bit_encoding();
1064 }
1065
1066 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1067
1068 int termlen = rb_enc_mbminlen(enc);
1069
1070 if (STR_EMBEDDABLE_P(len, termlen)) {
1071 str = str_alloc_embed(klass, len + termlen);
1072 if (len == 0) {
1073 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1074 }
1075 }
1076 else {
1077 str = str_alloc_heap(klass);
1078 RSTRING(str)->as.heap.aux.capa = len;
1079 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1080 * integer overflow. If we can STATIC_ASSERT that, the following
1081 * mul_add_mul can be reverted to a simple ALLOC_N. */
1082 RSTRING(str)->as.heap.ptr =
1083 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1084 }
1085
1086 rb_enc_raw_set(str, enc);
1087
1088 if (ptr) {
1089 memcpy(RSTRING_PTR(str), ptr, len);
1090 }
1091 else {
1092 memset(RSTRING_PTR(str), 0, len);
1093 }
1094
1095 STR_SET_LEN(str, len);
1096 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1097 return str;
1098}
1099
1100static VALUE
1101str_new(VALUE klass, const char *ptr, long len)
1102{
1103 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1104}
1105
1106VALUE
1107rb_str_new(const char *ptr, long len)
1108{
1109 return str_new(rb_cString, ptr, len);
1110}
1111
1112VALUE
1113rb_usascii_str_new(const char *ptr, long len)
1114{
1115 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1116}
1117
1118VALUE
1119rb_utf8_str_new(const char *ptr, long len)
1120{
1121 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1122}
1123
1124VALUE
1125rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1126{
1127 return str_enc_new(rb_cString, ptr, len, enc);
1128}
1129
1130VALUE
1132{
1133 must_not_null(ptr);
1134 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1135 * memory regions, and that cannot be detected by the MSAN. Just
1136 * trust the programmer that the argument passed here is a sane C
1137 * string. */
1138 __msan_unpoison_string(ptr);
1139 return rb_str_new(ptr, strlen(ptr));
1140}
1141
1142VALUE
1144{
1145 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1146}
1147
1148VALUE
1150{
1151 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1152}
1153
1154VALUE
1156{
1157 must_not_null(ptr);
1158 if (rb_enc_mbminlen(enc) != 1) {
1159 rb_raise(rb_eArgError, "wchar encoding given");
1160 }
1161 return rb_enc_str_new(ptr, strlen(ptr), enc);
1162}
1163
1164static VALUE
1165str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1166{
1167 VALUE str;
1168
1169 if (len < 0) {
1170 rb_raise(rb_eArgError, "negative string size (or size too big)");
1171 }
1172
1173 if (!ptr) {
1174 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1175 }
1176 else {
1177 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1178 str = str_alloc_heap(klass);
1179 RSTRING(str)->len = len;
1180 RSTRING(str)->as.heap.ptr = (char *)ptr;
1181 RSTRING(str)->as.heap.aux.capa = len;
1182 RBASIC(str)->flags |= STR_NOFREE;
1183 rb_enc_associate_index(str, encindex);
1184 }
1185 return str;
1186}
1187
1188VALUE
1189rb_str_new_static(const char *ptr, long len)
1190{
1191 return str_new_static(rb_cString, ptr, len, 0);
1192}
1193
1194VALUE
1196{
1197 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1198}
1199
1200VALUE
1202{
1203 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1204}
1205
1206VALUE
1208{
1209 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1210}
1211
1212static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1213 rb_encoding *from, rb_encoding *to,
1214 int ecflags, VALUE ecopts);
1215
1216static inline bool
1217is_enc_ascii_string(VALUE str, rb_encoding *enc)
1218{
1219 int encidx = rb_enc_to_index(enc);
1220 if (rb_enc_get_index(str) == encidx)
1221 return is_ascii_string(str);
1222 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1223}
1224
1225VALUE
1226rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1227{
1228 long len;
1229 const char *ptr;
1230 VALUE newstr;
1231
1232 if (!to) return str;
1233 if (!from) from = rb_enc_get(str);
1234 if (from == to) return str;
1235 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1236 rb_is_ascii8bit_enc(to)) {
1237 if (STR_ENC_GET(str) != to) {
1238 str = rb_str_dup(str);
1239 rb_enc_associate(str, to);
1240 }
1241 return str;
1242 }
1243
1244 RSTRING_GETMEM(str, ptr, len);
1245 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1246 from, to, ecflags, ecopts);
1247 if (NIL_P(newstr)) {
1248 /* some error, return original */
1249 return str;
1250 }
1251 return newstr;
1252}
1253
1254VALUE
1255rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1256 rb_encoding *from, int ecflags, VALUE ecopts)
1257{
1258 long olen;
1259
1260 olen = RSTRING_LEN(newstr);
1261 if (ofs < -olen || olen < ofs)
1262 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1263 if (ofs < 0) ofs += olen;
1264 if (!from) {
1265 STR_SET_LEN(newstr, ofs);
1266 return rb_str_cat(newstr, ptr, len);
1267 }
1268
1269 rb_str_modify(newstr);
1270 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1271 rb_enc_get(newstr),
1272 ecflags, ecopts);
1273}
1274
1275VALUE
1276rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1277{
1278 STR_SET_LEN(str, 0);
1279 rb_enc_associate(str, enc);
1280 rb_str_cat(str, ptr, len);
1281 return str;
1282}
1283
1284static VALUE
1285str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1286 rb_encoding *from, rb_encoding *to,
1287 int ecflags, VALUE ecopts)
1288{
1289 rb_econv_t *ec;
1291 long olen;
1292 VALUE econv_wrapper;
1293 const unsigned char *start, *sp;
1294 unsigned char *dest, *dp;
1295 size_t converted_output = (size_t)ofs;
1296
1297 olen = rb_str_capacity(newstr);
1298
1299 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1300 RBASIC_CLEAR_CLASS(econv_wrapper);
1301 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1302 if (!ec) return Qnil;
1303 DATA_PTR(econv_wrapper) = ec;
1304
1305 sp = (unsigned char*)ptr;
1306 start = sp;
1307 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1308 (dp = dest + converted_output),
1309 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1311 /* destination buffer short */
1312 size_t converted_input = sp - start;
1313 size_t rest = len - converted_input;
1314 converted_output = dp - dest;
1315 rb_str_set_len(newstr, converted_output);
1316 if (converted_input && converted_output &&
1317 rest < (LONG_MAX / converted_output)) {
1318 rest = (rest * converted_output) / converted_input;
1319 }
1320 else {
1321 rest = olen;
1322 }
1323 olen += rest < 2 ? 2 : rest;
1324 rb_str_resize(newstr, olen);
1325 }
1326 DATA_PTR(econv_wrapper) = 0;
1327 RB_GC_GUARD(econv_wrapper);
1328 rb_econv_close(ec);
1329 switch (ret) {
1330 case econv_finished:
1331 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1332 rb_str_set_len(newstr, len);
1333 rb_enc_associate(newstr, to);
1334 return newstr;
1335
1336 default:
1337 return Qnil;
1338 }
1339}
1340
1341VALUE
1343{
1344 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1345}
1346
1347VALUE
1349{
1350 rb_encoding *ienc;
1351 VALUE str;
1352 const int eidx = rb_enc_to_index(eenc);
1353
1354 if (!ptr) {
1355 return rb_enc_str_new(ptr, len, eenc);
1356 }
1357
1358 /* ASCII-8BIT case, no conversion */
1359 if ((eidx == rb_ascii8bit_encindex()) ||
1360 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1361 return rb_str_new(ptr, len);
1362 }
1363 /* no default_internal or same encoding, no conversion */
1364 ienc = rb_default_internal_encoding();
1365 if (!ienc || eenc == ienc) {
1366 return rb_enc_str_new(ptr, len, eenc);
1367 }
1368 /* ASCII compatible, and ASCII only string, no conversion in
1369 * default_internal */
1370 if ((eidx == rb_ascii8bit_encindex()) ||
1371 (eidx == rb_usascii_encindex()) ||
1372 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1373 return rb_enc_str_new(ptr, len, ienc);
1374 }
1375 /* convert from the given encoding to default_internal */
1376 str = rb_enc_str_new(NULL, 0, ienc);
1377 /* when the conversion failed for some reason, just ignore the
1378 * default_internal and result in the given encoding as-is. */
1379 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1380 rb_str_initialize(str, ptr, len, eenc);
1381 }
1382 return str;
1383}
1384
1385VALUE
1386rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1387{
1388 int eidx = rb_enc_to_index(eenc);
1389 if (eidx == rb_usascii_encindex() &&
1390 !is_ascii_string(str)) {
1391 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1392 return str;
1393 }
1394 rb_enc_associate_index(str, eidx);
1395 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1396}
1397
1398VALUE
1399rb_external_str_new(const char *ptr, long len)
1400{
1401 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1402}
1403
1404VALUE
1406{
1407 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1408}
1409
1410VALUE
1411rb_locale_str_new(const char *ptr, long len)
1412{
1413 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1414}
1415
1416VALUE
1418{
1419 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1420}
1421
1422VALUE
1424{
1425 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1426}
1427
1428VALUE
1430{
1431 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1432}
1433
1434VALUE
1436{
1437 return rb_str_export_to_enc(str, rb_default_external_encoding());
1438}
1439
1440VALUE
1442{
1443 return rb_str_export_to_enc(str, rb_locale_encoding());
1444}
1445
1446VALUE
1448{
1449 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1450}
1451
1452static VALUE
1453str_replace_shared_without_enc(VALUE str2, VALUE str)
1454{
1455 const int termlen = TERM_LEN(str);
1456 char *ptr;
1457 long len;
1458
1459 RSTRING_GETMEM(str, ptr, len);
1460 if (str_embed_capa(str2) >= len + termlen) {
1461 char *ptr2 = RSTRING(str2)->as.embed.ary;
1462 STR_SET_EMBED(str2);
1463 memcpy(ptr2, RSTRING_PTR(str), len);
1464 TERM_FILL(ptr2+len, termlen);
1465 }
1466 else {
1467 VALUE root;
1468 if (STR_SHARED_P(str)) {
1469 root = RSTRING(str)->as.heap.aux.shared;
1470 RSTRING_GETMEM(str, ptr, len);
1471 }
1472 else {
1473 root = rb_str_new_frozen(str);
1474 RSTRING_GETMEM(root, ptr, len);
1475 }
1476 RUBY_ASSERT(OBJ_FROZEN(root));
1477
1478 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1479 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1480 rb_fatal("about to free a possible shared root");
1481 }
1482 char *ptr2 = STR_HEAP_PTR(str2);
1483 if (ptr2 != ptr) {
1484 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1485 }
1486 }
1487 FL_SET(str2, STR_NOEMBED);
1488 RSTRING(str2)->as.heap.ptr = ptr;
1489 STR_SET_SHARED(str2, root);
1490 }
1491
1492 STR_SET_LEN(str2, len);
1493
1494 return str2;
1495}
1496
1497static VALUE
1498str_replace_shared(VALUE str2, VALUE str)
1499{
1500 str_replace_shared_without_enc(str2, str);
1501 rb_enc_cr_str_exact_copy(str2, str);
1502 return str2;
1503}
1504
1505static VALUE
1506str_new_shared(VALUE klass, VALUE str)
1507{
1508 return str_replace_shared(str_alloc_heap(klass), str);
1509}
1510
1511VALUE
1513{
1514 return str_new_shared(rb_obj_class(str), str);
1515}
1516
1517VALUE
1519{
1520 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1521 return str_new_frozen(rb_obj_class(orig), orig);
1522}
1523
1524static VALUE
1525rb_str_new_frozen_String(VALUE orig)
1526{
1527 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1528 return str_new_frozen(rb_cString, orig);
1529}
1530
1531
1532VALUE
1533rb_str_frozen_bare_string(VALUE orig)
1534{
1535 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1536 return str_new_frozen(rb_cString, orig);
1537}
1538
1539VALUE
1540rb_str_tmp_frozen_acquire(VALUE orig)
1541{
1542 if (OBJ_FROZEN_RAW(orig)) return orig;
1543 return str_new_frozen_buffer(0, orig, FALSE);
1544}
1545
1546VALUE
1547rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1548{
1549 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1550 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1551
1552 VALUE str = str_alloc_heap(0);
1553 OBJ_FREEZE(str);
1554 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1555 FL_SET(str, STR_SHARED_ROOT);
1556
1557 size_t capa = str_capacity(orig, TERM_LEN(orig));
1558
1559 /* If the string is embedded then we want to create a copy that is heap
1560 * allocated. If the string is shared then the shared root must be
1561 * embedded, so we want to create a copy. If the string is a shared root
1562 * then it must be embedded, so we want to create a copy. */
1563 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1564 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1565 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1566 }
1567 else {
1568 /* orig must be heap allocated and not shared, so we can safely transfer
1569 * the pointer to str. */
1570 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1571 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1572 RBASIC(orig)->flags &= ~STR_NOFREE;
1573 STR_SET_SHARED(orig, str);
1574 if (RB_OBJ_SHAREABLE_P(orig)) {
1575 RB_OBJ_SET_SHAREABLE(str);
1576 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1577 }
1578 }
1579
1580 RSTRING(str)->len = RSTRING(orig)->len;
1581 RSTRING(str)->as.heap.aux.capa = capa;
1582
1583 return str;
1584}
1585
1586void
1587rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1588{
1589 if (RBASIC_CLASS(tmp) != 0)
1590 return;
1591
1592 if (STR_EMBED_P(tmp)) {
1594 }
1595 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1596 !OBJ_FROZEN_RAW(orig)) {
1597 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1598
1599 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1600 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1601 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1602
1603 /* Unshare orig since the root (tmp) only has this one child. */
1604 FL_UNSET_RAW(orig, STR_SHARED);
1605 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1606 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1608
1609 /* Make tmp embedded and empty so it is safe for sweeping. */
1610 STR_SET_EMBED(tmp);
1611 STR_SET_LEN(tmp, 0);
1612 }
1613 }
1614}
1615
1616static VALUE
1617str_new_frozen(VALUE klass, VALUE orig)
1618{
1619 return str_new_frozen_buffer(klass, orig, TRUE);
1620}
1621
1622static VALUE
1623heap_str_make_shared(VALUE klass, VALUE orig)
1624{
1625 RUBY_ASSERT(!STR_EMBED_P(orig));
1626 RUBY_ASSERT(!STR_SHARED_P(orig));
1628
1629 VALUE str = str_alloc_heap(klass);
1630 STR_SET_LEN(str, RSTRING_LEN(orig));
1631 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1632 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1633 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1634 RBASIC(orig)->flags &= ~STR_NOFREE;
1635 STR_SET_SHARED(orig, str);
1636 if (klass == 0)
1637 FL_UNSET_RAW(str, STR_BORROWED);
1638 return str;
1639}
1640
1641static VALUE
1642str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1643{
1644 VALUE str;
1645
1646 long len = RSTRING_LEN(orig);
1647 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1648 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1649
1650 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1651 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1652 RUBY_ASSERT(STR_EMBED_P(str));
1653 }
1654 else {
1655 if (FL_TEST_RAW(orig, STR_SHARED)) {
1656 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1657 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1658 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1659 RUBY_ASSERT(ofs >= 0);
1660 RUBY_ASSERT(rest >= 0);
1661 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1663
1664 if ((ofs > 0) || (rest > 0) ||
1665 (klass != RBASIC(shared)->klass) ||
1666 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1667 str = str_new_shared(klass, shared);
1668 RUBY_ASSERT(!STR_EMBED_P(str));
1669 RSTRING(str)->as.heap.ptr += ofs;
1670 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1671 }
1672 else {
1673 if (RBASIC_CLASS(shared) == 0)
1674 FL_SET_RAW(shared, STR_BORROWED);
1675 return shared;
1676 }
1677 }
1678 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1679 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1680 STR_SET_EMBED(str);
1681 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1682 STR_SET_LEN(str, RSTRING_LEN(orig));
1683 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1684 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1685 }
1686 else {
1687 if (RB_OBJ_SHAREABLE_P(orig)) {
1688 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1689 }
1690 else {
1691 str = heap_str_make_shared(klass, orig);
1692 }
1693 }
1694 }
1695
1696 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1697 OBJ_FREEZE(str);
1698 return str;
1699}
1700
1701VALUE
1702rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1703{
1704 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1705}
1706
1707static VALUE
1708str_new_empty_String(VALUE str)
1709{
1710 VALUE v = rb_str_new(0, 0);
1711 rb_enc_copy(v, str);
1712 return v;
1713}
1714
1715#define STR_BUF_MIN_SIZE 63
1716
1717VALUE
1719{
1720 if (STR_EMBEDDABLE_P(capa, 1)) {
1721 return str_alloc_embed(rb_cString, capa + 1);
1722 }
1723
1724 VALUE str = str_alloc_heap(rb_cString);
1725
1726 RSTRING(str)->as.heap.aux.capa = capa;
1727 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1728 RSTRING(str)->as.heap.ptr[0] = '\0';
1729
1730 return str;
1731}
1732
1733VALUE
1735{
1736 VALUE str;
1737 long len = strlen(ptr);
1738
1739 str = rb_str_buf_new(len);
1740 rb_str_buf_cat(str, ptr, len);
1741
1742 return str;
1743}
1744
1745VALUE
1747{
1748 return str_new(0, 0, len);
1749}
1750
1751void
1753{
1754 if (STR_EMBED_P(str)) {
1755 RB_DEBUG_COUNTER_INC(obj_str_embed);
1756 }
1757 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1758 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1759 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1760 }
1761 else {
1762 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1763 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1764 }
1765}
1766
1767size_t
1768rb_str_memsize(VALUE str)
1769{
1770 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1771 return STR_HEAP_SIZE(str);
1772 }
1773 else {
1774 return 0;
1775 }
1776}
1777
1778VALUE
1780{
1781 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1782}
1783
1784static inline void str_discard(VALUE str);
1785static void str_shared_replace(VALUE str, VALUE str2);
1786
1787void
1789{
1790 if (str != str2) str_shared_replace(str, str2);
1791}
1792
1793static void
1794str_shared_replace(VALUE str, VALUE str2)
1795{
1796 rb_encoding *enc;
1797 int cr;
1798 int termlen;
1799
1800 RUBY_ASSERT(str2 != str);
1801 enc = STR_ENC_GET(str2);
1802 cr = ENC_CODERANGE(str2);
1803 str_discard(str);
1804 termlen = rb_enc_mbminlen(enc);
1805
1806 STR_SET_LEN(str, RSTRING_LEN(str2));
1807
1808 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1809 STR_SET_EMBED(str);
1810 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1811 rb_enc_associate(str, enc);
1812 ENC_CODERANGE_SET(str, cr);
1813 }
1814 else {
1815 if (STR_EMBED_P(str2)) {
1816 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1817 long len = RSTRING_LEN(str2);
1818 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1819
1820 char *new_ptr = ALLOC_N(char, len + termlen);
1821 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1822 RSTRING(str2)->as.heap.ptr = new_ptr;
1823 STR_SET_LEN(str2, len);
1824 RSTRING(str2)->as.heap.aux.capa = len;
1825 STR_SET_NOEMBED(str2);
1826 }
1827
1828 STR_SET_NOEMBED(str);
1829 FL_UNSET(str, STR_SHARED);
1830 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1831
1832 if (FL_TEST(str2, STR_SHARED)) {
1833 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1834 STR_SET_SHARED(str, shared);
1835 }
1836 else {
1837 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1838 }
1839
1840 /* abandon str2 */
1841 STR_SET_EMBED(str2);
1842 RSTRING_PTR(str2)[0] = 0;
1843 STR_SET_LEN(str2, 0);
1844 rb_enc_associate(str, enc);
1845 ENC_CODERANGE_SET(str, cr);
1846 }
1847}
1848
1849VALUE
1851{
1852 VALUE str;
1853
1854 if (RB_TYPE_P(obj, T_STRING)) {
1855 return obj;
1856 }
1857 str = rb_funcall(obj, idTo_s, 0);
1858 return rb_obj_as_string_result(str, obj);
1859}
1860
1861VALUE
1862rb_obj_as_string_result(VALUE str, VALUE obj)
1863{
1864 if (!RB_TYPE_P(str, T_STRING))
1865 return rb_any_to_s(obj);
1866 return str;
1867}
1868
1869static VALUE
1870str_replace(VALUE str, VALUE str2)
1871{
1872 long len;
1873
1874 len = RSTRING_LEN(str2);
1875 if (STR_SHARED_P(str2)) {
1876 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1878 STR_SET_NOEMBED(str);
1879 STR_SET_LEN(str, len);
1880 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1881 STR_SET_SHARED(str, shared);
1882 rb_enc_cr_str_exact_copy(str, str2);
1883 }
1884 else {
1885 str_replace_shared(str, str2);
1886 }
1887
1888 return str;
1889}
1890
1891static inline VALUE
1892ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1893{
1894 size_t size = rb_str_embed_size(capa, 0);
1895 RUBY_ASSERT(size > 0);
1896 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1897
1898 NEWOBJ_OF(str, struct RString, klass,
1900
1901 str->len = 0;
1902
1903 return (VALUE)str;
1904}
1905
1906static inline VALUE
1907ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1908{
1909 NEWOBJ_OF(str, struct RString, klass,
1910 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1911
1912 str->as.heap.aux.capa = 0;
1913 str->as.heap.ptr = NULL;
1914
1915 return (VALUE)str;
1916}
1917
1918static inline VALUE
1919str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1920{
1921 int encidx = 0;
1922 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1923 encidx = rb_enc_get_index(str);
1924 flags &= ~ENCODING_MASK;
1925 }
1926 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1927 if (encidx) rb_enc_associate_index(dup, encidx);
1928 return dup;
1929}
1930
1931static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1932
1933static inline VALUE
1934str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1935{
1936 VALUE flags = FL_TEST_RAW(str, flag_mask);
1937 long len = RSTRING_LEN(str);
1938
1939 RUBY_ASSERT(STR_EMBED_P(dup));
1940 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1941 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1942 STR_SET_LEN(dup, RSTRING_LEN(str));
1943 return str_duplicate_setup_encoding(str, dup, flags);
1944}
1945
1946static inline VALUE
1947str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1948{
1949 VALUE flags = FL_TEST_RAW(str, flag_mask);
1950 VALUE root = str;
1951 if (FL_TEST_RAW(str, STR_SHARED)) {
1952 root = RSTRING(str)->as.heap.aux.shared;
1953 }
1954 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1955 root = str = str_new_frozen(klass, str);
1956 flags = FL_TEST_RAW(str, flag_mask);
1957 }
1958 RUBY_ASSERT(!STR_SHARED_P(root));
1960
1961 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1962 FL_SET(root, STR_SHARED_ROOT);
1963 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1964 flags |= RSTRING_NOEMBED | STR_SHARED;
1965
1966 STR_SET_LEN(dup, RSTRING_LEN(str));
1967 return str_duplicate_setup_encoding(str, dup, flags);
1968}
1969
1970static inline VALUE
1971str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1972{
1973 if (STR_EMBED_P(str)) {
1974 return str_duplicate_setup_embed(klass, str, dup);
1975 }
1976 else {
1977 return str_duplicate_setup_heap(klass, str, dup);
1978 }
1979}
1980
1981static inline VALUE
1982str_duplicate(VALUE klass, VALUE str)
1983{
1984 VALUE dup;
1985 if (STR_EMBED_P(str)) {
1986 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 }
1988 else {
1989 dup = str_alloc_heap(klass);
1990 }
1991
1992 return str_duplicate_setup(klass, str, dup);
1993}
1994
1995VALUE
1997{
1998 return str_duplicate(rb_obj_class(str), str);
1999}
2000
2001/* :nodoc: */
2002VALUE
2003rb_str_dup_m(VALUE str)
2004{
2005 if (LIKELY(BARE_STRING_P(str))) {
2006 return str_duplicate(rb_cString, str);
2007 }
2008 else {
2009 return rb_obj_dup(str);
2010 }
2011}
2012
2013VALUE
2015{
2016 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2017 return str_duplicate(rb_cString, str);
2018}
2019
2020VALUE
2021rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2022{
2023 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2024 VALUE new_str, klass = rb_cString;
2025
2026 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2027 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2028 str_duplicate_setup_embed(klass, str, new_str);
2029 }
2030 else {
2031 new_str = ec_str_alloc_heap(ec, klass);
2032 str_duplicate_setup_heap(klass, str, new_str);
2033 }
2034 if (chilled) {
2035 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2036 }
2037 return new_str;
2038}
2039
2040VALUE
2041rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2042{
2043 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2044 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2045 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2046 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2047 return rb_str_freeze(str);
2048}
2049
2050/*
2051 * The documentation block below uses an include (instead of inline text)
2052 * because the included text has non-ASCII characters (which are not allowed in a C file).
2053 */
2054
2055/*
2056 *
2057 * call-seq:
2058 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2059 *
2060 * :include: doc/string/new.rdoc
2061 *
2062 */
2063
2064static VALUE
2065rb_str_init(int argc, VALUE *argv, VALUE str)
2066{
2067 static ID keyword_ids[2];
2068 VALUE orig, opt, venc, vcapa;
2069 VALUE kwargs[2];
2070 rb_encoding *enc = 0;
2071 int n;
2072
2073 if (!keyword_ids[0]) {
2074 keyword_ids[0] = rb_id_encoding();
2075 CONST_ID(keyword_ids[1], "capacity");
2076 }
2077
2078 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2079 if (!NIL_P(opt)) {
2080 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2081 venc = kwargs[0];
2082 vcapa = kwargs[1];
2083 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2084 enc = rb_to_encoding(venc);
2085 }
2086 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2087 long capa = NUM2LONG(vcapa);
2088 long len = 0;
2089 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2090
2091 if (capa < STR_BUF_MIN_SIZE) {
2092 capa = STR_BUF_MIN_SIZE;
2093 }
2094 if (n == 1) {
2095 StringValue(orig);
2096 len = RSTRING_LEN(orig);
2097 if (capa < len) {
2098 capa = len;
2099 }
2100 if (orig == str) n = 0;
2101 }
2102 str_modifiable(str);
2103 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2104 /* make noembed always */
2105 const size_t size = (size_t)capa + termlen;
2106 const char *const old_ptr = RSTRING_PTR(str);
2107 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2108 char *new_ptr = ALLOC_N(char, size);
2109 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2110 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2111 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2112 RSTRING(str)->as.heap.ptr = new_ptr;
2113 }
2114 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2115 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2116 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2117 }
2118 STR_SET_LEN(str, len);
2119 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2120 if (n == 1) {
2121 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2122 rb_enc_cr_str_exact_copy(str, orig);
2123 }
2124 FL_SET(str, STR_NOEMBED);
2125 RSTRING(str)->as.heap.aux.capa = capa;
2126 }
2127 else if (n == 1) {
2128 rb_str_replace(str, orig);
2129 }
2130 if (enc) {
2131 rb_enc_associate(str, enc);
2133 }
2134 }
2135 else if (n == 1) {
2136 rb_str_replace(str, orig);
2137 }
2138 return str;
2139}
2140
2141/* :nodoc: */
2142static VALUE
2143rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2144{
2145 if (klass != rb_cString) {
2146 return rb_class_new_instance_pass_kw(argc, argv, klass);
2147 }
2148
2149 static ID keyword_ids[2];
2150 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2151 VALUE kwargs[2];
2152 rb_encoding *enc = NULL;
2153
2154 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2155 if (NIL_P(opt)) {
2156 return rb_class_new_instance_pass_kw(argc, argv, klass);
2157 }
2158
2159 keyword_ids[0] = rb_id_encoding();
2160 CONST_ID(keyword_ids[1], "capacity");
2161 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2162 encoding = kwargs[0];
2163 capacity = kwargs[1];
2164
2165 if (n == 1) {
2166 orig = StringValue(orig);
2167 }
2168 else {
2169 orig = Qnil;
2170 }
2171
2172 if (UNDEF_P(encoding)) {
2173 if (!NIL_P(orig)) {
2174 encoding = rb_obj_encoding(orig);
2175 }
2176 }
2177
2178 if (!UNDEF_P(encoding)) {
2179 enc = rb_to_encoding(encoding);
2180 }
2181
2182 // If capacity is nil, we're basically just duping `orig`.
2183 if (UNDEF_P(capacity)) {
2184 if (NIL_P(orig)) {
2185 VALUE empty_str = str_new(klass, "", 0);
2186 if (enc) {
2187 rb_enc_associate(empty_str, enc);
2188 }
2189 return empty_str;
2190 }
2191 VALUE copy = str_duplicate(klass, orig);
2192 rb_enc_associate(copy, enc);
2193 ENC_CODERANGE_CLEAR(copy);
2194 return copy;
2195 }
2196
2197 long capa = 0;
2198 capa = NUM2LONG(capacity);
2199 if (capa < 0) {
2200 capa = 0;
2201 }
2202
2203 if (!NIL_P(orig)) {
2204 long orig_capa = rb_str_capacity(orig);
2205 if (orig_capa > capa) {
2206 capa = orig_capa;
2207 }
2208 }
2209
2210 VALUE str = str_enc_new(klass, NULL, capa, enc);
2211 STR_SET_LEN(str, 0);
2212 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2213
2214 if (!NIL_P(orig)) {
2215 rb_str_buf_append(str, orig);
2216 }
2217
2218 return str;
2219}
2220
2221#ifdef NONASCII_MASK
2222#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2223
2224/*
2225 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2226 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2227 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2228 *
2229 * if (!(byte & 0x80))
2230 * byte |= 0x40; // turn on bit6
2231 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2232 *
2233 * This function calculates whether a byte is leading or not for all bytes
2234 * in the argument word by concurrently using the above logic, and then
2235 * adds up the number of leading bytes in the word.
2236 */
2237static inline uintptr_t
2238count_utf8_lead_bytes_with_word(const uintptr_t *s)
2239{
2240 uintptr_t d = *s;
2241
2242 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2243 d = (d>>6) | (~d>>7);
2244 d &= NONASCII_MASK >> 7;
2245
2246 /* Gather all bytes. */
2247#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2248 /* use only if it can use POPCNT */
2249 return rb_popcount_intptr(d);
2250#else
2251 d += (d>>8);
2252 d += (d>>16);
2253# if SIZEOF_VOIDP == 8
2254 d += (d>>32);
2255# endif
2256 return (d&0xF);
2257#endif
2258}
2259#endif
2260
2261static inline long
2262enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2263{
2264 long c;
2265 const char *q;
2266
2267 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2268 long diff = (long)(e - p);
2269 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2270 }
2271#ifdef NONASCII_MASK
2272 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2273 uintptr_t len = 0;
2274 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2275 const uintptr_t *s, *t;
2276 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2277 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2278 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2279 while (p < (const char *)s) {
2280 if (is_utf8_lead_byte(*p)) len++;
2281 p++;
2282 }
2283 while (s < t) {
2284 len += count_utf8_lead_bytes_with_word(s);
2285 s++;
2286 }
2287 p = (const char *)s;
2288 }
2289 while (p < e) {
2290 if (is_utf8_lead_byte(*p)) len++;
2291 p++;
2292 }
2293 return (long)len;
2294 }
2295#endif
2296 else if (rb_enc_asciicompat(enc)) {
2297 c = 0;
2298 if (ENC_CODERANGE_CLEAN_P(cr)) {
2299 while (p < e) {
2300 if (ISASCII(*p)) {
2301 q = search_nonascii(p, e);
2302 if (!q)
2303 return c + (e - p);
2304 c += q - p;
2305 p = q;
2306 }
2307 p += rb_enc_fast_mbclen(p, e, enc);
2308 c++;
2309 }
2310 }
2311 else {
2312 while (p < e) {
2313 if (ISASCII(*p)) {
2314 q = search_nonascii(p, e);
2315 if (!q)
2316 return c + (e - p);
2317 c += q - p;
2318 p = q;
2319 }
2320 p += rb_enc_mbclen(p, e, enc);
2321 c++;
2322 }
2323 }
2324 return c;
2325 }
2326
2327 for (c=0; p<e; c++) {
2328 p += rb_enc_mbclen(p, e, enc);
2329 }
2330 return c;
2331}
2332
2333long
2334rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2335{
2336 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2337}
2338
2339/* To get strlen with cr
2340 * Note that given cr is not used.
2341 */
2342long
2343rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2344{
2345 long c;
2346 const char *q;
2347 int ret;
2348
2349 *cr = 0;
2350 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2351 long diff = (long)(e - p);
2352 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2353 }
2354 else if (rb_enc_asciicompat(enc)) {
2355 c = 0;
2356 while (p < e) {
2357 if (ISASCII(*p)) {
2358 q = search_nonascii(p, e);
2359 if (!q) {
2360 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2361 return c + (e - p);
2362 }
2363 c += q - p;
2364 p = q;
2365 }
2366 ret = rb_enc_precise_mbclen(p, e, enc);
2367 if (MBCLEN_CHARFOUND_P(ret)) {
2368 *cr |= ENC_CODERANGE_VALID;
2369 p += MBCLEN_CHARFOUND_LEN(ret);
2370 }
2371 else {
2373 p++;
2374 }
2375 c++;
2376 }
2377 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2378 return c;
2379 }
2380
2381 for (c=0; p<e; c++) {
2382 ret = rb_enc_precise_mbclen(p, e, enc);
2383 if (MBCLEN_CHARFOUND_P(ret)) {
2384 *cr |= ENC_CODERANGE_VALID;
2385 p += MBCLEN_CHARFOUND_LEN(ret);
2386 }
2387 else {
2389 if (p + rb_enc_mbminlen(enc) <= e)
2390 p += rb_enc_mbminlen(enc);
2391 else
2392 p = e;
2393 }
2394 }
2395 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2396 return c;
2397}
2398
2399/* enc must be str's enc or rb_enc_check(str, str2) */
2400static long
2401str_strlen(VALUE str, rb_encoding *enc)
2402{
2403 const char *p, *e;
2404 int cr;
2405
2406 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2407 if (!enc) enc = STR_ENC_GET(str);
2408 p = RSTRING_PTR(str);
2409 e = RSTRING_END(str);
2410 cr = ENC_CODERANGE(str);
2411
2412 if (cr == ENC_CODERANGE_UNKNOWN) {
2413 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2414 if (cr) ENC_CODERANGE_SET(str, cr);
2415 return n;
2416 }
2417 else {
2418 return enc_strlen(p, e, enc, cr);
2419 }
2420}
2421
2422long
2424{
2425 return str_strlen(str, NULL);
2426}
2427
2428/*
2429 * call-seq:
2430 * length -> integer
2431 *
2432 * :include: doc/string/length.rdoc
2433 *
2434 */
2435
2436VALUE
2438{
2439 return LONG2NUM(str_strlen(str, NULL));
2440}
2441
2442/*
2443 * call-seq:
2444 * bytesize -> integer
2445 *
2446 * :include: doc/string/bytesize.rdoc
2447 *
2448 */
2449
2450VALUE
2451rb_str_bytesize(VALUE str)
2452{
2453 return LONG2NUM(RSTRING_LEN(str));
2454}
2455
2456/*
2457 * call-seq:
2458 * empty? -> true or false
2459 *
2460 * Returns whether the length of +self+ is zero:
2461 *
2462 * 'hello'.empty? # => false
2463 * ' '.empty? # => false
2464 * ''.empty? # => true
2465 *
2466 * Related: see {Querying}[rdoc-ref:String@Querying].
2467 */
2468
2469static VALUE
2470rb_str_empty(VALUE str)
2471{
2472 return RBOOL(RSTRING_LEN(str) == 0);
2473}
2474
2475/*
2476 * call-seq:
2477 * self + other_string -> new_string
2478 *
2479 * Returns a new string containing +other_string+ concatenated to +self+:
2480 *
2481 * 'Hello from ' + self.to_s # => "Hello from main"
2482 *
2483 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2484 */
2485
2486VALUE
2488{
2489 VALUE str3;
2490 rb_encoding *enc;
2491 char *ptr1, *ptr2, *ptr3;
2492 long len1, len2;
2493 int termlen;
2494
2495 StringValue(str2);
2496 enc = rb_enc_check_str(str1, str2);
2497 RSTRING_GETMEM(str1, ptr1, len1);
2498 RSTRING_GETMEM(str2, ptr2, len2);
2499 termlen = rb_enc_mbminlen(enc);
2500 if (len1 > LONG_MAX - len2) {
2501 rb_raise(rb_eArgError, "string size too big");
2502 }
2503 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2504 ptr3 = RSTRING_PTR(str3);
2505 memcpy(ptr3, ptr1, len1);
2506 memcpy(ptr3+len1, ptr2, len2);
2507 TERM_FILL(&ptr3[len1+len2], termlen);
2508
2509 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2511 RB_GC_GUARD(str1);
2512 RB_GC_GUARD(str2);
2513 return str3;
2514}
2515
2516/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2517VALUE
2518rb_str_opt_plus(VALUE str1, VALUE str2)
2519{
2522 long len1, len2;
2523 MAYBE_UNUSED(char) *ptr1, *ptr2;
2524 RSTRING_GETMEM(str1, ptr1, len1);
2525 RSTRING_GETMEM(str2, ptr2, len2);
2526 int enc1 = rb_enc_get_index(str1);
2527 int enc2 = rb_enc_get_index(str2);
2528
2529 if (enc1 < 0) {
2530 return Qundef;
2531 }
2532 else if (enc2 < 0) {
2533 return Qundef;
2534 }
2535 else if (enc1 != enc2) {
2536 return Qundef;
2537 }
2538 else if (len1 > LONG_MAX - len2) {
2539 return Qundef;
2540 }
2541 else {
2542 return rb_str_plus(str1, str2);
2543 }
2544
2545}
2546
2547/*
2548 * call-seq:
2549 * self * n -> new_string
2550 *
2551 * Returns a new string containing +n+ copies of +self+:
2552 *
2553 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2554 * 'No!' * 0 # => ""
2555 *
2556 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2557 */
2558
2559VALUE
2561{
2562 VALUE str2;
2563 long n, len;
2564 char *ptr2;
2565 int termlen;
2566
2567 if (times == INT2FIX(1)) {
2568 return str_duplicate(rb_cString, str);
2569 }
2570 if (times == INT2FIX(0)) {
2571 str2 = str_alloc_embed(rb_cString, 0);
2572 rb_enc_copy(str2, str);
2573 return str2;
2574 }
2575 len = NUM2LONG(times);
2576 if (len < 0) {
2577 rb_raise(rb_eArgError, "negative argument");
2578 }
2579 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2580 if (STR_EMBEDDABLE_P(len, 1)) {
2581 str2 = str_alloc_embed(rb_cString, len + 1);
2582 memset(RSTRING_PTR(str2), 0, len + 1);
2583 }
2584 else {
2585 str2 = str_alloc_heap(rb_cString);
2586 RSTRING(str2)->as.heap.aux.capa = len;
2587 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2588 }
2589 STR_SET_LEN(str2, len);
2590 rb_enc_copy(str2, str);
2591 return str2;
2592 }
2593 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2594 rb_raise(rb_eArgError, "argument too big");
2595 }
2596
2597 len *= RSTRING_LEN(str);
2598 termlen = TERM_LEN(str);
2599 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2600 ptr2 = RSTRING_PTR(str2);
2601 if (len) {
2602 n = RSTRING_LEN(str);
2603 memcpy(ptr2, RSTRING_PTR(str), n);
2604 while (n <= len/2) {
2605 memcpy(ptr2 + n, ptr2, n);
2606 n *= 2;
2607 }
2608 memcpy(ptr2 + n, ptr2, len-n);
2609 }
2610 STR_SET_LEN(str2, len);
2611 TERM_FILL(&ptr2[len], termlen);
2612 rb_enc_cr_str_copy_for_substr(str2, str);
2613
2614 return str2;
2615}
2616
2617/*
2618 * call-seq:
2619 * self % object -> new_string
2620 *
2621 * Returns the result of formatting +object+ into the format specifications
2622 * contained in +self+
2623 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2624 *
2625 * '%05d' % 123 # => "00123"
2626 *
2627 * If +self+ contains multiple format specifications,
2628 * +object+ must be an array or hash containing the objects to be formatted:
2629 *
2630 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2631 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2632 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2633 *
2634 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2635 */
2636
2637static VALUE
2638rb_str_format_m(VALUE str, VALUE arg)
2639{
2640 VALUE tmp = rb_check_array_type(arg);
2641
2642 if (!NIL_P(tmp)) {
2643 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2644 }
2645 return rb_str_format(1, &arg, str);
2646}
2647
2648static inline void
2649rb_check_lockedtmp(VALUE str)
2650{
2651 if (FL_TEST(str, STR_TMPLOCK)) {
2652 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2653 }
2654}
2655
2656// If none of these flags are set, we know we have an modifiable string.
2657// If any is set, we need to do more detailed checks.
2658#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2659static inline void
2660str_modifiable(VALUE str)
2661{
2662 RUBY_ASSERT(ruby_thread_has_gvl_p());
2663
2664 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2665 if (CHILLED_STRING_P(str)) {
2666 CHILLED_STRING_MUTATED(str);
2667 }
2668 rb_check_lockedtmp(str);
2669 rb_check_frozen(str);
2670 }
2671}
2672
2673static inline int
2674str_dependent_p(VALUE str)
2675{
2676 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2677 return FALSE;
2678 }
2679 else {
2680 return TRUE;
2681 }
2682}
2683
2684// If none of these flags are set, we know we have an independent string.
2685// If any is set, we need to do more detailed checks.
2686#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2687static inline int
2688str_independent(VALUE str)
2689{
2690 RUBY_ASSERT(ruby_thread_has_gvl_p());
2691
2692 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2693 str_modifiable(str);
2694 return !str_dependent_p(str);
2695 }
2696 return TRUE;
2697}
2698
2699static void
2700str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2701{
2702 RUBY_ASSERT(ruby_thread_has_gvl_p());
2703
2704 char *ptr;
2705 char *oldptr;
2706 long capa = len + expand;
2707
2708 if (len > capa) len = capa;
2709
2710 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2711 ptr = RSTRING(str)->as.heap.ptr;
2712 STR_SET_EMBED(str);
2713 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2714 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2715 STR_SET_LEN(str, len);
2716 return;
2717 }
2718
2719 ptr = ALLOC_N(char, (size_t)capa + termlen);
2720 oldptr = RSTRING_PTR(str);
2721 if (oldptr) {
2722 memcpy(ptr, oldptr, len);
2723 }
2724 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2725 xfree(oldptr);
2726 }
2727 STR_SET_NOEMBED(str);
2728 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2729 TERM_FILL(ptr + len, termlen);
2730 RSTRING(str)->as.heap.ptr = ptr;
2731 STR_SET_LEN(str, len);
2732 RSTRING(str)->as.heap.aux.capa = capa;
2733}
2734
2735void
2736rb_str_modify(VALUE str)
2737{
2738 if (!str_independent(str))
2739 str_make_independent(str);
2741}
2742
2743void
2745{
2746 RUBY_ASSERT(ruby_thread_has_gvl_p());
2747
2748 int termlen = TERM_LEN(str);
2749 long len = RSTRING_LEN(str);
2750
2751 if (expand < 0) {
2752 rb_raise(rb_eArgError, "negative expanding string size");
2753 }
2754 if (expand >= LONG_MAX - len) {
2755 rb_raise(rb_eArgError, "string size too big");
2756 }
2757
2758 if (!str_independent(str)) {
2759 str_make_independent_expand(str, len, expand, termlen);
2760 }
2761 else if (expand > 0) {
2762 RESIZE_CAPA_TERM(str, len + expand, termlen);
2763 }
2765}
2766
2767/* As rb_str_modify(), but don't clear coderange */
2768static void
2769str_modify_keep_cr(VALUE str)
2770{
2771 if (!str_independent(str))
2772 str_make_independent(str);
2774 /* Force re-scan later */
2776}
2777
2778static inline void
2779str_discard(VALUE str)
2780{
2781 str_modifiable(str);
2782 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2783 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2784 RSTRING(str)->as.heap.ptr = 0;
2785 STR_SET_LEN(str, 0);
2786 }
2787}
2788
2789void
2791{
2792 int encindex = rb_enc_get_index(str);
2793
2794 if (RB_UNLIKELY(encindex == -1)) {
2795 rb_raise(rb_eTypeError, "not encoding capable object");
2796 }
2797
2798 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2799 return;
2800 }
2801
2802 rb_encoding *enc = rb_enc_from_index(encindex);
2803 if (!rb_enc_asciicompat(enc)) {
2804 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2805 }
2806}
2807
2808VALUE
2810{
2811 RUBY_ASSERT(ruby_thread_has_gvl_p());
2812
2813 VALUE s = *ptr;
2814 if (!RB_TYPE_P(s, T_STRING)) {
2815 s = rb_str_to_str(s);
2816 *ptr = s;
2817 }
2818 return s;
2819}
2820
2821char *
2823{
2824 VALUE str = rb_string_value(ptr);
2825 return RSTRING_PTR(str);
2826}
2827
2828static int
2829zero_filled(const char *s, int n)
2830{
2831 for (; n > 0; --n) {
2832 if (*s++) return 0;
2833 }
2834 return 1;
2835}
2836
2837static const char *
2838str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2839{
2840 const char *e = s + len;
2841
2842 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2843 if (zero_filled(s, minlen)) return s;
2844 }
2845 return 0;
2846}
2847
2848static char *
2849str_fill_term(VALUE str, char *s, long len, int termlen)
2850{
2851 /* This function assumes that (capa + termlen) bytes of memory
2852 * is allocated, like many other functions in this file.
2853 */
2854 if (str_dependent_p(str)) {
2855 if (!zero_filled(s + len, termlen))
2856 str_make_independent_expand(str, len, 0L, termlen);
2857 }
2858 else {
2859 TERM_FILL(s + len, termlen);
2860 return s;
2861 }
2862 return RSTRING_PTR(str);
2863}
2864
2865void
2866rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2867{
2868 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2869 long len = RSTRING_LEN(str);
2870
2871 RUBY_ASSERT(capa >= len);
2872 if (capa - len < termlen) {
2873 rb_check_lockedtmp(str);
2874 str_make_independent_expand(str, len, 0L, termlen);
2875 }
2876 else if (str_dependent_p(str)) {
2877 if (termlen > oldtermlen)
2878 str_make_independent_expand(str, len, 0L, termlen);
2879 }
2880 else {
2881 if (!STR_EMBED_P(str)) {
2882 /* modify capa instead of realloc */
2883 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2884 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2885 }
2886 if (termlen > oldtermlen) {
2887 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2888 }
2889 }
2890
2891 return;
2892}
2893
2894static char *
2895str_null_check(VALUE str, int *w)
2896{
2897 char *s = RSTRING_PTR(str);
2898 long len = RSTRING_LEN(str);
2899 rb_encoding *enc = rb_enc_get(str);
2900 const int minlen = rb_enc_mbminlen(enc);
2901
2902 if (minlen > 1) {
2903 *w = 1;
2904 if (str_null_char(s, len, minlen, enc)) {
2905 return NULL;
2906 }
2907 return str_fill_term(str, s, len, minlen);
2908 }
2909 *w = 0;
2910 if (!s || memchr(s, 0, len)) {
2911 return NULL;
2912 }
2913 if (s[len]) {
2914 s = str_fill_term(str, s, len, minlen);
2915 }
2916 return s;
2917}
2918
2919char *
2920rb_str_to_cstr(VALUE str)
2921{
2922 int w;
2923 return str_null_check(str, &w);
2924}
2925
2926char *
2928{
2929 VALUE str = rb_string_value(ptr);
2930 int w;
2931 char *s = str_null_check(str, &w);
2932 if (!s) {
2933 if (w) {
2934 rb_raise(rb_eArgError, "string contains null char");
2935 }
2936 rb_raise(rb_eArgError, "string contains null byte");
2937 }
2938 return s;
2939}
2940
2941char *
2942rb_str_fill_terminator(VALUE str, const int newminlen)
2943{
2944 char *s = RSTRING_PTR(str);
2945 long len = RSTRING_LEN(str);
2946 return str_fill_term(str, s, len, newminlen);
2947}
2948
2949VALUE
2951{
2952 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2953 return str;
2954}
2955
2956/*
2957 * call-seq:
2958 * String.try_convert(object) -> object, new_string, or nil
2959 *
2960 * Attempts to convert the given +object+ to a string.
2961 *
2962 * If +object+ is already a string, returns +object+, unmodified.
2963 *
2964 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2965 * calls <tt>object.to_str</tt> and returns the result.
2966 *
2967 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2968 *
2969 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2970 */
2971static VALUE
2972rb_str_s_try_convert(VALUE dummy, VALUE str)
2973{
2974 return rb_check_string_type(str);
2975}
2976
2977static char*
2978str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2979{
2980 long nth = *nthp;
2981 if (rb_enc_mbmaxlen(enc) == 1) {
2982 p += nth;
2983 }
2984 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2985 p += nth * rb_enc_mbmaxlen(enc);
2986 }
2987 else if (rb_enc_asciicompat(enc)) {
2988 const char *p2, *e2;
2989 int n;
2990
2991 while (p < e && 0 < nth) {
2992 e2 = p + nth;
2993 if (e < e2) {
2994 *nthp = nth;
2995 return (char *)e;
2996 }
2997 if (ISASCII(*p)) {
2998 p2 = search_nonascii(p, e2);
2999 if (!p2) {
3000 nth -= e2 - p;
3001 *nthp = nth;
3002 return (char *)e2;
3003 }
3004 nth -= p2 - p;
3005 p = p2;
3006 }
3007 n = rb_enc_mbclen(p, e, enc);
3008 p += n;
3009 nth--;
3010 }
3011 *nthp = nth;
3012 if (nth != 0) {
3013 return (char *)e;
3014 }
3015 return (char *)p;
3016 }
3017 else {
3018 while (p < e && nth--) {
3019 p += rb_enc_mbclen(p, e, enc);
3020 }
3021 }
3022 if (p > e) p = e;
3023 *nthp = nth;
3024 return (char*)p;
3025}
3026
3027char*
3028rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3029{
3030 return str_nth_len(p, e, &nth, enc);
3031}
3032
3033static char*
3034str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3035{
3036 if (singlebyte)
3037 p += nth;
3038 else {
3039 p = str_nth_len(p, e, &nth, enc);
3040 }
3041 if (!p) return 0;
3042 if (p > e) p = e;
3043 return (char *)p;
3044}
3045
3046/* char offset to byte offset */
3047static long
3048str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3049{
3050 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3051 if (!pp) return e - p;
3052 return pp - p;
3053}
3054
3055long
3056rb_str_offset(VALUE str, long pos)
3057{
3058 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3059 STR_ENC_GET(str), single_byte_optimizable(str));
3060}
3061
3062#ifdef NONASCII_MASK
3063static char *
3064str_utf8_nth(const char *p, const char *e, long *nthp)
3065{
3066 long nth = *nthp;
3067 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3068 const uintptr_t *s, *t;
3069 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3070 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3071 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3072 while (p < (const char *)s) {
3073 if (is_utf8_lead_byte(*p)) nth--;
3074 p++;
3075 }
3076 do {
3077 nth -= count_utf8_lead_bytes_with_word(s);
3078 s++;
3079 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3080 p = (char *)s;
3081 }
3082 while (p < e) {
3083 if (is_utf8_lead_byte(*p)) {
3084 if (nth == 0) break;
3085 nth--;
3086 }
3087 p++;
3088 }
3089 *nthp = nth;
3090 return (char *)p;
3091}
3092
3093static long
3094str_utf8_offset(const char *p, const char *e, long nth)
3095{
3096 const char *pp = str_utf8_nth(p, e, &nth);
3097 return pp - p;
3098}
3099#endif
3100
3101/* byte offset to char offset */
3102long
3103rb_str_sublen(VALUE str, long pos)
3104{
3105 if (single_byte_optimizable(str) || pos < 0)
3106 return pos;
3107 else {
3108 char *p = RSTRING_PTR(str);
3109 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3110 }
3111}
3112
3113static VALUE
3114str_subseq(VALUE str, long beg, long len)
3115{
3116 VALUE str2;
3117
3118 RUBY_ASSERT(beg >= 0);
3119 RUBY_ASSERT(len >= 0);
3120 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3121
3122 const int termlen = TERM_LEN(str);
3123 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3124 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3125 RB_GC_GUARD(str);
3126 return str2;
3127 }
3128
3129 str2 = str_alloc_heap(rb_cString);
3130 if (str_embed_capa(str2) >= len + termlen) {
3131 char *ptr2 = RSTRING(str2)->as.embed.ary;
3132 STR_SET_EMBED(str2);
3133 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3134 TERM_FILL(ptr2+len, termlen);
3135
3136 STR_SET_LEN(str2, len);
3137 RB_GC_GUARD(str);
3138 }
3139 else {
3140 str_replace_shared(str2, str);
3141 RUBY_ASSERT(!STR_EMBED_P(str2));
3142 ENC_CODERANGE_CLEAR(str2);
3143 RSTRING(str2)->as.heap.ptr += beg;
3144 if (RSTRING_LEN(str2) > len) {
3145 STR_SET_LEN(str2, len);
3146 }
3147 }
3148
3149 return str2;
3150}
3151
3152VALUE
3153rb_str_subseq(VALUE str, long beg, long len)
3154{
3155 VALUE str2 = str_subseq(str, beg, len);
3156 rb_enc_cr_str_copy_for_substr(str2, str);
3157 return str2;
3158}
3159
3160char *
3161rb_str_subpos(VALUE str, long beg, long *lenp)
3162{
3163 long len = *lenp;
3164 long slen = -1L;
3165 const long blen = RSTRING_LEN(str);
3166 rb_encoding *enc = STR_ENC_GET(str);
3167 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3168
3169 if (len < 0) return 0;
3170 if (beg < 0 && -beg < 0) return 0;
3171 if (!blen) {
3172 len = 0;
3173 }
3174 if (single_byte_optimizable(str)) {
3175 if (beg > blen) return 0;
3176 if (beg < 0) {
3177 beg += blen;
3178 if (beg < 0) return 0;
3179 }
3180 if (len > blen - beg)
3181 len = blen - beg;
3182 if (len < 0) return 0;
3183 p = s + beg;
3184 goto end;
3185 }
3186 if (beg < 0) {
3187 if (len > -beg) len = -beg;
3188 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3189 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3190 beg = -beg;
3191 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3192 p = e;
3193 if (!p) return 0;
3194 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3195 if (!p) return 0;
3196 len = e - p;
3197 goto end;
3198 }
3199 else {
3200 slen = str_strlen(str, enc);
3201 beg += slen;
3202 if (beg < 0) return 0;
3203 p = s + beg;
3204 if (len == 0) goto end;
3205 }
3206 }
3207 else if (beg > 0 && beg > blen) {
3208 return 0;
3209 }
3210 if (len == 0) {
3211 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3212 p = s + beg;
3213 }
3214#ifdef NONASCII_MASK
3215 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3216 enc == rb_utf8_encoding()) {
3217 p = str_utf8_nth(s, e, &beg);
3218 if (beg > 0) return 0;
3219 len = str_utf8_offset(p, e, len);
3220 }
3221#endif
3222 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3223 int char_sz = rb_enc_mbmaxlen(enc);
3224
3225 p = s + beg * char_sz;
3226 if (p > e) {
3227 return 0;
3228 }
3229 else if (len * char_sz > e - p)
3230 len = e - p;
3231 else
3232 len *= char_sz;
3233 }
3234 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3235 if (beg > 0) return 0;
3236 len = 0;
3237 }
3238 else {
3239 len = str_offset(p, e, len, enc, 0);
3240 }
3241 end:
3242 *lenp = len;
3243 RB_GC_GUARD(str);
3244 return p;
3245}
3246
3247static VALUE str_substr(VALUE str, long beg, long len, int empty);
3248
3249VALUE
3250rb_str_substr(VALUE str, long beg, long len)
3251{
3252 return str_substr(str, beg, len, TRUE);
3253}
3254
3255VALUE
3256rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3257{
3258 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3259}
3260
3261static VALUE
3262str_substr(VALUE str, long beg, long len, int empty)
3263{
3264 char *p = rb_str_subpos(str, beg, &len);
3265
3266 if (!p) return Qnil;
3267 if (!len && !empty) return Qnil;
3268
3269 beg = p - RSTRING_PTR(str);
3270
3271 VALUE str2 = str_subseq(str, beg, len);
3272 rb_enc_cr_str_copy_for_substr(str2, str);
3273 return str2;
3274}
3275
3276/* :nodoc: */
3277VALUE
3279{
3280 if (CHILLED_STRING_P(str)) {
3281 FL_UNSET_RAW(str, STR_CHILLED);
3282 }
3283
3284 if (OBJ_FROZEN(str)) return str;
3285 rb_str_resize(str, RSTRING_LEN(str));
3286 return rb_obj_freeze(str);
3287}
3288
3289/*
3290 * call-seq:
3291 * +string -> new_string or self
3292 *
3293 * Returns +self+ if +self+ is not frozen and can be mutated
3294 * without warning issuance.
3295 *
3296 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3297 *
3298 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3299 */
3300static VALUE
3301str_uplus(VALUE str)
3302{
3303 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3304 return rb_str_dup(str);
3305 }
3306 else {
3307 return str;
3308 }
3309}
3310
3311/*
3312 * call-seq:
3313 * -self -> frozen_string
3314 *
3315 * Returns a frozen string equal to +self+.
3316 *
3317 * The returned string is +self+ if and only if all of the following are true:
3318 *
3319 * - +self+ is already frozen.
3320 * - +self+ is an instance of \String (rather than of a subclass of \String)
3321 * - +self+ has no instance variables set on it.
3322 *
3323 * Otherwise, the returned string is a frozen copy of +self+.
3324 *
3325 * Returning +self+, when possible, saves duplicating +self+;
3326 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3327 *
3328 * It may also save duplicating other, already-existing, strings:
3329 *
3330 * s0 = 'foo'
3331 * s1 = 'foo'
3332 * s0.object_id == s1.object_id # => false
3333 * (-s0).object_id == (-s1).object_id # => true
3334 *
3335 * Note that method #-@ is convenient for defining a constant:
3336 *
3337 * FileName = -'config/database.yml'
3338 *
3339 * While its alias #dedup is better suited for chaining:
3340 *
3341 * 'foo'.dedup.gsub!('o')
3342 *
3343 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3344 */
3345static VALUE
3346str_uminus(VALUE str)
3347{
3348 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3349 str = rb_str_dup(str);
3350 }
3351 return rb_fstring(str);
3352}
3353
3354RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3355#define rb_str_dup_frozen rb_str_new_frozen
3356
3357VALUE
3359{
3360 rb_check_frozen(str);
3361 if (FL_TEST(str, STR_TMPLOCK)) {
3362 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3363 }
3364 FL_SET(str, STR_TMPLOCK);
3365 return str;
3366}
3367
3368VALUE
3370{
3371 rb_check_frozen(str);
3372 if (!FL_TEST(str, STR_TMPLOCK)) {
3373 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3374 }
3375 FL_UNSET(str, STR_TMPLOCK);
3376 return str;
3377}
3378
3379VALUE
3380rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3381{
3382 rb_str_locktmp(str);
3383 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3384}
3385
3386void
3388{
3389 RUBY_ASSERT(ruby_thread_has_gvl_p());
3390
3391 long capa;
3392 const int termlen = TERM_LEN(str);
3393
3394 str_modifiable(str);
3395 if (STR_SHARED_P(str)) {
3396 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3397 }
3398 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3399 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3400 }
3401
3402 int cr = ENC_CODERANGE(str);
3403 if (len == 0) {
3404 /* Empty string does not contain non-ASCII */
3406 }
3407 else if (cr == ENC_CODERANGE_UNKNOWN) {
3408 /* Leave unknown. */
3409 }
3410 else if (len > RSTRING_LEN(str)) {
3411 if (ENC_CODERANGE_CLEAN_P(cr)) {
3412 /* Update the coderange regarding the extended part. */
3413 const char *const prev_end = RSTRING_END(str);
3414 const char *const new_end = RSTRING_PTR(str) + len;
3415 rb_encoding *enc = rb_enc_get(str);
3416 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3417 ENC_CODERANGE_SET(str, cr);
3418 }
3419 else if (cr == ENC_CODERANGE_BROKEN) {
3420 /* May be valid now, by appended part. */
3422 }
3423 }
3424 else if (len < RSTRING_LEN(str)) {
3425 if (cr != ENC_CODERANGE_7BIT) {
3426 /* ASCII-only string is keeping after truncated. Valid
3427 * and broken may be invalid or valid, leave unknown. */
3429 }
3430 }
3431
3432 STR_SET_LEN(str, len);
3433 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3434}
3435
3436VALUE
3437rb_str_resize(VALUE str, long len)
3438{
3439 if (len < 0) {
3440 rb_raise(rb_eArgError, "negative string size (or size too big)");
3441 }
3442
3443 int independent = str_independent(str);
3444 long slen = RSTRING_LEN(str);
3445 const int termlen = TERM_LEN(str);
3446
3447 if (slen > len || (termlen != 1 && slen < len)) {
3449 }
3450
3451 {
3452 long capa;
3453 if (STR_EMBED_P(str)) {
3454 if (len == slen) return str;
3455 if (str_embed_capa(str) >= len + termlen) {
3456 STR_SET_LEN(str, len);
3457 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3458 return str;
3459 }
3460 str_make_independent_expand(str, slen, len - slen, termlen);
3461 }
3462 else if (str_embed_capa(str) >= len + termlen) {
3463 char *ptr = STR_HEAP_PTR(str);
3464 STR_SET_EMBED(str);
3465 if (slen > len) slen = len;
3466 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3467 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3468 STR_SET_LEN(str, len);
3469 if (independent) ruby_xfree(ptr);
3470 return str;
3471 }
3472 else if (!independent) {
3473 if (len == slen) return str;
3474 str_make_independent_expand(str, slen, len - slen, termlen);
3475 }
3476 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3477 (capa - len) > (len < 1024 ? len : 1024)) {
3478 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3479 (size_t)len + termlen, STR_HEAP_SIZE(str));
3480 RSTRING(str)->as.heap.aux.capa = len;
3481 }
3482 else if (len == slen) return str;
3483 STR_SET_LEN(str, len);
3484 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3485 }
3486 return str;
3487}
3488
3489static void
3490str_ensure_available_capa(VALUE str, long len)
3491{
3492 str_modify_keep_cr(str);
3493
3494 const int termlen = TERM_LEN(str);
3495 long olen = RSTRING_LEN(str);
3496
3497 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3498 rb_raise(rb_eArgError, "string sizes too big");
3499 }
3500
3501 long total = olen + len;
3502 long capa = str_capacity(str, termlen);
3503
3504 if (capa < total) {
3505 if (total >= LONG_MAX / 2) {
3506 capa = total;
3507 }
3508 while (total > capa) {
3509 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3510 }
3511 RESIZE_CAPA_TERM(str, capa, termlen);
3512 }
3513}
3514
3515static VALUE
3516str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3517{
3518 if (keep_cr) {
3519 str_modify_keep_cr(str);
3520 }
3521 else {
3522 rb_str_modify(str);
3523 }
3524 if (len == 0) return 0;
3525
3526 long total, olen, off = -1;
3527 char *sptr;
3528 const int termlen = TERM_LEN(str);
3529
3530 RSTRING_GETMEM(str, sptr, olen);
3531 if (ptr >= sptr && ptr <= sptr + olen) {
3532 off = ptr - sptr;
3533 }
3534
3535 long capa = str_capacity(str, termlen);
3536
3537 if (olen > LONG_MAX - len) {
3538 rb_raise(rb_eArgError, "string sizes too big");
3539 }
3540 total = olen + len;
3541 if (capa < total) {
3542 if (total >= LONG_MAX / 2) {
3543 capa = total;
3544 }
3545 while (total > capa) {
3546 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3547 }
3548 RESIZE_CAPA_TERM(str, capa, termlen);
3549 sptr = RSTRING_PTR(str);
3550 }
3551 if (off != -1) {
3552 ptr = sptr + off;
3553 }
3554 memcpy(sptr + olen, ptr, len);
3555 STR_SET_LEN(str, total);
3556 TERM_FILL(sptr + total, termlen); /* sentinel */
3557
3558 return str;
3559}
3560
3561#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3562#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3563
3564VALUE
3565rb_str_cat(VALUE str, const char *ptr, long len)
3566{
3567 if (len == 0) return str;
3568 if (len < 0) {
3569 rb_raise(rb_eArgError, "negative string size (or size too big)");
3570 }
3571 return str_buf_cat(str, ptr, len);
3572}
3573
3574VALUE
3575rb_str_cat_cstr(VALUE str, const char *ptr)
3576{
3577 must_not_null(ptr);
3578 return rb_str_buf_cat(str, ptr, strlen(ptr));
3579}
3580
3581static void
3582rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3583{
3584 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3585
3586 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3587 if (UNLIKELY(!str_independent(str))) {
3588 str_make_independent(str);
3589 }
3590
3591 long string_length = -1;
3592 const int null_terminator_length = 1;
3593 char *sptr;
3594 RSTRING_GETMEM(str, sptr, string_length);
3595
3596 // Ensure the resulting string wouldn't be too long.
3597 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3598 rb_raise(rb_eArgError, "string sizes too big");
3599 }
3600
3601 long string_capacity = str_capacity(str, null_terminator_length);
3602
3603 // Get the code range before any modifications since those might clear the code range.
3604 int cr = ENC_CODERANGE(str);
3605
3606 // Check if the string has spare string_capacity to write the new byte.
3607 if (LIKELY(string_capacity >= string_length + 1)) {
3608 // In fast path we can write the new byte and note the string's new length.
3609 sptr[string_length] = byte;
3610 STR_SET_LEN(str, string_length + 1);
3611 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3612 }
3613 else {
3614 // If there's not enough string_capacity, make a call into the general string concatenation function.
3615 str_buf_cat(str, (char *)&byte, 1);
3616 }
3617
3618 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3619 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3620 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3621 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3622 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3623 if (ISASCII(byte)) {
3625 }
3626 else {
3628
3629 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3630 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3631 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3632 }
3633 }
3634 }
3635}
3636
3637RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3638RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3639RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3640
3641static VALUE
3642rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3643 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3644{
3645 int str_encindex = ENCODING_GET(str);
3646 int res_encindex;
3647 int str_cr, res_cr;
3648 rb_encoding *str_enc, *ptr_enc;
3649
3650 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3651
3652 if (str_encindex == ptr_encindex) {
3653 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3654 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3655 }
3656 }
3657 else {
3658 str_enc = rb_enc_from_index(str_encindex);
3659 ptr_enc = rb_enc_from_index(ptr_encindex);
3660 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3661 if (len == 0)
3662 return str;
3663 if (RSTRING_LEN(str) == 0) {
3664 rb_str_buf_cat(str, ptr, len);
3665 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3666 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3667 return str;
3668 }
3669 goto incompatible;
3670 }
3671 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3672 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3673 }
3674 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3675 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3676 str_cr = rb_enc_str_coderange(str);
3677 }
3678 }
3679 }
3680 if (ptr_cr_ret)
3681 *ptr_cr_ret = ptr_cr;
3682
3683 if (str_encindex != ptr_encindex &&
3684 str_cr != ENC_CODERANGE_7BIT &&
3685 ptr_cr != ENC_CODERANGE_7BIT) {
3686 str_enc = rb_enc_from_index(str_encindex);
3687 ptr_enc = rb_enc_from_index(ptr_encindex);
3688 goto incompatible;
3689 }
3690
3691 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3692 res_encindex = str_encindex;
3693 res_cr = ENC_CODERANGE_UNKNOWN;
3694 }
3695 else if (str_cr == ENC_CODERANGE_7BIT) {
3696 if (ptr_cr == ENC_CODERANGE_7BIT) {
3697 res_encindex = str_encindex;
3698 res_cr = ENC_CODERANGE_7BIT;
3699 }
3700 else {
3701 res_encindex = ptr_encindex;
3702 res_cr = ptr_cr;
3703 }
3704 }
3705 else if (str_cr == ENC_CODERANGE_VALID) {
3706 res_encindex = str_encindex;
3707 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3708 res_cr = str_cr;
3709 else
3710 res_cr = ptr_cr;
3711 }
3712 else { /* str_cr == ENC_CODERANGE_BROKEN */
3713 res_encindex = str_encindex;
3714 res_cr = str_cr;
3715 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3716 }
3717
3718 if (len < 0) {
3719 rb_raise(rb_eArgError, "negative string size (or size too big)");
3720 }
3721 str_buf_cat(str, ptr, len);
3722 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3723 return str;
3724
3725 incompatible:
3726 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3727 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3729}
3730
3731VALUE
3732rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3733{
3734 return rb_enc_cr_str_buf_cat(str, ptr, len,
3735 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3736}
3737
3738VALUE
3740{
3741 /* ptr must reference NUL terminated ASCII string. */
3742 int encindex = ENCODING_GET(str);
3743 rb_encoding *enc = rb_enc_from_index(encindex);
3744 if (rb_enc_asciicompat(enc)) {
3745 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3746 encindex, ENC_CODERANGE_7BIT, 0);
3747 }
3748 else {
3749 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3750 while (*ptr) {
3751 unsigned int c = (unsigned char)*ptr;
3752 int len = rb_enc_codelen(c, enc);
3753 rb_enc_mbcput(c, buf, enc);
3754 rb_enc_cr_str_buf_cat(str, buf, len,
3755 encindex, ENC_CODERANGE_VALID, 0);
3756 ptr++;
3757 }
3758 return str;
3759 }
3760}
3761
3762VALUE
3764{
3765 int str2_cr = rb_enc_str_coderange(str2);
3766
3767 if (str_enc_fastpath(str)) {
3768 switch (str2_cr) {
3769 case ENC_CODERANGE_7BIT:
3770 // If RHS is 7bit we can do simple concatenation
3771 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3772 RB_GC_GUARD(str2);
3773 return str;
3775 // If RHS is valid, we can do simple concatenation if encodings are the same
3776 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3777 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3778 int str_cr = ENC_CODERANGE(str);
3779 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3780 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3781 }
3782 RB_GC_GUARD(str2);
3783 return str;
3784 }
3785 }
3786 }
3787
3788 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3789 ENCODING_GET(str2), str2_cr, &str2_cr);
3790
3791 ENC_CODERANGE_SET(str2, str2_cr);
3792
3793 return str;
3794}
3795
3796VALUE
3798{
3799 StringValue(str2);
3800 return rb_str_buf_append(str, str2);
3801}
3802
3803VALUE
3804rb_str_concat_literals(size_t num, const VALUE *strary)
3805{
3806 VALUE str;
3807 size_t i, s = 0;
3808 unsigned long len = 1;
3809
3810 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3811 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3812
3813 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3814 str = rb_str_buf_new(len);
3815 str_enc_copy_direct(str, strary[0]);
3816
3817 for (i = s; i < num; ++i) {
3818 const VALUE v = strary[i];
3819 int encidx = ENCODING_GET(v);
3820
3821 rb_str_buf_append(str, v);
3822 if (encidx != ENCINDEX_US_ASCII) {
3823 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3824 rb_enc_set_index(str, encidx);
3825 }
3826 }
3827 return str;
3828}
3829
3830/*
3831 * call-seq:
3832 * concat(*objects) -> string
3833 *
3834 * :include: doc/string/concat.rdoc
3835 */
3836static VALUE
3837rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3838{
3839 str_modifiable(str);
3840
3841 if (argc == 1) {
3842 return rb_str_concat(str, argv[0]);
3843 }
3844 else if (argc > 1) {
3845 int i;
3846 VALUE arg_str = rb_str_tmp_new(0);
3847 rb_enc_copy(arg_str, str);
3848 for (i = 0; i < argc; i++) {
3849 rb_str_concat(arg_str, argv[i]);
3850 }
3851 rb_str_buf_append(str, arg_str);
3852 }
3853
3854 return str;
3855}
3856
3857/*
3858 * call-seq:
3859 * append_as_bytes(*objects) -> self
3860 *
3861 * Concatenates each object in +objects+ into +self+; returns +self+;
3862 * performs no encoding validation or conversion:
3863 *
3864 * s = 'foo'
3865 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3866 * s.valid_encoding? # => false
3867 * s.append_as_bytes("\xAC 12")
3868 * s.valid_encoding? # => true
3869 *
3870 * When a given object is an integer,
3871 * the value is considered an 8-bit byte;
3872 * if the integer occupies more than one byte (i.e,. is greater than 255),
3873 * appends only the low-order byte (similar to String#setbyte):
3874 *
3875 * s = ""
3876 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3877 * s.bytesize # => 2
3878 *
3879 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3880 */
3881
3882VALUE
3883rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3884{
3885 long needed_capacity = 0;
3886 volatile VALUE t0;
3887 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3888
3889 for (int index = 0; index < argc; index++) {
3890 VALUE obj = argv[index];
3891 enum ruby_value_type type = types[index] = rb_type(obj);
3892 switch (type) {
3893 case T_FIXNUM:
3894 case T_BIGNUM:
3895 needed_capacity++;
3896 break;
3897 case T_STRING:
3898 needed_capacity += RSTRING_LEN(obj);
3899 break;
3900 default:
3901 rb_raise(
3903 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3904 rb_obj_class(obj)
3905 );
3906 break;
3907 }
3908 }
3909
3910 str_ensure_available_capa(str, needed_capacity);
3911 char *sptr = RSTRING_END(str);
3912
3913 for (int index = 0; index < argc; index++) {
3914 VALUE obj = argv[index];
3915 enum ruby_value_type type = types[index];
3916 switch (type) {
3917 case T_FIXNUM:
3918 case T_BIGNUM: {
3919 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3920 char byte = (char)(NUM2INT(obj) & 0xFF);
3921 *sptr = byte;
3922 sptr++;
3923 break;
3924 }
3925 case T_STRING: {
3926 const char *ptr;
3927 long len;
3928 RSTRING_GETMEM(obj, ptr, len);
3929 memcpy(sptr, ptr, len);
3930 sptr += len;
3931 break;
3932 }
3933 default:
3934 rb_bug("append_as_bytes arguments should have been validated");
3935 }
3936 }
3937
3938 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3939 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3940
3941 int cr = ENC_CODERANGE(str);
3942 switch (cr) {
3943 case ENC_CODERANGE_7BIT: {
3944 for (int index = 0; index < argc; index++) {
3945 VALUE obj = argv[index];
3946 enum ruby_value_type type = types[index];
3947 switch (type) {
3948 case T_FIXNUM:
3949 case T_BIGNUM: {
3950 if (!ISASCII(NUM2INT(obj))) {
3951 goto clear_cr;
3952 }
3953 break;
3954 }
3955 case T_STRING: {
3956 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3957 goto clear_cr;
3958 }
3959 break;
3960 }
3961 default:
3962 rb_bug("append_as_bytes arguments should have been validated");
3963 }
3964 }
3965 break;
3966 }
3968 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3969 goto keep_cr;
3970 }
3971 else {
3972 goto clear_cr;
3973 }
3974 break;
3975 default:
3976 goto clear_cr;
3977 break;
3978 }
3979
3980 RB_GC_GUARD(t0);
3981
3982 clear_cr:
3983 // If no fast path was hit, we clear the coderange.
3984 // append_as_bytes is predominantly meant to be used in
3985 // buffering situation, hence it's likely the coderange
3986 // will never be scanned, so it's not worth spending time
3987 // precomputing the coderange except for simple and common
3988 // situations.
3990 keep_cr:
3991 return str;
3992}
3993
3994/*
3995 * call-seq:
3996 * self << object -> self
3997 *
3998 * Appends a string representation of +object+ to +self+;
3999 * returns +self+.
4000 *
4001 * If +object+ is a string, appends it to +self+:
4002 *
4003 * s = 'foo'
4004 * s << 'bar' # => "foobar"
4005 * s # => "foobar"
4006 *
4007 * If +object+ is an integer,
4008 * its value is considered a codepoint;
4009 * converts the value to a character before concatenating:
4010 *
4011 * s = 'foo'
4012 * s << 33 # => "foo!"
4013 *
4014 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4015 * and the encoding of +self+ is Encoding::US_ASCII,
4016 * changes the encoding to Encoding::ASCII_8BIT:
4017 *
4018 * s = 'foo'.encode(Encoding::US_ASCII)
4019 * s.encoding # => #<Encoding:US-ASCII>
4020 * s << 0xff # => "foo\xFF"
4021 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4022 *
4023 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4024 *
4025 * s = 'foo'
4026 * s.encoding # => <Encoding:UTF-8>
4027 * s << 0x00110000 # 1114112 out of char range (RangeError)
4028 * s = 'foo'.encode(Encoding::EUC_JP)
4029 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4030 *
4031 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4032 */
4033VALUE
4035{
4036 unsigned int code;
4037 rb_encoding *enc = STR_ENC_GET(str1);
4038 int encidx;
4039
4040 if (RB_INTEGER_TYPE_P(str2)) {
4041 if (rb_num_to_uint(str2, &code) == 0) {
4042 }
4043 else if (FIXNUM_P(str2)) {
4044 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4045 }
4046 else {
4047 rb_raise(rb_eRangeError, "bignum out of char range");
4048 }
4049 }
4050 else {
4051 return rb_str_append(str1, str2);
4052 }
4053
4054 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4055
4056 if (encidx >= 0) {
4057 rb_str_buf_cat_byte(str1, (unsigned char)code);
4058 }
4059 else {
4060 long pos = RSTRING_LEN(str1);
4061 int cr = ENC_CODERANGE(str1);
4062 int len;
4063 char *buf;
4064
4065 switch (len = rb_enc_codelen(code, enc)) {
4066 case ONIGERR_INVALID_CODE_POINT_VALUE:
4067 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4068 break;
4069 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4070 case 0:
4071 rb_raise(rb_eRangeError, "%u out of char range", code);
4072 break;
4073 }
4074 buf = ALLOCA_N(char, len + 1);
4075 rb_enc_mbcput(code, buf, enc);
4076 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4077 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4078 }
4079 rb_str_resize(str1, pos+len);
4080 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4081 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4083 }
4084 else if (cr == ENC_CODERANGE_BROKEN) {
4086 }
4087 ENC_CODERANGE_SET(str1, cr);
4088 }
4089 return str1;
4090}
4091
4092int
4093rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4094{
4095 int encidx = rb_enc_to_index(enc);
4096
4097 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4098 /* US-ASCII automatically extended to ASCII-8BIT */
4099 if (code > 0xFF) {
4100 rb_raise(rb_eRangeError, "%u out of char range", code);
4101 }
4102 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4103 return ENCINDEX_ASCII_8BIT;
4104 }
4105 return encidx;
4106 }
4107 else {
4108 return -1;
4109 }
4110}
4111
4112/*
4113 * call-seq:
4114 * prepend(*other_strings) -> new_string
4115 *
4116 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4117 *
4118 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4119 *
4120 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4121 *
4122 */
4123
4124static VALUE
4125rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4126{
4127 str_modifiable(str);
4128
4129 if (argc == 1) {
4130 rb_str_update(str, 0L, 0L, argv[0]);
4131 }
4132 else if (argc > 1) {
4133 int i;
4134 VALUE arg_str = rb_str_tmp_new(0);
4135 rb_enc_copy(arg_str, str);
4136 for (i = 0; i < argc; i++) {
4137 rb_str_append(arg_str, argv[i]);
4138 }
4139 rb_str_update(str, 0L, 0L, arg_str);
4140 }
4141
4142 return str;
4143}
4144
4145st_index_t
4147{
4148 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4149 st_index_t precomputed_hash;
4150 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4151
4152 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4153 return precomputed_hash;
4154 }
4155
4156 return str_do_hash(str);
4157}
4158
4159int
4161{
4162 long len1, len2;
4163 const char *ptr1, *ptr2;
4164 RSTRING_GETMEM(str1, ptr1, len1);
4165 RSTRING_GETMEM(str2, ptr2, len2);
4166 return (len1 != len2 ||
4167 !rb_str_comparable(str1, str2) ||
4168 memcmp(ptr1, ptr2, len1) != 0);
4169}
4170
4171/*
4172 * call-seq:
4173 * hash -> integer
4174 *
4175 * :include: doc/string/hash.rdoc
4176 *
4177 */
4178
4179static VALUE
4180rb_str_hash_m(VALUE str)
4181{
4182 st_index_t hval = rb_str_hash(str);
4183 return ST2FIX(hval);
4184}
4185
4186#define lesser(a,b) (((a)>(b))?(b):(a))
4187
4188int
4190{
4191 int idx1, idx2;
4192 int rc1, rc2;
4193
4194 if (RSTRING_LEN(str1) == 0) return TRUE;
4195 if (RSTRING_LEN(str2) == 0) return TRUE;
4196 idx1 = ENCODING_GET(str1);
4197 idx2 = ENCODING_GET(str2);
4198 if (idx1 == idx2) return TRUE;
4199 rc1 = rb_enc_str_coderange(str1);
4200 rc2 = rb_enc_str_coderange(str2);
4201 if (rc1 == ENC_CODERANGE_7BIT) {
4202 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4203 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4204 return TRUE;
4205 }
4206 if (rc2 == ENC_CODERANGE_7BIT) {
4207 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4208 return TRUE;
4209 }
4210 return FALSE;
4211}
4212
4213int
4215{
4216 long len1, len2;
4217 const char *ptr1, *ptr2;
4218 int retval;
4219
4220 if (str1 == str2) return 0;
4221 RSTRING_GETMEM(str1, ptr1, len1);
4222 RSTRING_GETMEM(str2, ptr2, len2);
4223 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4224 if (len1 == len2) {
4225 if (!rb_str_comparable(str1, str2)) {
4226 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4227 return 1;
4228 return -1;
4229 }
4230 return 0;
4231 }
4232 if (len1 > len2) return 1;
4233 return -1;
4234 }
4235 if (retval > 0) return 1;
4236 return -1;
4237}
4238
4239/*
4240 * call-seq:
4241 * self == object -> true or false
4242 *
4243 * Returns whether +object+ is equal to +self+.
4244 *
4245 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4246 *
4247 * s = 'foo'
4248 * s == 'foo' # => true
4249 * s == 'food' # => false
4250 * s == 'FOO' # => false
4251 *
4252 * Returns +false+ if the two strings' encodings are not compatible:
4253 *
4254 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4255 *
4256 * When +object+ is not a string:
4257 *
4258 * - If +object+ responds to method <tt>to_str</tt>,
4259 * <tt>object == self</tt> is called and its return value is returned.
4260 * - If +object+ does not respond to <tt>to_str</tt>,
4261 * +false+ is returned.
4262 *
4263 * Related: {Comparing}[rdoc-ref:String@Comparing].
4264 */
4265
4266VALUE
4268{
4269 if (str1 == str2) return Qtrue;
4270 if (!RB_TYPE_P(str2, T_STRING)) {
4271 if (!rb_respond_to(str2, idTo_str)) {
4272 return Qfalse;
4273 }
4274 return rb_equal(str2, str1);
4275 }
4276 return rb_str_eql_internal(str1, str2);
4277}
4278
4279/*
4280 * call-seq:
4281 * eql?(object) -> true or false
4282 *
4283 * :include: doc/string/eql_p.rdoc
4284 *
4285 */
4286
4287VALUE
4288rb_str_eql(VALUE str1, VALUE str2)
4289{
4290 if (str1 == str2) return Qtrue;
4291 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4292 return rb_str_eql_internal(str1, str2);
4293}
4294
4295/*
4296 * call-seq:
4297 * self <=> other -> -1, 0, 1, or nil
4298 *
4299 * Compares +self+ and +other+,
4300 * evaluating their _contents_, not their _lengths_.
4301 *
4302 * Returns:
4303 *
4304 * - +-1+, if +self+ is smaller.
4305 * - +0+, if the two are equal.
4306 * - +1+, if +self+ is larger.
4307 * - +nil+, if the two are incomparable.
4308 *
4309 * Examples:
4310 *
4311 * 'a' <=> 'b' # => -1
4312 * 'a' <=> 'ab' # => -1
4313 * 'a' <=> 'a' # => 0
4314 * 'b' <=> 'a' # => 1
4315 * 'ab' <=> 'a' # => 1
4316 * 'a' <=> :a # => nil
4317 *
4318 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4319 */
4320
4321static VALUE
4322rb_str_cmp_m(VALUE str1, VALUE str2)
4323{
4324 int result;
4325 VALUE s = rb_check_string_type(str2);
4326 if (NIL_P(s)) {
4327 return rb_invcmp(str1, str2);
4328 }
4329 result = rb_str_cmp(str1, s);
4330 return INT2FIX(result);
4331}
4332
4333static VALUE str_casecmp(VALUE str1, VALUE str2);
4334static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4335
4336/*
4337 * call-seq:
4338 * casecmp(other_string) -> -1, 0, 1, or nil
4339 *
4340 * Ignoring case, compares +self+ and +other_string+; returns:
4341 *
4342 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4343 * - 0 if the two are equal.
4344 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4345 * - +nil+ if the two are incomparable.
4346 *
4347 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4348 *
4349 * Examples:
4350 *
4351 * 'foo'.casecmp('goo') # => -1
4352 * 'goo'.casecmp('foo') # => 1
4353 * 'foo'.casecmp('food') # => -1
4354 * 'food'.casecmp('foo') # => 1
4355 * 'FOO'.casecmp('foo') # => 0
4356 * 'foo'.casecmp('FOO') # => 0
4357 * 'foo'.casecmp(1) # => nil
4358 *
4359 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4360 */
4361
4362static VALUE
4363rb_str_casecmp(VALUE str1, VALUE str2)
4364{
4365 VALUE s = rb_check_string_type(str2);
4366 if (NIL_P(s)) {
4367 return Qnil;
4368 }
4369 return str_casecmp(str1, s);
4370}
4371
4372static VALUE
4373str_casecmp(VALUE str1, VALUE str2)
4374{
4375 long len;
4376 rb_encoding *enc;
4377 const char *p1, *p1end, *p2, *p2end;
4378
4379 enc = rb_enc_compatible(str1, str2);
4380 if (!enc) {
4381 return Qnil;
4382 }
4383
4384 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4385 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4386 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4387 while (p1 < p1end && p2 < p2end) {
4388 if (*p1 != *p2) {
4389 unsigned int c1 = TOLOWER(*p1 & 0xff);
4390 unsigned int c2 = TOLOWER(*p2 & 0xff);
4391 if (c1 != c2)
4392 return INT2FIX(c1 < c2 ? -1 : 1);
4393 }
4394 p1++;
4395 p2++;
4396 }
4397 }
4398 else {
4399 while (p1 < p1end && p2 < p2end) {
4400 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4401 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4402
4403 if (0 <= c1 && 0 <= c2) {
4404 c1 = TOLOWER(c1);
4405 c2 = TOLOWER(c2);
4406 if (c1 != c2)
4407 return INT2FIX(c1 < c2 ? -1 : 1);
4408 }
4409 else {
4410 int r;
4411 l1 = rb_enc_mbclen(p1, p1end, enc);
4412 l2 = rb_enc_mbclen(p2, p2end, enc);
4413 len = l1 < l2 ? l1 : l2;
4414 r = memcmp(p1, p2, len);
4415 if (r != 0)
4416 return INT2FIX(r < 0 ? -1 : 1);
4417 if (l1 != l2)
4418 return INT2FIX(l1 < l2 ? -1 : 1);
4419 }
4420 p1 += l1;
4421 p2 += l2;
4422 }
4423 }
4424 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4425 if (p1 == p1end) return INT2FIX(-1);
4426 return INT2FIX(1);
4427}
4428
4429/*
4430 * call-seq:
4431 * casecmp?(other_string) -> true, false, or nil
4432 *
4433 * Returns +true+ if +self+ and +other_string+ are equal after
4434 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4435 *
4436 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4437 *
4438 * Examples:
4439 *
4440 * 'foo'.casecmp?('goo') # => false
4441 * 'goo'.casecmp?('foo') # => false
4442 * 'foo'.casecmp?('food') # => false
4443 * 'food'.casecmp?('foo') # => false
4444 * 'FOO'.casecmp?('foo') # => true
4445 * 'foo'.casecmp?('FOO') # => true
4446 * 'foo'.casecmp?(1) # => nil
4447 *
4448 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4449 */
4450
4451static VALUE
4452rb_str_casecmp_p(VALUE str1, VALUE str2)
4453{
4454 VALUE s = rb_check_string_type(str2);
4455 if (NIL_P(s)) {
4456 return Qnil;
4457 }
4458 return str_casecmp_p(str1, s);
4459}
4460
4461static VALUE
4462str_casecmp_p(VALUE str1, VALUE str2)
4463{
4464 rb_encoding *enc;
4465 VALUE folded_str1, folded_str2;
4466 VALUE fold_opt = sym_fold;
4467
4468 enc = rb_enc_compatible(str1, str2);
4469 if (!enc) {
4470 return Qnil;
4471 }
4472
4473 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4474 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4475
4476 return rb_str_eql(folded_str1, folded_str2);
4477}
4478
4479static long
4480strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4481 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4482{
4483 const char *search_start = str_ptr;
4484 long pos, search_len = str_len - offset;
4485
4486 for (;;) {
4487 const char *t;
4488 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4489 if (pos < 0) return pos;
4490 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4491 if (t == search_start + pos) break;
4492 search_len -= t - search_start;
4493 if (search_len <= 0) return -1;
4494 offset += t - search_start;
4495 search_start = t;
4496 }
4497 return pos + offset;
4498}
4499
4500/* found index in byte */
4501#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4502#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4503
4504static long
4505rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4506{
4507 const char *str_ptr, *str_ptr_end, *sub_ptr;
4508 long str_len, sub_len;
4509 rb_encoding *enc;
4510
4511 enc = rb_enc_check(str, sub);
4512 if (is_broken_string(sub)) return -1;
4513
4514 str_ptr = RSTRING_PTR(str);
4515 str_ptr_end = RSTRING_END(str);
4516 str_len = RSTRING_LEN(str);
4517 sub_ptr = RSTRING_PTR(sub);
4518 sub_len = RSTRING_LEN(sub);
4519
4520 if (str_len < sub_len) return -1;
4521
4522 if (offset != 0) {
4523 long str_len_char, sub_len_char;
4524 int single_byte = single_byte_optimizable(str);
4525 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4526 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4527 if (offset < 0) {
4528 offset += str_len_char;
4529 if (offset < 0) return -1;
4530 }
4531 if (str_len_char - offset < sub_len_char) return -1;
4532 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4533 str_ptr += offset;
4534 }
4535 if (sub_len == 0) return offset;
4536
4537 /* need proceed one character at a time */
4538 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4539}
4540
4541
4542/*
4543 * call-seq:
4544 * index(pattern, offset = 0) -> integer or nil
4545 *
4546 * :include: doc/string/index.rdoc
4547 *
4548 */
4549
4550static VALUE
4551rb_str_index_m(int argc, VALUE *argv, VALUE str)
4552{
4553 VALUE sub;
4554 VALUE initpos;
4555 rb_encoding *enc = STR_ENC_GET(str);
4556 long pos;
4557
4558 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4559 long slen = str_strlen(str, enc); /* str's enc */
4560 pos = NUM2LONG(initpos);
4561 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4562 if (RB_TYPE_P(sub, T_REGEXP)) {
4564 }
4565 return Qnil;
4566 }
4567 }
4568 else {
4569 pos = 0;
4570 }
4571
4572 if (RB_TYPE_P(sub, T_REGEXP)) {
4573 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4574 enc, single_byte_optimizable(str));
4575
4576 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4577 VALUE match = rb_backref_get();
4578 struct re_registers *regs = RMATCH_REGS(match);
4579 pos = rb_str_sublen(str, BEG(0));
4580 return LONG2NUM(pos);
4581 }
4582 }
4583 else {
4584 StringValue(sub);
4585 pos = rb_str_index(str, sub, pos);
4586 if (pos >= 0) {
4587 pos = rb_str_sublen(str, pos);
4588 return LONG2NUM(pos);
4589 }
4590 }
4591 return Qnil;
4592}
4593
4594/* Ensure that the given pos is a valid character boundary.
4595 * Note that in this function, "character" means a code point
4596 * (Unicode scalar value), not a grapheme cluster.
4597 */
4598static void
4599str_ensure_byte_pos(VALUE str, long pos)
4600{
4601 if (!single_byte_optimizable(str)) {
4602 const char *s = RSTRING_PTR(str);
4603 const char *e = RSTRING_END(str);
4604 const char *p = s + pos;
4605 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4606 rb_raise(rb_eIndexError,
4607 "offset %ld does not land on character boundary", pos);
4608 }
4609 }
4610}
4611
4612/*
4613 * call-seq:
4614 * byteindex(object, offset = 0) -> integer or nil
4615 *
4616 * Returns the 0-based integer index of a substring of +self+
4617 * specified by +object+ (a string or Regexp) and +offset+,
4618 * or +nil+ if there is no such substring;
4619 * the returned index is the count of _bytes_ (not characters).
4620 *
4621 * When +object+ is a string,
4622 * returns the index of the first found substring equal to +object+:
4623 *
4624 * s = 'foo' # => "foo"
4625 * s.size # => 3 # Three 1-byte characters.
4626 * s.bytesize # => 3 # Three bytes.
4627 * s.byteindex('f') # => 0
4628 * s.byteindex('o') # => 1
4629 * s.byteindex('oo') # => 1
4630 * s.byteindex('ooo') # => nil
4631 *
4632 * When +object+ is a Regexp,
4633 * returns the index of the first found substring matching +object+;
4634 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4635 *
4636 * s = 'foo'
4637 * s.byteindex(/f/) # => 0
4638 * $~ # => #<MatchData "f">
4639 * s.byteindex(/o/) # => 1
4640 * s.byteindex(/oo/) # => 1
4641 * s.byteindex(/ooo/) # => nil
4642 * $~ # => nil
4643 *
4644 * \Integer argument +offset+, if given, specifies the 0-based index
4645 * of the byte where searching is to begin.
4646 *
4647 * When +offset+ is non-negative,
4648 * searching begins at byte position +offset+:
4649 *
4650 * s = 'foo'
4651 * s.byteindex('o', 1) # => 1
4652 * s.byteindex('o', 2) # => 2
4653 * s.byteindex('o', 3) # => nil
4654 *
4655 * When +offset+ is negative, counts backward from the end of +self+:
4656 *
4657 * s = 'foo'
4658 * s.byteindex('o', -1) # => 2
4659 * s.byteindex('o', -2) # => 1
4660 * s.byteindex('o', -3) # => 1
4661 * s.byteindex('o', -4) # => nil
4662 *
4663 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4664 *
4665 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4666 * s.size # => 2 # Two 3-byte characters.
4667 * s.bytesize # => 6 # Six bytes.
4668 * s.byteindex("\uFFFF") # => 0
4669 * s.byteindex("\uFFFF", 1) # Raises IndexError
4670 * s.byteindex("\uFFFF", 2) # Raises IndexError
4671 * s.byteindex("\uFFFF", 3) # => 3
4672 * s.byteindex("\uFFFF", 4) # Raises IndexError
4673 * s.byteindex("\uFFFF", 5) # Raises IndexError
4674 * s.byteindex("\uFFFF", 6) # => nil
4675 *
4676 * Related: see {Querying}[rdoc-ref:String@Querying].
4677 */
4678
4679static VALUE
4680rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4681{
4682 VALUE sub;
4683 VALUE initpos;
4684 long pos;
4685
4686 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4687 long slen = RSTRING_LEN(str);
4688 pos = NUM2LONG(initpos);
4689 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4690 if (RB_TYPE_P(sub, T_REGEXP)) {
4692 }
4693 return Qnil;
4694 }
4695 }
4696 else {
4697 pos = 0;
4698 }
4699
4700 str_ensure_byte_pos(str, pos);
4701
4702 if (RB_TYPE_P(sub, T_REGEXP)) {
4703 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4704 VALUE match = rb_backref_get();
4705 struct re_registers *regs = RMATCH_REGS(match);
4706 pos = BEG(0);
4707 return LONG2NUM(pos);
4708 }
4709 }
4710 else {
4711 StringValue(sub);
4712 pos = rb_str_byteindex(str, sub, pos);
4713 if (pos >= 0) return LONG2NUM(pos);
4714 }
4715 return Qnil;
4716}
4717
4718#ifndef HAVE_MEMRCHR
4719static void*
4720memrchr(const char *search_str, int chr, long search_len)
4721{
4722 const char *ptr = search_str + search_len;
4723 while (ptr > search_str) {
4724 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4725 }
4726
4727 return ((void *)0);
4728}
4729#endif
4730
4731static long
4732str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4733{
4734 char *hit, *adjusted;
4735 int c;
4736 long slen, searchlen;
4737 char *sbeg, *e, *t;
4738
4739 sbeg = RSTRING_PTR(str);
4740 slen = RSTRING_LEN(sub);
4741 if (slen == 0) return s - sbeg;
4742 e = RSTRING_END(str);
4743 t = RSTRING_PTR(sub);
4744 c = *t & 0xff;
4745 searchlen = s - sbeg + 1;
4746
4747 if (memcmp(s, t, slen) == 0) {
4748 return s - sbeg;
4749 }
4750
4751 do {
4752 hit = memrchr(sbeg, c, searchlen);
4753 if (!hit) break;
4754 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4755 if (hit != adjusted) {
4756 searchlen = adjusted - sbeg;
4757 continue;
4758 }
4759 if (memcmp(hit, t, slen) == 0)
4760 return hit - sbeg;
4761 searchlen = adjusted - sbeg;
4762 } while (searchlen > 0);
4763
4764 return -1;
4765}
4766
4767/* found index in byte */
4768static long
4769rb_str_rindex(VALUE str, VALUE sub, long pos)
4770{
4771 long len, slen;
4772 char *sbeg, *s;
4773 rb_encoding *enc;
4774 int singlebyte;
4775
4776 enc = rb_enc_check(str, sub);
4777 if (is_broken_string(sub)) return -1;
4778 singlebyte = single_byte_optimizable(str);
4779 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4780 slen = str_strlen(sub, enc); /* rb_enc_check */
4781
4782 /* substring longer than string */
4783 if (len < slen) return -1;
4784 if (len - pos < slen) pos = len - slen;
4785 if (len == 0) return pos;
4786
4787 sbeg = RSTRING_PTR(str);
4788
4789 if (pos == 0) {
4790 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4791 return 0;
4792 else
4793 return -1;
4794 }
4795
4796 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4797 return str_rindex(str, sub, s, enc);
4798}
4799
4800/*
4801 * call-seq:
4802 * rindex(pattern, offset = self.length) -> integer or nil
4803 *
4804 * :include:doc/string/rindex.rdoc
4805 *
4806 */
4807
4808static VALUE
4809rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4810{
4811 VALUE sub;
4812 VALUE initpos;
4813 rb_encoding *enc = STR_ENC_GET(str);
4814 long pos, len = str_strlen(str, enc); /* str's enc */
4815
4816 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4817 pos = NUM2LONG(initpos);
4818 if (pos < 0 && (pos += len) < 0) {
4819 if (RB_TYPE_P(sub, T_REGEXP)) {
4821 }
4822 return Qnil;
4823 }
4824 if (pos > len) pos = len;
4825 }
4826 else {
4827 pos = len;
4828 }
4829
4830 if (RB_TYPE_P(sub, T_REGEXP)) {
4831 /* enc = rb_enc_check(str, sub); */
4832 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4833 enc, single_byte_optimizable(str));
4834
4835 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4836 VALUE match = rb_backref_get();
4837 struct re_registers *regs = RMATCH_REGS(match);
4838 pos = rb_str_sublen(str, BEG(0));
4839 return LONG2NUM(pos);
4840 }
4841 }
4842 else {
4843 StringValue(sub);
4844 pos = rb_str_rindex(str, sub, pos);
4845 if (pos >= 0) {
4846 pos = rb_str_sublen(str, pos);
4847 return LONG2NUM(pos);
4848 }
4849 }
4850 return Qnil;
4851}
4852
4853static long
4854rb_str_byterindex(VALUE str, VALUE sub, long pos)
4855{
4856 long len, slen;
4857 char *sbeg, *s;
4858 rb_encoding *enc;
4859
4860 enc = rb_enc_check(str, sub);
4861 if (is_broken_string(sub)) return -1;
4862 len = RSTRING_LEN(str);
4863 slen = RSTRING_LEN(sub);
4864
4865 /* substring longer than string */
4866 if (len < slen) return -1;
4867 if (len - pos < slen) pos = len - slen;
4868 if (len == 0) return pos;
4869
4870 sbeg = RSTRING_PTR(str);
4871
4872 if (pos == 0) {
4873 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4874 return 0;
4875 else
4876 return -1;
4877 }
4878
4879 s = sbeg + pos;
4880 return str_rindex(str, sub, s, enc);
4881}
4882
4883/*
4884 * call-seq:
4885 * byterindex(object, offset = self.bytesize) -> integer or nil
4886 *
4887 * Returns the 0-based integer index of a substring of +self+
4888 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4889 * or +nil+ if there is no such substring;
4890 * the returned index is the count of _bytes_ (not characters).
4891 *
4892 * When +object+ is a string,
4893 * returns the index of the _last_ found substring equal to +object+:
4894 *
4895 * s = 'foo' # => "foo"
4896 * s.size # => 3 # Three 1-byte characters.
4897 * s.bytesize # => 3 # Three bytes.
4898 * s.byterindex('f') # => 0
4899 s.byterindex('o') # => 2
4900 s.byterindex('oo') # => 1
4901 s.byterindex('ooo') # => nil
4902 *
4903 * When +object+ is a Regexp,
4904 * returns the index of the last found substring matching +object+;
4905 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4906 *
4907 * s = 'foo'
4908 * s.byterindex(/f/) # => 0
4909 * $~ # => #<MatchData "f">
4910 * s.byterindex(/o/) # => 2
4911 * s.byterindex(/oo/) # => 1
4912 * s.byterindex(/ooo/) # => nil
4913 * $~ # => nil
4914 *
4915 * The last match means starting at the possible last position,
4916 * not the last of the longest matches:
4917 *
4918 * s = 'foo'
4919 * s.byterindex(/o+/) # => 2
4920 * $~ #=> #<MatchData "o">
4921 *
4922 * To get the last longest match, use a negative lookbehind:
4923 *
4924 * s = 'foo'
4925 * s.byterindex(/(?<!o)o+/) # => 1
4926 * $~ # => #<MatchData "oo">
4927 *
4928 * Or use method #byteindex with negative lookahead:
4929 *
4930 * s = 'foo'
4931 * s.byteindex(/o+(?!.*o)/) # => 1
4932 * $~ #=> #<MatchData "oo">
4933 *
4934 * \Integer argument +offset+, if given, specifies the 0-based index
4935 * of the byte where searching is to end.
4936 *
4937 * When +offset+ is non-negative,
4938 * searching ends at byte position +offset+:
4939 *
4940 * s = 'foo'
4941 * s.byterindex('o', 0) # => nil
4942 * s.byterindex('o', 1) # => 1
4943 * s.byterindex('o', 2) # => 2
4944 * s.byterindex('o', 3) # => 2
4945 *
4946 * When +offset+ is negative, counts backward from the end of +self+:
4947 *
4948 * s = 'foo'
4949 * s.byterindex('o', -1) # => 2
4950 * s.byterindex('o', -2) # => 1
4951 * s.byterindex('o', -3) # => nil
4952 *
4953 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4954 *
4955 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4956 * s.size # => 2 # Two 3-byte characters.
4957 * s.bytesize # => 6 # Six bytes.
4958 * s.byterindex("\uFFFF") # => 3
4959 * s.byterindex("\uFFFF", 1) # Raises IndexError
4960 * s.byterindex("\uFFFF", 2) # Raises IndexError
4961 * s.byterindex("\uFFFF", 3) # => 3
4962 * s.byterindex("\uFFFF", 4) # Raises IndexError
4963 * s.byterindex("\uFFFF", 5) # Raises IndexError
4964 * s.byterindex("\uFFFF", 6) # => nil
4965 *
4966 * Related: see {Querying}[rdoc-ref:String@Querying].
4967 */
4968
4969static VALUE
4970rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4971{
4972 VALUE sub;
4973 VALUE initpos;
4974 long pos, len = RSTRING_LEN(str);
4975
4976 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4977 pos = NUM2LONG(initpos);
4978 if (pos < 0 && (pos += len) < 0) {
4979 if (RB_TYPE_P(sub, T_REGEXP)) {
4981 }
4982 return Qnil;
4983 }
4984 if (pos > len) pos = len;
4985 }
4986 else {
4987 pos = len;
4988 }
4989
4990 str_ensure_byte_pos(str, pos);
4991
4992 if (RB_TYPE_P(sub, T_REGEXP)) {
4993 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4994 VALUE match = rb_backref_get();
4995 struct re_registers *regs = RMATCH_REGS(match);
4996 pos = BEG(0);
4997 return LONG2NUM(pos);
4998 }
4999 }
5000 else {
5001 StringValue(sub);
5002 pos = rb_str_byterindex(str, sub, pos);
5003 if (pos >= 0) return LONG2NUM(pos);
5004 }
5005 return Qnil;
5006}
5007
5008/*
5009 * call-seq:
5010 * self =~ object -> integer or nil
5011 *
5012 * When +object+ is a Regexp, returns the index of the first substring in +self+
5013 * matched by +object+,
5014 * or +nil+ if no match is found;
5015 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5016 *
5017 * 'foo' =~ /f/ # => 0
5018 * $~ # => #<MatchData "f">
5019 * 'foo' =~ /o/ # => 1
5020 * $~ # => #<MatchData "o">
5021 * 'foo' =~ /x/ # => nil
5022 * $~ # => nil
5023 *
5024 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5025 * (see Regexp#=~):
5026 *
5027 * number = nil
5028 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5029 * number # => nil # Not assigned.
5030 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5031 * number # => "9" # Assigned.
5032 *
5033 * If +object+ is not a Regexp, returns the value
5034 * returned by <tt>object =~ self</tt>.
5035 *
5036 * Related: see {Querying}[rdoc-ref:String@Querying].
5037 */
5038
5039static VALUE
5040rb_str_match(VALUE x, VALUE y)
5041{
5042 switch (OBJ_BUILTIN_TYPE(y)) {
5043 case T_STRING:
5044 rb_raise(rb_eTypeError, "type mismatch: String given");
5045
5046 case T_REGEXP:
5047 return rb_reg_match(y, x);
5048
5049 default:
5050 return rb_funcall(y, idEqTilde, 1, x);
5051 }
5052}
5053
5054
5055static VALUE get_pat(VALUE);
5056
5057
5058/*
5059 * call-seq:
5060 * match(pattern, offset = 0) -> matchdata or nil
5061 * match(pattern, offset = 0) {|matchdata| ... } -> object
5062 *
5063 * Creates a MatchData object based on +self+ and the given arguments;
5064 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5065 *
5066 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5067 *
5068 * regexp = Regexp.new(pattern)
5069 *
5070 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5071 * (see Regexp#match):
5072 *
5073 * matchdata = regexp.match(self[offset..])
5074 *
5075 * With no block given, returns the computed +matchdata+ or +nil+:
5076 *
5077 * 'foo'.match('f') # => #<MatchData "f">
5078 * 'foo'.match('o') # => #<MatchData "o">
5079 * 'foo'.match('x') # => nil
5080 * 'foo'.match('f', 1) # => nil
5081 * 'foo'.match('o', 1) # => #<MatchData "o">
5082 *
5083 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5084 * returns the block's return value:
5085 *
5086 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5087 *
5088 * With a block given and +nil+ +matchdata+, does not call the block:
5089 *
5090 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5091 *
5092 * Related: see {Querying}[rdoc-ref:String@Querying].
5093 */
5094
5095static VALUE
5096rb_str_match_m(int argc, VALUE *argv, VALUE str)
5097{
5098 VALUE re, result;
5099 if (argc < 1)
5100 rb_check_arity(argc, 1, 2);
5101 re = argv[0];
5102 argv[0] = str;
5103 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5104 if (!NIL_P(result) && rb_block_given_p()) {
5105 return rb_yield(result);
5106 }
5107 return result;
5108}
5109
5110/*
5111 * call-seq:
5112 * match?(pattern, offset = 0) -> true or false
5113 *
5114 * Returns whether a match is found for +self+ and the given arguments;
5115 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5116 *
5117 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5118 *
5119 * regexp = Regexp.new(pattern)
5120 *
5121 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5122 * +false+ otherwise:
5123 *
5124 * 'foo'.match?(/o/) # => true
5125 * 'foo'.match?('o') # => true
5126 * 'foo'.match?(/x/) # => false
5127 * 'foo'.match?('f', 1) # => false
5128 * 'foo'.match?('o', 1) # => true
5129 *
5130 * Related: see {Querying}[rdoc-ref:String@Querying].
5131 */
5132
5133static VALUE
5134rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5135{
5136 VALUE re;
5137 rb_check_arity(argc, 1, 2);
5138 re = get_pat(argv[0]);
5139 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5140}
5141
5142enum neighbor_char {
5143 NEIGHBOR_NOT_CHAR,
5144 NEIGHBOR_FOUND,
5145 NEIGHBOR_WRAPPED
5146};
5147
5148static enum neighbor_char
5149enc_succ_char(char *p, long len, rb_encoding *enc)
5150{
5151 long i;
5152 int l;
5153
5154 if (rb_enc_mbminlen(enc) > 1) {
5155 /* wchar, trivial case */
5156 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5157 if (!MBCLEN_CHARFOUND_P(r)) {
5158 return NEIGHBOR_NOT_CHAR;
5159 }
5160 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5161 l = rb_enc_code_to_mbclen(c, enc);
5162 if (!l) return NEIGHBOR_NOT_CHAR;
5163 if (l != len) return NEIGHBOR_WRAPPED;
5164 rb_enc_mbcput(c, p, enc);
5165 r = rb_enc_precise_mbclen(p, p + len, enc);
5166 if (!MBCLEN_CHARFOUND_P(r)) {
5167 return NEIGHBOR_NOT_CHAR;
5168 }
5169 return NEIGHBOR_FOUND;
5170 }
5171 while (1) {
5172 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5173 p[i] = '\0';
5174 if (i < 0)
5175 return NEIGHBOR_WRAPPED;
5176 ++((unsigned char*)p)[i];
5177 l = rb_enc_precise_mbclen(p, p+len, enc);
5178 if (MBCLEN_CHARFOUND_P(l)) {
5179 l = MBCLEN_CHARFOUND_LEN(l);
5180 if (l == len) {
5181 return NEIGHBOR_FOUND;
5182 }
5183 else {
5184 memset(p+l, 0xff, len-l);
5185 }
5186 }
5187 if (MBCLEN_INVALID_P(l) && i < len-1) {
5188 long len2;
5189 int l2;
5190 for (len2 = len-1; 0 < len2; len2--) {
5191 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5192 if (!MBCLEN_INVALID_P(l2))
5193 break;
5194 }
5195 memset(p+len2+1, 0xff, len-(len2+1));
5196 }
5197 }
5198}
5199
5200static enum neighbor_char
5201enc_pred_char(char *p, long len, rb_encoding *enc)
5202{
5203 long i;
5204 int l;
5205 if (rb_enc_mbminlen(enc) > 1) {
5206 /* wchar, trivial case */
5207 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5208 if (!MBCLEN_CHARFOUND_P(r)) {
5209 return NEIGHBOR_NOT_CHAR;
5210 }
5211 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5212 if (!c) return NEIGHBOR_NOT_CHAR;
5213 --c;
5214 l = rb_enc_code_to_mbclen(c, enc);
5215 if (!l) return NEIGHBOR_NOT_CHAR;
5216 if (l != len) return NEIGHBOR_WRAPPED;
5217 rb_enc_mbcput(c, p, enc);
5218 r = rb_enc_precise_mbclen(p, p + len, enc);
5219 if (!MBCLEN_CHARFOUND_P(r)) {
5220 return NEIGHBOR_NOT_CHAR;
5221 }
5222 return NEIGHBOR_FOUND;
5223 }
5224 while (1) {
5225 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5226 p[i] = '\xff';
5227 if (i < 0)
5228 return NEIGHBOR_WRAPPED;
5229 --((unsigned char*)p)[i];
5230 l = rb_enc_precise_mbclen(p, p+len, enc);
5231 if (MBCLEN_CHARFOUND_P(l)) {
5232 l = MBCLEN_CHARFOUND_LEN(l);
5233 if (l == len) {
5234 return NEIGHBOR_FOUND;
5235 }
5236 else {
5237 memset(p+l, 0, len-l);
5238 }
5239 }
5240 if (MBCLEN_INVALID_P(l) && i < len-1) {
5241 long len2;
5242 int l2;
5243 for (len2 = len-1; 0 < len2; len2--) {
5244 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5245 if (!MBCLEN_INVALID_P(l2))
5246 break;
5247 }
5248 memset(p+len2+1, 0, len-(len2+1));
5249 }
5250 }
5251}
5252
5253/*
5254 overwrite +p+ by succeeding letter in +enc+ and returns
5255 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5256 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5257 assuming each ranges are successive, and mbclen
5258 never change in each ranges.
5259 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5260 character.
5261 */
5262static enum neighbor_char
5263enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5264{
5265 enum neighbor_char ret;
5266 unsigned int c;
5267 int ctype;
5268 int range;
5269 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5270
5271 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5272 int try;
5273 const int max_gaps = 1;
5274
5275 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5276 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5277 ctype = ONIGENC_CTYPE_DIGIT;
5278 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5279 ctype = ONIGENC_CTYPE_ALPHA;
5280 else
5281 return NEIGHBOR_NOT_CHAR;
5282
5283 MEMCPY(save, p, char, len);
5284 for (try = 0; try <= max_gaps; ++try) {
5285 ret = enc_succ_char(p, len, enc);
5286 if (ret == NEIGHBOR_FOUND) {
5287 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5288 if (rb_enc_isctype(c, ctype, enc))
5289 return NEIGHBOR_FOUND;
5290 }
5291 }
5292 MEMCPY(p, save, char, len);
5293 range = 1;
5294 while (1) {
5295 MEMCPY(save, p, char, len);
5296 ret = enc_pred_char(p, len, enc);
5297 if (ret == NEIGHBOR_FOUND) {
5298 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5299 if (!rb_enc_isctype(c, ctype, enc)) {
5300 MEMCPY(p, save, char, len);
5301 break;
5302 }
5303 }
5304 else {
5305 MEMCPY(p, save, char, len);
5306 break;
5307 }
5308 range++;
5309 }
5310 if (range == 1) {
5311 return NEIGHBOR_NOT_CHAR;
5312 }
5313
5314 if (ctype != ONIGENC_CTYPE_DIGIT) {
5315 MEMCPY(carry, p, char, len);
5316 return NEIGHBOR_WRAPPED;
5317 }
5318
5319 MEMCPY(carry, p, char, len);
5320 enc_succ_char(carry, len, enc);
5321 return NEIGHBOR_WRAPPED;
5322}
5323
5324
5325static VALUE str_succ(VALUE str);
5326
5327/*
5328 * call-seq:
5329 * succ -> new_str
5330 *
5331 * :include: doc/string/succ.rdoc
5332 *
5333 */
5334
5335VALUE
5337{
5338 VALUE str;
5339 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5340 rb_enc_cr_str_copy_for_substr(str, orig);
5341 return str_succ(str);
5342}
5343
5344static VALUE
5345str_succ(VALUE str)
5346{
5347 rb_encoding *enc;
5348 char *sbeg, *s, *e, *last_alnum = 0;
5349 int found_alnum = 0;
5350 long l, slen;
5351 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5352 long carry_pos = 0, carry_len = 1;
5353 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5354
5355 slen = RSTRING_LEN(str);
5356 if (slen == 0) return str;
5357
5358 enc = STR_ENC_GET(str);
5359 sbeg = RSTRING_PTR(str);
5360 s = e = sbeg + slen;
5361
5362 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5363 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5364 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5365 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5366 break;
5367 }
5368 }
5369 l = rb_enc_precise_mbclen(s, e, enc);
5370 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5371 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5372 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5373 switch (neighbor) {
5374 case NEIGHBOR_NOT_CHAR:
5375 continue;
5376 case NEIGHBOR_FOUND:
5377 return str;
5378 case NEIGHBOR_WRAPPED:
5379 last_alnum = s;
5380 break;
5381 }
5382 found_alnum = 1;
5383 carry_pos = s - sbeg;
5384 carry_len = l;
5385 }
5386 if (!found_alnum) { /* str contains no alnum */
5387 s = e;
5388 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5389 enum neighbor_char neighbor;
5390 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5391 l = rb_enc_precise_mbclen(s, e, enc);
5392 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5393 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5394 MEMCPY(tmp, s, char, l);
5395 neighbor = enc_succ_char(tmp, l, enc);
5396 switch (neighbor) {
5397 case NEIGHBOR_FOUND:
5398 MEMCPY(s, tmp, char, l);
5399 return str;
5400 break;
5401 case NEIGHBOR_WRAPPED:
5402 MEMCPY(s, tmp, char, l);
5403 break;
5404 case NEIGHBOR_NOT_CHAR:
5405 break;
5406 }
5407 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5408 /* wrapped to \0...\0. search next valid char. */
5409 enc_succ_char(s, l, enc);
5410 }
5411 if (!rb_enc_asciicompat(enc)) {
5412 MEMCPY(carry, s, char, l);
5413 carry_len = l;
5414 }
5415 carry_pos = s - sbeg;
5416 }
5418 }
5419 RESIZE_CAPA(str, slen + carry_len);
5420 sbeg = RSTRING_PTR(str);
5421 s = sbeg + carry_pos;
5422 memmove(s + carry_len, s, slen - carry_pos);
5423 memmove(s, carry, carry_len);
5424 slen += carry_len;
5425 STR_SET_LEN(str, slen);
5426 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5428 return str;
5429}
5430
5431
5432/*
5433 * call-seq:
5434 * succ! -> self
5435 *
5436 * Like String#succ, but modifies +self+ in place; returns +self+.
5437 *
5438 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5439 */
5440
5441static VALUE
5442rb_str_succ_bang(VALUE str)
5443{
5444 rb_str_modify(str);
5445 str_succ(str);
5446 return str;
5447}
5448
5449static int
5450all_digits_p(const char *s, long len)
5451{
5452 while (len-- > 0) {
5453 if (!ISDIGIT(*s)) return 0;
5454 s++;
5455 }
5456 return 1;
5457}
5458
5459static int
5460str_upto_i(VALUE str, VALUE arg)
5461{
5462 rb_yield(str);
5463 return 0;
5464}
5465
5466/*
5467 * call-seq:
5468 * upto(other_string, exclusive = false) {|string| ... } -> self
5469 * upto(other_string, exclusive = false) -> new_enumerator
5470 *
5471 * :include: doc/string/upto.rdoc
5472 *
5473 */
5474
5475static VALUE
5476rb_str_upto(int argc, VALUE *argv, VALUE beg)
5477{
5478 VALUE end, exclusive;
5479
5480 rb_scan_args(argc, argv, "11", &end, &exclusive);
5481 RETURN_ENUMERATOR(beg, argc, argv);
5482 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5483}
5484
5485VALUE
5486rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5487{
5488 VALUE current, after_end;
5489 ID succ;
5490 int n, ascii;
5491 rb_encoding *enc;
5492
5493 CONST_ID(succ, "succ");
5494 StringValue(end);
5495 enc = rb_enc_check(beg, end);
5496 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5497 /* single character */
5498 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5499 char c = RSTRING_PTR(beg)[0];
5500 char e = RSTRING_PTR(end)[0];
5501
5502 if (c > e || (excl && c == e)) return beg;
5503 for (;;) {
5504 VALUE str = rb_enc_str_new(&c, 1, enc);
5506 if ((*each)(str, arg)) break;
5507 if (!excl && c == e) break;
5508 c++;
5509 if (excl && c == e) break;
5510 }
5511 return beg;
5512 }
5513 /* both edges are all digits */
5514 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5515 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5516 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5517 VALUE b, e;
5518 int width;
5519
5520 width = RSTRING_LENINT(beg);
5521 b = rb_str_to_inum(beg, 10, FALSE);
5522 e = rb_str_to_inum(end, 10, FALSE);
5523 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5524 long bi = FIX2LONG(b);
5525 long ei = FIX2LONG(e);
5526 rb_encoding *usascii = rb_usascii_encoding();
5527
5528 while (bi <= ei) {
5529 if (excl && bi == ei) break;
5530 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5531 bi++;
5532 }
5533 }
5534 else {
5535 ID op = excl ? '<' : idLE;
5536 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5537
5538 args[0] = INT2FIX(width);
5539 while (rb_funcall(b, op, 1, e)) {
5540 args[1] = b;
5541 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5542 b = rb_funcallv(b, succ, 0, 0);
5543 }
5544 }
5545 return beg;
5546 }
5547 /* normal case */
5548 n = rb_str_cmp(beg, end);
5549 if (n > 0 || (excl && n == 0)) return beg;
5550
5551 after_end = rb_funcallv(end, succ, 0, 0);
5552 current = str_duplicate(rb_cString, beg);
5553 while (!rb_str_equal(current, after_end)) {
5554 VALUE next = Qnil;
5555 if (excl || !rb_str_equal(current, end))
5556 next = rb_funcallv(current, succ, 0, 0);
5557 if ((*each)(current, arg)) break;
5558 if (NIL_P(next)) break;
5559 current = next;
5560 StringValue(current);
5561 if (excl && rb_str_equal(current, end)) break;
5562 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5563 break;
5564 }
5565
5566 return beg;
5567}
5568
5569VALUE
5570rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5571{
5572 VALUE current;
5573 ID succ;
5574
5575 CONST_ID(succ, "succ");
5576 /* both edges are all digits */
5577 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5578 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5579 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5580 int width = RSTRING_LENINT(beg);
5581 b = rb_str_to_inum(beg, 10, FALSE);
5582 if (FIXNUM_P(b)) {
5583 long bi = FIX2LONG(b);
5584 rb_encoding *usascii = rb_usascii_encoding();
5585
5586 while (FIXABLE(bi)) {
5587 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5588 bi++;
5589 }
5590 b = LONG2NUM(bi);
5591 }
5592 args[0] = INT2FIX(width);
5593 while (1) {
5594 args[1] = b;
5595 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5596 b = rb_funcallv(b, succ, 0, 0);
5597 }
5598 }
5599 /* normal case */
5600 current = str_duplicate(rb_cString, beg);
5601 while (1) {
5602 VALUE next = rb_funcallv(current, succ, 0, 0);
5603 if ((*each)(current, arg)) break;
5604 current = next;
5605 StringValue(current);
5606 if (RSTRING_LEN(current) == 0)
5607 break;
5608 }
5609
5610 return beg;
5611}
5612
5613static int
5614include_range_i(VALUE str, VALUE arg)
5615{
5616 VALUE *argp = (VALUE *)arg;
5617 if (!rb_equal(str, *argp)) return 0;
5618 *argp = Qnil;
5619 return 1;
5620}
5621
5622VALUE
5623rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5624{
5625 beg = rb_str_new_frozen(beg);
5626 StringValue(end);
5627 end = rb_str_new_frozen(end);
5628 if (NIL_P(val)) return Qfalse;
5629 val = rb_check_string_type(val);
5630 if (NIL_P(val)) return Qfalse;
5631 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5632 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5633 rb_enc_asciicompat(STR_ENC_GET(val))) {
5634 const char *bp = RSTRING_PTR(beg);
5635 const char *ep = RSTRING_PTR(end);
5636 const char *vp = RSTRING_PTR(val);
5637 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5638 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5639 return Qfalse;
5640 else {
5641 char b = *bp;
5642 char e = *ep;
5643 char v = *vp;
5644
5645 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5646 if (b <= v && v < e) return Qtrue;
5647 return RBOOL(!RTEST(exclusive) && v == e);
5648 }
5649 }
5650 }
5651#if 0
5652 /* both edges are all digits */
5653 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5654 all_digits_p(bp, RSTRING_LEN(beg)) &&
5655 all_digits_p(ep, RSTRING_LEN(end))) {
5656 /* TODO */
5657 }
5658#endif
5659 }
5660 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5661
5662 return RBOOL(NIL_P(val));
5663}
5664
5665static VALUE
5666rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5667{
5668 if (rb_reg_search(re, str, 0, 0) >= 0) {
5669 VALUE match = rb_backref_get();
5670 int nth = rb_reg_backref_number(match, backref);
5671 return rb_reg_nth_match(nth, match);
5672 }
5673 return Qnil;
5674}
5675
5676static VALUE
5677rb_str_aref(VALUE str, VALUE indx)
5678{
5679 long idx;
5680
5681 if (FIXNUM_P(indx)) {
5682 idx = FIX2LONG(indx);
5683 }
5684 else if (RB_TYPE_P(indx, T_REGEXP)) {
5685 return rb_str_subpat(str, indx, INT2FIX(0));
5686 }
5687 else if (RB_TYPE_P(indx, T_STRING)) {
5688 if (rb_str_index(str, indx, 0) != -1)
5689 return str_duplicate(rb_cString, indx);
5690 return Qnil;
5691 }
5692 else {
5693 /* check if indx is Range */
5694 long beg, len = str_strlen(str, NULL);
5695 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5696 case Qfalse:
5697 break;
5698 case Qnil:
5699 return Qnil;
5700 default:
5701 return rb_str_substr(str, beg, len);
5702 }
5703 idx = NUM2LONG(indx);
5704 }
5705
5706 return str_substr(str, idx, 1, FALSE);
5707}
5708
5709
5710/*
5711 * call-seq:
5712 * self[index] -> new_string or nil
5713 * self[start, length] -> new_string or nil
5714 * self[range] -> new_string or nil
5715 * self[regexp, capture = 0] -> new_string or nil
5716 * self[substring] -> new_string or nil
5717 *
5718 * :include: doc/string/aref.rdoc
5719 *
5720 */
5721
5722static VALUE
5723rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5724{
5725 if (argc == 2) {
5726 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5727 return rb_str_subpat(str, argv[0], argv[1]);
5728 }
5729 else {
5730 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5731 }
5732 }
5733 rb_check_arity(argc, 1, 2);
5734 return rb_str_aref(str, argv[0]);
5735}
5736
5737VALUE
5739{
5740 char *ptr = RSTRING_PTR(str);
5741 long olen = RSTRING_LEN(str), nlen;
5742
5743 str_modifiable(str);
5744 if (len > olen) len = olen;
5745 nlen = olen - len;
5746 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5747 char *oldptr = ptr;
5748 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5749 STR_SET_EMBED(str);
5750 ptr = RSTRING(str)->as.embed.ary;
5751 memmove(ptr, oldptr + len, nlen);
5752 if (fl == STR_NOEMBED) xfree(oldptr);
5753 }
5754 else {
5755 if (!STR_SHARED_P(str)) {
5756 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5757 rb_enc_cr_str_exact_copy(shared, str);
5758 OBJ_FREEZE(shared);
5759 }
5760 ptr = RSTRING(str)->as.heap.ptr += len;
5761 }
5762 STR_SET_LEN(str, nlen);
5763
5764 if (!SHARABLE_MIDDLE_SUBSTRING) {
5765 TERM_FILL(ptr + nlen, TERM_LEN(str));
5766 }
5768 return str;
5769}
5770
5771static void
5772rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5773{
5774 char *sptr;
5775 long slen;
5776 int cr;
5777
5778 if (beg == 0 && vlen == 0) {
5779 rb_str_drop_bytes(str, len);
5780 return;
5781 }
5782
5783 str_modify_keep_cr(str);
5784 RSTRING_GETMEM(str, sptr, slen);
5785 if (len < vlen) {
5786 /* expand string */
5787 RESIZE_CAPA(str, slen + vlen - len);
5788 sptr = RSTRING_PTR(str);
5789 }
5790
5792 cr = rb_enc_str_coderange(val);
5793 else
5795
5796 if (vlen != len) {
5797 memmove(sptr + beg + vlen,
5798 sptr + beg + len,
5799 slen - (beg + len));
5800 }
5801 if (vlen < beg && len < 0) {
5802 MEMZERO(sptr + slen, char, -len);
5803 }
5804 if (vlen > 0) {
5805 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5806 }
5807 slen += vlen - len;
5808 STR_SET_LEN(str, slen);
5809 TERM_FILL(&sptr[slen], TERM_LEN(str));
5810 ENC_CODERANGE_SET(str, cr);
5811}
5812
5813static inline void
5814rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5815{
5816 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5817}
5818
5819void
5820rb_str_update(VALUE str, long beg, long len, VALUE val)
5821{
5822 long slen;
5823 char *p, *e;
5824 rb_encoding *enc;
5825 int singlebyte = single_byte_optimizable(str);
5826 int cr;
5827
5828 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5829
5830 StringValue(val);
5831 enc = rb_enc_check(str, val);
5832 slen = str_strlen(str, enc); /* rb_enc_check */
5833
5834 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5835 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5836 }
5837 if (beg < 0) {
5838 beg += slen;
5839 }
5840 RUBY_ASSERT(beg >= 0);
5841 RUBY_ASSERT(beg <= slen);
5842
5843 if (len > slen - beg) {
5844 len = slen - beg;
5845 }
5846 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5847 if (!p) p = RSTRING_END(str);
5848 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5849 if (!e) e = RSTRING_END(str);
5850 /* error check */
5851 beg = p - RSTRING_PTR(str); /* physical position */
5852 len = e - p; /* physical length */
5853 rb_str_update_0(str, beg, len, val);
5854 rb_enc_associate(str, enc);
5856 if (cr != ENC_CODERANGE_BROKEN)
5857 ENC_CODERANGE_SET(str, cr);
5858}
5859
5860static void
5861rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5862{
5863 int nth;
5864 VALUE match;
5865 long start, end, len;
5866 rb_encoding *enc;
5867 struct re_registers *regs;
5868
5869 if (rb_reg_search(re, str, 0, 0) < 0) {
5870 rb_raise(rb_eIndexError, "regexp not matched");
5871 }
5872 match = rb_backref_get();
5873 nth = rb_reg_backref_number(match, backref);
5874 regs = RMATCH_REGS(match);
5875 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5876 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5877 }
5878 if (nth < 0) {
5879 nth += regs->num_regs;
5880 }
5881
5882 start = BEG(nth);
5883 if (start == -1) {
5884 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5885 }
5886 end = END(nth);
5887 len = end - start;
5888 StringValue(val);
5889 enc = rb_enc_check_str(str, val);
5890 rb_str_update_0(str, start, len, val);
5891 rb_enc_associate(str, enc);
5892}
5893
5894static VALUE
5895rb_str_aset(VALUE str, VALUE indx, VALUE val)
5896{
5897 long idx, beg;
5898
5899 switch (TYPE(indx)) {
5900 case T_REGEXP:
5901 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5902 return val;
5903
5904 case T_STRING:
5905 beg = rb_str_index(str, indx, 0);
5906 if (beg < 0) {
5907 rb_raise(rb_eIndexError, "string not matched");
5908 }
5909 beg = rb_str_sublen(str, beg);
5910 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5911 return val;
5912
5913 default:
5914 /* check if indx is Range */
5915 {
5916 long beg, len;
5917 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5918 rb_str_update(str, beg, len, val);
5919 return val;
5920 }
5921 }
5922 /* FALLTHROUGH */
5923
5924 case T_FIXNUM:
5925 idx = NUM2LONG(indx);
5926 rb_str_update(str, idx, 1, val);
5927 return val;
5928 }
5929}
5930
5931/*
5932 * call-seq:
5933 * self[index] = other_string -> new_string
5934 * self[start, length] = other_string -> new_string
5935 * self[range] = other_string -> new_string
5936 * self[regexp, capture = 0] = other_string -> new_string
5937 * self[substring] = other_string -> new_string
5938 *
5939 * :include: doc/string/aset.rdoc
5940 *
5941 */
5942
5943static VALUE
5944rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5945{
5946 if (argc == 3) {
5947 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5948 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5949 }
5950 else {
5951 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5952 }
5953 return argv[2];
5954 }
5955 rb_check_arity(argc, 2, 3);
5956 return rb_str_aset(str, argv[0], argv[1]);
5957}
5958
5959/*
5960 * call-seq:
5961 * insert(offset, other_string) -> self
5962 *
5963 * :include: doc/string/insert.rdoc
5964 *
5965 */
5966
5967static VALUE
5968rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5969{
5970 long pos = NUM2LONG(idx);
5971
5972 if (pos == -1) {
5973 return rb_str_append(str, str2);
5974 }
5975 else if (pos < 0) {
5976 pos++;
5977 }
5978 rb_str_update(str, pos, 0, str2);
5979 return str;
5980}
5981
5982
5983/*
5984 * call-seq:
5985 * slice!(index) -> new_string or nil
5986 * slice!(start, length) -> new_string or nil
5987 * slice!(range) -> new_string or nil
5988 * slice!(regexp, capture = 0) -> new_string or nil
5989 * slice!(substring) -> new_string or nil
5990 *
5991 * Like String#[] (and its alias String#slice), except that:
5992 *
5993 * - Performs substitutions in +self+ (not in a copy of +self+).
5994 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
5995 *
5996 * A few examples:
5997 *
5998 * s = 'hello'
5999 * s.slice!('e') # => "e"
6000 * s # => "hllo"
6001 * s.slice!('e') # => nil
6002 * s # => "hllo"
6003 *
6004 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6005 */
6006
6007static VALUE
6008rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6009{
6010 VALUE result = Qnil;
6011 VALUE indx;
6012 long beg, len = 1;
6013 char *p;
6014
6015 rb_check_arity(argc, 1, 2);
6016 str_modify_keep_cr(str);
6017 indx = argv[0];
6018 if (RB_TYPE_P(indx, T_REGEXP)) {
6019 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6020 VALUE match = rb_backref_get();
6021 struct re_registers *regs = RMATCH_REGS(match);
6022 int nth = 0;
6023 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6024 if ((nth += regs->num_regs) <= 0) return Qnil;
6025 }
6026 else if (nth >= regs->num_regs) return Qnil;
6027 beg = BEG(nth);
6028 len = END(nth) - beg;
6029 goto subseq;
6030 }
6031 else if (argc == 2) {
6032 beg = NUM2LONG(indx);
6033 len = NUM2LONG(argv[1]);
6034 goto num_index;
6035 }
6036 else if (FIXNUM_P(indx)) {
6037 beg = FIX2LONG(indx);
6038 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6039 if (!len) return Qnil;
6040 beg = p - RSTRING_PTR(str);
6041 goto subseq;
6042 }
6043 else if (RB_TYPE_P(indx, T_STRING)) {
6044 beg = rb_str_index(str, indx, 0);
6045 if (beg == -1) return Qnil;
6046 len = RSTRING_LEN(indx);
6047 result = str_duplicate(rb_cString, indx);
6048 goto squash;
6049 }
6050 else {
6051 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6052 case Qnil:
6053 return Qnil;
6054 case Qfalse:
6055 beg = NUM2LONG(indx);
6056 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6057 if (!len) return Qnil;
6058 beg = p - RSTRING_PTR(str);
6059 goto subseq;
6060 default:
6061 goto num_index;
6062 }
6063 }
6064
6065 num_index:
6066 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6067 beg = p - RSTRING_PTR(str);
6068
6069 subseq:
6070 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6071 rb_enc_cr_str_copy_for_substr(result, str);
6072
6073 squash:
6074 if (len > 0) {
6075 if (beg == 0) {
6076 rb_str_drop_bytes(str, len);
6077 }
6078 else {
6079 char *sptr = RSTRING_PTR(str);
6080 long slen = RSTRING_LEN(str);
6081 if (beg + len > slen) /* pathological check */
6082 len = slen - beg;
6083 memmove(sptr + beg,
6084 sptr + beg + len,
6085 slen - (beg + len));
6086 slen -= len;
6087 STR_SET_LEN(str, slen);
6088 TERM_FILL(&sptr[slen], TERM_LEN(str));
6089 }
6090 }
6091 return result;
6092}
6093
6094static VALUE
6095get_pat(VALUE pat)
6096{
6097 VALUE val;
6098
6099 switch (OBJ_BUILTIN_TYPE(pat)) {
6100 case T_REGEXP:
6101 return pat;
6102
6103 case T_STRING:
6104 break;
6105
6106 default:
6107 val = rb_check_string_type(pat);
6108 if (NIL_P(val)) {
6109 Check_Type(pat, T_REGEXP);
6110 }
6111 pat = val;
6112 }
6113
6114 return rb_reg_regcomp(pat);
6115}
6116
6117static VALUE
6118get_pat_quoted(VALUE pat, int check)
6119{
6120 VALUE val;
6121
6122 switch (OBJ_BUILTIN_TYPE(pat)) {
6123 case T_REGEXP:
6124 return pat;
6125
6126 case T_STRING:
6127 break;
6128
6129 default:
6130 val = rb_check_string_type(pat);
6131 if (NIL_P(val)) {
6132 Check_Type(pat, T_REGEXP);
6133 }
6134 pat = val;
6135 }
6136 if (check && is_broken_string(pat)) {
6137 rb_exc_raise(rb_reg_check_preprocess(pat));
6138 }
6139 return pat;
6140}
6141
6142static long
6143rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6144{
6145 if (BUILTIN_TYPE(pat) == T_STRING) {
6146 pos = rb_str_byteindex(str, pat, pos);
6147 if (set_backref_str) {
6148 if (pos >= 0) {
6149 str = rb_str_new_frozen_String(str);
6150 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6151 if (match) {
6152 *match = match_data;
6153 }
6154 }
6155 else {
6157 }
6158 }
6159 return pos;
6160 }
6161 else {
6162 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6163 }
6164}
6165
6166static long
6167rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6168{
6169 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6170}
6171
6172
6173/*
6174 * call-seq:
6175 * sub!(pattern, replacement) -> self or nil
6176 * sub!(pattern) {|match| ... } -> self or nil
6177 *
6178 * Like String#sub, except that:
6179 *
6180 * - Changes are made to +self+, not to copy of +self+.
6181 * - Returns +self+ if any changes are made, +nil+ otherwise.
6182 *
6183 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6184 */
6185
6186static VALUE
6187rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6188{
6189 VALUE pat, repl, hash = Qnil;
6190 int iter = 0;
6191 long plen;
6192 int min_arity = rb_block_given_p() ? 1 : 2;
6193 long beg;
6194
6195 rb_check_arity(argc, min_arity, 2);
6196 if (argc == 1) {
6197 iter = 1;
6198 }
6199 else {
6200 repl = argv[1];
6201 hash = rb_check_hash_type(argv[1]);
6202 if (NIL_P(hash)) {
6203 StringValue(repl);
6204 }
6205 }
6206
6207 pat = get_pat_quoted(argv[0], 1);
6208
6209 str_modifiable(str);
6210 beg = rb_pat_search(pat, str, 0, 1);
6211 if (beg >= 0) {
6212 rb_encoding *enc;
6213 int cr = ENC_CODERANGE(str);
6214 long beg0, end0;
6215 VALUE match, match0 = Qnil;
6216 struct re_registers *regs;
6217 char *p, *rp;
6218 long len, rlen;
6219
6220 match = rb_backref_get();
6221 regs = RMATCH_REGS(match);
6222 if (RB_TYPE_P(pat, T_STRING)) {
6223 beg0 = beg;
6224 end0 = beg0 + RSTRING_LEN(pat);
6225 match0 = pat;
6226 }
6227 else {
6228 beg0 = BEG(0);
6229 end0 = END(0);
6230 if (iter) match0 = rb_reg_nth_match(0, match);
6231 }
6232
6233 if (iter || !NIL_P(hash)) {
6234 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6235
6236 if (iter) {
6237 repl = rb_obj_as_string(rb_yield(match0));
6238 }
6239 else {
6240 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6241 repl = rb_obj_as_string(repl);
6242 }
6243 str_mod_check(str, p, len);
6244 rb_check_frozen(str);
6245 }
6246 else {
6247 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6248 }
6249
6250 enc = rb_enc_compatible(str, repl);
6251 if (!enc) {
6252 rb_encoding *str_enc = STR_ENC_GET(str);
6253 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6254 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6255 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6256 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6257 rb_enc_inspect_name(str_enc),
6258 rb_enc_inspect_name(STR_ENC_GET(repl)));
6259 }
6260 enc = STR_ENC_GET(repl);
6261 }
6262 rb_str_modify(str);
6263 rb_enc_associate(str, enc);
6265 int cr2 = ENC_CODERANGE(repl);
6266 if (cr2 == ENC_CODERANGE_BROKEN ||
6267 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6269 else
6270 cr = cr2;
6271 }
6272 plen = end0 - beg0;
6273 rlen = RSTRING_LEN(repl);
6274 len = RSTRING_LEN(str);
6275 if (rlen > plen) {
6276 RESIZE_CAPA(str, len + rlen - plen);
6277 }
6278 p = RSTRING_PTR(str);
6279 if (rlen != plen) {
6280 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6281 }
6282 rp = RSTRING_PTR(repl);
6283 memmove(p + beg0, rp, rlen);
6284 len += rlen - plen;
6285 STR_SET_LEN(str, len);
6286 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6287 ENC_CODERANGE_SET(str, cr);
6288
6289 RB_GC_GUARD(match);
6290
6291 return str;
6292 }
6293 return Qnil;
6294}
6295
6296
6297/*
6298 * call-seq:
6299 * sub(pattern, replacement) -> new_string
6300 * sub(pattern) {|match| ... } -> new_string
6301 *
6302 * :include: doc/string/sub.rdoc
6303 */
6304
6305static VALUE
6306rb_str_sub(int argc, VALUE *argv, VALUE str)
6307{
6308 str = str_duplicate(rb_cString, str);
6309 rb_str_sub_bang(argc, argv, str);
6310 return str;
6311}
6312
6313static VALUE
6314str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6315{
6316 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6317 long beg, beg0, end0;
6318 long offset, blen, slen, len, last;
6319 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6320 char *sp, *cp;
6321 int need_backref_str = -1;
6322 rb_encoding *str_enc;
6323
6324 switch (argc) {
6325 case 1:
6326 RETURN_ENUMERATOR(str, argc, argv);
6327 mode = ITER;
6328 break;
6329 case 2:
6330 repl = argv[1];
6331 hash = rb_check_hash_type(argv[1]);
6332 if (NIL_P(hash)) {
6333 StringValue(repl);
6334 }
6335 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6336 mode = FAST_MAP;
6337 }
6338 else {
6339 mode = MAP;
6340 }
6341 break;
6342 default:
6343 rb_error_arity(argc, 1, 2);
6344 }
6345
6346 pat = get_pat_quoted(argv[0], 1);
6347 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6348
6349 if (beg < 0) {
6350 if (bang) return Qnil; /* no match, no substitution */
6351 return str_duplicate(rb_cString, str);
6352 }
6353
6354 offset = 0;
6355 blen = RSTRING_LEN(str) + 30; /* len + margin */
6356 dest = rb_str_buf_new(blen);
6357 sp = RSTRING_PTR(str);
6358 slen = RSTRING_LEN(str);
6359 cp = sp;
6360 str_enc = STR_ENC_GET(str);
6361 rb_enc_associate(dest, str_enc);
6362 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6363
6364 do {
6365 struct re_registers *regs = RMATCH_REGS(match);
6366 if (RB_TYPE_P(pat, T_STRING)) {
6367 beg0 = beg;
6368 end0 = beg0 + RSTRING_LEN(pat);
6369 match0 = pat;
6370 }
6371 else {
6372 beg0 = BEG(0);
6373 end0 = END(0);
6374 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6375 }
6376
6377 if (mode != STR) {
6378 if (mode == ITER) {
6379 val = rb_obj_as_string(rb_yield(match0));
6380 }
6381 else {
6382 struct RString fake_str = {RBASIC_INIT};
6383 VALUE key;
6384 if (mode == FAST_MAP) {
6385 // It is safe to use a fake_str here because we established that it won't escape,
6386 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6387 // default proc.
6388 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6389 }
6390 else {
6391 key = rb_str_subseq(str, beg0, end0 - beg0);
6392 }
6393 val = rb_hash_aref(hash, key);
6394 val = rb_obj_as_string(val);
6395 }
6396 str_mod_check(str, sp, slen);
6397 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6398 rb_raise(rb_eRuntimeError, "block should not cheat");
6399 }
6400 }
6401 else if (need_backref_str) {
6402 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6403 if (need_backref_str < 0) {
6404 need_backref_str = val != repl;
6405 }
6406 }
6407 else {
6408 val = repl;
6409 }
6410
6411 len = beg0 - offset; /* copy pre-match substr */
6412 if (len) {
6413 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6414 }
6415
6416 rb_str_buf_append(dest, val);
6417
6418 last = offset;
6419 offset = end0;
6420 if (beg0 == end0) {
6421 /*
6422 * Always consume at least one character of the input string
6423 * in order to prevent infinite loops.
6424 */
6425 if (RSTRING_LEN(str) <= end0) break;
6426 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6427 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6428 offset = end0 + len;
6429 }
6430 cp = RSTRING_PTR(str) + offset;
6431 if (offset > RSTRING_LEN(str)) break;
6432
6433 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6434 if (mode != FAST_MAP && mode != STR) {
6435 match = Qnil;
6436 }
6437 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6438
6439 RB_GC_GUARD(match);
6440 } while (beg >= 0);
6441
6442 if (RSTRING_LEN(str) > offset) {
6443 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6444 }
6445 rb_pat_search0(pat, str, last, 1, &match);
6446 if (bang) {
6447 str_shared_replace(str, dest);
6448 }
6449 else {
6450 str = dest;
6451 }
6452
6453 return str;
6454}
6455
6456
6457/*
6458 * call-seq:
6459 * gsub!(pattern, replacement) -> self or nil
6460 * gsub!(pattern) {|match| ... } -> self or nil
6461 * gsub!(pattern) -> an_enumerator
6462 *
6463 * Like String#gsub, except that:
6464 *
6465 * - Performs substitutions in +self+ (not in a copy of +self+).
6466 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6467 *
6468 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6469 */
6470
6471static VALUE
6472rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6473{
6474 str_modify_keep_cr(str);
6475 return str_gsub(argc, argv, str, 1);
6476}
6477
6478
6479/*
6480 * call-seq:
6481 * gsub(pattern, replacement) -> new_string
6482 * gsub(pattern) {|match| ... } -> new_string
6483 * gsub(pattern) -> enumerator
6484 *
6485 * Returns a copy of +self+ with zero or more substrings replaced.
6486 *
6487 * Argument +pattern+ may be a string or a Regexp;
6488 * argument +replacement+ may be a string or a Hash.
6489 * Varying types for the argument values makes this method very versatile.
6490 *
6491 * Below are some simple examples;
6492 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6493 *
6494 * With arguments +pattern+ and string +replacement+ given,
6495 * replaces each matching substring with the given +replacement+ string:
6496 *
6497 * s = 'abracadabra'
6498 * s.gsub('ab', 'AB') # => "ABracadABra"
6499 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6500 *
6501 * With arguments +pattern+ and hash +replacement+ given,
6502 * replaces each matching substring with a value from the given +replacement+ hash,
6503 * or removes it:
6504 *
6505 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6506 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6507 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6508 *
6509 * With argument +pattern+ and a block given,
6510 * calls the block with each matching substring;
6511 * replaces that substring with the block's return value:
6512 *
6513 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6514 * # => "ABrACADABrA"
6515 *
6516 * With argument +pattern+ and no block given,
6517 * returns a new Enumerator.
6518 *
6519 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6520 */
6521
6522static VALUE
6523rb_str_gsub(int argc, VALUE *argv, VALUE str)
6524{
6525 return str_gsub(argc, argv, str, 0);
6526}
6527
6528
6529/*
6530 * call-seq:
6531 * replace(other_string) -> self
6532 *
6533 * Replaces the contents of +self+ with the contents of +other_string+;
6534 * returns +self+:
6535 *
6536 * s = 'foo' # => "foo"
6537 * s.replace('bar') # => "bar"
6538 *
6539 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6540 */
6541
6542VALUE
6544{
6545 str_modifiable(str);
6546 if (str == str2) return str;
6547
6548 StringValue(str2);
6549 str_discard(str);
6550 return str_replace(str, str2);
6551}
6552
6553/*
6554 * call-seq:
6555 * clear -> self
6556 *
6557 * Removes the contents of +self+:
6558 *
6559 * s = 'foo'
6560 * s.clear # => ""
6561 * s # => ""
6562 *
6563 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6564 */
6565
6566static VALUE
6567rb_str_clear(VALUE str)
6568{
6569 str_discard(str);
6570 STR_SET_EMBED(str);
6571 STR_SET_LEN(str, 0);
6572 RSTRING_PTR(str)[0] = 0;
6573 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6575 else
6577 return str;
6578}
6579
6580/*
6581 * call-seq:
6582 * chr -> string
6583 *
6584 * :include: doc/string/chr.rdoc
6585 *
6586 */
6587
6588static VALUE
6589rb_str_chr(VALUE str)
6590{
6591 return rb_str_substr(str, 0, 1);
6592}
6593
6594/*
6595 * call-seq:
6596 * getbyte(index) -> integer or nil
6597 *
6598 * :include: doc/string/getbyte.rdoc
6599 *
6600 */
6601VALUE
6602rb_str_getbyte(VALUE str, VALUE index)
6603{
6604 long pos = NUM2LONG(index);
6605
6606 if (pos < 0)
6607 pos += RSTRING_LEN(str);
6608 if (pos < 0 || RSTRING_LEN(str) <= pos)
6609 return Qnil;
6610
6611 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6612}
6613
6614/*
6615 * call-seq:
6616 * setbyte(index, integer) -> integer
6617 *
6618 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6619 * returns +integer+:
6620 *
6621 * s = 'xyzzy'
6622 * s.setbyte(2, 129) # => 129
6623 * s # => "xy\x81zy"
6624 *
6625 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6626 */
6627VALUE
6628rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6629{
6630 long pos = NUM2LONG(index);
6631 long len = RSTRING_LEN(str);
6632 char *ptr, *head, *left = 0;
6633 rb_encoding *enc;
6634 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6635
6636 if (pos < -len || len <= pos)
6637 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6638 if (pos < 0)
6639 pos += len;
6640
6641 VALUE v = rb_to_int(value);
6642 VALUE w = rb_int_and(v, INT2FIX(0xff));
6643 char byte = (char)(NUM2INT(w) & 0xFF);
6644
6645 if (!str_independent(str))
6646 str_make_independent(str);
6647 enc = STR_ENC_GET(str);
6648 head = RSTRING_PTR(str);
6649 ptr = &head[pos];
6650 if (!STR_EMBED_P(str)) {
6651 cr = ENC_CODERANGE(str);
6652 switch (cr) {
6653 case ENC_CODERANGE_7BIT:
6654 left = ptr;
6655 *ptr = byte;
6656 if (ISASCII(byte)) goto end;
6657 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6658 if (!MBCLEN_CHARFOUND_P(nlen))
6660 else
6662 goto end;
6664 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6665 width = rb_enc_precise_mbclen(left, head+len, enc);
6666 *ptr = byte;
6667 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6668 if (!MBCLEN_CHARFOUND_P(nlen))
6670 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6672 goto end;
6673 }
6674 }
6676 *ptr = byte;
6677
6678 end:
6679 return value;
6680}
6681
6682static VALUE
6683str_byte_substr(VALUE str, long beg, long len, int empty)
6684{
6685 long n = RSTRING_LEN(str);
6686
6687 if (beg > n || len < 0) return Qnil;
6688 if (beg < 0) {
6689 beg += n;
6690 if (beg < 0) return Qnil;
6691 }
6692 if (len > n - beg)
6693 len = n - beg;
6694 if (len <= 0) {
6695 if (!empty) return Qnil;
6696 len = 0;
6697 }
6698
6699 VALUE str2 = str_subseq(str, beg, len);
6700
6701 str_enc_copy_direct(str2, str);
6702
6703 if (RSTRING_LEN(str2) == 0) {
6704 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6706 else
6708 }
6709 else {
6710 switch (ENC_CODERANGE(str)) {
6711 case ENC_CODERANGE_7BIT:
6713 break;
6714 default:
6716 break;
6717 }
6718 }
6719
6720 return str2;
6721}
6722
6723VALUE
6724rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6725{
6726 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6727}
6728
6729static VALUE
6730str_byte_aref(VALUE str, VALUE indx)
6731{
6732 long idx;
6733 if (FIXNUM_P(indx)) {
6734 idx = FIX2LONG(indx);
6735 }
6736 else {
6737 /* check if indx is Range */
6738 long beg, len = RSTRING_LEN(str);
6739
6740 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6741 case Qfalse:
6742 break;
6743 case Qnil:
6744 return Qnil;
6745 default:
6746 return str_byte_substr(str, beg, len, TRUE);
6747 }
6748
6749 idx = NUM2LONG(indx);
6750 }
6751 return str_byte_substr(str, idx, 1, FALSE);
6752}
6753
6754/*
6755 * call-seq:
6756 * byteslice(offset, length = 1) -> string or nil
6757 * byteslice(range) -> string or nil
6758 *
6759 * :include: doc/string/byteslice.rdoc
6760 */
6761
6762static VALUE
6763rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6764{
6765 if (argc == 2) {
6766 long beg = NUM2LONG(argv[0]);
6767 long len = NUM2LONG(argv[1]);
6768 return str_byte_substr(str, beg, len, TRUE);
6769 }
6770 rb_check_arity(argc, 1, 2);
6771 return str_byte_aref(str, argv[0]);
6772}
6773
6774static void
6775str_check_beg_len(VALUE str, long *beg, long *len)
6776{
6777 long end, slen = RSTRING_LEN(str);
6778
6779 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6780 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6781 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6782 }
6783 if (*beg < 0) {
6784 *beg += slen;
6785 }
6786 RUBY_ASSERT(*beg >= 0);
6787 RUBY_ASSERT(*beg <= slen);
6788
6789 if (*len > slen - *beg) {
6790 *len = slen - *beg;
6791 }
6792 end = *beg + *len;
6793 str_ensure_byte_pos(str, *beg);
6794 str_ensure_byte_pos(str, end);
6795}
6796
6797/*
6798 * call-seq:
6799 * bytesplice(offset, length, str) -> self
6800 * bytesplice(offset, length, str, str_offset, str_length) -> self
6801 * bytesplice(range, str) -> self
6802 * bytesplice(range, str, str_range) -> self
6803 *
6804 * :include: doc/string/bytesplice.rdoc
6805 */
6806
6807static VALUE
6808rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6809{
6810 long beg, len, vbeg, vlen;
6811 VALUE val;
6812 int cr;
6813
6814 rb_check_arity(argc, 2, 5);
6815 if (!(argc == 2 || argc == 3 || argc == 5)) {
6816 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6817 }
6818 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6819 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6820 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6821 rb_builtin_class_name(argv[0]));
6822 }
6823 val = argv[1];
6824 StringValue(val);
6825 if (argc == 2) {
6826 /* bytesplice(range, str) */
6827 vbeg = 0;
6828 vlen = RSTRING_LEN(val);
6829 }
6830 else {
6831 /* bytesplice(range, str, str_range) */
6832 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6833 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6834 rb_builtin_class_name(argv[2]));
6835 }
6836 }
6837 }
6838 else {
6839 beg = NUM2LONG(argv[0]);
6840 len = NUM2LONG(argv[1]);
6841 val = argv[2];
6842 StringValue(val);
6843 if (argc == 3) {
6844 /* bytesplice(index, length, str) */
6845 vbeg = 0;
6846 vlen = RSTRING_LEN(val);
6847 }
6848 else {
6849 /* bytesplice(index, length, str, str_index, str_length) */
6850 vbeg = NUM2LONG(argv[3]);
6851 vlen = NUM2LONG(argv[4]);
6852 }
6853 }
6854 str_check_beg_len(str, &beg, &len);
6855 str_check_beg_len(val, &vbeg, &vlen);
6856 str_modify_keep_cr(str);
6857
6858 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6859 rb_enc_associate(str, rb_enc_check(str, val));
6860 }
6861
6862 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6864 if (cr != ENC_CODERANGE_BROKEN)
6865 ENC_CODERANGE_SET(str, cr);
6866 return str;
6867}
6868
6869/*
6870 * call-seq:
6871 * reverse -> new_string
6872 *
6873 * Returns a new string with the characters from +self+ in reverse order.
6874 *
6875 * 'drawer'.reverse # => "reward"
6876 * 'reviled'.reverse # => "deliver"
6877 * 'stressed'.reverse # => "desserts"
6878 * 'semordnilaps'.reverse # => "spalindromes"
6879 *
6880 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6881 */
6882
6883static VALUE
6884rb_str_reverse(VALUE str)
6885{
6886 rb_encoding *enc;
6887 VALUE rev;
6888 char *s, *e, *p;
6889 int cr;
6890
6891 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6892 enc = STR_ENC_GET(str);
6893 rev = rb_str_new(0, RSTRING_LEN(str));
6894 s = RSTRING_PTR(str); e = RSTRING_END(str);
6895 p = RSTRING_END(rev);
6896 cr = ENC_CODERANGE(str);
6897
6898 if (RSTRING_LEN(str) > 1) {
6899 if (single_byte_optimizable(str)) {
6900 while (s < e) {
6901 *--p = *s++;
6902 }
6903 }
6904 else if (cr == ENC_CODERANGE_VALID) {
6905 while (s < e) {
6906 int clen = rb_enc_fast_mbclen(s, e, enc);
6907
6908 p -= clen;
6909 memcpy(p, s, clen);
6910 s += clen;
6911 }
6912 }
6913 else {
6914 cr = rb_enc_asciicompat(enc) ?
6916 while (s < e) {
6917 int clen = rb_enc_mbclen(s, e, enc);
6918
6919 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6920 p -= clen;
6921 memcpy(p, s, clen);
6922 s += clen;
6923 }
6924 }
6925 }
6926 STR_SET_LEN(rev, RSTRING_LEN(str));
6927 str_enc_copy_direct(rev, str);
6928 ENC_CODERANGE_SET(rev, cr);
6929
6930 return rev;
6931}
6932
6933
6934/*
6935 * call-seq:
6936 * reverse! -> self
6937 *
6938 * Returns +self+ with its characters reversed:
6939 *
6940 * 'drawer'.reverse! # => "reward"
6941 * 'reviled'.reverse! # => "deliver"
6942 * 'stressed'.reverse! # => "desserts"
6943 * 'semordnilaps'.reverse! # => "spalindromes"
6944 *
6945 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6946 */
6947
6948static VALUE
6949rb_str_reverse_bang(VALUE str)
6950{
6951 if (RSTRING_LEN(str) > 1) {
6952 if (single_byte_optimizable(str)) {
6953 char *s, *e, c;
6954
6955 str_modify_keep_cr(str);
6956 s = RSTRING_PTR(str);
6957 e = RSTRING_END(str) - 1;
6958 while (s < e) {
6959 c = *s;
6960 *s++ = *e;
6961 *e-- = c;
6962 }
6963 }
6964 else {
6965 str_shared_replace(str, rb_str_reverse(str));
6966 }
6967 }
6968 else {
6969 str_modify_keep_cr(str);
6970 }
6971 return str;
6972}
6973
6974
6975/*
6976 * call-seq:
6977 * include?(other_string) -> true or false
6978 *
6979 * Returns whether +self+ contains +other_string+:
6980 *
6981 * s = 'bar'
6982 * s.include?('ba') # => true
6983 * s.include?('ar') # => true
6984 * s.include?('bar') # => true
6985 * s.include?('a') # => true
6986 * s.include?('') # => true
6987 * s.include?('foo') # => false
6988 *
6989 * Related: see {Querying}[rdoc-ref:String@Querying].
6990 */
6991
6992VALUE
6993rb_str_include(VALUE str, VALUE arg)
6994{
6995 long i;
6996
6997 StringValue(arg);
6998 i = rb_str_index(str, arg, 0);
6999
7000 return RBOOL(i != -1);
7001}
7002
7003
7004/*
7005 * call-seq:
7006 * to_i(base = 10) -> integer
7007 *
7008 * Returns the result of interpreting leading characters in +self+
7009 * as an integer in the given +base+;
7010 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7011 *
7012 * '123456'.to_i # => 123456
7013 * '123def'.to_i(16) # => 1195503
7014 *
7015 * With +base+ zero given, string +object+ may contain leading characters
7016 * to specify the actual base:
7017 *
7018 * '123def'.to_i(0) # => 123
7019 * '0123def'.to_i(0) # => 83
7020 * '0b123def'.to_i(0) # => 1
7021 * '0o123def'.to_i(0) # => 83
7022 * '0d123def'.to_i(0) # => 123
7023 * '0x123def'.to_i(0) # => 1195503
7024 *
7025 * Characters past a leading valid number (in the given +base+) are ignored:
7026 *
7027 * '12.345'.to_i # => 12
7028 * '12345'.to_i(2) # => 1
7029 *
7030 * Returns zero if there is no leading valid number:
7031 *
7032 * 'abcdef'.to_i # => 0
7033 * '2'.to_i(2) # => 0
7034 *
7035 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7036 */
7037
7038static VALUE
7039rb_str_to_i(int argc, VALUE *argv, VALUE str)
7040{
7041 int base = 10;
7042
7043 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7044 rb_raise(rb_eArgError, "invalid radix %d", base);
7045 }
7046 return rb_str_to_inum(str, base, FALSE);
7047}
7048
7049
7050/*
7051 * call-seq:
7052 * to_f -> float
7053 *
7054 * Returns the result of interpreting leading characters in +self+ as a Float:
7055 *
7056 * '3.14159'.to_f # => 3.14159
7057 * '1.234e-2'.to_f # => 0.01234
7058 *
7059 * Characters past a leading valid number are ignored:
7060 *
7061 * '3.14 (pi to two places)'.to_f # => 3.14
7062 *
7063 * Returns zero if there is no leading valid number:
7064 *
7065 * 'abcdef'.to_f # => 0.0
7066 *
7067 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7068 */
7069
7070static VALUE
7071rb_str_to_f(VALUE str)
7072{
7073 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7074}
7075
7076
7077/*
7078 * call-seq:
7079 * to_s -> self or new_string
7080 *
7081 * Returns +self+ if +self+ is a +String+,
7082 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7083 *
7084 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7085 */
7086
7087static VALUE
7088rb_str_to_s(VALUE str)
7089{
7090 if (rb_obj_class(str) != rb_cString) {
7091 return str_duplicate(rb_cString, str);
7092 }
7093 return str;
7094}
7095
7096#if 0
7097static void
7098str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7099{
7100 char s[RUBY_MAX_CHAR_LEN];
7101 int n = rb_enc_codelen(c, enc);
7102
7103 rb_enc_mbcput(c, s, enc);
7104 rb_enc_str_buf_cat(str, s, n, enc);
7105}
7106#endif
7107
7108#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7109
7110int
7111rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7112{
7113 char buf[CHAR_ESC_LEN + 1];
7114 int l;
7115
7116#if SIZEOF_INT > 4
7117 c &= 0xffffffff;
7118#endif
7119 if (unicode_p) {
7120 if (c < 0x7F && ISPRINT(c)) {
7121 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7122 }
7123 else if (c < 0x10000) {
7124 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7125 }
7126 else {
7127 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7128 }
7129 }
7130 else {
7131 if (c < 0x100) {
7132 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7133 }
7134 else {
7135 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7136 }
7137 }
7138 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7139 rb_str_buf_cat(result, buf, l);
7140 return l;
7141}
7142
7143const char *
7144ruby_escaped_char(int c)
7145{
7146 switch (c) {
7147 case '\0': return "\\0";
7148 case '\n': return "\\n";
7149 case '\r': return "\\r";
7150 case '\t': return "\\t";
7151 case '\f': return "\\f";
7152 case '\013': return "\\v";
7153 case '\010': return "\\b";
7154 case '\007': return "\\a";
7155 case '\033': return "\\e";
7156 case '\x7f': return "\\c?";
7157 }
7158 return NULL;
7159}
7160
7161VALUE
7162rb_str_escape(VALUE str)
7163{
7164 int encidx = ENCODING_GET(str);
7165 rb_encoding *enc = rb_enc_from_index(encidx);
7166 const char *p = RSTRING_PTR(str);
7167 const char *pend = RSTRING_END(str);
7168 const char *prev = p;
7169 char buf[CHAR_ESC_LEN + 1];
7170 VALUE result = rb_str_buf_new(0);
7171 int unicode_p = rb_enc_unicode_p(enc);
7172 int asciicompat = rb_enc_asciicompat(enc);
7173
7174 while (p < pend) {
7175 unsigned int c;
7176 const char *cc;
7177 int n = rb_enc_precise_mbclen(p, pend, enc);
7178 if (!MBCLEN_CHARFOUND_P(n)) {
7179 if (p > prev) str_buf_cat(result, prev, p - prev);
7180 n = rb_enc_mbminlen(enc);
7181 if (pend < p + n)
7182 n = (int)(pend - p);
7183 while (n--) {
7184 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7185 str_buf_cat(result, buf, strlen(buf));
7186 prev = ++p;
7187 }
7188 continue;
7189 }
7190 n = MBCLEN_CHARFOUND_LEN(n);
7191 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7192 p += n;
7193 cc = ruby_escaped_char(c);
7194 if (cc) {
7195 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7196 str_buf_cat(result, cc, strlen(cc));
7197 prev = p;
7198 }
7199 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7200 }
7201 else {
7202 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7203 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7204 prev = p;
7205 }
7206 }
7207 if (p > prev) str_buf_cat(result, prev, p - prev);
7208 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7209
7210 return result;
7211}
7212
7213/*
7214 * call-seq:
7215 * inspect -> string
7216 *
7217 * :include: doc/string/inspect.rdoc
7218 *
7219 */
7220
7221VALUE
7223{
7224 int encidx = ENCODING_GET(str);
7225 rb_encoding *enc = rb_enc_from_index(encidx);
7226 const char *p, *pend, *prev;
7227 char buf[CHAR_ESC_LEN + 1];
7228 VALUE result = rb_str_buf_new(0);
7229 rb_encoding *resenc = rb_default_internal_encoding();
7230 int unicode_p = rb_enc_unicode_p(enc);
7231 int asciicompat = rb_enc_asciicompat(enc);
7232
7233 if (resenc == NULL) resenc = rb_default_external_encoding();
7234 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7235 rb_enc_associate(result, resenc);
7236 str_buf_cat2(result, "\"");
7237
7238 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7239 prev = p;
7240 while (p < pend) {
7241 unsigned int c, cc;
7242 int n;
7243
7244 n = rb_enc_precise_mbclen(p, pend, enc);
7245 if (!MBCLEN_CHARFOUND_P(n)) {
7246 if (p > prev) str_buf_cat(result, prev, p - prev);
7247 n = rb_enc_mbminlen(enc);
7248 if (pend < p + n)
7249 n = (int)(pend - p);
7250 while (n--) {
7251 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7252 str_buf_cat(result, buf, strlen(buf));
7253 prev = ++p;
7254 }
7255 continue;
7256 }
7257 n = MBCLEN_CHARFOUND_LEN(n);
7258 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7259 p += n;
7260 if ((asciicompat || unicode_p) &&
7261 (c == '"'|| c == '\\' ||
7262 (c == '#' &&
7263 p < pend &&
7264 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7265 (cc = rb_enc_codepoint(p,pend,enc),
7266 (cc == '$' || cc == '@' || cc == '{'))))) {
7267 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7268 str_buf_cat2(result, "\\");
7269 if (asciicompat || enc == resenc) {
7270 prev = p - n;
7271 continue;
7272 }
7273 }
7274 switch (c) {
7275 case '\n': cc = 'n'; break;
7276 case '\r': cc = 'r'; break;
7277 case '\t': cc = 't'; break;
7278 case '\f': cc = 'f'; break;
7279 case '\013': cc = 'v'; break;
7280 case '\010': cc = 'b'; break;
7281 case '\007': cc = 'a'; break;
7282 case 033: cc = 'e'; break;
7283 default: cc = 0; break;
7284 }
7285 if (cc) {
7286 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7287 buf[0] = '\\';
7288 buf[1] = (char)cc;
7289 str_buf_cat(result, buf, 2);
7290 prev = p;
7291 continue;
7292 }
7293 /* The special casing of 0x85 (NEXT_LINE) here is because
7294 * Oniguruma historically treats it as printable, but it
7295 * doesn't match the print POSIX bracket class or character
7296 * property in regexps.
7297 *
7298 * See Ruby Bug #16842 for details:
7299 * https://bugs.ruby-lang.org/issues/16842
7300 */
7301 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7302 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7303 continue;
7304 }
7305 else {
7306 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7307 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7308 prev = p;
7309 continue;
7310 }
7311 }
7312 if (p > prev) str_buf_cat(result, prev, p - prev);
7313 str_buf_cat2(result, "\"");
7314
7315 return result;
7316}
7317
7318#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7319
7320/*
7321 * call-seq:
7322 * dump -> new_string
7323 *
7324 * :include: doc/string/dump.rdoc
7325 *
7326 */
7327
7328VALUE
7330{
7331 int encidx = rb_enc_get_index(str);
7332 rb_encoding *enc = rb_enc_from_index(encidx);
7333 long len;
7334 const char *p, *pend;
7335 char *q, *qend;
7336 VALUE result;
7337 int u8 = (encidx == rb_utf8_encindex());
7338 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7339
7340 len = 2; /* "" */
7341 if (!rb_enc_asciicompat(enc)) {
7342 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7343 len += strlen(enc->name);
7344 }
7345
7346 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7347 while (p < pend) {
7348 int clen;
7349 unsigned char c = *p++;
7350
7351 switch (c) {
7352 case '"': case '\\':
7353 case '\n': case '\r':
7354 case '\t': case '\f':
7355 case '\013': case '\010': case '\007': case '\033':
7356 clen = 2;
7357 break;
7358
7359 case '#':
7360 clen = IS_EVSTR(p, pend) ? 2 : 1;
7361 break;
7362
7363 default:
7364 if (ISPRINT(c)) {
7365 clen = 1;
7366 }
7367 else {
7368 if (u8 && c > 0x7F) { /* \u notation */
7369 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7370 if (MBCLEN_CHARFOUND_P(n)) {
7371 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7372 if (cc <= 0xFFFF)
7373 clen = 6; /* \uXXXX */
7374 else if (cc <= 0xFFFFF)
7375 clen = 9; /* \u{XXXXX} */
7376 else
7377 clen = 10; /* \u{XXXXXX} */
7378 p += MBCLEN_CHARFOUND_LEN(n)-1;
7379 break;
7380 }
7381 }
7382 clen = 4; /* \xNN */
7383 }
7384 break;
7385 }
7386
7387 if (clen > LONG_MAX - len) {
7388 rb_raise(rb_eRuntimeError, "string size too big");
7389 }
7390 len += clen;
7391 }
7392
7393 result = rb_str_new(0, len);
7394 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7395 q = RSTRING_PTR(result); qend = q + len + 1;
7396
7397 *q++ = '"';
7398 while (p < pend) {
7399 unsigned char c = *p++;
7400
7401 if (c == '"' || c == '\\') {
7402 *q++ = '\\';
7403 *q++ = c;
7404 }
7405 else if (c == '#') {
7406 if (IS_EVSTR(p, pend)) *q++ = '\\';
7407 *q++ = '#';
7408 }
7409 else if (c == '\n') {
7410 *q++ = '\\';
7411 *q++ = 'n';
7412 }
7413 else if (c == '\r') {
7414 *q++ = '\\';
7415 *q++ = 'r';
7416 }
7417 else if (c == '\t') {
7418 *q++ = '\\';
7419 *q++ = 't';
7420 }
7421 else if (c == '\f') {
7422 *q++ = '\\';
7423 *q++ = 'f';
7424 }
7425 else if (c == '\013') {
7426 *q++ = '\\';
7427 *q++ = 'v';
7428 }
7429 else if (c == '\010') {
7430 *q++ = '\\';
7431 *q++ = 'b';
7432 }
7433 else if (c == '\007') {
7434 *q++ = '\\';
7435 *q++ = 'a';
7436 }
7437 else if (c == '\033') {
7438 *q++ = '\\';
7439 *q++ = 'e';
7440 }
7441 else if (ISPRINT(c)) {
7442 *q++ = c;
7443 }
7444 else {
7445 *q++ = '\\';
7446 if (u8) {
7447 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7448 if (MBCLEN_CHARFOUND_P(n)) {
7449 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7450 p += n;
7451 if (cc <= 0xFFFF)
7452 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7453 else
7454 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7455 q += strlen(q);
7456 continue;
7457 }
7458 }
7459 snprintf(q, qend-q, "x%02X", c);
7460 q += 3;
7461 }
7462 }
7463 *q++ = '"';
7464 *q = '\0';
7465 if (!rb_enc_asciicompat(enc)) {
7466 snprintf(q, qend-q, nonascii_suffix, enc->name);
7467 encidx = rb_ascii8bit_encindex();
7468 }
7469 /* result from dump is ASCII */
7470 rb_enc_associate_index(result, encidx);
7472 return result;
7473}
7474
7475static int
7476unescape_ascii(unsigned int c)
7477{
7478 switch (c) {
7479 case 'n':
7480 return '\n';
7481 case 'r':
7482 return '\r';
7483 case 't':
7484 return '\t';
7485 case 'f':
7486 return '\f';
7487 case 'v':
7488 return '\13';
7489 case 'b':
7490 return '\010';
7491 case 'a':
7492 return '\007';
7493 case 'e':
7494 return 033;
7495 }
7497}
7498
7499static void
7500undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7501{
7502 const char *s = *ss;
7503 unsigned int c;
7504 int codelen;
7505 size_t hexlen;
7506 unsigned char buf[6];
7507 static rb_encoding *enc_utf8 = NULL;
7508
7509 switch (*s) {
7510 case '\\':
7511 case '"':
7512 case '#':
7513 rb_str_cat(undumped, s, 1); /* cat itself */
7514 s++;
7515 break;
7516 case 'n':
7517 case 'r':
7518 case 't':
7519 case 'f':
7520 case 'v':
7521 case 'b':
7522 case 'a':
7523 case 'e':
7524 *buf = unescape_ascii(*s);
7525 rb_str_cat(undumped, (char *)buf, 1);
7526 s++;
7527 break;
7528 case 'u':
7529 if (*binary) {
7530 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7531 }
7532 *utf8 = true;
7533 if (++s >= s_end) {
7534 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7535 }
7536 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7537 if (*penc != enc_utf8) {
7538 *penc = enc_utf8;
7539 rb_enc_associate(undumped, enc_utf8);
7540 }
7541 if (*s == '{') { /* handle \u{...} form */
7542 s++;
7543 for (;;) {
7544 if (s >= s_end) {
7545 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7546 }
7547 if (*s == '}') {
7548 s++;
7549 break;
7550 }
7551 if (ISSPACE(*s)) {
7552 s++;
7553 continue;
7554 }
7555 c = scan_hex(s, s_end-s, &hexlen);
7556 if (hexlen == 0 || hexlen > 6) {
7557 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7558 }
7559 if (c > 0x10ffff) {
7560 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7561 }
7562 if (0xd800 <= c && c <= 0xdfff) {
7563 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7564 }
7565 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7566 rb_str_cat(undumped, (char *)buf, codelen);
7567 s += hexlen;
7568 }
7569 }
7570 else { /* handle \uXXXX form */
7571 c = scan_hex(s, 4, &hexlen);
7572 if (hexlen != 4) {
7573 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7574 }
7575 if (0xd800 <= c && c <= 0xdfff) {
7576 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7577 }
7578 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7579 rb_str_cat(undumped, (char *)buf, codelen);
7580 s += hexlen;
7581 }
7582 break;
7583 case 'x':
7584 if (++s >= s_end) {
7585 rb_raise(rb_eRuntimeError, "invalid hex escape");
7586 }
7587 *buf = scan_hex(s, 2, &hexlen);
7588 if (hexlen != 2) {
7589 rb_raise(rb_eRuntimeError, "invalid hex escape");
7590 }
7591 if (!ISASCII(*buf)) {
7592 if (*utf8) {
7593 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7594 }
7595 *binary = true;
7596 }
7597 rb_str_cat(undumped, (char *)buf, 1);
7598 s += hexlen;
7599 break;
7600 default:
7601 rb_str_cat(undumped, s-1, 2);
7602 s++;
7603 }
7604
7605 *ss = s;
7606}
7607
7608static VALUE rb_str_is_ascii_only_p(VALUE str);
7609
7610/*
7611 * call-seq:
7612 * undump -> new_string
7613 *
7614 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7615 *
7616 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7617 */
7618
7619static VALUE
7620str_undump(VALUE str)
7621{
7622 const char *s = RSTRING_PTR(str);
7623 const char *s_end = RSTRING_END(str);
7624 rb_encoding *enc = rb_enc_get(str);
7625 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7626 bool utf8 = false;
7627 bool binary = false;
7628 int w;
7629
7631 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7632 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7633 }
7634 if (!str_null_check(str, &w)) {
7635 rb_raise(rb_eRuntimeError, "string contains null byte");
7636 }
7637 if (RSTRING_LEN(str) < 2) goto invalid_format;
7638 if (*s != '"') goto invalid_format;
7639
7640 /* strip '"' at the start */
7641 s++;
7642
7643 for (;;) {
7644 if (s >= s_end) {
7645 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7646 }
7647
7648 if (*s == '"') {
7649 /* epilogue */
7650 s++;
7651 if (s == s_end) {
7652 /* ascii compatible dumped string */
7653 break;
7654 }
7655 else {
7656 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7657 static const char dup_suffix[] = ".dup";
7658 const char *encname;
7659 int encidx;
7660 ptrdiff_t size;
7661
7662 /* check separately for strings dumped by older versions */
7663 size = sizeof(dup_suffix) - 1;
7664 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7665
7666 size = sizeof(force_encoding_suffix) - 1;
7667 if (s_end - s <= size) goto invalid_format;
7668 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7669 s += size;
7670
7671 if (utf8) {
7672 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7673 }
7674
7675 encname = s;
7676 s = memchr(s, '"', s_end-s);
7677 size = s - encname;
7678 if (!s) goto invalid_format;
7679 if (s_end - s != 2) goto invalid_format;
7680 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7681
7682 encidx = rb_enc_find_index2(encname, (long)size);
7683 if (encidx < 0) {
7684 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7685 }
7686 rb_enc_associate_index(undumped, encidx);
7687 }
7688 break;
7689 }
7690
7691 if (*s == '\\') {
7692 s++;
7693 if (s >= s_end) {
7694 rb_raise(rb_eRuntimeError, "invalid escape");
7695 }
7696 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7697 }
7698 else {
7699 rb_str_cat(undumped, s++, 1);
7700 }
7701 }
7702
7703 RB_GC_GUARD(str);
7704
7705 return undumped;
7706invalid_format:
7707 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7708}
7709
7710static void
7711rb_str_check_dummy_enc(rb_encoding *enc)
7712{
7713 if (rb_enc_dummy_p(enc)) {
7714 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7715 rb_enc_name(enc));
7716 }
7717}
7718
7719static rb_encoding *
7720str_true_enc(VALUE str)
7721{
7722 rb_encoding *enc = STR_ENC_GET(str);
7723 rb_str_check_dummy_enc(enc);
7724 return enc;
7725}
7726
7727static OnigCaseFoldType
7728check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7729{
7730 if (argc==0)
7731 return flags;
7732 if (argc>2)
7733 rb_raise(rb_eArgError, "too many options");
7734 if (argv[0]==sym_turkic) {
7735 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7736 if (argc==2) {
7737 if (argv[1]==sym_lithuanian)
7738 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7739 else
7740 rb_raise(rb_eArgError, "invalid second option");
7741 }
7742 }
7743 else if (argv[0]==sym_lithuanian) {
7744 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7745 if (argc==2) {
7746 if (argv[1]==sym_turkic)
7747 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7748 else
7749 rb_raise(rb_eArgError, "invalid second option");
7750 }
7751 }
7752 else if (argc>1)
7753 rb_raise(rb_eArgError, "too many options");
7754 else if (argv[0]==sym_ascii)
7755 flags |= ONIGENC_CASE_ASCII_ONLY;
7756 else if (argv[0]==sym_fold) {
7757 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7758 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7759 else
7760 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7761 }
7762 else
7763 rb_raise(rb_eArgError, "invalid option");
7764 return flags;
7765}
7766
7767static inline bool
7768case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7769{
7770 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7771 return true;
7772 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7773}
7774
7775/* 16 should be long enough to absorb any kind of single character length increase */
7776#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7777#ifndef CASEMAP_DEBUG
7778# define CASEMAP_DEBUG 0
7779#endif
7780
7781struct mapping_buffer;
7782typedef struct mapping_buffer {
7783 size_t capa;
7784 size_t used;
7785 struct mapping_buffer *next;
7786 OnigUChar space[FLEX_ARY_LEN];
7788
7789static void
7790mapping_buffer_free(void *p)
7791{
7792 mapping_buffer *previous_buffer;
7793 mapping_buffer *current_buffer = p;
7794 while (current_buffer) {
7795 previous_buffer = current_buffer;
7796 current_buffer = current_buffer->next;
7797 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7798 }
7799}
7800
7801static const rb_data_type_t mapping_buffer_type = {
7802 "mapping_buffer",
7803 {0, mapping_buffer_free,},
7804 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7805};
7806
7807static VALUE
7808rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7809{
7810 VALUE target;
7811
7812 const OnigUChar *source_current, *source_end;
7813 int target_length = 0;
7814 VALUE buffer_anchor;
7815 mapping_buffer *current_buffer = 0;
7816 mapping_buffer **pre_buffer;
7817 size_t buffer_count = 0;
7818 int buffer_length_or_invalid;
7819
7820 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7821
7822 source_current = (OnigUChar*)RSTRING_PTR(source);
7823 source_end = (OnigUChar*)RSTRING_END(source);
7824
7825 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7826 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7827 while (source_current < source_end) {
7828 /* increase multiplier using buffer count to converge quickly */
7829 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7830 if (CASEMAP_DEBUG) {
7831 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7832 }
7833 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7834 *pre_buffer = current_buffer;
7835 pre_buffer = &current_buffer->next;
7836 current_buffer->next = NULL;
7837 current_buffer->capa = capa;
7838 buffer_length_or_invalid = enc->case_map(flags,
7839 &source_current, source_end,
7840 current_buffer->space,
7841 current_buffer->space+current_buffer->capa,
7842 enc);
7843 if (buffer_length_or_invalid < 0) {
7844 current_buffer = DATA_PTR(buffer_anchor);
7845 DATA_PTR(buffer_anchor) = 0;
7846 mapping_buffer_free(current_buffer);
7847 rb_raise(rb_eArgError, "input string invalid");
7848 }
7849 target_length += current_buffer->used = buffer_length_or_invalid;
7850 }
7851 if (CASEMAP_DEBUG) {
7852 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7853 }
7854
7855 if (buffer_count==1) {
7856 target = rb_str_new((const char*)current_buffer->space, target_length);
7857 }
7858 else {
7859 char *target_current;
7860
7861 target = rb_str_new(0, target_length);
7862 target_current = RSTRING_PTR(target);
7863 current_buffer = DATA_PTR(buffer_anchor);
7864 while (current_buffer) {
7865 memcpy(target_current, current_buffer->space, current_buffer->used);
7866 target_current += current_buffer->used;
7867 current_buffer = current_buffer->next;
7868 }
7869 }
7870 current_buffer = DATA_PTR(buffer_anchor);
7871 DATA_PTR(buffer_anchor) = 0;
7872 mapping_buffer_free(current_buffer);
7873
7874 RB_GC_GUARD(buffer_anchor);
7875
7876 /* TODO: check about string terminator character */
7877 str_enc_copy_direct(target, source);
7878 /*ENC_CODERANGE_SET(mapped, cr);*/
7879
7880 return target;
7881}
7882
7883static VALUE
7884rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7885{
7886 const OnigUChar *source_current, *source_end;
7887 OnigUChar *target_current, *target_end;
7888 long old_length = RSTRING_LEN(source);
7889 int length_or_invalid;
7890
7891 if (old_length == 0) return Qnil;
7892
7893 source_current = (OnigUChar*)RSTRING_PTR(source);
7894 source_end = (OnigUChar*)RSTRING_END(source);
7895 if (source == target) {
7896 target_current = (OnigUChar*)source_current;
7897 target_end = (OnigUChar*)source_end;
7898 }
7899 else {
7900 target_current = (OnigUChar*)RSTRING_PTR(target);
7901 target_end = (OnigUChar*)RSTRING_END(target);
7902 }
7903
7904 length_or_invalid = onigenc_ascii_only_case_map(flags,
7905 &source_current, source_end,
7906 target_current, target_end, enc);
7907 if (length_or_invalid < 0)
7908 rb_raise(rb_eArgError, "input string invalid");
7909 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7910 fprintf(stderr, "problem with rb_str_ascii_casemap"
7911 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7912 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7913 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7914 }
7915
7916 str_enc_copy(target, source);
7917
7918 return target;
7919}
7920
7921static bool
7922upcase_single(VALUE str)
7923{
7924 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7925 bool modified = false;
7926
7927 while (s < send) {
7928 unsigned int c = *(unsigned char*)s;
7929
7930 if ('a' <= c && c <= 'z') {
7931 *s = 'A' + (c - 'a');
7932 modified = true;
7933 }
7934 s++;
7935 }
7936 return modified;
7937}
7938
7939/*
7940 * call-seq:
7941 * upcase!(mapping) -> self or nil
7942 *
7943 * Like String#upcase, except that:
7944 *
7945 * - Changes character casings in +self+ (not in a copy of +self+).
7946 * - Returns +self+ if any changes are made, +nil+ otherwise.
7947 *
7948 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7949 */
7950
7951static VALUE
7952rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7953{
7954 rb_encoding *enc;
7955 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7956
7957 flags = check_case_options(argc, argv, flags);
7958 str_modify_keep_cr(str);
7959 enc = str_true_enc(str);
7960 if (case_option_single_p(flags, enc, str)) {
7961 if (upcase_single(str))
7962 flags |= ONIGENC_CASE_MODIFIED;
7963 }
7964 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7965 rb_str_ascii_casemap(str, str, &flags, enc);
7966 else
7967 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7968
7969 if (ONIGENC_CASE_MODIFIED&flags) return str;
7970 return Qnil;
7971}
7972
7973
7974/*
7975 * call-seq:
7976 * upcase(mapping = :ascii) -> new_string
7977 *
7978 * :include: doc/string/upcase.rdoc
7979 */
7980
7981static VALUE
7982rb_str_upcase(int argc, VALUE *argv, VALUE str)
7983{
7984 rb_encoding *enc;
7985 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7986 VALUE ret;
7987
7988 flags = check_case_options(argc, argv, flags);
7989 enc = str_true_enc(str);
7990 if (case_option_single_p(flags, enc, str)) {
7991 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7992 str_enc_copy_direct(ret, str);
7993 upcase_single(ret);
7994 }
7995 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7996 ret = rb_str_new(0, RSTRING_LEN(str));
7997 rb_str_ascii_casemap(str, ret, &flags, enc);
7998 }
7999 else {
8000 ret = rb_str_casemap(str, &flags, enc);
8001 }
8002
8003 return ret;
8004}
8005
8006static bool
8007downcase_single(VALUE str)
8008{
8009 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8010 bool modified = false;
8011
8012 while (s < send) {
8013 unsigned int c = *(unsigned char*)s;
8014
8015 if ('A' <= c && c <= 'Z') {
8016 *s = 'a' + (c - 'A');
8017 modified = true;
8018 }
8019 s++;
8020 }
8021
8022 return modified;
8023}
8024
8025/*
8026 * call-seq:
8027 * downcase!(mapping) -> self or nil
8028 *
8029 * Like String#downcase, except that:
8030 *
8031 * - Changes character casings in +self+ (not in a copy of +self+).
8032 * - Returns +self+ if any changes are made, +nil+ otherwise.
8033 *
8034 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8035 */
8036
8037static VALUE
8038rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8039{
8040 rb_encoding *enc;
8041 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8042
8043 flags = check_case_options(argc, argv, flags);
8044 str_modify_keep_cr(str);
8045 enc = str_true_enc(str);
8046 if (case_option_single_p(flags, enc, str)) {
8047 if (downcase_single(str))
8048 flags |= ONIGENC_CASE_MODIFIED;
8049 }
8050 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8051 rb_str_ascii_casemap(str, str, &flags, enc);
8052 else
8053 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8054
8055 if (ONIGENC_CASE_MODIFIED&flags) return str;
8056 return Qnil;
8057}
8058
8059
8060/*
8061 * call-seq:
8062 * downcase(mapping = :ascii) -> new_string
8063 *
8064 * :include: doc/string/downcase.rdoc
8065 *
8066 */
8067
8068static VALUE
8069rb_str_downcase(int argc, VALUE *argv, VALUE str)
8070{
8071 rb_encoding *enc;
8072 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8073 VALUE ret;
8074
8075 flags = check_case_options(argc, argv, flags);
8076 enc = str_true_enc(str);
8077 if (case_option_single_p(flags, enc, str)) {
8078 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8079 str_enc_copy_direct(ret, str);
8080 downcase_single(ret);
8081 }
8082 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8083 ret = rb_str_new(0, RSTRING_LEN(str));
8084 rb_str_ascii_casemap(str, ret, &flags, enc);
8085 }
8086 else {
8087 ret = rb_str_casemap(str, &flags, enc);
8088 }
8089
8090 return ret;
8091}
8092
8093
8094/*
8095 * call-seq:
8096 * capitalize!(mapping = :ascii) -> self or nil
8097 *
8098 * Like String#capitalize, except that:
8099 *
8100 * - Changes character casings in +self+ (not in a copy of +self+).
8101 * - Returns +self+ if any changes are made, +nil+ otherwise.
8102 *
8103 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8104 */
8105
8106static VALUE
8107rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8108{
8109 rb_encoding *enc;
8110 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8111
8112 flags = check_case_options(argc, argv, flags);
8113 str_modify_keep_cr(str);
8114 enc = str_true_enc(str);
8115 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8116 if (flags&ONIGENC_CASE_ASCII_ONLY)
8117 rb_str_ascii_casemap(str, str, &flags, enc);
8118 else
8119 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8120
8121 if (ONIGENC_CASE_MODIFIED&flags) return str;
8122 return Qnil;
8123}
8124
8125
8126/*
8127 * call-seq:
8128 * capitalize(mapping = :ascii) -> new_string
8129 *
8130 * :include: doc/string/capitalize.rdoc
8131 *
8132 */
8133
8134static VALUE
8135rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8136{
8137 rb_encoding *enc;
8138 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8139 VALUE ret;
8140
8141 flags = check_case_options(argc, argv, flags);
8142 enc = str_true_enc(str);
8143 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8144 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8145 ret = rb_str_new(0, RSTRING_LEN(str));
8146 rb_str_ascii_casemap(str, ret, &flags, enc);
8147 }
8148 else {
8149 ret = rb_str_casemap(str, &flags, enc);
8150 }
8151 return ret;
8152}
8153
8154
8155/*
8156 * call-seq:
8157 * swapcase!(mapping) -> self or nil
8158 *
8159 * Like String#swapcase, except that:
8160 *
8161 * - Changes are made to +self+, not to copy of +self+.
8162 * - Returns +self+ if any changes are made, +nil+ otherwise.
8163 *
8164 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8165 */
8166
8167static VALUE
8168rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8169{
8170 rb_encoding *enc;
8171 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8172
8173 flags = check_case_options(argc, argv, flags);
8174 str_modify_keep_cr(str);
8175 enc = str_true_enc(str);
8176 if (flags&ONIGENC_CASE_ASCII_ONLY)
8177 rb_str_ascii_casemap(str, str, &flags, enc);
8178 else
8179 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8180
8181 if (ONIGENC_CASE_MODIFIED&flags) return str;
8182 return Qnil;
8183}
8184
8185
8186/*
8187 * call-seq:
8188 * swapcase(mapping = :ascii) -> new_string
8189 *
8190 * :include: doc/string/swapcase.rdoc
8191 *
8192 */
8193
8194static VALUE
8195rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8196{
8197 rb_encoding *enc;
8198 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8199 VALUE ret;
8200
8201 flags = check_case_options(argc, argv, flags);
8202 enc = str_true_enc(str);
8203 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8204 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8205 ret = rb_str_new(0, RSTRING_LEN(str));
8206 rb_str_ascii_casemap(str, ret, &flags, enc);
8207 }
8208 else {
8209 ret = rb_str_casemap(str, &flags, enc);
8210 }
8211 return ret;
8212}
8213
8214typedef unsigned char *USTR;
8215
8216struct tr {
8217 int gen;
8218 unsigned int now, max;
8219 char *p, *pend;
8220};
8221
8222static unsigned int
8223trnext(struct tr *t, rb_encoding *enc)
8224{
8225 int n;
8226
8227 for (;;) {
8228 nextpart:
8229 if (!t->gen) {
8230 if (t->p == t->pend) return -1;
8231 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8232 t->p += n;
8233 }
8234 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8235 t->p += n;
8236 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8237 t->p += n;
8238 if (t->p < t->pend) {
8239 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8240 t->p += n;
8241 if (t->now > c) {
8242 if (t->now < 0x80 && c < 0x80) {
8243 rb_raise(rb_eArgError,
8244 "invalid range \"%c-%c\" in string transliteration",
8245 t->now, c);
8246 }
8247 else {
8248 rb_raise(rb_eArgError, "invalid range in string transliteration");
8249 }
8250 continue; /* not reached */
8251 }
8252 else if (t->now < c) {
8253 t->gen = 1;
8254 t->max = c;
8255 }
8256 }
8257 }
8258 return t->now;
8259 }
8260 else {
8261 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8262 if (t->now == t->max) {
8263 t->gen = 0;
8264 goto nextpart;
8265 }
8266 }
8267 if (t->now < t->max) {
8268 return t->now;
8269 }
8270 else {
8271 t->gen = 0;
8272 return t->max;
8273 }
8274 }
8275 }
8276}
8277
8278static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8279
8280static VALUE
8281tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8282{
8283 const unsigned int errc = -1;
8284 unsigned int trans[256];
8285 rb_encoding *enc, *e1, *e2;
8286 struct tr trsrc, trrepl;
8287 int cflag = 0;
8288 unsigned int c, c0, last = 0;
8289 int modify = 0, i, l;
8290 unsigned char *s, *send;
8291 VALUE hash = 0;
8292 int singlebyte = single_byte_optimizable(str);
8293 int termlen;
8294 int cr;
8295
8296#define CHECK_IF_ASCII(c) \
8297 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8298 (cr = ENC_CODERANGE_VALID) : 0)
8299
8300 StringValue(src);
8301 StringValue(repl);
8302 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8303 if (RSTRING_LEN(repl) == 0) {
8304 return rb_str_delete_bang(1, &src, str);
8305 }
8306
8307 cr = ENC_CODERANGE(str);
8308 e1 = rb_enc_check(str, src);
8309 e2 = rb_enc_check(str, repl);
8310 if (e1 == e2) {
8311 enc = e1;
8312 }
8313 else {
8314 enc = rb_enc_check(src, repl);
8315 }
8316 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8317 if (RSTRING_LEN(src) > 1 &&
8318 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8319 trsrc.p + l < trsrc.pend) {
8320 cflag = 1;
8321 trsrc.p += l;
8322 }
8323 trrepl.p = RSTRING_PTR(repl);
8324 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8325 trsrc.gen = trrepl.gen = 0;
8326 trsrc.now = trrepl.now = 0;
8327 trsrc.max = trrepl.max = 0;
8328
8329 if (cflag) {
8330 for (i=0; i<256; i++) {
8331 trans[i] = 1;
8332 }
8333 while ((c = trnext(&trsrc, enc)) != errc) {
8334 if (c < 256) {
8335 trans[c] = errc;
8336 }
8337 else {
8338 if (!hash) hash = rb_hash_new();
8339 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8340 }
8341 }
8342 while ((c = trnext(&trrepl, enc)) != errc)
8343 /* retrieve last replacer */;
8344 last = trrepl.now;
8345 for (i=0; i<256; i++) {
8346 if (trans[i] != errc) {
8347 trans[i] = last;
8348 }
8349 }
8350 }
8351 else {
8352 unsigned int r;
8353
8354 for (i=0; i<256; i++) {
8355 trans[i] = errc;
8356 }
8357 while ((c = trnext(&trsrc, enc)) != errc) {
8358 r = trnext(&trrepl, enc);
8359 if (r == errc) r = trrepl.now;
8360 if (c < 256) {
8361 trans[c] = r;
8362 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8363 }
8364 else {
8365 if (!hash) hash = rb_hash_new();
8366 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8367 }
8368 }
8369 }
8370
8371 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8372 cr = ENC_CODERANGE_7BIT;
8373 str_modify_keep_cr(str);
8374 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8375 termlen = rb_enc_mbminlen(enc);
8376 if (sflag) {
8377 int clen, tlen;
8378 long offset, max = RSTRING_LEN(str);
8379 unsigned int save = -1;
8380 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8381
8382 while (s < send) {
8383 int may_modify = 0;
8384
8385 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8386 if (!MBCLEN_CHARFOUND_P(r)) {
8387 xfree(buf);
8388 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8389 }
8390 clen = MBCLEN_CHARFOUND_LEN(r);
8391 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8392
8393 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8394
8395 s += clen;
8396 if (c < 256) {
8397 c = trans[c];
8398 }
8399 else if (hash) {
8400 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8401 if (NIL_P(tmp)) {
8402 if (cflag) c = last;
8403 else c = errc;
8404 }
8405 else if (cflag) c = errc;
8406 else c = NUM2INT(tmp);
8407 }
8408 else {
8409 c = errc;
8410 }
8411 if (c != (unsigned int)-1) {
8412 if (save == c) {
8413 CHECK_IF_ASCII(c);
8414 continue;
8415 }
8416 save = c;
8417 tlen = rb_enc_codelen(c, enc);
8418 modify = 1;
8419 }
8420 else {
8421 save = -1;
8422 c = c0;
8423 if (enc != e1) may_modify = 1;
8424 }
8425 if ((offset = t - buf) + tlen > max) {
8426 size_t MAYBE_UNUSED(old) = max + termlen;
8427 max = offset + tlen + (send - s);
8428 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8429 t = buf + offset;
8430 }
8431 rb_enc_mbcput(c, t, enc);
8432 if (may_modify && memcmp(s, t, tlen) != 0) {
8433 modify = 1;
8434 }
8435 CHECK_IF_ASCII(c);
8436 t += tlen;
8437 }
8438 if (!STR_EMBED_P(str)) {
8439 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8440 }
8441 TERM_FILL((char *)t, termlen);
8442 RSTRING(str)->as.heap.ptr = (char *)buf;
8443 STR_SET_LEN(str, t - buf);
8444 STR_SET_NOEMBED(str);
8445 RSTRING(str)->as.heap.aux.capa = max;
8446 }
8447 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8448 while (s < send) {
8449 c = (unsigned char)*s;
8450 if (trans[c] != errc) {
8451 if (!cflag) {
8452 c = trans[c];
8453 *s = c;
8454 modify = 1;
8455 }
8456 else {
8457 *s = last;
8458 modify = 1;
8459 }
8460 }
8461 CHECK_IF_ASCII(c);
8462 s++;
8463 }
8464 }
8465 else {
8466 int clen, tlen;
8467 long offset, max = (long)((send - s) * 1.2);
8468 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8469
8470 while (s < send) {
8471 int may_modify = 0;
8472
8473 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8474 if (!MBCLEN_CHARFOUND_P(r)) {
8475 xfree(buf);
8476 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8477 }
8478 clen = MBCLEN_CHARFOUND_LEN(r);
8479 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8480
8481 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8482
8483 if (c < 256) {
8484 c = trans[c];
8485 }
8486 else if (hash) {
8487 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8488 if (NIL_P(tmp)) {
8489 if (cflag) c = last;
8490 else c = errc;
8491 }
8492 else if (cflag) c = errc;
8493 else c = NUM2INT(tmp);
8494 }
8495 else {
8496 c = cflag ? last : errc;
8497 }
8498 if (c != errc) {
8499 tlen = rb_enc_codelen(c, enc);
8500 modify = 1;
8501 }
8502 else {
8503 c = c0;
8504 if (enc != e1) may_modify = 1;
8505 }
8506 if ((offset = t - buf) + tlen > max) {
8507 size_t MAYBE_UNUSED(old) = max + termlen;
8508 max = offset + tlen + (long)((send - s) * 1.2);
8509 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8510 t = buf + offset;
8511 }
8512 if (s != t) {
8513 rb_enc_mbcput(c, t, enc);
8514 if (may_modify && memcmp(s, t, tlen) != 0) {
8515 modify = 1;
8516 }
8517 }
8518 CHECK_IF_ASCII(c);
8519 s += clen;
8520 t += tlen;
8521 }
8522 if (!STR_EMBED_P(str)) {
8523 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8524 }
8525 TERM_FILL((char *)t, termlen);
8526 RSTRING(str)->as.heap.ptr = (char *)buf;
8527 STR_SET_LEN(str, t - buf);
8528 STR_SET_NOEMBED(str);
8529 RSTRING(str)->as.heap.aux.capa = max;
8530 }
8531
8532 if (modify) {
8533 if (cr != ENC_CODERANGE_BROKEN)
8534 ENC_CODERANGE_SET(str, cr);
8535 rb_enc_associate(str, enc);
8536 return str;
8537 }
8538 return Qnil;
8539}
8540
8541
8542/*
8543 * call-seq:
8544 * tr!(selector, replacements) -> self or nil
8545 *
8546 * Like String#tr, except:
8547 *
8548 * - Performs substitutions in +self+ (not in a copy of +self+).
8549 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8550 *
8551 * Related: {Modifying}[rdoc-ref:String@Modifying].
8552 */
8553
8554static VALUE
8555rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8556{
8557 return tr_trans(str, src, repl, 0);
8558}
8559
8560
8561/*
8562 * call-seq:
8563 * tr(selector, replacements) -> new_string
8564 *
8565 * Returns a copy of +self+ with each character specified by string +selector+
8566 * translated to the corresponding character in string +replacements+.
8567 * The correspondence is _positional_:
8568 *
8569 * - Each occurrence of the first character specified by +selector+
8570 * is translated to the first character in +replacements+.
8571 * - Each occurrence of the second character specified by +selector+
8572 * is translated to the second character in +replacements+.
8573 * - And so on.
8574 *
8575 * Example:
8576 *
8577 * 'hello'.tr('el', 'ip') #=> "hippo"
8578 *
8579 * If +replacements+ is shorter than +selector+,
8580 * it is implicitly padded with its own last character:
8581 *
8582 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8583 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8584 *
8585 * Arguments +selector+ and +replacements+ must be valid character selectors
8586 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8587 * and may use any of its valid forms, including negation, ranges, and escapes:
8588 *
8589 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8590 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8591 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8592 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8593 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8594 *
8595 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8596 */
8597
8598static VALUE
8599rb_str_tr(VALUE str, VALUE src, VALUE repl)
8600{
8601 str = str_duplicate(rb_cString, str);
8602 tr_trans(str, src, repl, 0);
8603 return str;
8604}
8605
8606#define TR_TABLE_MAX (UCHAR_MAX+1)
8607#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8608static void
8609tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8610 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8611{
8612 const unsigned int errc = -1;
8613 char buf[TR_TABLE_MAX];
8614 struct tr tr;
8615 unsigned int c;
8616 VALUE table = 0, ptable = 0;
8617 int i, l, cflag = 0;
8618
8619 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8620 tr.gen = tr.now = tr.max = 0;
8621
8622 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8623 cflag = 1;
8624 tr.p += l;
8625 }
8626 if (first) {
8627 for (i=0; i<TR_TABLE_MAX; i++) {
8628 stable[i] = 1;
8629 }
8630 stable[TR_TABLE_MAX] = cflag;
8631 }
8632 else if (stable[TR_TABLE_MAX] && !cflag) {
8633 stable[TR_TABLE_MAX] = 0;
8634 }
8635 for (i=0; i<TR_TABLE_MAX; i++) {
8636 buf[i] = cflag;
8637 }
8638
8639 while ((c = trnext(&tr, enc)) != errc) {
8640 if (c < TR_TABLE_MAX) {
8641 buf[(unsigned char)c] = !cflag;
8642 }
8643 else {
8644 VALUE key = UINT2NUM(c);
8645
8646 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8647 if (cflag) {
8648 ptable = *ctablep;
8649 table = ptable ? ptable : rb_hash_new();
8650 *ctablep = table;
8651 }
8652 else {
8653 table = rb_hash_new();
8654 ptable = *tablep;
8655 *tablep = table;
8656 }
8657 }
8658 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8659 rb_hash_aset(table, key, Qtrue);
8660 }
8661 }
8662 }
8663 for (i=0; i<TR_TABLE_MAX; i++) {
8664 stable[i] = stable[i] && buf[i];
8665 }
8666 if (!table && !cflag) {
8667 *tablep = 0;
8668 }
8669}
8670
8671
8672static int
8673tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8674{
8675 if (c < TR_TABLE_MAX) {
8676 return table[c] != 0;
8677 }
8678 else {
8679 VALUE v = UINT2NUM(c);
8680
8681 if (del) {
8682 if (!NIL_P(rb_hash_lookup(del, v)) &&
8683 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8684 return TRUE;
8685 }
8686 }
8687 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8688 return FALSE;
8689 }
8690 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8691 }
8692}
8693
8694/*
8695 * call-seq:
8696 * delete!(*selectors) -> self or nil
8697 *
8698 * Like String#delete, but modifies +self+ in place;
8699 * returns +self+ if any characters were deleted, +nil+ otherwise.
8700 *
8701 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8702 */
8703
8704static VALUE
8705rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8706{
8707 char squeez[TR_TABLE_SIZE];
8708 rb_encoding *enc = 0;
8709 char *s, *send, *t;
8710 VALUE del = 0, nodel = 0;
8711 int modify = 0;
8712 int i, ascompat, cr;
8713
8714 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8716 for (i=0; i<argc; i++) {
8717 VALUE s = argv[i];
8718
8719 StringValue(s);
8720 enc = rb_enc_check(str, s);
8721 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8722 }
8723
8724 str_modify_keep_cr(str);
8725 ascompat = rb_enc_asciicompat(enc);
8726 s = t = RSTRING_PTR(str);
8727 send = RSTRING_END(str);
8728 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8729 while (s < send) {
8730 unsigned int c;
8731 int clen;
8732
8733 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8734 if (squeez[c]) {
8735 modify = 1;
8736 }
8737 else {
8738 if (t != s) *t = c;
8739 t++;
8740 }
8741 s++;
8742 }
8743 else {
8744 c = rb_enc_codepoint_len(s, send, &clen, enc);
8745
8746 if (tr_find(c, squeez, del, nodel)) {
8747 modify = 1;
8748 }
8749 else {
8750 if (t != s) rb_enc_mbcput(c, t, enc);
8751 t += clen;
8753 }
8754 s += clen;
8755 }
8756 }
8757 TERM_FILL(t, TERM_LEN(str));
8758 STR_SET_LEN(str, t - RSTRING_PTR(str));
8759 ENC_CODERANGE_SET(str, cr);
8760
8761 if (modify) return str;
8762 return Qnil;
8763}
8764
8765
8766/*
8767 * call-seq:
8768 * delete(*selectors) -> new_string
8769 *
8770 * :include: doc/string/delete.rdoc
8771 *
8772 */
8773
8774static VALUE
8775rb_str_delete(int argc, VALUE *argv, VALUE str)
8776{
8777 str = str_duplicate(rb_cString, str);
8778 rb_str_delete_bang(argc, argv, str);
8779 return str;
8780}
8781
8782
8783/*
8784 * call-seq:
8785 * squeeze!(*selectors) -> self or nil
8786 *
8787 * Like String#squeeze, except that:
8788 *
8789 * - Characters are squeezed in +self+ (not in a copy of +self+).
8790 * - Returns +self+ if any changes are made, +nil+ otherwise.
8791 *
8792 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8793 */
8794
8795static VALUE
8796rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8797{
8798 char squeez[TR_TABLE_SIZE];
8799 rb_encoding *enc = 0;
8800 VALUE del = 0, nodel = 0;
8801 unsigned char *s, *send, *t;
8802 int i, modify = 0;
8803 int ascompat, singlebyte = single_byte_optimizable(str);
8804 unsigned int save;
8805
8806 if (argc == 0) {
8807 enc = STR_ENC_GET(str);
8808 }
8809 else {
8810 for (i=0; i<argc; i++) {
8811 VALUE s = argv[i];
8812
8813 StringValue(s);
8814 enc = rb_enc_check(str, s);
8815 if (singlebyte && !single_byte_optimizable(s))
8816 singlebyte = 0;
8817 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8818 }
8819 }
8820
8821 str_modify_keep_cr(str);
8822 s = t = (unsigned char *)RSTRING_PTR(str);
8823 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8824 send = (unsigned char *)RSTRING_END(str);
8825 save = -1;
8826 ascompat = rb_enc_asciicompat(enc);
8827
8828 if (singlebyte) {
8829 while (s < send) {
8830 unsigned int c = *s++;
8831 if (c != save || (argc > 0 && !squeez[c])) {
8832 *t++ = save = c;
8833 }
8834 }
8835 }
8836 else {
8837 while (s < send) {
8838 unsigned int c;
8839 int clen;
8840
8841 if (ascompat && (c = *s) < 0x80) {
8842 if (c != save || (argc > 0 && !squeez[c])) {
8843 *t++ = save = c;
8844 }
8845 s++;
8846 }
8847 else {
8848 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8849
8850 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8851 if (t != s) rb_enc_mbcput(c, t, enc);
8852 save = c;
8853 t += clen;
8854 }
8855 s += clen;
8856 }
8857 }
8858 }
8859
8860 TERM_FILL((char *)t, TERM_LEN(str));
8861 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8862 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8863 modify = 1;
8864 }
8865
8866 if (modify) return str;
8867 return Qnil;
8868}
8869
8870
8871/*
8872 * call-seq:
8873 * squeeze(*selectors) -> new_string
8874 *
8875 * :include: doc/string/squeeze.rdoc
8876 *
8877 */
8878
8879static VALUE
8880rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8881{
8882 str = str_duplicate(rb_cString, str);
8883 rb_str_squeeze_bang(argc, argv, str);
8884 return str;
8885}
8886
8887
8888/*
8889 * call-seq:
8890 * tr_s!(selector, replacements) -> self or nil
8891 *
8892 * Like String#tr_s, except:
8893 *
8894 * - Modifies +self+ in place (not a copy of +self+).
8895 * - Returns +self+ if any changes were made, +nil+ otherwise.
8896 *
8897 * Related: {Modifying}[rdoc-ref:String@Modifying].
8898 */
8899
8900static VALUE
8901rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8902{
8903 return tr_trans(str, src, repl, 1);
8904}
8905
8906
8907/*
8908 * call-seq:
8909 * tr_s(selector, replacements) -> new_string
8910 *
8911 * Like String#tr, except:
8912 *
8913 * - Also squeezes the modified portions of the translated string;
8914 * see String#squeeze.
8915 * - Returns the translated and squeezed string.
8916 *
8917 * Examples:
8918 *
8919 * 'hello'.tr_s('l', 'r') #=> "hero"
8920 * 'hello'.tr_s('el', '-') #=> "h-o"
8921 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8922 *
8923 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8924 *
8925 */
8926
8927static VALUE
8928rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8929{
8930 str = str_duplicate(rb_cString, str);
8931 tr_trans(str, src, repl, 1);
8932 return str;
8933}
8934
8935
8936/*
8937 * call-seq:
8938 * count(*selectors) -> integer
8939 *
8940 * :include: doc/string/count.rdoc
8941 */
8942
8943static VALUE
8944rb_str_count(int argc, VALUE *argv, VALUE str)
8945{
8946 char table[TR_TABLE_SIZE];
8947 rb_encoding *enc = 0;
8948 VALUE del = 0, nodel = 0, tstr;
8949 char *s, *send;
8950 int i;
8951 int ascompat;
8952 size_t n = 0;
8953
8955
8956 tstr = argv[0];
8957 StringValue(tstr);
8958 enc = rb_enc_check(str, tstr);
8959 if (argc == 1) {
8960 const char *ptstr;
8961 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8962 (ptstr = RSTRING_PTR(tstr),
8963 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8964 !is_broken_string(str)) {
8965 int clen;
8966 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8967
8968 s = RSTRING_PTR(str);
8969 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8970 send = RSTRING_END(str);
8971 while (s < send) {
8972 if (*(unsigned char*)s++ == c) n++;
8973 }
8974 return SIZET2NUM(n);
8975 }
8976 }
8977
8978 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8979 for (i=1; i<argc; i++) {
8980 tstr = argv[i];
8981 StringValue(tstr);
8982 enc = rb_enc_check(str, tstr);
8983 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8984 }
8985
8986 s = RSTRING_PTR(str);
8987 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8988 send = RSTRING_END(str);
8989 ascompat = rb_enc_asciicompat(enc);
8990 while (s < send) {
8991 unsigned int c;
8992
8993 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8994 if (table[c]) {
8995 n++;
8996 }
8997 s++;
8998 }
8999 else {
9000 int clen;
9001 c = rb_enc_codepoint_len(s, send, &clen, enc);
9002 if (tr_find(c, table, del, nodel)) {
9003 n++;
9004 }
9005 s += clen;
9006 }
9007 }
9008
9009 return SIZET2NUM(n);
9010}
9011
9012static VALUE
9013rb_fs_check(VALUE val)
9014{
9015 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9016 val = rb_check_string_type(val);
9017 if (NIL_P(val)) return 0;
9018 }
9019 return val;
9020}
9021
9022static const char isspacetable[256] = {
9023 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9024 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9025 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9026 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9027 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9028 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9029 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9030 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9034 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9035 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9036 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9037 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9038 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9039};
9040
9041#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9042
9043static long
9044split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9045{
9046 if (empty_count >= 0 && len == 0) {
9047 return empty_count + 1;
9048 }
9049 if (empty_count > 0) {
9050 /* make different substrings */
9051 if (result) {
9052 do {
9053 rb_ary_push(result, str_new_empty_String(str));
9054 } while (--empty_count > 0);
9055 }
9056 else {
9057 do {
9058 rb_yield(str_new_empty_String(str));
9059 } while (--empty_count > 0);
9060 }
9061 }
9062 str = rb_str_subseq(str, beg, len);
9063 if (result) {
9064 rb_ary_push(result, str);
9065 }
9066 else {
9067 rb_yield(str);
9068 }
9069 return empty_count;
9070}
9071
9072typedef enum {
9073 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9074} split_type_t;
9075
9076static split_type_t
9077literal_split_pattern(VALUE spat, split_type_t default_type)
9078{
9079 rb_encoding *enc = STR_ENC_GET(spat);
9080 const char *ptr;
9081 long len;
9082 RSTRING_GETMEM(spat, ptr, len);
9083 if (len == 0) {
9084 /* Special case - split into chars */
9085 return SPLIT_TYPE_CHARS;
9086 }
9087 else if (rb_enc_asciicompat(enc)) {
9088 if (len == 1 && ptr[0] == ' ') {
9089 return SPLIT_TYPE_AWK;
9090 }
9091 }
9092 else {
9093 int l;
9094 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9095 return SPLIT_TYPE_AWK;
9096 }
9097 }
9098 return default_type;
9099}
9100
9101/*
9102 * call-seq:
9103 * split(field_sep = $;, limit = 0) -> array_of_substrings
9104 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9105 *
9106 * :include: doc/string/split.rdoc
9107 *
9108 */
9109
9110static VALUE
9111rb_str_split_m(int argc, VALUE *argv, VALUE str)
9112{
9113 rb_encoding *enc;
9114 VALUE spat;
9115 VALUE limit;
9116 split_type_t split_type;
9117 long beg, end, i = 0, empty_count = -1;
9118 int lim = 0;
9119 VALUE result, tmp;
9120
9121 result = rb_block_given_p() ? Qfalse : Qnil;
9122 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9123 lim = NUM2INT(limit);
9124 if (lim <= 0) limit = Qnil;
9125 else if (lim == 1) {
9126 if (RSTRING_LEN(str) == 0)
9127 return result ? rb_ary_new2(0) : str;
9128 tmp = str_duplicate(rb_cString, str);
9129 if (!result) {
9130 rb_yield(tmp);
9131 return str;
9132 }
9133 return rb_ary_new3(1, tmp);
9134 }
9135 i = 1;
9136 }
9137 if (NIL_P(limit) && !lim) empty_count = 0;
9138
9139 enc = STR_ENC_GET(str);
9140 split_type = SPLIT_TYPE_REGEXP;
9141 if (!NIL_P(spat)) {
9142 spat = get_pat_quoted(spat, 0);
9143 }
9144 else if (NIL_P(spat = rb_fs)) {
9145 split_type = SPLIT_TYPE_AWK;
9146 }
9147 else if (!(spat = rb_fs_check(spat))) {
9148 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9149 }
9150 else {
9151 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9152 }
9153 if (split_type != SPLIT_TYPE_AWK) {
9154 switch (BUILTIN_TYPE(spat)) {
9155 case T_REGEXP:
9156 rb_reg_options(spat); /* check if uninitialized */
9157 tmp = RREGEXP_SRC(spat);
9158 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9159 if (split_type == SPLIT_TYPE_AWK) {
9160 spat = tmp;
9161 split_type = SPLIT_TYPE_STRING;
9162 }
9163 break;
9164
9165 case T_STRING:
9166 mustnot_broken(spat);
9167 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9168 break;
9169
9170 default:
9172 }
9173 }
9174
9175#define SPLIT_STR(beg, len) ( \
9176 empty_count = split_string(result, str, beg, len, empty_count), \
9177 str_mod_check(str, str_start, str_len))
9178
9179 beg = 0;
9180 char *ptr = RSTRING_PTR(str);
9181 char *const str_start = ptr;
9182 const long str_len = RSTRING_LEN(str);
9183 char *const eptr = str_start + str_len;
9184 if (split_type == SPLIT_TYPE_AWK) {
9185 char *bptr = ptr;
9186 int skip = 1;
9187 unsigned int c;
9188
9189 if (result) result = rb_ary_new();
9190 end = beg;
9191 if (is_ascii_string(str)) {
9192 while (ptr < eptr) {
9193 c = (unsigned char)*ptr++;
9194 if (skip) {
9195 if (ascii_isspace(c)) {
9196 beg = ptr - bptr;
9197 }
9198 else {
9199 end = ptr - bptr;
9200 skip = 0;
9201 if (!NIL_P(limit) && lim <= i) break;
9202 }
9203 }
9204 else if (ascii_isspace(c)) {
9205 SPLIT_STR(beg, end-beg);
9206 skip = 1;
9207 beg = ptr - bptr;
9208 if (!NIL_P(limit)) ++i;
9209 }
9210 else {
9211 end = ptr - bptr;
9212 }
9213 }
9214 }
9215 else {
9216 while (ptr < eptr) {
9217 int n;
9218
9219 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9220 ptr += n;
9221 if (skip) {
9222 if (rb_isspace(c)) {
9223 beg = ptr - bptr;
9224 }
9225 else {
9226 end = ptr - bptr;
9227 skip = 0;
9228 if (!NIL_P(limit) && lim <= i) break;
9229 }
9230 }
9231 else if (rb_isspace(c)) {
9232 SPLIT_STR(beg, end-beg);
9233 skip = 1;
9234 beg = ptr - bptr;
9235 if (!NIL_P(limit)) ++i;
9236 }
9237 else {
9238 end = ptr - bptr;
9239 }
9240 }
9241 }
9242 }
9243 else if (split_type == SPLIT_TYPE_STRING) {
9244 char *substr_start = ptr;
9245 char *sptr = RSTRING_PTR(spat);
9246 long slen = RSTRING_LEN(spat);
9247
9248 if (result) result = rb_ary_new();
9249 mustnot_broken(str);
9250 enc = rb_enc_check(str, spat);
9251 while (ptr < eptr &&
9252 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9253 /* Check we are at the start of a char */
9254 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9255 if (t != ptr + end) {
9256 ptr = t;
9257 continue;
9258 }
9259 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9260 str_mod_check(spat, sptr, slen);
9261 ptr += end + slen;
9262 substr_start = ptr;
9263 if (!NIL_P(limit) && lim <= ++i) break;
9264 }
9265 beg = ptr - str_start;
9266 }
9267 else if (split_type == SPLIT_TYPE_CHARS) {
9268 int n;
9269
9270 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9271 mustnot_broken(str);
9272 enc = rb_enc_get(str);
9273 while (ptr < eptr &&
9274 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9275 SPLIT_STR(ptr - str_start, n);
9276 ptr += n;
9277 if (!NIL_P(limit) && lim <= ++i) break;
9278 }
9279 beg = ptr - str_start;
9280 }
9281 else {
9282 if (result) result = rb_ary_new();
9283 long len = RSTRING_LEN(str);
9284 long start = beg;
9285 long idx;
9286 int last_null = 0;
9287 struct re_registers *regs;
9288 VALUE match = 0;
9289
9290 for (; rb_reg_search(spat, str, start, 0) >= 0;
9291 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9292 match = rb_backref_get();
9293 if (!result) rb_match_busy(match);
9294 regs = RMATCH_REGS(match);
9295 end = BEG(0);
9296 if (start == end && BEG(0) == END(0)) {
9297 if (!ptr) {
9298 SPLIT_STR(0, 0);
9299 break;
9300 }
9301 else if (last_null == 1) {
9302 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9303 beg = start;
9304 }
9305 else {
9306 if (start == len)
9307 start++;
9308 else
9309 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9310 last_null = 1;
9311 continue;
9312 }
9313 }
9314 else {
9315 SPLIT_STR(beg, end-beg);
9316 beg = start = END(0);
9317 }
9318 last_null = 0;
9319
9320 for (idx=1; idx < regs->num_regs; idx++) {
9321 if (BEG(idx) == -1) continue;
9322 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9323 }
9324 if (!NIL_P(limit) && lim <= ++i) break;
9325 }
9326 if (match) rb_match_unbusy(match);
9327 }
9328 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9329 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9330 }
9331
9332 return result ? result : str;
9333}
9334
9335VALUE
9336rb_str_split(VALUE str, const char *sep0)
9337{
9338 VALUE sep;
9339
9340 StringValue(str);
9341 sep = rb_str_new_cstr(sep0);
9342 return rb_str_split_m(1, &sep, str);
9343}
9344
9345#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9346
9347static inline int
9348enumerator_element(VALUE ary, VALUE e)
9349{
9350 if (ary) {
9351 rb_ary_push(ary, e);
9352 return 0;
9353 }
9354 else {
9355 rb_yield(e);
9356 return 1;
9357 }
9358}
9359
9360#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9361
9362static const char *
9363chomp_newline(const char *p, const char *e, rb_encoding *enc)
9364{
9365 const char *prev = rb_enc_prev_char(p, e, e, enc);
9366 if (rb_enc_is_newline(prev, e, enc)) {
9367 e = prev;
9368 prev = rb_enc_prev_char(p, e, e, enc);
9369 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9370 e = prev;
9371 }
9372 return e;
9373}
9374
9375static VALUE
9376get_rs(void)
9377{
9378 VALUE rs = rb_rs;
9379 if (!NIL_P(rs) &&
9380 (!RB_TYPE_P(rs, T_STRING) ||
9381 RSTRING_LEN(rs) != 1 ||
9382 RSTRING_PTR(rs)[0] != '\n')) {
9383 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9384 }
9385 return rs;
9386}
9387
9388#define rb_rs get_rs()
9389
9390static VALUE
9391rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9392{
9393 rb_encoding *enc;
9394 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9395 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9396 long pos, len, rslen;
9397 int rsnewline = 0;
9398
9399 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9400 rs = rb_rs;
9401 if (!NIL_P(opts)) {
9402 static ID keywords[1];
9403 if (!keywords[0]) {
9404 keywords[0] = rb_intern_const("chomp");
9405 }
9406 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9407 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9408 }
9409
9410 if (NIL_P(rs)) {
9411 if (!ENUM_ELEM(ary, str)) {
9412 return ary;
9413 }
9414 else {
9415 return orig;
9416 }
9417 }
9418
9419 if (!RSTRING_LEN(str)) goto end;
9420 str = rb_str_new_frozen(str);
9421 ptr = subptr = RSTRING_PTR(str);
9422 pend = RSTRING_END(str);
9423 len = RSTRING_LEN(str);
9424 StringValue(rs);
9425 rslen = RSTRING_LEN(rs);
9426
9427 if (rs == rb_default_rs)
9428 enc = rb_enc_get(str);
9429 else
9430 enc = rb_enc_check(str, rs);
9431
9432 if (rslen == 0) {
9433 /* paragraph mode */
9434 int n;
9435 const char *eol = NULL;
9436 subend = subptr;
9437 while (subend < pend) {
9438 long chomp_rslen = 0;
9439 do {
9440 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9441 n = 0;
9442 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9443 if (rb_enc_is_newline(subend + n, pend, enc)) {
9444 if (eol == subend) break;
9445 subend += rslen;
9446 if (subptr) {
9447 eol = subend;
9448 chomp_rslen = -rslen;
9449 }
9450 }
9451 else {
9452 if (!subptr) subptr = subend;
9453 subend += rslen;
9454 }
9455 rslen = 0;
9456 } while (subend < pend);
9457 if (!subptr) break;
9458 if (rslen == 0) chomp_rslen = 0;
9459 line = rb_str_subseq(str, subptr - ptr,
9460 subend - subptr + (chomp ? chomp_rslen : rslen));
9461 if (ENUM_ELEM(ary, line)) {
9462 str_mod_check(str, ptr, len);
9463 }
9464 subptr = eol = NULL;
9465 }
9466 goto end;
9467 }
9468 else {
9469 rsptr = RSTRING_PTR(rs);
9470 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9471 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9472 rsnewline = 1;
9473 }
9474 }
9475
9476 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9477 rs = rb_str_new(rsptr, rslen);
9478 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9479 rsptr = RSTRING_PTR(rs);
9480 rslen = RSTRING_LEN(rs);
9481 }
9482
9483 while (subptr < pend) {
9484 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9485 if (pos < 0) break;
9486 hit = subptr + pos;
9487 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9488 if (hit != adjusted) {
9489 subptr = adjusted;
9490 continue;
9491 }
9492 subend = hit += rslen;
9493 if (chomp) {
9494 if (rsnewline) {
9495 subend = chomp_newline(subptr, subend, enc);
9496 }
9497 else {
9498 subend -= rslen;
9499 }
9500 }
9501 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9502 if (ENUM_ELEM(ary, line)) {
9503 str_mod_check(str, ptr, len);
9504 }
9505 subptr = hit;
9506 }
9507
9508 if (subptr != pend) {
9509 if (chomp) {
9510 if (rsnewline) {
9511 pend = chomp_newline(subptr, pend, enc);
9512 }
9513 else if (pend - subptr >= rslen &&
9514 memcmp(pend - rslen, rsptr, rslen) == 0) {
9515 pend -= rslen;
9516 }
9517 }
9518 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9519 ENUM_ELEM(ary, line);
9520 RB_GC_GUARD(str);
9521 }
9522
9523 end:
9524 if (ary)
9525 return ary;
9526 else
9527 return orig;
9528}
9529
9530/*
9531 * call-seq:
9532 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9533 * each_line(record_separator = $/, chomp: false) -> enumerator
9534 *
9535 * :include: doc/string/each_line.rdoc
9536 *
9537 */
9538
9539static VALUE
9540rb_str_each_line(int argc, VALUE *argv, VALUE str)
9541{
9542 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9543 return rb_str_enumerate_lines(argc, argv, str, 0);
9544}
9545
9546/*
9547 * call-seq:
9548 * lines(record_separator = $/, chomp: false) -> array_of_strings
9549 *
9550 * Returns substrings ("lines") of +self+
9551 * according to the given arguments:
9552 *
9553 * s = <<~EOT
9554 * This is the first line.
9555 * This is line two.
9556 *
9557 * This is line four.
9558 * This is line five.
9559 * EOT
9560 *
9561 * With the default argument values:
9562 *
9563 * $/ # => "\n"
9564 * s.lines
9565 * # =>
9566 * ["This is the first line.\n",
9567 * "This is line two.\n",
9568 * "\n",
9569 * "This is line four.\n",
9570 * "This is line five.\n"]
9571 *
9572 * With a different +record_separator+:
9573 *
9574 * record_separator = ' is '
9575 * s.lines(record_separator)
9576 * # =>
9577 * ["This is ",
9578 * "the first line.\nThis is ",
9579 * "line two.\n\nThis is ",
9580 * "line four.\nThis is ",
9581 * "line five.\n"]
9582 *
9583 * With keyword argument +chomp+ as +true+,
9584 * removes the trailing newline from each line:
9585 *
9586 * s.lines(chomp: true)
9587 * # =>
9588 * ["This is the first line.",
9589 * "This is line two.",
9590 * "",
9591 * "This is line four.",
9592 * "This is line five."]
9593 *
9594 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9595 */
9596
9597static VALUE
9598rb_str_lines(int argc, VALUE *argv, VALUE str)
9599{
9600 VALUE ary = WANTARRAY("lines", 0);
9601 return rb_str_enumerate_lines(argc, argv, str, ary);
9602}
9603
9604static VALUE
9605rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9606{
9607 return LONG2FIX(RSTRING_LEN(str));
9608}
9609
9610static VALUE
9611rb_str_enumerate_bytes(VALUE str, VALUE ary)
9612{
9613 long i;
9614
9615 for (i=0; i<RSTRING_LEN(str); i++) {
9616 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9617 }
9618 if (ary)
9619 return ary;
9620 else
9621 return str;
9622}
9623
9624/*
9625 * call-seq:
9626 * each_byte {|byte| ... } -> self
9627 * each_byte -> enumerator
9628 *
9629 * :include: doc/string/each_byte.rdoc
9630 *
9631 */
9632
9633static VALUE
9634rb_str_each_byte(VALUE str)
9635{
9636 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9637 return rb_str_enumerate_bytes(str, 0);
9638}
9639
9640/*
9641 * call-seq:
9642 * bytes -> array_of_bytes
9643 *
9644 * :include: doc/string/bytes.rdoc
9645 *
9646 */
9647
9648static VALUE
9649rb_str_bytes(VALUE str)
9650{
9651 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9652 return rb_str_enumerate_bytes(str, ary);
9653}
9654
9655static VALUE
9656rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9657{
9658 return rb_str_length(str);
9659}
9660
9661static VALUE
9662rb_str_enumerate_chars(VALUE str, VALUE ary)
9663{
9664 VALUE orig = str;
9665 long i, len, n;
9666 const char *ptr;
9667 rb_encoding *enc;
9668
9669 str = rb_str_new_frozen(str);
9670 ptr = RSTRING_PTR(str);
9671 len = RSTRING_LEN(str);
9672 enc = rb_enc_get(str);
9673
9675 for (i = 0; i < len; i += n) {
9676 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9677 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9678 }
9679 }
9680 else {
9681 for (i = 0; i < len; i += n) {
9682 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9683 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9684 }
9685 }
9686 RB_GC_GUARD(str);
9687 if (ary)
9688 return ary;
9689 else
9690 return orig;
9691}
9692
9693/*
9694 * call-seq:
9695 * each_char {|char| ... } -> self
9696 * each_char -> enumerator
9697 *
9698 * :include: doc/string/each_char.rdoc
9699 *
9700 */
9701
9702static VALUE
9703rb_str_each_char(VALUE str)
9704{
9705 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9706 return rb_str_enumerate_chars(str, 0);
9707}
9708
9709/*
9710 * call-seq:
9711 * chars -> array_of_characters
9712 *
9713 * :include: doc/string/chars.rdoc
9714 *
9715 */
9716
9717static VALUE
9718rb_str_chars(VALUE str)
9719{
9720 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9721 return rb_str_enumerate_chars(str, ary);
9722}
9723
9724static VALUE
9725rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9726{
9727 VALUE orig = str;
9728 int n;
9729 unsigned int c;
9730 const char *ptr, *end;
9731 rb_encoding *enc;
9732
9733 if (single_byte_optimizable(str))
9734 return rb_str_enumerate_bytes(str, ary);
9735
9736 str = rb_str_new_frozen(str);
9737 ptr = RSTRING_PTR(str);
9738 end = RSTRING_END(str);
9739 enc = STR_ENC_GET(str);
9740
9741 while (ptr < end) {
9742 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9743 ENUM_ELEM(ary, UINT2NUM(c));
9744 ptr += n;
9745 }
9746 RB_GC_GUARD(str);
9747 if (ary)
9748 return ary;
9749 else
9750 return orig;
9751}
9752
9753/*
9754 * call-seq:
9755 * each_codepoint {|codepoint| ... } -> self
9756 * each_codepoint -> enumerator
9757 *
9758 * :include: doc/string/each_codepoint.rdoc
9759 *
9760 */
9761
9762static VALUE
9763rb_str_each_codepoint(VALUE str)
9764{
9765 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9766 return rb_str_enumerate_codepoints(str, 0);
9767}
9768
9769/*
9770 * call-seq:
9771 * codepoints -> array_of_integers
9772 *
9773 * :include: doc/string/codepoints.rdoc
9774 *
9775 */
9776
9777static VALUE
9778rb_str_codepoints(VALUE str)
9779{
9780 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9781 return rb_str_enumerate_codepoints(str, ary);
9782}
9783
9784static regex_t *
9785get_reg_grapheme_cluster(rb_encoding *enc)
9786{
9787 int encidx = rb_enc_to_index(enc);
9788
9789 const OnigUChar source_ascii[] = "\\X";
9790 const OnigUChar *source = source_ascii;
9791 size_t source_len = sizeof(source_ascii) - 1;
9792
9793 switch (encidx) {
9794#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9795#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9796#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9797#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9798#define CASE_UTF(e) \
9799 case ENCINDEX_UTF_##e: { \
9800 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9801 source = source_UTF_##e; \
9802 source_len = sizeof(source_UTF_##e); \
9803 break; \
9804 }
9805 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9806#undef CASE_UTF
9807#undef CHARS_16BE
9808#undef CHARS_16LE
9809#undef CHARS_32BE
9810#undef CHARS_32LE
9811 }
9812
9813 regex_t *reg_grapheme_cluster;
9814 OnigErrorInfo einfo;
9815 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9816 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9817 if (r) {
9818 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9819 onig_error_code_to_str(message, r, &einfo);
9820 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9821 }
9822
9823 return reg_grapheme_cluster;
9824}
9825
9826static regex_t *
9827get_cached_reg_grapheme_cluster(rb_encoding *enc)
9828{
9829 int encidx = rb_enc_to_index(enc);
9830 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9831
9832 if (encidx == rb_utf8_encindex()) {
9833 if (!reg_grapheme_cluster_utf8) {
9834 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9835 }
9836
9837 return reg_grapheme_cluster_utf8;
9838 }
9839
9840 return NULL;
9841}
9842
9843static VALUE
9844rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9845{
9846 size_t grapheme_cluster_count = 0;
9847 rb_encoding *enc = get_encoding(str);
9848 const char *ptr, *end;
9849
9850 if (!rb_enc_unicode_p(enc)) {
9851 return rb_str_length(str);
9852 }
9853
9854 bool cached_reg_grapheme_cluster = true;
9855 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9856 if (!reg_grapheme_cluster) {
9857 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9858 cached_reg_grapheme_cluster = false;
9859 }
9860
9861 ptr = RSTRING_PTR(str);
9862 end = RSTRING_END(str);
9863
9864 while (ptr < end) {
9865 OnigPosition len = onig_match(reg_grapheme_cluster,
9866 (const OnigUChar *)ptr, (const OnigUChar *)end,
9867 (const OnigUChar *)ptr, NULL, 0);
9868 if (len <= 0) break;
9869 grapheme_cluster_count++;
9870 ptr += len;
9871 }
9872
9873 if (!cached_reg_grapheme_cluster) {
9874 onig_free(reg_grapheme_cluster);
9875 }
9876
9877 return SIZET2NUM(grapheme_cluster_count);
9878}
9879
9880static VALUE
9881rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9882{
9883 VALUE orig = str;
9884 rb_encoding *enc = get_encoding(str);
9885 const char *ptr0, *ptr, *end;
9886
9887 if (!rb_enc_unicode_p(enc)) {
9888 return rb_str_enumerate_chars(str, ary);
9889 }
9890
9891 if (!ary) str = rb_str_new_frozen(str);
9892
9893 bool cached_reg_grapheme_cluster = true;
9894 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9895 if (!reg_grapheme_cluster) {
9896 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9897 cached_reg_grapheme_cluster = false;
9898 }
9899
9900 ptr0 = ptr = RSTRING_PTR(str);
9901 end = RSTRING_END(str);
9902
9903 while (ptr < end) {
9904 OnigPosition len = onig_match(reg_grapheme_cluster,
9905 (const OnigUChar *)ptr, (const OnigUChar *)end,
9906 (const OnigUChar *)ptr, NULL, 0);
9907 if (len <= 0) break;
9908 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9909 ptr += len;
9910 }
9911
9912 if (!cached_reg_grapheme_cluster) {
9913 onig_free(reg_grapheme_cluster);
9914 }
9915
9916 RB_GC_GUARD(str);
9917 if (ary)
9918 return ary;
9919 else
9920 return orig;
9921}
9922
9923/*
9924 * call-seq:
9925 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9926 * each_grapheme_cluster -> enumerator
9927 *
9928 * :include: doc/string/each_grapheme_cluster.rdoc
9929 *
9930 */
9931
9932static VALUE
9933rb_str_each_grapheme_cluster(VALUE str)
9934{
9935 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9936 return rb_str_enumerate_grapheme_clusters(str, 0);
9937}
9938
9939/*
9940 * call-seq:
9941 * grapheme_clusters -> array_of_grapheme_clusters
9942 *
9943 * :include: doc/string/grapheme_clusters.rdoc
9944 *
9945 */
9946
9947static VALUE
9948rb_str_grapheme_clusters(VALUE str)
9949{
9950 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9951 return rb_str_enumerate_grapheme_clusters(str, ary);
9952}
9953
9954static long
9955chopped_length(VALUE str)
9956{
9957 rb_encoding *enc = STR_ENC_GET(str);
9958 const char *p, *p2, *beg, *end;
9959
9960 beg = RSTRING_PTR(str);
9961 end = beg + RSTRING_LEN(str);
9962 if (beg >= end) return 0;
9963 p = rb_enc_prev_char(beg, end, end, enc);
9964 if (!p) return 0;
9965 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9966 p2 = rb_enc_prev_char(beg, p, end, enc);
9967 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9968 }
9969 return p - beg;
9970}
9971
9972/*
9973 * call-seq:
9974 * chop! -> self or nil
9975 *
9976 * Like String#chop, except that:
9977 *
9978 * - Removes trailing characters from +self+ (not from a copy of +self+).
9979 * - Returns +self+ if any characters are removed, +nil+ otherwise.
9980 *
9981 * Related: see {Modifying}[rdoc-ref:String@Modifying].
9982 */
9983
9984static VALUE
9985rb_str_chop_bang(VALUE str)
9986{
9987 str_modify_keep_cr(str);
9988 if (RSTRING_LEN(str) > 0) {
9989 long len;
9990 len = chopped_length(str);
9991 STR_SET_LEN(str, len);
9992 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9993 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9995 }
9996 return str;
9997 }
9998 return Qnil;
9999}
10000
10001
10002/*
10003 * call-seq:
10004 * chop -> new_string
10005 *
10006 * :include: doc/string/chop.rdoc
10007 *
10008 */
10009
10010static VALUE
10011rb_str_chop(VALUE str)
10012{
10013 return rb_str_subseq(str, 0, chopped_length(str));
10014}
10015
10016static long
10017smart_chomp(VALUE str, const char *e, const char *p)
10018{
10019 rb_encoding *enc = rb_enc_get(str);
10020 if (rb_enc_mbminlen(enc) > 1) {
10021 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10022 if (rb_enc_is_newline(pp, e, enc)) {
10023 e = pp;
10024 }
10025 pp = e - rb_enc_mbminlen(enc);
10026 if (pp >= p) {
10027 pp = rb_enc_left_char_head(p, pp, e, enc);
10028 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10029 e = pp;
10030 }
10031 }
10032 }
10033 else {
10034 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10035 case '\n':
10036 if (--e > p && *(e-1) == '\r') {
10037 --e;
10038 }
10039 break;
10040 case '\r':
10041 --e;
10042 break;
10043 }
10044 }
10045 return e - p;
10046}
10047
10048static long
10049chompped_length(VALUE str, VALUE rs)
10050{
10051 rb_encoding *enc;
10052 int newline;
10053 char *pp, *e, *rsptr;
10054 long rslen;
10055 char *const p = RSTRING_PTR(str);
10056 long len = RSTRING_LEN(str);
10057
10058 if (len == 0) return 0;
10059 e = p + len;
10060 if (rs == rb_default_rs) {
10061 return smart_chomp(str, e, p);
10062 }
10063
10064 enc = rb_enc_get(str);
10065 RSTRING_GETMEM(rs, rsptr, rslen);
10066 if (rslen == 0) {
10067 if (rb_enc_mbminlen(enc) > 1) {
10068 while (e > p) {
10069 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10070 if (!rb_enc_is_newline(pp, e, enc)) break;
10071 e = pp;
10072 pp -= rb_enc_mbminlen(enc);
10073 if (pp >= p) {
10074 pp = rb_enc_left_char_head(p, pp, e, enc);
10075 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10076 e = pp;
10077 }
10078 }
10079 }
10080 }
10081 else {
10082 while (e > p && *(e-1) == '\n') {
10083 --e;
10084 if (e > p && *(e-1) == '\r')
10085 --e;
10086 }
10087 }
10088 return e - p;
10089 }
10090 if (rslen > len) return len;
10091
10092 enc = rb_enc_get(rs);
10093 newline = rsptr[rslen-1];
10094 if (rslen == rb_enc_mbminlen(enc)) {
10095 if (rslen == 1) {
10096 if (newline == '\n')
10097 return smart_chomp(str, e, p);
10098 }
10099 else {
10100 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10101 return smart_chomp(str, e, p);
10102 }
10103 }
10104
10105 enc = rb_enc_check(str, rs);
10106 if (is_broken_string(rs)) {
10107 return len;
10108 }
10109 pp = e - rslen;
10110 if (p[len-1] == newline &&
10111 (rslen <= 1 ||
10112 memcmp(rsptr, pp, rslen) == 0)) {
10113 if (at_char_boundary(p, pp, e, enc))
10114 return len - rslen;
10115 RB_GC_GUARD(rs);
10116 }
10117 return len;
10118}
10119
10125static VALUE
10126chomp_rs(int argc, const VALUE *argv)
10127{
10128 rb_check_arity(argc, 0, 1);
10129 if (argc > 0) {
10130 VALUE rs = argv[0];
10131 if (!NIL_P(rs)) StringValue(rs);
10132 return rs;
10133 }
10134 else {
10135 return rb_rs;
10136 }
10137}
10138
10139VALUE
10140rb_str_chomp_string(VALUE str, VALUE rs)
10141{
10142 long olen = RSTRING_LEN(str);
10143 long len = chompped_length(str, rs);
10144 if (len >= olen) return Qnil;
10145 str_modify_keep_cr(str);
10146 STR_SET_LEN(str, len);
10147 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10148 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10150 }
10151 return str;
10152}
10153
10154/*
10155 * call-seq:
10156 * chomp!(line_sep = $/) -> self or nil
10157 *
10158 * Like String#chomp, except that:
10159 *
10160 * - Removes trailing characters from +self+ (not from a copy of +self+).
10161 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10162 *
10163 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10164 */
10165
10166static VALUE
10167rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10168{
10169 VALUE rs;
10170 str_modifiable(str);
10171 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10172 rs = chomp_rs(argc, argv);
10173 if (NIL_P(rs)) return Qnil;
10174 return rb_str_chomp_string(str, rs);
10175}
10176
10177
10178/*
10179 * call-seq:
10180 * chomp(line_sep = $/) -> new_string
10181 *
10182 * :include: doc/string/chomp.rdoc
10183 *
10184 */
10185
10186static VALUE
10187rb_str_chomp(int argc, VALUE *argv, VALUE str)
10188{
10189 VALUE rs = chomp_rs(argc, argv);
10190 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10191 return rb_str_subseq(str, 0, chompped_length(str, rs));
10192}
10193
10194static void
10195tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10196 VALUE str, int num_selectors, VALUE *selectors)
10197{
10198 int i;
10199
10200 for (i=0; i<num_selectors; i++) {
10201 VALUE selector = selectors[i];
10202 rb_encoding *enc;
10203
10204 StringValue(selector);
10205 enc = rb_enc_check(str, selector);
10206 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10207 }
10208}
10209
10210static long
10211lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10212{
10213 const char *const start = s;
10214
10215 if (!s || s >= e) return 0;
10216
10217 /* remove spaces at head */
10218 if (single_byte_optimizable(str)) {
10219 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10220 }
10221 else {
10222 while (s < e) {
10223 int n;
10224 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10225
10226 if (cc && !rb_isspace(cc)) break;
10227 s += n;
10228 }
10229 }
10230 return s - start;
10231}
10232
10233static long
10234lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10235 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10236{
10237 const char *const start = s;
10238
10239 if (!s || s >= e) return 0;
10240
10241 /* remove leading characters in the table */
10242 while (s < e) {
10243 int n;
10244 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10245
10246 if (!tr_find(cc, table, del, nodel)) break;
10247 s += n;
10248 }
10249 return s - start;
10250}
10251
10252/*
10253 * call-seq:
10254 * lstrip!(*selectors) -> self or nil
10255 *
10256 * Like String#lstrip, except that:
10257 *
10258 * - Performs stripping in +self+ (not in a copy of +self+).
10259 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10260 *
10261 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10262 */
10263
10264static VALUE
10265rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10266{
10267 rb_encoding *enc;
10268 char *start, *s;
10269 long olen, loffset;
10270
10271 str_modify_keep_cr(str);
10272 enc = STR_ENC_GET(str);
10273 RSTRING_GETMEM(str, start, olen);
10274 if (argc > 0) {
10275 char table[TR_TABLE_SIZE];
10276 VALUE del = 0, nodel = 0;
10277
10278 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10279 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10280 }
10281 else {
10282 loffset = lstrip_offset(str, start, start+olen, enc);
10283 }
10284
10285 if (loffset > 0) {
10286 long len = olen-loffset;
10287 s = start + loffset;
10288 memmove(start, s, len);
10289 STR_SET_LEN(str, len);
10290 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10291 return str;
10292 }
10293 return Qnil;
10294}
10295
10296
10297/*
10298 * call-seq:
10299 * lstrip(*selectors) -> new_string
10300 *
10301 * Returns a copy of +self+ with leading whitespace removed;
10302 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10303 *
10304 * whitespace = "\x00\t\n\v\f\r "
10305 * s = whitespace + 'abc' + whitespace
10306 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10307 * s.lstrip
10308 * # => "abc\u0000\t\n\v\f\r "
10309 *
10310 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10311 *
10312 * s = "---abc+++"
10313 * s.lstrip("-") # => "abc+++"
10314 *
10315 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10316 * and may use any of its valid forms, including negation, ranges, and escapes:
10317 *
10318 * "01234abc56789".lstrip("0-9") # "abc56789"
10319 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10320 *
10321 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10322 */
10323
10324static VALUE
10325rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10326{
10327 char *start;
10328 long len, loffset;
10329
10330 RSTRING_GETMEM(str, start, len);
10331 if (argc > 0) {
10332 char table[TR_TABLE_SIZE];
10333 VALUE del = 0, nodel = 0;
10334
10335 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10336 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10337 }
10338 else {
10339 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10340 }
10341 if (loffset <= 0) return str_duplicate(rb_cString, str);
10342 return rb_str_subseq(str, loffset, len - loffset);
10343}
10344
10345static long
10346rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10347{
10348 const char *t;
10349
10350 rb_str_check_dummy_enc(enc);
10352 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10353 }
10354 if (!s || s >= e) return 0;
10355 t = e;
10356
10357 /* remove trailing spaces or '\0's */
10358 if (single_byte_optimizable(str)) {
10359 unsigned char c;
10360 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10361 }
10362 else {
10363 char *tp;
10364
10365 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10366 unsigned int c = rb_enc_codepoint(tp, e, enc);
10367 if (c && !rb_isspace(c)) break;
10368 t = tp;
10369 }
10370 }
10371 return e - t;
10372}
10373
10374static long
10375rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10376 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10377{
10378 const char *t;
10379 char *tp;
10380
10381 rb_str_check_dummy_enc(enc);
10383 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10384 }
10385 if (!s || s >= e) return 0;
10386 t = e;
10387
10388 /* remove trailing characters in the table */
10389 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10390 unsigned int c = rb_enc_codepoint(tp, e, enc);
10391 if (!tr_find(c, table, del, nodel)) break;
10392 t = tp;
10393 }
10394
10395 return e - t;
10396}
10397
10398/*
10399 * call-seq:
10400 * rstrip!(*selectors) -> self or nil
10401 *
10402 * Like String#rstrip, except that:
10403 *
10404 * - Performs stripping in +self+ (not in a copy of +self+).
10405 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10406 *
10407 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10408 */
10409
10410static VALUE
10411rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10412{
10413 rb_encoding *enc;
10414 char *start;
10415 long olen, roffset;
10416
10417 str_modify_keep_cr(str);
10418 enc = STR_ENC_GET(str);
10419 RSTRING_GETMEM(str, start, olen);
10420 if (argc > 0) {
10421 char table[TR_TABLE_SIZE];
10422 VALUE del = 0, nodel = 0;
10423
10424 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10425 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10426 }
10427 else {
10428 roffset = rstrip_offset(str, start, start+olen, enc);
10429 }
10430 if (roffset > 0) {
10431 long len = olen - roffset;
10432
10433 STR_SET_LEN(str, len);
10434 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10435 return str;
10436 }
10437 return Qnil;
10438}
10439
10440
10441/*
10442 * call-seq:
10443 * rstrip(*selectors) -> new_string
10444 *
10445 * Returns a copy of +self+ with trailing whitespace removed;
10446 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10447 *
10448 * whitespace = "\x00\t\n\v\f\r "
10449 * s = whitespace + 'abc' + whitespace
10450 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10451 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10452 *
10453 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10454 *
10455 * s = "---abc+++"
10456 * s.rstrip("+") # => "---abc"
10457 *
10458 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10459 * and may use any of its valid forms, including negation, ranges, and escapes:
10460 *
10461 * "01234abc56789".rstrip("0-9") # "01234abc"
10462 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10463 *
10464 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10465 */
10466
10467static VALUE
10468rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10469{
10470 rb_encoding *enc;
10471 char *start;
10472 long olen, roffset;
10473
10474 enc = STR_ENC_GET(str);
10475 RSTRING_GETMEM(str, start, olen);
10476 if (argc > 0) {
10477 char table[TR_TABLE_SIZE];
10478 VALUE del = 0, nodel = 0;
10479
10480 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10481 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10482 }
10483 else {
10484 roffset = rstrip_offset(str, start, start+olen, enc);
10485 }
10486 if (roffset <= 0) return str_duplicate(rb_cString, str);
10487 return rb_str_subseq(str, 0, olen-roffset);
10488}
10489
10490
10491/*
10492 * call-seq:
10493 * strip!(*selectors) -> self or nil
10494 *
10495 * Like String#strip, except that:
10496 *
10497 * - Any modifications are made to +self+.
10498 * - Returns +self+ if any modification are made, +nil+ otherwise.
10499 *
10500 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10501 */
10502
10503static VALUE
10504rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10505{
10506 char *start;
10507 long olen, loffset, roffset;
10508 rb_encoding *enc;
10509
10510 str_modify_keep_cr(str);
10511 enc = STR_ENC_GET(str);
10512 RSTRING_GETMEM(str, start, olen);
10513
10514 if (argc > 0) {
10515 char table[TR_TABLE_SIZE];
10516 VALUE del = 0, nodel = 0;
10517
10518 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10519 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10520 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10521 }
10522 else {
10523 loffset = lstrip_offset(str, start, start+olen, enc);
10524 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10525 }
10526
10527 if (loffset > 0 || roffset > 0) {
10528 long len = olen-roffset;
10529 if (loffset > 0) {
10530 len -= loffset;
10531 memmove(start, start + loffset, len);
10532 }
10533 STR_SET_LEN(str, len);
10534 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10535 return str;
10536 }
10537 return Qnil;
10538}
10539
10540
10541/*
10542 * call-seq:
10543 * strip(*selectors) -> new_string
10544 *
10545 * Returns a copy of +self+ with leading and trailing whitespace removed;
10546 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10547 *
10548 * whitespace = "\x00\t\n\v\f\r "
10549 * s = whitespace + 'abc' + whitespace
10550 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10551 * s.strip # => "abc"
10552 *
10553 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10554 *
10555 * s = "---abc+++"
10556 * s.strip("-+") # => "abc"
10557 * s.strip("+-") # => "abc"
10558 *
10559 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10560 * and may use any of its valid forms, including negation, ranges, and escapes:
10561 *
10562 * "01234abc56789".strip("0-9") # "abc"
10563 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10564 *
10565 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10566 */
10567
10568static VALUE
10569rb_str_strip(int argc, VALUE *argv, VALUE str)
10570{
10571 char *start;
10572 long olen, loffset, roffset;
10573 rb_encoding *enc = STR_ENC_GET(str);
10574
10575 RSTRING_GETMEM(str, start, olen);
10576
10577 if (argc > 0) {
10578 char table[TR_TABLE_SIZE];
10579 VALUE del = 0, nodel = 0;
10580
10581 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10582 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10583 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10584 }
10585 else {
10586 loffset = lstrip_offset(str, start, start+olen, enc);
10587 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10588 }
10589
10590 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10591 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10592}
10593
10594static VALUE
10595scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10596{
10597 VALUE result = Qnil;
10598 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10599 if (pos >= 0) {
10600 VALUE match;
10601 struct re_registers *regs;
10602 if (BUILTIN_TYPE(pat) == T_STRING) {
10603 regs = NULL;
10604 end = pos + RSTRING_LEN(pat);
10605 }
10606 else {
10607 match = rb_backref_get();
10608 regs = RMATCH_REGS(match);
10609 pos = BEG(0);
10610 end = END(0);
10611 }
10612
10613 if (pos == end) {
10614 rb_encoding *enc = STR_ENC_GET(str);
10615 /*
10616 * Always consume at least one character of the input string
10617 */
10618 if (RSTRING_LEN(str) > end)
10619 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10620 RSTRING_END(str), enc);
10621 else
10622 *start = end + 1;
10623 }
10624 else {
10625 *start = end;
10626 }
10627
10628 if (!regs || regs->num_regs == 1) {
10629 result = rb_str_subseq(str, pos, end - pos);
10630 return result;
10631 }
10632 else {
10633 result = rb_ary_new2(regs->num_regs);
10634 for (int i = 1; i < regs->num_regs; i++) {
10635 VALUE s = Qnil;
10636 if (BEG(i) >= 0) {
10637 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10638 }
10639
10640 rb_ary_push(result, s);
10641 }
10642 }
10643
10644 RB_GC_GUARD(match);
10645 }
10646
10647 return result;
10648}
10649
10650
10651/*
10652 * call-seq:
10653 * scan(pattern) -> array_of_results
10654 * scan(pattern) {|result| ... } -> self
10655 *
10656 * :include: doc/string/scan.rdoc
10657 *
10658 */
10659
10660static VALUE
10661rb_str_scan(VALUE str, VALUE pat)
10662{
10663 VALUE result;
10664 long start = 0;
10665 long last = -1, prev = 0;
10666 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10667
10668 pat = get_pat_quoted(pat, 1);
10669 mustnot_broken(str);
10670 if (!rb_block_given_p()) {
10671 VALUE ary = rb_ary_new();
10672
10673 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10674 last = prev;
10675 prev = start;
10676 rb_ary_push(ary, result);
10677 }
10678 if (last >= 0) rb_pat_search(pat, str, last, 1);
10679 else rb_backref_set(Qnil);
10680 return ary;
10681 }
10682
10683 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10684 last = prev;
10685 prev = start;
10686 rb_yield(result);
10687 str_mod_check(str, p, len);
10688 }
10689 if (last >= 0) rb_pat_search(pat, str, last, 1);
10690 return str;
10691}
10692
10693
10694/*
10695 * call-seq:
10696 * hex -> integer
10697 *
10698 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10699 * returns its value as an integer.
10700 *
10701 * The leading substring is interpreted as hexadecimal when it begins with:
10702 *
10703 * - One or more character representing hexadecimal digits
10704 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10705 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10706 *
10707 * 'f'.hex # => 15
10708 * '11'.hex # => 17
10709 * 'FFF'.hex # => 4095
10710 * 'fffg'.hex # => 4095
10711 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10712 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10713 * 'deadbeef'.hex # => 3735928559
10714 *
10715 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10716 *
10717 * '0xfff'.hex # => 4095
10718 * '0xfffg'.hex # => 4095
10719 *
10720 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10721 *
10722 * '-fff'.hex # => -4095
10723 * '-0xFFF'.hex # => -4095
10724 *
10725 * For any substring not described above, returns zero:
10726 *
10727 * 'xxx'.hex # => 0
10728 * ''.hex # => 0
10729 *
10730 * Note that, unlike #oct, this method interprets only hexadecimal,
10731 * and not binary, octal, or decimal notations:
10732 *
10733 * '0b111'.hex # => 45329
10734 * '0o777'.hex # => 0
10735 * '0d999'.hex # => 55705
10736 *
10737 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10738 */
10739
10740static VALUE
10741rb_str_hex(VALUE str)
10742{
10743 return rb_str_to_inum(str, 16, FALSE);
10744}
10745
10746
10747/*
10748 * call-seq:
10749 * oct -> integer
10750 *
10751 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10752 * returns their value as an integer.
10753 *
10754 * In brief:
10755 *
10756 * # Interpreted as octal.
10757 * '777'.oct # => 511
10758 * '777x'.oct # => 511
10759 * '0777'.oct # => 511
10760 * '0o777'.oct # => 511
10761 * '-777'.oct # => -511
10762 * # Not interpreted as octal.
10763 * '0b111'.oct # => 7 # Interpreted as binary.
10764 * '0d999'.oct # => 999 # Interpreted as decimal.
10765 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10766 *
10767 * The leading substring is interpreted as octal when it begins with:
10768 *
10769 * - One or more character representing octal digits
10770 * (each in the range <tt>'0'..'7'</tt>);
10771 * the string to be interpreted ends at the first character that does not represent an octal digit:
10772 *
10773 * '7'.oct @ => 7
10774 * '11'.oct # => 9
10775 * '777'.oct # => 511
10776 * '0777'.oct # => 511
10777 * '7778'.oct # => 511
10778 * '777x'.oct # => 511
10779 *
10780 * - <tt>'0o'</tt>, followed by one or more octal digits:
10781 *
10782 * '0o777'.oct # => 511
10783 * '0o7778'.oct # => 511
10784 *
10785 * The leading substring is _not_ interpreted as octal when it begins with:
10786 *
10787 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10788 * (each in the range <tt>'0'..'1'</tt>);
10789 * the string to be interpreted ends at the first character that does not represent a binary digit.
10790 * the string is interpreted as binary digits (base 2):
10791 *
10792 * '0b111'.oct # => 7
10793 * '0b1112'.oct # => 7
10794 *
10795 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10796 * (each in the range <tt>'0'..'9'</tt>);
10797 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10798 * the string is interpreted as decimal digits (base 10):
10799 *
10800 * '0d999'.oct # => 999
10801 * '0d999x'.oct # => 999
10802 *
10803 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10804 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10805 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10806 * the string is interpreted as hexadecimal digits (base 16):
10807 *
10808 * '0xfff'.oct # => 4095
10809 * '0xfffg'.oct # => 4095
10810 *
10811 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10812 *
10813 * '-777'.oct # => -511
10814 * '-0777'.oct # => -511
10815 * '-0b111'.oct # => -7
10816 * '-0xfff'.oct # => -4095
10817 *
10818 * For any substring not described above, returns zero:
10819 *
10820 * 'foo'.oct # => 0
10821 * ''.oct # => 0
10822 *
10823 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10824 */
10825
10826static VALUE
10827rb_str_oct(VALUE str)
10828{
10829 return rb_str_to_inum(str, -8, FALSE);
10830}
10831
10832#ifndef HAVE_CRYPT_R
10833# include "ruby/thread_native.h"
10834# include "ruby/atomic.h"
10835
10836static struct {
10837 rb_nativethread_lock_t lock;
10838} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10839#endif
10840
10841/*
10842 * call-seq:
10843 * crypt(salt_str) -> new_string
10844 *
10845 * Returns the string generated by calling <code>crypt(3)</code>
10846 * standard library function with <code>str</code> and
10847 * <code>salt_str</code>, in this order, as its arguments. Please do
10848 * not use this method any longer. It is legacy; provided only for
10849 * backward compatibility with ruby scripts in earlier days. It is
10850 * bad to use in contemporary programs for several reasons:
10851 *
10852 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10853 * run. The generated string lacks data portability.
10854 *
10855 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10856 * (i.e. silently ends up in unexpected results).
10857 *
10858 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10859 * thread safe.
10860 *
10861 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10862 * very very weak. According to its manpage, Linux's traditional
10863 * <code>crypt(3)</code> output has only 2**56 variations; too
10864 * easy to brute force today. And this is the default behaviour.
10865 *
10866 * * In order to make things robust some OSes implement so-called
10867 * "modular" usage. To go through, you have to do a complex
10868 * build-up of the <code>salt_str</code> parameter, by hand.
10869 * Failure in generation of a proper salt string tends not to
10870 * yield any errors; typos in parameters are normally not
10871 * detectable.
10872 *
10873 * * For instance, in the following example, the second invocation
10874 * of String#crypt is wrong; it has a typo in "round=" (lacks
10875 * "s"). However the call does not fail and something unexpected
10876 * is generated.
10877 *
10878 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10879 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10880 *
10881 * * Even in the "modular" mode, some hash functions are considered
10882 * archaic and no longer recommended at all; for instance module
10883 * <code>$1$</code> is officially abandoned by its author: see
10884 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10885 * instance module <code>$3$</code> is considered completely
10886 * broken: see the manpage of FreeBSD.
10887 *
10888 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10889 * written above, <code>crypt(3)</code> on Mac OS never fails.
10890 * This means even if you build up a proper salt string it
10891 * generates a traditional DES hash anyways, and there is no way
10892 * for you to be aware of.
10893 *
10894 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10895 *
10896 * If for some reason you cannot migrate to other secure contemporary
10897 * password hashing algorithms, install the string-crypt gem and
10898 * <code>require 'string/crypt'</code> to continue using it.
10899 */
10900
10901static VALUE
10902rb_str_crypt(VALUE str, VALUE salt)
10903{
10904#ifdef HAVE_CRYPT_R
10905 VALUE databuf;
10906 struct crypt_data *data;
10907# define CRYPT_END() ALLOCV_END(databuf)
10908#else
10909 char *tmp_buf;
10910 extern char *crypt(const char *, const char *);
10911# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10912#endif
10913 VALUE result;
10914 const char *s, *saltp;
10915 char *res;
10916#ifdef BROKEN_CRYPT
10917 char salt_8bit_clean[3];
10918#endif
10919
10920 StringValue(salt);
10921 mustnot_wchar(str);
10922 mustnot_wchar(salt);
10923 s = StringValueCStr(str);
10924 saltp = RSTRING_PTR(salt);
10925 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10926 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10927 }
10928
10929#ifdef BROKEN_CRYPT
10930 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10931 salt_8bit_clean[0] = saltp[0] & 0x7f;
10932 salt_8bit_clean[1] = saltp[1] & 0x7f;
10933 salt_8bit_clean[2] = '\0';
10934 saltp = salt_8bit_clean;
10935 }
10936#endif
10937#ifdef HAVE_CRYPT_R
10938 data = ALLOCV(databuf, sizeof(struct crypt_data));
10939# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10940 data->initialized = 0;
10941# endif
10942 res = crypt_r(s, saltp, data);
10943#else
10944 rb_nativethread_lock_lock(&crypt_mutex.lock);
10945 res = crypt(s, saltp);
10946#endif
10947 if (!res) {
10948 int err = errno;
10949 CRYPT_END();
10950 rb_syserr_fail(err, "crypt");
10951 }
10952#ifdef HAVE_CRYPT_R
10953 result = rb_str_new_cstr(res);
10954 CRYPT_END();
10955#else
10956 // We need to copy this buffer because it's static and we need to unlock the mutex
10957 // before allocating a new object (the string to be returned). If we allocate while
10958 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10959 // if other ractors are waiting on this lock.
10960 size_t res_size = strlen(res)+1;
10961 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10962 memcpy(tmp_buf, res, res_size);
10963 res = tmp_buf;
10964 CRYPT_END();
10965 result = rb_str_new_cstr(res);
10966#endif
10967 return result;
10968}
10969
10970
10971/*
10972 * call-seq:
10973 * ord -> integer
10974 *
10975 * :include: doc/string/ord.rdoc
10976 *
10977 */
10978
10979static VALUE
10980rb_str_ord(VALUE s)
10981{
10982 unsigned int c;
10983
10984 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10985 return UINT2NUM(c);
10986}
10987/*
10988 * call-seq:
10989 * sum(n = 16) -> integer
10990 *
10991 * :include: doc/string/sum.rdoc
10992 *
10993 */
10994
10995static VALUE
10996rb_str_sum(int argc, VALUE *argv, VALUE str)
10997{
10998 int bits = 16;
10999 char *ptr, *p, *pend;
11000 long len;
11001 VALUE sum = INT2FIX(0);
11002 unsigned long sum0 = 0;
11003
11004 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11005 bits = 0;
11006 }
11007 ptr = p = RSTRING_PTR(str);
11008 len = RSTRING_LEN(str);
11009 pend = p + len;
11010
11011 while (p < pend) {
11012 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11013 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11014 str_mod_check(str, ptr, len);
11015 sum0 = 0;
11016 }
11017 sum0 += (unsigned char)*p;
11018 p++;
11019 }
11020
11021 if (bits == 0) {
11022 if (sum0) {
11023 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11024 }
11025 }
11026 else {
11027 if (sum == INT2FIX(0)) {
11028 if (bits < (int)sizeof(long)*CHAR_BIT) {
11029 sum0 &= (((unsigned long)1)<<bits)-1;
11030 }
11031 sum = LONG2FIX(sum0);
11032 }
11033 else {
11034 VALUE mod;
11035
11036 if (sum0) {
11037 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11038 }
11039
11040 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11041 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11042 sum = rb_funcall(sum, '&', 1, mod);
11043 }
11044 }
11045 return sum;
11046}
11047
11048static VALUE
11049rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11050{
11051 rb_encoding *enc;
11052 VALUE w;
11053 long width, len, flen = 1, fclen = 1;
11054 VALUE res;
11055 char *p;
11056 const char *f = " ";
11057 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11058 VALUE pad;
11059 int singlebyte = 1, cr;
11060 int termlen;
11061
11062 rb_scan_args(argc, argv, "11", &w, &pad);
11063 enc = STR_ENC_GET(str);
11064 termlen = rb_enc_mbminlen(enc);
11065 width = NUM2LONG(w);
11066 if (argc == 2) {
11067 StringValue(pad);
11068 enc = rb_enc_check(str, pad);
11069 f = RSTRING_PTR(pad);
11070 flen = RSTRING_LEN(pad);
11071 fclen = str_strlen(pad, enc); /* rb_enc_check */
11072 singlebyte = single_byte_optimizable(pad);
11073 if (flen == 0 || fclen == 0) {
11074 rb_raise(rb_eArgError, "zero width padding");
11075 }
11076 }
11077 len = str_strlen(str, enc); /* rb_enc_check */
11078 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11079 n = width - len;
11080 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11081 rlen = n - llen;
11082 cr = ENC_CODERANGE(str);
11083 if (flen > 1) {
11084 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11085 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11086 }
11087 size = RSTRING_LEN(str);
11088 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11089 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11090 (len += llen2 + rlen2) >= LONG_MAX - size) {
11091 rb_raise(rb_eArgError, "argument too big");
11092 }
11093 len += size;
11094 res = str_enc_new(rb_cString, 0, len, enc);
11095 p = RSTRING_PTR(res);
11096 if (flen <= 1) {
11097 memset(p, *f, llen);
11098 p += llen;
11099 }
11100 else {
11101 while (llen >= fclen) {
11102 memcpy(p,f,flen);
11103 p += flen;
11104 llen -= fclen;
11105 }
11106 if (llen > 0) {
11107 memcpy(p, f, llen2);
11108 p += llen2;
11109 }
11110 }
11111 memcpy(p, RSTRING_PTR(str), size);
11112 p += size;
11113 if (flen <= 1) {
11114 memset(p, *f, rlen);
11115 p += rlen;
11116 }
11117 else {
11118 while (rlen >= fclen) {
11119 memcpy(p,f,flen);
11120 p += flen;
11121 rlen -= fclen;
11122 }
11123 if (rlen > 0) {
11124 memcpy(p, f, rlen2);
11125 p += rlen2;
11126 }
11127 }
11128 TERM_FILL(p, termlen);
11129 STR_SET_LEN(res, p-RSTRING_PTR(res));
11130
11131 if (argc == 2)
11132 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11133 if (cr != ENC_CODERANGE_BROKEN)
11134 ENC_CODERANGE_SET(res, cr);
11135
11136 RB_GC_GUARD(pad);
11137 return res;
11138}
11139
11140
11141/*
11142 * call-seq:
11143 * ljust(width, pad_string = ' ') -> new_string
11144 *
11145 * :include: doc/string/ljust.rdoc
11146 *
11147 */
11148
11149static VALUE
11150rb_str_ljust(int argc, VALUE *argv, VALUE str)
11151{
11152 return rb_str_justify(argc, argv, str, 'l');
11153}
11154
11155/*
11156 * call-seq:
11157 * rjust(width, pad_string = ' ') -> new_string
11158 *
11159 * :include: doc/string/rjust.rdoc
11160 *
11161 */
11162
11163static VALUE
11164rb_str_rjust(int argc, VALUE *argv, VALUE str)
11165{
11166 return rb_str_justify(argc, argv, str, 'r');
11167}
11168
11169
11170/*
11171 * call-seq:
11172 * center(size, pad_string = ' ') -> new_string
11173 *
11174 * :include: doc/string/center.rdoc
11175 *
11176 */
11177
11178static VALUE
11179rb_str_center(int argc, VALUE *argv, VALUE str)
11180{
11181 return rb_str_justify(argc, argv, str, 'c');
11182}
11183
11184/*
11185 * call-seq:
11186 * partition(pattern) -> [pre_match, first_match, post_match]
11187 *
11188 * :include: doc/string/partition.rdoc
11189 *
11190 */
11191
11192static VALUE
11193rb_str_partition(VALUE str, VALUE sep)
11194{
11195 long pos;
11196
11197 sep = get_pat_quoted(sep, 0);
11198 if (RB_TYPE_P(sep, T_REGEXP)) {
11199 if (rb_reg_search(sep, str, 0, 0) < 0) {
11200 goto failed;
11201 }
11202 VALUE match = rb_backref_get();
11203 struct re_registers *regs = RMATCH_REGS(match);
11204
11205 pos = BEG(0);
11206 sep = rb_str_subseq(str, pos, END(0) - pos);
11207 }
11208 else {
11209 pos = rb_str_index(str, sep, 0);
11210 if (pos < 0) goto failed;
11211 }
11212 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11213 sep,
11214 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11215 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11216
11217 failed:
11218 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11219}
11220
11221/*
11222 * call-seq:
11223 * rpartition(pattern) -> [pre_match, last_match, post_match]
11224 *
11225 * :include: doc/string/rpartition.rdoc
11226 *
11227 */
11228
11229static VALUE
11230rb_str_rpartition(VALUE str, VALUE sep)
11231{
11232 long pos = RSTRING_LEN(str);
11233
11234 sep = get_pat_quoted(sep, 0);
11235 if (RB_TYPE_P(sep, T_REGEXP)) {
11236 if (rb_reg_search(sep, str, pos, 1) < 0) {
11237 goto failed;
11238 }
11239 VALUE match = rb_backref_get();
11240 struct re_registers *regs = RMATCH_REGS(match);
11241
11242 pos = BEG(0);
11243 sep = rb_str_subseq(str, pos, END(0) - pos);
11244 }
11245 else {
11246 pos = rb_str_sublen(str, pos);
11247 pos = rb_str_rindex(str, sep, pos);
11248 if (pos < 0) {
11249 goto failed;
11250 }
11251 }
11252
11253 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11254 sep,
11255 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11256 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11257 failed:
11258 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11259}
11260
11261/*
11262 * call-seq:
11263 * start_with?(*patterns) -> true or false
11264 *
11265 * :include: doc/string/start_with_p.rdoc
11266 *
11267 */
11268
11269static VALUE
11270rb_str_start_with(int argc, VALUE *argv, VALUE str)
11271{
11272 int i;
11273
11274 for (i=0; i<argc; i++) {
11275 VALUE tmp = argv[i];
11276 if (RB_TYPE_P(tmp, T_REGEXP)) {
11277 if (rb_reg_start_with_p(tmp, str))
11278 return Qtrue;
11279 }
11280 else {
11281 const char *p, *s, *e;
11282 long slen, tlen;
11283 rb_encoding *enc;
11284
11285 StringValue(tmp);
11286 enc = rb_enc_check(str, tmp);
11287 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11288 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11289 p = RSTRING_PTR(str);
11290 e = p + slen;
11291 s = p + tlen;
11292 if (!at_char_right_boundary(p, s, e, enc))
11293 continue;
11294 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11295 return Qtrue;
11296 }
11297 }
11298 return Qfalse;
11299}
11300
11301/*
11302 * call-seq:
11303 * end_with?(*strings) -> true or false
11304 *
11305 * :include: doc/string/end_with_p.rdoc
11306 *
11307 */
11308
11309static VALUE
11310rb_str_end_with(int argc, VALUE *argv, VALUE str)
11311{
11312 int i;
11313
11314 for (i=0; i<argc; i++) {
11315 VALUE tmp = argv[i];
11316 const char *p, *s, *e;
11317 long slen, tlen;
11318 rb_encoding *enc;
11319
11320 StringValue(tmp);
11321 enc = rb_enc_check(str, tmp);
11322 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11323 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11324 p = RSTRING_PTR(str);
11325 e = p + slen;
11326 s = e - tlen;
11327 if (!at_char_boundary(p, s, e, enc))
11328 continue;
11329 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11330 return Qtrue;
11331 }
11332 return Qfalse;
11333}
11334
11344static long
11345deleted_prefix_length(VALUE str, VALUE prefix)
11346{
11347 const char *strptr, *prefixptr;
11348 long olen, prefixlen;
11349 rb_encoding *enc = rb_enc_get(str);
11350
11351 StringValue(prefix);
11352
11353 if (!is_broken_string(prefix) ||
11354 !rb_enc_asciicompat(enc) ||
11355 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11356 enc = rb_enc_check(str, prefix);
11357 }
11358
11359 /* return 0 if not start with prefix */
11360 prefixlen = RSTRING_LEN(prefix);
11361 if (prefixlen <= 0) return 0;
11362 olen = RSTRING_LEN(str);
11363 if (olen < prefixlen) return 0;
11364 strptr = RSTRING_PTR(str);
11365 prefixptr = RSTRING_PTR(prefix);
11366 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11367 if (is_broken_string(prefix)) {
11368 if (!is_broken_string(str)) {
11369 /* prefix in a valid string cannot be broken */
11370 return 0;
11371 }
11372 const char *strend = strptr + olen;
11373 const char *after_prefix = strptr + prefixlen;
11374 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11375 /* prefix does not end at char-boundary */
11376 return 0;
11377 }
11378 }
11379 /* prefix part in `str` also should be valid. */
11380
11381 return prefixlen;
11382}
11383
11384/*
11385 * call-seq:
11386 * delete_prefix!(prefix) -> self or nil
11387 *
11388 * Like String#delete_prefix, except that +self+ is modified in place;
11389 * returns +self+ if the prefix is removed, +nil+ otherwise.
11390 *
11391 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11392 */
11393
11394static VALUE
11395rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11396{
11397 long prefixlen;
11398 str_modify_keep_cr(str);
11399
11400 prefixlen = deleted_prefix_length(str, prefix);
11401 if (prefixlen <= 0) return Qnil;
11402
11403 return rb_str_drop_bytes(str, prefixlen);
11404}
11405
11406/*
11407 * call-seq:
11408 * delete_prefix(prefix) -> new_string
11409 *
11410 * :include: doc/string/delete_prefix.rdoc
11411 *
11412 */
11413
11414static VALUE
11415rb_str_delete_prefix(VALUE str, VALUE prefix)
11416{
11417 long prefixlen;
11418
11419 prefixlen = deleted_prefix_length(str, prefix);
11420 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11421
11422 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11423}
11424
11434static long
11435deleted_suffix_length(VALUE str, VALUE suffix)
11436{
11437 const char *strptr, *suffixptr;
11438 long olen, suffixlen;
11439 rb_encoding *enc;
11440
11441 StringValue(suffix);
11442 if (is_broken_string(suffix)) return 0;
11443 enc = rb_enc_check(str, suffix);
11444
11445 /* return 0 if not start with suffix */
11446 suffixlen = RSTRING_LEN(suffix);
11447 if (suffixlen <= 0) return 0;
11448 olen = RSTRING_LEN(str);
11449 if (olen < suffixlen) return 0;
11450 strptr = RSTRING_PTR(str);
11451 suffixptr = RSTRING_PTR(suffix);
11452 const char *strend = strptr + olen;
11453 const char *before_suffix = strend - suffixlen;
11454 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11455 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11456
11457 return suffixlen;
11458}
11459
11460/*
11461 * call-seq:
11462 * delete_suffix!(suffix) -> self or nil
11463 *
11464 * Like String#delete_suffix, except that +self+ is modified in place;
11465 * returns +self+ if the suffix is removed, +nil+ otherwise.
11466 *
11467 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11468 */
11469
11470static VALUE
11471rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11472{
11473 long olen, suffixlen, len;
11474 str_modifiable(str);
11475
11476 suffixlen = deleted_suffix_length(str, suffix);
11477 if (suffixlen <= 0) return Qnil;
11478
11479 olen = RSTRING_LEN(str);
11480 str_modify_keep_cr(str);
11481 len = olen - suffixlen;
11482 STR_SET_LEN(str, len);
11483 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11484 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11486 }
11487 return str;
11488}
11489
11490/*
11491 * call-seq:
11492 * delete_suffix(suffix) -> new_string
11493 *
11494 * :include: doc/string/delete_suffix.rdoc
11495 *
11496 */
11497
11498static VALUE
11499rb_str_delete_suffix(VALUE str, VALUE suffix)
11500{
11501 long suffixlen;
11502
11503 suffixlen = deleted_suffix_length(str, suffix);
11504 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11505
11506 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11507}
11508
11509void
11510rb_str_setter(VALUE val, ID id, VALUE *var)
11511{
11512 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11513 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11514 }
11515 *var = val;
11516}
11517
11518static void
11519nil_setter_warning(ID id)
11520{
11521 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11522}
11523
11524void
11525rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11526{
11527 rb_str_setter(val, id, var);
11528 if (!NIL_P(*var)) {
11529 nil_setter_warning(id);
11530 }
11531}
11532
11533static void
11534rb_fs_setter(VALUE val, ID id, VALUE *var)
11535{
11536 val = rb_fs_check(val);
11537 if (!val) {
11538 rb_raise(rb_eTypeError,
11539 "value of %"PRIsVALUE" must be String or Regexp",
11540 rb_id2str(id));
11541 }
11542 if (!NIL_P(val)) {
11543 nil_setter_warning(id);
11544 }
11545 *var = val;
11546}
11547
11548
11549/*
11550 * call-seq:
11551 * force_encoding(encoding) -> self
11552 *
11553 * :include: doc/string/force_encoding.rdoc
11554 *
11555 */
11556
11557static VALUE
11558rb_str_force_encoding(VALUE str, VALUE enc)
11559{
11560 str_modifiable(str);
11561
11562 rb_encoding *encoding = rb_to_encoding(enc);
11563 int idx = rb_enc_to_index(encoding);
11564
11565 // If the encoding is unchanged, we do nothing.
11566 if (ENCODING_GET(str) == idx) {
11567 return str;
11568 }
11569
11570 rb_enc_associate_index(str, idx);
11571
11572 // If the coderange was 7bit and the new encoding is ASCII-compatible
11573 // we can keep the coderange.
11574 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11575 return str;
11576 }
11577
11579 return str;
11580}
11581
11582/*
11583 * call-seq:
11584 * b -> new_string
11585 *
11586 * :include: doc/string/b.rdoc
11587 *
11588 */
11589
11590static VALUE
11591rb_str_b(VALUE str)
11592{
11593 VALUE str2;
11594 if (STR_EMBED_P(str)) {
11595 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11596 }
11597 else {
11598 str2 = str_alloc_heap(rb_cString);
11599 }
11600 str_replace_shared_without_enc(str2, str);
11601
11602 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11603 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11604 // If we know the receiver's code range then we know the result's code range.
11605 int cr = ENC_CODERANGE(str);
11606 switch (cr) {
11607 case ENC_CODERANGE_7BIT:
11609 break;
11613 break;
11614 default:
11615 ENC_CODERANGE_CLEAR(str2);
11616 break;
11617 }
11618 }
11619
11620 return str2;
11621}
11622
11623/*
11624 * call-seq:
11625 * valid_encoding? -> true or false
11626 *
11627 * :include: doc/string/valid_encoding_p.rdoc
11628 *
11629 */
11630
11631static VALUE
11632rb_str_valid_encoding_p(VALUE str)
11633{
11634 int cr = rb_enc_str_coderange(str);
11635
11636 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11637}
11638
11639/*
11640 * call-seq:
11641 * ascii_only? -> true or false
11642 *
11643 * Returns whether +self+ contains only ASCII characters:
11644 *
11645 * 'abc'.ascii_only? # => true
11646 * "abc\u{6666}".ascii_only? # => false
11647 *
11648 * Related: see {Querying}[rdoc-ref:String@Querying].
11649 */
11650
11651static VALUE
11652rb_str_is_ascii_only_p(VALUE str)
11653{
11654 int cr = rb_enc_str_coderange(str);
11655
11656 return RBOOL(cr == ENC_CODERANGE_7BIT);
11657}
11658
11659VALUE
11661{
11662 static const char ellipsis[] = "...";
11663 const long ellipsislen = sizeof(ellipsis) - 1;
11664 rb_encoding *const enc = rb_enc_get(str);
11665 const long blen = RSTRING_LEN(str);
11666 const char *const p = RSTRING_PTR(str), *e = p + blen;
11667 VALUE estr, ret = 0;
11668
11669 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11670 if (len * rb_enc_mbminlen(enc) >= blen ||
11671 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11672 ret = str;
11673 }
11674 else if (len <= ellipsislen ||
11675 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11676 if (rb_enc_asciicompat(enc)) {
11677 ret = rb_str_new(ellipsis, len);
11678 rb_enc_associate(ret, enc);
11679 }
11680 else {
11681 estr = rb_usascii_str_new(ellipsis, len);
11682 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11683 }
11684 }
11685 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11686 rb_str_cat(ret, ellipsis, ellipsislen);
11687 }
11688 else {
11689 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11690 rb_enc_from_encoding(enc), 0, Qnil);
11691 rb_str_append(ret, estr);
11692 }
11693 return ret;
11694}
11695
11696static VALUE
11697str_compat_and_valid(VALUE str, rb_encoding *enc)
11698{
11699 int cr;
11700 str = StringValue(str);
11701 cr = rb_enc_str_coderange(str);
11702 if (cr == ENC_CODERANGE_BROKEN) {
11703 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11704 }
11705 else {
11706 rb_encoding *e = STR_ENC_GET(str);
11707 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11708 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11709 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11710 }
11711 }
11712 return str;
11713}
11714
11715static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11716
11717VALUE
11719{
11720 rb_encoding *enc = STR_ENC_GET(str);
11721 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11722}
11723
11724VALUE
11725rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11726{
11727 int cr = ENC_CODERANGE_UNKNOWN;
11728 if (enc == STR_ENC_GET(str)) {
11729 /* cached coderange makes sense only when enc equals the
11730 * actual encoding of str */
11731 cr = ENC_CODERANGE(str);
11732 }
11733 return enc_str_scrub(enc, str, repl, cr);
11734}
11735
11736static VALUE
11737enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11738{
11739 int encidx;
11740 VALUE buf = Qnil;
11741 const char *rep, *p, *e, *p1, *sp;
11742 long replen = -1;
11743 long slen;
11744
11745 if (rb_block_given_p()) {
11746 if (!NIL_P(repl))
11747 rb_raise(rb_eArgError, "both of block and replacement given");
11748 replen = 0;
11749 }
11750
11751 if (ENC_CODERANGE_CLEAN_P(cr))
11752 return Qnil;
11753
11754 if (!NIL_P(repl)) {
11755 repl = str_compat_and_valid(repl, enc);
11756 }
11757
11758 if (rb_enc_dummy_p(enc)) {
11759 return Qnil;
11760 }
11761 encidx = rb_enc_to_index(enc);
11762
11763#define DEFAULT_REPLACE_CHAR(str) do { \
11764 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11765 rep = replace; replen = (int)sizeof(replace); \
11766 } while (0)
11767
11768 slen = RSTRING_LEN(str);
11769 p = RSTRING_PTR(str);
11770 e = RSTRING_END(str);
11771 p1 = p;
11772 sp = p;
11773
11774 if (rb_enc_asciicompat(enc)) {
11775 int rep7bit_p;
11776 if (!replen) {
11777 rep = NULL;
11778 rep7bit_p = FALSE;
11779 }
11780 else if (!NIL_P(repl)) {
11781 rep = RSTRING_PTR(repl);
11782 replen = RSTRING_LEN(repl);
11783 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11784 }
11785 else if (encidx == rb_utf8_encindex()) {
11786 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11787 rep7bit_p = FALSE;
11788 }
11789 else {
11790 DEFAULT_REPLACE_CHAR("?");
11791 rep7bit_p = TRUE;
11792 }
11793 cr = ENC_CODERANGE_7BIT;
11794
11795 p = search_nonascii(p, e);
11796 if (!p) {
11797 p = e;
11798 }
11799 while (p < e) {
11800 int ret = rb_enc_precise_mbclen(p, e, enc);
11801 if (MBCLEN_NEEDMORE_P(ret)) {
11802 break;
11803 }
11804 else if (MBCLEN_CHARFOUND_P(ret)) {
11806 p += MBCLEN_CHARFOUND_LEN(ret);
11807 }
11808 else if (MBCLEN_INVALID_P(ret)) {
11809 /*
11810 * p1~p: valid ascii/multibyte chars
11811 * p ~e: invalid bytes + unknown bytes
11812 */
11813 long clen = rb_enc_mbmaxlen(enc);
11814 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11815 if (p > p1) {
11816 rb_str_buf_cat(buf, p1, p - p1);
11817 }
11818
11819 if (e - p < clen) clen = e - p;
11820 if (clen <= 2) {
11821 clen = 1;
11822 }
11823 else {
11824 const char *q = p;
11825 clen--;
11826 for (; clen > 1; clen--) {
11827 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11828 if (MBCLEN_NEEDMORE_P(ret)) break;
11829 if (MBCLEN_INVALID_P(ret)) continue;
11831 }
11832 }
11833 if (rep) {
11834 rb_str_buf_cat(buf, rep, replen);
11835 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11836 }
11837 else {
11838 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11839 str_mod_check(str, sp, slen);
11840 repl = str_compat_and_valid(repl, enc);
11841 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11844 }
11845 p += clen;
11846 p1 = p;
11847 p = search_nonascii(p, e);
11848 if (!p) {
11849 p = e;
11850 break;
11851 }
11852 }
11853 else {
11855 }
11856 }
11857 if (NIL_P(buf)) {
11858 if (p == e) {
11859 ENC_CODERANGE_SET(str, cr);
11860 return Qnil;
11861 }
11862 buf = rb_str_buf_new(RSTRING_LEN(str));
11863 }
11864 if (p1 < p) {
11865 rb_str_buf_cat(buf, p1, p - p1);
11866 }
11867 if (p < e) {
11868 if (rep) {
11869 rb_str_buf_cat(buf, rep, replen);
11870 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11871 }
11872 else {
11873 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11874 str_mod_check(str, sp, slen);
11875 repl = str_compat_and_valid(repl, enc);
11876 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11879 }
11880 }
11881 }
11882 else {
11883 /* ASCII incompatible */
11884 long mbminlen = rb_enc_mbminlen(enc);
11885 if (!replen) {
11886 rep = NULL;
11887 }
11888 else if (!NIL_P(repl)) {
11889 rep = RSTRING_PTR(repl);
11890 replen = RSTRING_LEN(repl);
11891 }
11892 else if (encidx == ENCINDEX_UTF_16BE) {
11893 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11894 }
11895 else if (encidx == ENCINDEX_UTF_16LE) {
11896 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11897 }
11898 else if (encidx == ENCINDEX_UTF_32BE) {
11899 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11900 }
11901 else if (encidx == ENCINDEX_UTF_32LE) {
11902 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11903 }
11904 else {
11905 DEFAULT_REPLACE_CHAR("?");
11906 }
11907
11908 while (p < e) {
11909 int ret = rb_enc_precise_mbclen(p, e, enc);
11910 if (MBCLEN_NEEDMORE_P(ret)) {
11911 break;
11912 }
11913 else if (MBCLEN_CHARFOUND_P(ret)) {
11914 p += MBCLEN_CHARFOUND_LEN(ret);
11915 }
11916 else if (MBCLEN_INVALID_P(ret)) {
11917 const char *q = p;
11918 long clen = rb_enc_mbmaxlen(enc);
11919 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11920 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11921
11922 if (e - p < clen) clen = e - p;
11923 if (clen <= mbminlen * 2) {
11924 clen = mbminlen;
11925 }
11926 else {
11927 clen -= mbminlen;
11928 for (; clen > mbminlen; clen-=mbminlen) {
11929 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11930 if (MBCLEN_NEEDMORE_P(ret)) break;
11931 if (MBCLEN_INVALID_P(ret)) continue;
11933 }
11934 }
11935 if (rep) {
11936 rb_str_buf_cat(buf, rep, replen);
11937 }
11938 else {
11939 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11940 str_mod_check(str, sp, slen);
11941 repl = str_compat_and_valid(repl, enc);
11942 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11943 }
11944 p += clen;
11945 p1 = p;
11946 }
11947 else {
11949 }
11950 }
11951 if (NIL_P(buf)) {
11952 if (p == e) {
11954 return Qnil;
11955 }
11956 buf = rb_str_buf_new(RSTRING_LEN(str));
11957 }
11958 if (p1 < p) {
11959 rb_str_buf_cat(buf, p1, p - p1);
11960 }
11961 if (p < e) {
11962 if (rep) {
11963 rb_str_buf_cat(buf, rep, replen);
11964 }
11965 else {
11966 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11967 str_mod_check(str, sp, slen);
11968 repl = str_compat_and_valid(repl, enc);
11969 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11970 }
11971 }
11973 }
11974 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11975 return buf;
11976}
11977
11978/*
11979 * call-seq:
11980 * scrub(replacement_string = default_replacement_string) -> new_string
11981 * scrub{|sequence| ... } -> new_string
11982 *
11983 * :include: doc/string/scrub.rdoc
11984 *
11985 */
11986static VALUE
11987str_scrub(int argc, VALUE *argv, VALUE str)
11988{
11989 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11990 VALUE new = rb_str_scrub(str, repl);
11991 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11992}
11993
11994/*
11995 * call-seq:
11996 * scrub!(replacement_string = default_replacement_string) -> self
11997 * scrub!{|sequence| ... } -> self
11998 *
11999 * Like String#scrub, except that:
12000 *
12001 * - Any replacements are made in +self+.
12002 * - Returns +self+.
12003 *
12004 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12005 *
12006 */
12007static VALUE
12008str_scrub_bang(int argc, VALUE *argv, VALUE str)
12009{
12010 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12011 VALUE new = rb_str_scrub(str, repl);
12012 if (!NIL_P(new)) rb_str_replace(str, new);
12013 return str;
12014}
12015
12016static ID id_normalize;
12017static ID id_normalized_p;
12018static VALUE mUnicodeNormalize;
12019
12020static VALUE
12021unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12022{
12023 static int UnicodeNormalizeRequired = 0;
12024 VALUE argv2[2];
12025
12026 if (!UnicodeNormalizeRequired) {
12027 rb_require("unicode_normalize/normalize.rb");
12028 UnicodeNormalizeRequired = 1;
12029 }
12030 argv2[0] = str;
12031 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12032 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12033}
12034
12035/*
12036 * call-seq:
12037 * unicode_normalize(form = :nfc) -> string
12038 *
12039 * :include: doc/string/unicode_normalize.rdoc
12040 *
12041 */
12042static VALUE
12043rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12044{
12045 return unicode_normalize_common(argc, argv, str, id_normalize);
12046}
12047
12048/*
12049 * call-seq:
12050 * unicode_normalize!(form = :nfc) -> self
12051 *
12052 * Like String#unicode_normalize, except that the normalization
12053 * is performed on +self+ (not on a copy of +self+).
12054 *
12055 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12056 *
12057 */
12058static VALUE
12059rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12060{
12061 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12062}
12063
12064/* call-seq:
12065 * unicode_normalized?(form = :nfc) -> true or false
12066 *
12067 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12068 * see String#unicode_normalize.
12069 *
12070 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12071 *
12072 * Examples:
12073 *
12074 * "a\u0300".unicode_normalized? # => false
12075 * "a\u0300".unicode_normalized?(:nfd) # => true
12076 * "\u00E0".unicode_normalized? # => true
12077 * "\u00E0".unicode_normalized?(:nfd) # => false
12078 *
12079 *
12080 * Raises an exception if +self+ is not in a Unicode encoding:
12081 *
12082 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12083 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12084 *
12085 * Related: see {Querying}[rdoc-ref:String@Querying].
12086 */
12087static VALUE
12088rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12089{
12090 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12091}
12092
12093/**********************************************************************
12094 * Document-class: Symbol
12095 *
12096 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12097 *
12098 * You can create a +Symbol+ object explicitly with:
12099 *
12100 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12101 *
12102 * The same +Symbol+ object will be
12103 * created for a given name or string for the duration of a program's
12104 * execution, regardless of the context or meaning of that name. Thus
12105 * if <code>Fred</code> is a constant in one context, a method in
12106 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12107 * will be the same object in all three contexts.
12108 *
12109 * module One
12110 * class Fred
12111 * end
12112 * $f1 = :Fred
12113 * end
12114 * module Two
12115 * Fred = 1
12116 * $f2 = :Fred
12117 * end
12118 * def Fred()
12119 * end
12120 * $f3 = :Fred
12121 * $f1.object_id #=> 2514190
12122 * $f2.object_id #=> 2514190
12123 * $f3.object_id #=> 2514190
12124 *
12125 * Constant, method, and variable names are returned as symbols:
12126 *
12127 * module One
12128 * Two = 2
12129 * def three; 3 end
12130 * @four = 4
12131 * @@five = 5
12132 * $six = 6
12133 * end
12134 * seven = 7
12135 *
12136 * One.constants
12137 * # => [:Two]
12138 * One.instance_methods(true)
12139 * # => [:three]
12140 * One.instance_variables
12141 * # => [:@four]
12142 * One.class_variables
12143 * # => [:@@five]
12144 * global_variables.grep(/six/)
12145 * # => [:$six]
12146 * local_variables
12147 * # => [:seven]
12148 *
12149 * A +Symbol+ object differs from a String object in that
12150 * a +Symbol+ object represents an identifier, while a String object
12151 * represents text or data.
12152 *
12153 * == What's Here
12154 *
12155 * First, what's elsewhere. Class +Symbol+:
12156 *
12157 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12158 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12159 *
12160 * Here, class +Symbol+ provides methods that are useful for:
12161 *
12162 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12163 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12164 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12165 *
12166 * === Methods for Querying
12167 *
12168 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12169 * - #=~: Returns the index of the first substring in symbol that matches a
12170 * given Regexp or other object; returns +nil+ if no match is found.
12171 * - #[], #slice : Returns a substring of symbol
12172 * determined by a given index, start/length, or range, or string.
12173 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12174 * - #encoding: Returns the Encoding object that represents the encoding
12175 * of symbol.
12176 * - #end_with?: Returns +true+ if symbol ends with
12177 * any of the given strings.
12178 * - #match: Returns a MatchData object if symbol
12179 * matches a given Regexp; +nil+ otherwise.
12180 * - #match?: Returns +true+ if symbol
12181 * matches a given Regexp; +false+ otherwise.
12182 * - #length, #size: Returns the number of characters in symbol.
12183 * - #start_with?: Returns +true+ if symbol starts with
12184 * any of the given strings.
12185 *
12186 * === Methods for Comparing
12187 *
12188 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12189 * or larger than symbol.
12190 * - #==, #===: Returns +true+ if a given symbol has the same content and
12191 * encoding.
12192 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12193 * symbol is smaller than, equal to, or larger than symbol.
12194 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12195 * after Unicode case folding; +false+ otherwise.
12196 *
12197 * === Methods for Converting
12198 *
12199 * - #capitalize: Returns symbol with the first character upcased
12200 * and all other characters downcased.
12201 * - #downcase: Returns symbol with all characters downcased.
12202 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12203 * - #name: Returns the frozen string corresponding to symbol.
12204 * - #succ, #next: Returns the symbol that is the successor to symbol.
12205 * - #swapcase: Returns symbol with all upcase characters downcased
12206 * and all downcase characters upcased.
12207 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12208 * - #to_s, #id2name: Returns the string corresponding to +self+.
12209 * - #to_sym, #intern: Returns +self+.
12210 * - #upcase: Returns symbol with all characters upcased.
12211 *
12212 */
12213
12214
12215/*
12216 * call-seq:
12217 * symbol == object -> true or false
12218 *
12219 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12220 */
12221
12222#define sym_equal rb_obj_equal
12223
12224static int
12225sym_printable(const char *s, const char *send, rb_encoding *enc)
12226{
12227 while (s < send) {
12228 int n;
12229 int c = rb_enc_precise_mbclen(s, send, enc);
12230
12231 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12232 n = MBCLEN_CHARFOUND_LEN(c);
12233 c = rb_enc_mbc_to_codepoint(s, send, enc);
12234 if (!rb_enc_isprint(c, enc)) return FALSE;
12235 s += n;
12236 }
12237 return TRUE;
12238}
12239
12240int
12241rb_str_symname_p(VALUE sym)
12242{
12243 rb_encoding *enc;
12244 const char *ptr;
12245 long len;
12246 rb_encoding *resenc = rb_default_internal_encoding();
12247
12248 if (resenc == NULL) resenc = rb_default_external_encoding();
12249 enc = STR_ENC_GET(sym);
12250 ptr = RSTRING_PTR(sym);
12251 len = RSTRING_LEN(sym);
12252 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12253 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12254 return FALSE;
12255 }
12256 return TRUE;
12257}
12258
12259VALUE
12260rb_str_quote_unprintable(VALUE str)
12261{
12262 rb_encoding *enc;
12263 const char *ptr;
12264 long len;
12265 rb_encoding *resenc;
12266
12267 Check_Type(str, T_STRING);
12268 resenc = rb_default_internal_encoding();
12269 if (resenc == NULL) resenc = rb_default_external_encoding();
12270 enc = STR_ENC_GET(str);
12271 ptr = RSTRING_PTR(str);
12272 len = RSTRING_LEN(str);
12273 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12274 !sym_printable(ptr, ptr + len, enc)) {
12275 return rb_str_escape(str);
12276 }
12277 return str;
12278}
12279
12280VALUE
12281rb_id_quote_unprintable(ID id)
12282{
12283 VALUE str = rb_id2str(id);
12284 if (!rb_str_symname_p(str)) {
12285 return rb_str_escape(str);
12286 }
12287 return str;
12288}
12289
12290/*
12291 * call-seq:
12292 * inspect -> string
12293 *
12294 * Returns a string representation of +self+ (including the leading colon):
12295 *
12296 * :foo.inspect # => ":foo"
12297 *
12298 * Related: Symbol#to_s, Symbol#name.
12299 *
12300 */
12301
12302static VALUE
12303sym_inspect(VALUE sym)
12304{
12305 VALUE str = rb_sym2str(sym);
12306 const char *ptr;
12307 long len;
12308 char *dest;
12309
12310 if (!rb_str_symname_p(str)) {
12311 str = rb_str_inspect(str);
12312 len = RSTRING_LEN(str);
12313 rb_str_resize(str, len + 1);
12314 dest = RSTRING_PTR(str);
12315 memmove(dest + 1, dest, len);
12316 }
12317 else {
12318 rb_encoding *enc = STR_ENC_GET(str);
12319 VALUE orig_str = str;
12320
12321 len = RSTRING_LEN(orig_str);
12322 str = rb_enc_str_new(0, len + 1, enc);
12323
12324 // Get data pointer after allocation
12325 ptr = RSTRING_PTR(orig_str);
12326 dest = RSTRING_PTR(str);
12327 memcpy(dest + 1, ptr, len);
12328
12329 RB_GC_GUARD(orig_str);
12330 }
12331 dest[0] = ':';
12332
12334
12335 return str;
12336}
12337
12338VALUE
12340{
12341 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12342 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12343 return str;
12344}
12345
12346VALUE
12347rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12348{
12349 VALUE obj;
12350
12351 if (argc < 1) {
12352 rb_raise(rb_eArgError, "no receiver given");
12353 }
12354 obj = argv[0];
12355 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12356}
12357
12358/*
12359 * call-seq:
12360 * succ
12361 *
12362 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12363 *
12364 * :foo.succ # => :fop
12365 *
12366 * Related: String#succ.
12367 */
12368
12369static VALUE
12370sym_succ(VALUE sym)
12371{
12372 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12373}
12374
12375/*
12376 * call-seq:
12377 * self <=> other -> -1, 0, 1, or nil
12378 *
12379 * Compares +self+ and +other+, using String#<=>.
12380 *
12381 * Returns:
12382 *
12383 * - <tt>symbol.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12384 * - +nil+, otherwise.
12385 *
12386 * Examples:
12387 *
12388 * :bar <=> :foo # => -1
12389 * :foo <=> :foo # => 0
12390 * :foo <=> :bar # => 1
12391 * :foo <=> 'bar' # => nil
12392 *
12393 * \Class \Symbol includes module Comparable,
12394 * each of whose methods uses Symbol#<=> for comparison.
12395 *
12396 * Related: String#<=>.
12397 */
12398
12399static VALUE
12400sym_cmp(VALUE sym, VALUE other)
12401{
12402 if (!SYMBOL_P(other)) {
12403 return Qnil;
12404 }
12405 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12406}
12407
12408/*
12409 * call-seq:
12410 * casecmp(object) -> -1, 0, 1, or nil
12411 *
12412 * :include: doc/symbol/casecmp.rdoc
12413 *
12414 */
12415
12416static VALUE
12417sym_casecmp(VALUE sym, VALUE other)
12418{
12419 if (!SYMBOL_P(other)) {
12420 return Qnil;
12421 }
12422 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12423}
12424
12425/*
12426 * call-seq:
12427 * casecmp?(object) -> true, false, or nil
12428 *
12429 * :include: doc/symbol/casecmp_p.rdoc
12430 *
12431 */
12432
12433static VALUE
12434sym_casecmp_p(VALUE sym, VALUE other)
12435{
12436 if (!SYMBOL_P(other)) {
12437 return Qnil;
12438 }
12439 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12440}
12441
12442/*
12443 * call-seq:
12444 * symbol =~ object -> integer or nil
12445 *
12446 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12447 * including possible updates to global variables;
12448 * see String#=~.
12449 *
12450 */
12451
12452static VALUE
12453sym_match(VALUE sym, VALUE other)
12454{
12455 return rb_str_match(rb_sym2str(sym), other);
12456}
12457
12458/*
12459 * call-seq:
12460 * match(pattern, offset = 0) -> matchdata or nil
12461 * match(pattern, offset = 0) {|matchdata| } -> object
12462 *
12463 * Equivalent to <tt>self.to_s.match</tt>,
12464 * including possible updates to global variables;
12465 * see String#match.
12466 *
12467 */
12468
12469static VALUE
12470sym_match_m(int argc, VALUE *argv, VALUE sym)
12471{
12472 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12473}
12474
12475/*
12476 * call-seq:
12477 * match?(pattern, offset) -> true or false
12478 *
12479 * Equivalent to <tt>sym.to_s.match?</tt>;
12480 * see String#match.
12481 *
12482 */
12483
12484static VALUE
12485sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12486{
12487 return rb_str_match_m_p(argc, argv, sym);
12488}
12489
12490/*
12491 * call-seq:
12492 * symbol[index] -> string or nil
12493 * symbol[start, length] -> string or nil
12494 * symbol[range] -> string or nil
12495 * symbol[regexp, capture = 0] -> string or nil
12496 * symbol[substring] -> string or nil
12497 *
12498 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12499 *
12500 */
12501
12502static VALUE
12503sym_aref(int argc, VALUE *argv, VALUE sym)
12504{
12505 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12506}
12507
12508/*
12509 * call-seq:
12510 * length -> integer
12511 *
12512 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12513 */
12514
12515static VALUE
12516sym_length(VALUE sym)
12517{
12518 return rb_str_length(rb_sym2str(sym));
12519}
12520
12521/*
12522 * call-seq:
12523 * empty? -> true or false
12524 *
12525 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12526 *
12527 */
12528
12529static VALUE
12530sym_empty(VALUE sym)
12531{
12532 return rb_str_empty(rb_sym2str(sym));
12533}
12534
12535/*
12536 * call-seq:
12537 * upcase(mapping) -> symbol
12538 *
12539 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12540 *
12541 * See String#upcase.
12542 *
12543 */
12544
12545static VALUE
12546sym_upcase(int argc, VALUE *argv, VALUE sym)
12547{
12548 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12549}
12550
12551/*
12552 * call-seq:
12553 * downcase(mapping) -> symbol
12554 *
12555 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12556 *
12557 * See String#downcase.
12558 *
12559 * Related: Symbol#upcase.
12560 *
12561 */
12562
12563static VALUE
12564sym_downcase(int argc, VALUE *argv, VALUE sym)
12565{
12566 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12567}
12568
12569/*
12570 * call-seq:
12571 * capitalize(mapping) -> symbol
12572 *
12573 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12574 *
12575 * See String#capitalize.
12576 *
12577 */
12578
12579static VALUE
12580sym_capitalize(int argc, VALUE *argv, VALUE sym)
12581{
12582 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12583}
12584
12585/*
12586 * call-seq:
12587 * swapcase(mapping) -> symbol
12588 *
12589 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12590 *
12591 * See String#swapcase.
12592 *
12593 */
12594
12595static VALUE
12596sym_swapcase(int argc, VALUE *argv, VALUE sym)
12597{
12598 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12599}
12600
12601/*
12602 * call-seq:
12603 * start_with?(*string_or_regexp) -> true or false
12604 *
12605 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12606 *
12607 */
12608
12609static VALUE
12610sym_start_with(int argc, VALUE *argv, VALUE sym)
12611{
12612 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12613}
12614
12615/*
12616 * call-seq:
12617 * end_with?(*strings) -> true or false
12618 *
12619 *
12620 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12621 *
12622 */
12623
12624static VALUE
12625sym_end_with(int argc, VALUE *argv, VALUE sym)
12626{
12627 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12628}
12629
12630/*
12631 * call-seq:
12632 * encoding -> encoding
12633 *
12634 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12635 *
12636 */
12637
12638static VALUE
12639sym_encoding(VALUE sym)
12640{
12641 return rb_obj_encoding(rb_sym2str(sym));
12642}
12643
12644static VALUE
12645string_for_symbol(VALUE name)
12646{
12647 if (!RB_TYPE_P(name, T_STRING)) {
12648 VALUE tmp = rb_check_string_type(name);
12649 if (NIL_P(tmp)) {
12650 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12651 name);
12652 }
12653 name = tmp;
12654 }
12655 return name;
12656}
12657
12658ID
12660{
12661 if (SYMBOL_P(name)) {
12662 return SYM2ID(name);
12663 }
12664 name = string_for_symbol(name);
12665 return rb_intern_str(name);
12666}
12667
12668VALUE
12670{
12671 if (SYMBOL_P(name)) {
12672 return name;
12673 }
12674 name = string_for_symbol(name);
12675 return rb_str_intern(name);
12676}
12677
12678/*
12679 * call-seq:
12680 * Symbol.all_symbols -> array_of_symbols
12681 *
12682 * Returns an array of all symbols currently in Ruby's symbol table:
12683 *
12684 * Symbol.all_symbols.size # => 9334
12685 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12686 *
12687 */
12688
12689static VALUE
12690sym_all_symbols(VALUE _)
12691{
12692 return rb_sym_all_symbols();
12693}
12694
12695VALUE
12696rb_str_to_interned_str(VALUE str)
12697{
12698 return rb_fstring(str);
12699}
12700
12701VALUE
12702rb_interned_str(const char *ptr, long len)
12703{
12704 struct RString fake_str = {RBASIC_INIT};
12705 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12706}
12707
12708VALUE
12710{
12711 return rb_interned_str(ptr, strlen(ptr));
12712}
12713
12714VALUE
12715rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12716{
12717 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12718 rb_enc_autoload(enc);
12719 }
12720
12721 struct RString fake_str = {RBASIC_INIT};
12722 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12723}
12724
12725VALUE
12726rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12727{
12728 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12729 rb_enc_autoload(enc);
12730 }
12731
12732 struct RString fake_str = {RBASIC_INIT};
12733 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12734 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12735 return str;
12736}
12737
12738VALUE
12740{
12741 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12742}
12743
12744#if USE_YJIT || USE_ZJIT
12745void
12746rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12747{
12748 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12749 ssize_t code = RB_NUM2SSIZE(codepoint);
12750
12751 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12752 rb_str_buf_cat_byte(str, (char) code);
12753 return;
12754 }
12755 }
12756
12757 rb_str_concat(str, codepoint);
12758}
12759#endif
12760
12761static int
12762fstring_set_class_i(VALUE *str, void *data)
12763{
12764 RBASIC_SET_CLASS(*str, rb_cString);
12765
12766 return ST_CONTINUE;
12767}
12768
12769void
12770Init_String(void)
12771{
12772 rb_cString = rb_define_class("String", rb_cObject);
12773
12774 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12775
12777 rb_define_alloc_func(rb_cString, empty_str_alloc);
12778 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12779 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12780 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12782 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12783 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12786 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12787 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12788 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12789 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12792 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12793 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12794 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12795 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12798 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12799 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12800 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12801 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12802 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12804 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12806 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12807 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12808 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12809 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12810 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12811 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12812 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12813 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12814 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12815 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12816 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12817 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12818 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12819 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12821 rb_define_method(rb_cString, "+@", str_uplus, 0);
12822 rb_define_method(rb_cString, "-@", str_uminus, 0);
12823 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12824 rb_define_alias(rb_cString, "dedup", "-@");
12825
12826 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12827 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12828 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12829 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12832 rb_define_method(rb_cString, "undump", str_undump, 0);
12833
12834 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12835 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12836 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12837 sym_fold = ID2SYM(rb_intern_const("fold"));
12838
12839 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12840 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12841 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12842 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12843
12844 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12845 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12846 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12847 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12848
12849 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12850 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12851 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12852 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12853 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12854 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12855 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12856 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12857 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12858 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12859 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12860 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12862 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12863 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12864 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12865 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12866 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12867
12868 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12869 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12870 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12871
12872 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12873
12874 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12875 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12876 rb_define_method(rb_cString, "center", rb_str_center, -1);
12877
12878 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12879 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12880 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12881 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12882 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12883 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12884 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12885 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12886 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12887
12888 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12889 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12890 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12891 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12892 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12893 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12894 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12895 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12896 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12897
12898 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12899 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12900 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12901 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12902 rb_define_method(rb_cString, "count", rb_str_count, -1);
12903
12904 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12905 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12906 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12907 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12908
12909 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12910 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12911 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12912 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12913 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12914
12915 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12916
12917 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12918 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12919
12920 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12921 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12922
12923 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12924 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12925 rb_define_method(rb_cString, "b", rb_str_b, 0);
12926 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12927 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12928
12929 /* define UnicodeNormalize module here so that we don't have to look it up */
12930 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12931 id_normalize = rb_intern_const("normalize");
12932 id_normalized_p = rb_intern_const("normalized?");
12933
12934 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12935 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12936 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12937
12938 rb_fs = Qnil;
12939 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12940 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12941 rb_gc_register_address(&rb_fs);
12942
12943 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12947 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12948
12949 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12950 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12951 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12952 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12953 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12954 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12955
12956 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12957 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12958 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12959 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12960
12961 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12962 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12963 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12964 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12965 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12966 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12967 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12968
12969 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12970 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12971 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12972 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12973
12974 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12975 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12976
12977 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12978}
12979
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1796
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1589
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1702
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2956
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2768
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3246
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1010
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3035
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:653
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3909
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1435
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1431
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1438
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1429
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1433
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:675
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2192
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2210
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1354
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3606
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:264
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:582
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:176
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1342
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3290
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1342
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:947
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1207
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3028
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1226
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12715
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2334
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3732
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1155
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1447
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1348
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:966
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12739
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:831
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:714
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2030
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2036
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1950
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4223
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3720
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1752
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1512
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2487
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3797
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1423
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12339
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2560
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1399
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1746
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3056
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5336
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4160
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3153
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11660
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1788
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1189
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1001
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1518
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1996
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4146
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3565
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2423
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2014
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6543
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3161
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12709
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1429
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3763
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3103
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4267
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3387
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7222
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2790
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12702
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4214
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4034
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4189
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3739
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3278
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5820
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11718
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1702
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2950
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3250
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3369
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1201
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2744
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7329
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1411
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1718
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2437
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5738
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9336
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1195
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1850
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2017
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2096
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3402
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1650
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12669
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12659
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3499
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4467
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1441
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2927
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2809
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1435
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2822
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1779
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:461
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:208
Definition string.c:8216
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:307
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113