Ruby 4.0.0dev (2025-11-28 revision dcb9e17f467683b32f429736cf7598b1ce89cdc5)
string.c (dcb9e17f467683b32f429736cf7598b1ce89cdc5)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 FL_SET((shared_str), STR_SHARED_ROOT); \
209 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
210 FL_SET_RAW((shared_str), STR_BORROWED); \
211 } \
212} while (0)
213
214#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
215#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216/* TODO: include the terminator size in capa. */
217
218#define STR_ENC_GET(str) get_encoding(str)
219
220#if !defined SHARABLE_MIDDLE_SUBSTRING
221# define SHARABLE_MIDDLE_SUBSTRING 0
222#endif
223#if !SHARABLE_MIDDLE_SUBSTRING
224#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#else
226#define SHARABLE_SUBSTRING_P(beg, len, end) 1
227#endif
228
229
230static inline long
231str_embed_capa(VALUE str)
232{
233 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
234}
235
236bool
237rb_str_reembeddable_p(VALUE str)
238{
239 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
240}
241
242static inline size_t
243rb_str_embed_size(long capa, long termlen)
244{
245 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
246 if (size < sizeof(struct RString)) size = sizeof(struct RString);
247 return size;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254 if (STR_EMBED_P(str)) {
255 size_t capa = RSTRING(str)->len;
256 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
257
258 real_size = rb_str_embed_size(capa, TERM_LEN(str));
259 }
260 /* if the string is not currently embedded, but it can be embedded, how
261 * much space would it require */
262 else if (rb_str_reembeddable_p(str)) {
263 size_t capa = RSTRING(str)->as.heap.aux.capa;
264 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
265
266 real_size = rb_str_embed_size(capa, TERM_LEN(str));
267 }
268 else {
269 real_size = sizeof(struct RString);
270 }
271
272 return real_size;
273}
274
275static inline bool
276STR_EMBEDDABLE_P(long len, long termlen)
277{
278 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
279}
280
281static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
282static VALUE str_new_frozen(VALUE klass, VALUE orig);
283static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
284static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
285static VALUE str_new(VALUE klass, const char *ptr, long len);
286static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
287static inline void str_modifiable(VALUE str);
288static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
289static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->len;
317
318 STR_SET_EMBED(str);
319 STR_SET_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_debug_rstring_null_ptr(const char *func)
331{
332 fprintf(stderr, "%s is returning NULL!! "
333 "SIGSEGV is highly expected to follow immediately.\n"
334 "If you could reproduce, attach your debugger here, "
335 "and look at the passed string.\n",
336 func);
337}
338
339/* symbols for [up|down|swap]case/capitalize options */
340static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
341
342static rb_encoding *
343get_encoding(VALUE str)
344{
345 return rb_enc_from_index(ENCODING_GET(str));
346}
347
348static void
349mustnot_broken(VALUE str)
350{
351 if (is_broken_string(str)) {
352 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
353 }
354}
355
356static void
357mustnot_wchar(VALUE str)
358{
359 rb_encoding *enc = STR_ENC_GET(str);
360 if (rb_enc_mbminlen(enc) > 1) {
361 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
362 }
363}
364
365static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
366
367#if SIZEOF_LONG == SIZEOF_VOIDP
368#define PRECOMPUTED_FAKESTR_HASH 1
369#else
370#endif
371
372static inline bool
373BARE_STRING_P(VALUE str)
374{
375 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
376}
377
378static inline st_index_t
379str_do_hash(VALUE str)
380{
381 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
382 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
383 if (e && !is_ascii_string(str)) {
384 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
385 }
386 return h;
387}
388
389static VALUE
390str_store_precomputed_hash(VALUE str, st_index_t hash)
391{
392 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
393 RUBY_ASSERT(STR_EMBED_P(str));
394
395#if RUBY_DEBUG
396 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
397 size_t free_bytes = str_embed_capa(str) - used_bytes;
398 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
399#endif
400
401 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
402
403 FL_SET(str, STR_PRECOMPUTED_HASH);
404
405 return str;
406}
407
408VALUE
409rb_fstring(VALUE str)
410{
411 VALUE fstr;
412 int bare;
413
414 Check_Type(str, T_STRING);
415
416 if (FL_TEST(str, RSTRING_FSTR))
417 return str;
418
419 bare = BARE_STRING_P(str);
420 if (!bare) {
421 if (STR_EMBED_P(str)) {
422 OBJ_FREEZE(str);
423 return str;
424 }
425
426 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 return str;
429 }
430 }
431
432 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
433 rb_str_resize(str, RSTRING_LEN(str));
434
435 fstr = register_fstring(str, false, false);
436
437 if (!bare) {
438 str_replace_shared_without_enc(str, fstr);
439 OBJ_FREEZE(str);
440 return str;
441 }
442 return fstr;
443}
444
445static VALUE fstring_table_obj;
446
447static VALUE
448fstring_concurrent_set_hash(VALUE str)
449{
450#ifdef PRECOMPUTED_FAKESTR_HASH
451 st_index_t h;
452 if (FL_TEST_RAW(str, STR_FAKESTR)) {
453 // register_fstring precomputes the hash and stores it in capa for fake strings
454 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
455 }
456 else {
457 h = rb_str_hash(str);
458 }
459 // rb_str_hash doesn't include the encoding for ascii only strings, so
460 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
461 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
462#else
463 return (VALUE)rb_str_hash(str);
464#endif
465}
466
467static bool
468fstring_concurrent_set_cmp(VALUE a, VALUE b)
469{
470 long alen, blen;
471 const char *aptr, *bptr;
472
475
476 RSTRING_GETMEM(a, aptr, alen);
477 RSTRING_GETMEM(b, bptr, blen);
478 return (alen == blen &&
479 ENCODING_GET(a) == ENCODING_GET(b) &&
480 memcmp(aptr, bptr, alen) == 0);
481}
482
484 bool copy;
485 bool force_precompute_hash;
486};
487
488static VALUE
489fstring_concurrent_set_create(VALUE str, void *data)
490{
491 struct fstr_create_arg *arg = data;
492
493 // Unless the string is empty or binary, its coderange has been precomputed.
494 int coderange = ENC_CODERANGE(str);
495
496 if (FL_TEST_RAW(str, STR_FAKESTR)) {
497 if (arg->copy) {
498 VALUE new_str;
499 long len = RSTRING_LEN(str);
500 long capa = len + sizeof(st_index_t);
501 int term_len = TERM_LEN(str);
502
503 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
504 new_str = str_alloc_embed(rb_cString, capa + term_len);
505 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
506 STR_SET_LEN(new_str, RSTRING_LEN(str));
507 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
508 rb_enc_copy(new_str, str);
509 str_store_precomputed_hash(new_str, str_do_hash(str));
510 }
511 else {
512 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
513 rb_enc_copy(new_str, str);
514#ifdef PRECOMPUTED_FAKESTR_HASH
515 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
516 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
517 }
518#endif
519 }
520 str = new_str;
521 }
522 else {
523 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
524 RSTRING(str)->len,
525 ENCODING_GET(str));
526 }
527 OBJ_FREEZE(str);
528 }
529 else {
530 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
531 str = str_new_frozen(rb_cString, str);
532 }
533 if (STR_SHARED_P(str)) { /* str should not be shared */
534 /* shared substring */
535 str_make_independent(str);
537 }
538 if (!BARE_STRING_P(str)) {
539 str = str_new_frozen(rb_cString, str);
540 }
541 }
542
543 ENC_CODERANGE_SET(str, coderange);
544 RBASIC(str)->flags |= RSTRING_FSTR;
545 if (!RB_OBJ_SHAREABLE_P(str)) {
546 RB_OBJ_SET_SHAREABLE(str);
547 }
548 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
551 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
552 RUBY_ASSERT(!rb_obj_exivar_p(str));
554 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
555
556 return str;
557}
558
559static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
560 .hash = fstring_concurrent_set_hash,
561 .cmp = fstring_concurrent_set_cmp,
562 .create = fstring_concurrent_set_create,
563 .free = NULL,
564};
565
566void
567Init_fstring_table(void)
568{
569 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
570 rb_gc_register_address(&fstring_table_obj);
571}
572
573static VALUE
574register_fstring(VALUE str, bool copy, bool force_precompute_hash)
575{
576 struct fstr_create_arg args = {
577 .copy = copy,
578 .force_precompute_hash = force_precompute_hash
579 };
580
581#if SIZEOF_VOIDP == SIZEOF_LONG
582 if (FL_TEST_RAW(str, STR_FAKESTR)) {
583 // if the string hasn't been interned, we'll need the hash twice, so we
584 // compute it once and store it in capa
585 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
586 }
587#endif
588
589 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
590
591 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593 RUBY_ASSERT(OBJ_FROZEN(result));
595 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
596 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
598
599 return result;
600}
601
602bool
603rb_obj_is_fstring_table(VALUE obj)
604{
605 ASSERT_vm_locking();
606
607 return obj == fstring_table_obj;
608}
609
610void
611rb_gc_free_fstring(VALUE obj)
612{
613 ASSERT_vm_locking_with_barrier();
614
615 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
617 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
618
619 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
620
621 RB_DEBUG_COUNTER_INC(obj_str_fstr);
622
623 FL_UNSET(obj, RSTRING_FSTR);
624}
625
626void
627rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
628{
629 if (fstring_table_obj) {
630 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
631 }
632}
633
634static VALUE
635setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
636{
637 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
638 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
639
640 if (!name) {
642 name = "";
643 }
644
645 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
646
647 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
648 fake_str->len = len;
649 fake_str->as.heap.ptr = (char *)name;
650 fake_str->as.heap.aux.capa = len;
651 return (VALUE)fake_str;
652}
653
654/*
655 * set up a fake string which refers a static string literal.
656 */
657VALUE
658rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
659{
660 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
661}
662
663/*
664 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
665 * shared string which refers a static string literal. `ptr` must
666 * point a constant string.
667 */
668VALUE
669rb_fstring_new(const char *ptr, long len)
670{
671 struct RString fake_str = {RBASIC_INIT};
672 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
673}
674
675VALUE
676rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
677{
678 struct RString fake_str = {RBASIC_INIT};
679 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
680}
681
682VALUE
683rb_fstring_cstr(const char *ptr)
684{
685 return rb_fstring_new(ptr, strlen(ptr));
686}
687
688static inline bool
689single_byte_optimizable(VALUE str)
690{
691 int encindex = ENCODING_GET(str);
692 switch (encindex) {
693 case ENCINDEX_ASCII_8BIT:
694 case ENCINDEX_US_ASCII:
695 return true;
696 case ENCINDEX_UTF_8:
697 // For UTF-8 it's worth scanning the string coderange when unknown.
699 }
700 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
701 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
702 return true;
703 }
704
705 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
706 return true;
707 }
708
709 /* Conservative. Possibly single byte.
710 * "\xa1" in Shift_JIS for example. */
711 return false;
712}
713
715
716static inline const char *
717search_nonascii(const char *p, const char *e)
718{
719 const uintptr_t *s, *t;
720
721#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
722# if SIZEOF_UINTPTR_T == 8
723# define NONASCII_MASK UINT64_C(0x8080808080808080)
724# elif SIZEOF_UINTPTR_T == 4
725# define NONASCII_MASK UINT32_C(0x80808080)
726# else
727# error "don't know what to do."
728# endif
729#else
730# if SIZEOF_UINTPTR_T == 8
731# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
732# elif SIZEOF_UINTPTR_T == 4
733# define NONASCII_MASK 0x80808080UL /* or...? */
734# else
735# error "don't know what to do."
736# endif
737#endif
738
739 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
740#if !UNALIGNED_WORD_ACCESS
741 if ((uintptr_t)p % SIZEOF_VOIDP) {
742 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
743 p += l;
744 switch (l) {
745 default: UNREACHABLE;
746#if SIZEOF_VOIDP > 4
747 case 7: if (p[-7]&0x80) return p-7;
748 case 6: if (p[-6]&0x80) return p-6;
749 case 5: if (p[-5]&0x80) return p-5;
750 case 4: if (p[-4]&0x80) return p-4;
751#endif
752 case 3: if (p[-3]&0x80) return p-3;
753 case 2: if (p[-2]&0x80) return p-2;
754 case 1: if (p[-1]&0x80) return p-1;
755 case 0: break;
756 }
757 }
758#endif
759#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
760#define aligned_ptr(value) \
761 __builtin_assume_aligned((value), sizeof(uintptr_t))
762#else
763#define aligned_ptr(value) (uintptr_t *)(value)
764#endif
765 s = aligned_ptr(p);
766 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
767#undef aligned_ptr
768 for (;s < t; s++) {
769 if (*s & NONASCII_MASK) {
770#ifdef WORDS_BIGENDIAN
771 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
772#else
773 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
774#endif
775 }
776 }
777 p = (const char *)s;
778 }
779
780 switch (e - p) {
781 default: UNREACHABLE;
782#if SIZEOF_VOIDP > 4
783 case 7: if (e[-7]&0x80) return e-7;
784 case 6: if (e[-6]&0x80) return e-6;
785 case 5: if (e[-5]&0x80) return e-5;
786 case 4: if (e[-4]&0x80) return e-4;
787#endif
788 case 3: if (e[-3]&0x80) return e-3;
789 case 2: if (e[-2]&0x80) return e-2;
790 case 1: if (e[-1]&0x80) return e-1;
791 case 0: return NULL;
792 }
793}
794
795static int
796coderange_scan(const char *p, long len, rb_encoding *enc)
797{
798 const char *e = p + len;
799
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
801 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
802 p = search_nonascii(p, e);
804 }
805
806 if (rb_enc_asciicompat(enc)) {
807 p = search_nonascii(p, e);
808 if (!p) return ENC_CODERANGE_7BIT;
809 for (;;) {
810 int ret = rb_enc_precise_mbclen(p, e, enc);
812 p += MBCLEN_CHARFOUND_LEN(ret);
813 if (p == e) break;
814 p = search_nonascii(p, e);
815 if (!p) break;
816 }
817 }
818 else {
819 while (p < e) {
820 int ret = rb_enc_precise_mbclen(p, e, enc);
822 p += MBCLEN_CHARFOUND_LEN(ret);
823 }
824 }
825 return ENC_CODERANGE_VALID;
826}
827
828long
829rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
830{
831 const char *p = s;
832
833 if (*cr == ENC_CODERANGE_BROKEN)
834 return e - s;
835
836 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
837 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
838 if (*cr == ENC_CODERANGE_VALID) return e - s;
839 p = search_nonascii(p, e);
841 return e - s;
842 }
843 else if (rb_enc_asciicompat(enc)) {
844 p = search_nonascii(p, e);
845 if (!p) {
846 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
847 return e - s;
848 }
849 for (;;) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 if (p == e) break;
857 p = search_nonascii(p, e);
858 if (!p) break;
859 }
860 }
861 else {
862 while (p < e) {
863 int ret = rb_enc_precise_mbclen(p, e, enc);
864 if (!MBCLEN_CHARFOUND_P(ret)) {
866 return p - s;
867 }
868 p += MBCLEN_CHARFOUND_LEN(ret);
869 }
870 }
872 return e - s;
873}
874
875static inline void
876str_enc_copy(VALUE str1, VALUE str2)
877{
878 rb_enc_set_index(str1, ENCODING_GET(str2));
879}
880
881/* Like str_enc_copy, but does not check frozen status of str1.
882 * You should use this only if you're certain that str1 is not frozen. */
883static inline void
884str_enc_copy_direct(VALUE str1, VALUE str2)
885{
886 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
887 if (inlined_encoding == ENCODING_INLINE_MAX) {
888 rb_enc_set_index(str1, rb_enc_get_index(str2));
889 }
890 else {
891 ENCODING_SET_INLINED(str1, inlined_encoding);
892 }
893}
894
895static void
896rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
897{
898 /* this function is designed for copying encoding and coderange
899 * from src to new string "dest" which is made from the part of src.
900 */
901 str_enc_copy(dest, src);
902 if (RSTRING_LEN(dest) == 0) {
903 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
905 else
907 return;
908 }
909 switch (ENC_CODERANGE(src)) {
912 break;
914 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
915 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
917 else
919 break;
920 default:
921 break;
922 }
923}
924
925static void
926rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
927{
928 str_enc_copy(dest, src);
930}
931
932static int
933enc_coderange_scan(VALUE str, rb_encoding *enc)
934{
935 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
936}
937
938int
939rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
940{
941 return enc_coderange_scan(str, enc);
942}
943
944int
946{
947 int cr = ENC_CODERANGE(str);
948
949 if (cr == ENC_CODERANGE_UNKNOWN) {
950 cr = enc_coderange_scan(str, get_encoding(str));
951 ENC_CODERANGE_SET(str, cr);
952 }
953 return cr;
954}
955
956static inline bool
957rb_enc_str_asciicompat(VALUE str)
958{
959 int encindex = ENCODING_GET_INLINED(str);
960 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
961}
962
963int
965{
966 switch(ENC_CODERANGE(str)) {
968 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
970 return true;
971 default:
972 return false;
973 }
974}
975
976static inline void
977str_mod_check(VALUE s, const char *p, long len)
978{
979 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
980 rb_raise(rb_eRuntimeError, "string modified");
981 }
982}
983
984static size_t
985str_capacity(VALUE str, const int termlen)
986{
987 if (STR_EMBED_P(str)) {
988 return str_embed_capa(str) - termlen;
989 }
990 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
991 return RSTRING(str)->len;
992 }
993 else {
994 return RSTRING(str)->as.heap.aux.capa;
995 }
996}
997
998size_t
1000{
1001 return str_capacity(str, TERM_LEN(str));
1002}
1003
1004static inline void
1005must_not_null(const char *ptr)
1006{
1007 if (!ptr) {
1008 rb_raise(rb_eArgError, "NULL pointer given");
1009 }
1010}
1011
1012static inline VALUE
1013str_alloc_embed(VALUE klass, size_t capa)
1014{
1015 size_t size = rb_str_embed_size(capa, 0);
1016 RUBY_ASSERT(size > 0);
1017 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1018
1019 NEWOBJ_OF(str, struct RString, klass,
1021
1022 str->len = 0;
1023 str->as.embed.ary[0] = 0;
1024
1025 return (VALUE)str;
1026}
1027
1028static inline VALUE
1029str_alloc_heap(VALUE klass)
1030{
1031 NEWOBJ_OF(str, struct RString, klass,
1032 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1033
1034 str->len = 0;
1035 str->as.heap.aux.capa = 0;
1036 str->as.heap.ptr = NULL;
1037
1038 return (VALUE)str;
1039}
1040
1041static inline VALUE
1042empty_str_alloc(VALUE klass)
1043{
1044 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1045 VALUE str = str_alloc_embed(klass, 0);
1046 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1048 return str;
1049}
1050
1051static VALUE
1052str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1053{
1054 VALUE str;
1055
1056 if (len < 0) {
1057 rb_raise(rb_eArgError, "negative string size (or size too big)");
1058 }
1059
1060 if (enc == NULL) {
1061 enc = rb_ascii8bit_encoding();
1062 }
1063
1064 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1065
1066 int termlen = rb_enc_mbminlen(enc);
1067
1068 if (STR_EMBEDDABLE_P(len, termlen)) {
1069 str = str_alloc_embed(klass, len + termlen);
1070 if (len == 0) {
1071 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1072 }
1073 }
1074 else {
1075 str = str_alloc_heap(klass);
1076 RSTRING(str)->as.heap.aux.capa = len;
1077 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1078 * integer overflow. If we can STATIC_ASSERT that, the following
1079 * mul_add_mul can be reverted to a simple ALLOC_N. */
1080 RSTRING(str)->as.heap.ptr =
1081 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1082 }
1083
1084 rb_enc_raw_set(str, enc);
1085
1086 if (ptr) {
1087 memcpy(RSTRING_PTR(str), ptr, len);
1088 }
1089 else {
1090 memset(RSTRING_PTR(str), 0, len);
1091 }
1092
1093 STR_SET_LEN(str, len);
1094 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1095 return str;
1096}
1097
1098static VALUE
1099str_new(VALUE klass, const char *ptr, long len)
1100{
1101 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1102}
1103
1104VALUE
1105rb_str_new(const char *ptr, long len)
1106{
1107 return str_new(rb_cString, ptr, len);
1108}
1109
1110VALUE
1111rb_usascii_str_new(const char *ptr, long len)
1112{
1113 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1114}
1115
1116VALUE
1117rb_utf8_str_new(const char *ptr, long len)
1118{
1119 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1120}
1121
1122VALUE
1123rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1124{
1125 return str_enc_new(rb_cString, ptr, len, enc);
1126}
1127
1128VALUE
1130{
1131 must_not_null(ptr);
1132 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1133 * memory regions, and that cannot be detected by the MSAN. Just
1134 * trust the programmer that the argument passed here is a sane C
1135 * string. */
1136 __msan_unpoison_string(ptr);
1137 return rb_str_new(ptr, strlen(ptr));
1138}
1139
1140VALUE
1142{
1143 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1144}
1145
1146VALUE
1148{
1149 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1150}
1151
1152VALUE
1154{
1155 must_not_null(ptr);
1156 if (rb_enc_mbminlen(enc) != 1) {
1157 rb_raise(rb_eArgError, "wchar encoding given");
1158 }
1159 return rb_enc_str_new(ptr, strlen(ptr), enc);
1160}
1161
1162static VALUE
1163str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1164{
1165 VALUE str;
1166
1167 if (len < 0) {
1168 rb_raise(rb_eArgError, "negative string size (or size too big)");
1169 }
1170
1171 if (!ptr) {
1172 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1173 }
1174 else {
1175 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1176 str = str_alloc_heap(klass);
1177 RSTRING(str)->len = len;
1178 RSTRING(str)->as.heap.ptr = (char *)ptr;
1179 RSTRING(str)->as.heap.aux.capa = len;
1180 RBASIC(str)->flags |= STR_NOFREE;
1181 rb_enc_associate_index(str, encindex);
1182 }
1183 return str;
1184}
1185
1186VALUE
1187rb_str_new_static(const char *ptr, long len)
1188{
1189 return str_new_static(rb_cString, ptr, len, 0);
1190}
1191
1192VALUE
1194{
1195 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1196}
1197
1198VALUE
1200{
1201 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1202}
1203
1204VALUE
1206{
1207 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1208}
1209
1210static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1211 rb_encoding *from, rb_encoding *to,
1212 int ecflags, VALUE ecopts);
1213
1214static inline bool
1215is_enc_ascii_string(VALUE str, rb_encoding *enc)
1216{
1217 int encidx = rb_enc_to_index(enc);
1218 if (rb_enc_get_index(str) == encidx)
1219 return is_ascii_string(str);
1220 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1221}
1222
1223VALUE
1224rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1225{
1226 long len;
1227 const char *ptr;
1228 VALUE newstr;
1229
1230 if (!to) return str;
1231 if (!from) from = rb_enc_get(str);
1232 if (from == to) return str;
1233 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1234 rb_is_ascii8bit_enc(to)) {
1235 if (STR_ENC_GET(str) != to) {
1236 str = rb_str_dup(str);
1237 rb_enc_associate(str, to);
1238 }
1239 return str;
1240 }
1241
1242 RSTRING_GETMEM(str, ptr, len);
1243 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1244 from, to, ecflags, ecopts);
1245 if (NIL_P(newstr)) {
1246 /* some error, return original */
1247 return str;
1248 }
1249 return newstr;
1250}
1251
1252VALUE
1253rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1254 rb_encoding *from, int ecflags, VALUE ecopts)
1255{
1256 long olen;
1257
1258 olen = RSTRING_LEN(newstr);
1259 if (ofs < -olen || olen < ofs)
1260 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1261 if (ofs < 0) ofs += olen;
1262 if (!from) {
1263 STR_SET_LEN(newstr, ofs);
1264 return rb_str_cat(newstr, ptr, len);
1265 }
1266
1267 rb_str_modify(newstr);
1268 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1269 rb_enc_get(newstr),
1270 ecflags, ecopts);
1271}
1272
1273VALUE
1274rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1275{
1276 STR_SET_LEN(str, 0);
1277 rb_enc_associate(str, enc);
1278 rb_str_cat(str, ptr, len);
1279 return str;
1280}
1281
1282static VALUE
1283str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1284 rb_encoding *from, rb_encoding *to,
1285 int ecflags, VALUE ecopts)
1286{
1287 rb_econv_t *ec;
1289 long olen;
1290 VALUE econv_wrapper;
1291 const unsigned char *start, *sp;
1292 unsigned char *dest, *dp;
1293 size_t converted_output = (size_t)ofs;
1294
1295 olen = rb_str_capacity(newstr);
1296
1297 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1298 RBASIC_CLEAR_CLASS(econv_wrapper);
1299 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1300 if (!ec) return Qnil;
1301 DATA_PTR(econv_wrapper) = ec;
1302
1303 sp = (unsigned char*)ptr;
1304 start = sp;
1305 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1306 (dp = dest + converted_output),
1307 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1309 /* destination buffer short */
1310 size_t converted_input = sp - start;
1311 size_t rest = len - converted_input;
1312 converted_output = dp - dest;
1313 rb_str_set_len(newstr, converted_output);
1314 if (converted_input && converted_output &&
1315 rest < (LONG_MAX / converted_output)) {
1316 rest = (rest * converted_output) / converted_input;
1317 }
1318 else {
1319 rest = olen;
1320 }
1321 olen += rest < 2 ? 2 : rest;
1322 rb_str_resize(newstr, olen);
1323 }
1324 DATA_PTR(econv_wrapper) = 0;
1325 RB_GC_GUARD(econv_wrapper);
1326 rb_econv_close(ec);
1327 switch (ret) {
1328 case econv_finished:
1329 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1330 rb_str_set_len(newstr, len);
1331 rb_enc_associate(newstr, to);
1332 return newstr;
1333
1334 default:
1335 return Qnil;
1336 }
1337}
1338
1339VALUE
1341{
1342 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1343}
1344
1345VALUE
1347{
1348 rb_encoding *ienc;
1349 VALUE str;
1350 const int eidx = rb_enc_to_index(eenc);
1351
1352 if (!ptr) {
1353 return rb_enc_str_new(ptr, len, eenc);
1354 }
1355
1356 /* ASCII-8BIT case, no conversion */
1357 if ((eidx == rb_ascii8bit_encindex()) ||
1358 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1359 return rb_str_new(ptr, len);
1360 }
1361 /* no default_internal or same encoding, no conversion */
1362 ienc = rb_default_internal_encoding();
1363 if (!ienc || eenc == ienc) {
1364 return rb_enc_str_new(ptr, len, eenc);
1365 }
1366 /* ASCII compatible, and ASCII only string, no conversion in
1367 * default_internal */
1368 if ((eidx == rb_ascii8bit_encindex()) ||
1369 (eidx == rb_usascii_encindex()) ||
1370 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1371 return rb_enc_str_new(ptr, len, ienc);
1372 }
1373 /* convert from the given encoding to default_internal */
1374 str = rb_enc_str_new(NULL, 0, ienc);
1375 /* when the conversion failed for some reason, just ignore the
1376 * default_internal and result in the given encoding as-is. */
1377 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1378 rb_str_initialize(str, ptr, len, eenc);
1379 }
1380 return str;
1381}
1382
1383VALUE
1384rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1385{
1386 int eidx = rb_enc_to_index(eenc);
1387 if (eidx == rb_usascii_encindex() &&
1388 !is_ascii_string(str)) {
1389 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1390 return str;
1391 }
1392 rb_enc_associate_index(str, eidx);
1393 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1394}
1395
1396VALUE
1397rb_external_str_new(const char *ptr, long len)
1398{
1399 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1400}
1401
1402VALUE
1404{
1405 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1406}
1407
1408VALUE
1409rb_locale_str_new(const char *ptr, long len)
1410{
1411 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1412}
1413
1414VALUE
1416{
1417 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1418}
1419
1420VALUE
1422{
1423 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1424}
1425
1426VALUE
1428{
1429 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1430}
1431
1432VALUE
1434{
1435 return rb_str_export_to_enc(str, rb_default_external_encoding());
1436}
1437
1438VALUE
1440{
1441 return rb_str_export_to_enc(str, rb_locale_encoding());
1442}
1443
1444VALUE
1446{
1447 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1448}
1449
1450static VALUE
1451str_replace_shared_without_enc(VALUE str2, VALUE str)
1452{
1453 const int termlen = TERM_LEN(str);
1454 char *ptr;
1455 long len;
1456
1457 RSTRING_GETMEM(str, ptr, len);
1458 if (str_embed_capa(str2) >= len + termlen) {
1459 char *ptr2 = RSTRING(str2)->as.embed.ary;
1460 STR_SET_EMBED(str2);
1461 memcpy(ptr2, RSTRING_PTR(str), len);
1462 TERM_FILL(ptr2+len, termlen);
1463 }
1464 else {
1465 VALUE root;
1466 if (STR_SHARED_P(str)) {
1467 root = RSTRING(str)->as.heap.aux.shared;
1468 RSTRING_GETMEM(str, ptr, len);
1469 }
1470 else {
1471 root = rb_str_new_frozen(str);
1472 RSTRING_GETMEM(root, ptr, len);
1473 }
1474 RUBY_ASSERT(OBJ_FROZEN(root));
1475
1476 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1477 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1478 rb_fatal("about to free a possible shared root");
1479 }
1480 char *ptr2 = STR_HEAP_PTR(str2);
1481 if (ptr2 != ptr) {
1482 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1483 }
1484 }
1485 FL_SET(str2, STR_NOEMBED);
1486 RSTRING(str2)->as.heap.ptr = ptr;
1487 STR_SET_SHARED(str2, root);
1488 }
1489
1490 STR_SET_LEN(str2, len);
1491
1492 return str2;
1493}
1494
1495static VALUE
1496str_replace_shared(VALUE str2, VALUE str)
1497{
1498 str_replace_shared_without_enc(str2, str);
1499 rb_enc_cr_str_exact_copy(str2, str);
1500 return str2;
1501}
1502
1503static VALUE
1504str_new_shared(VALUE klass, VALUE str)
1505{
1506 return str_replace_shared(str_alloc_heap(klass), str);
1507}
1508
1509VALUE
1511{
1512 return str_new_shared(rb_obj_class(str), str);
1513}
1514
1515VALUE
1517{
1518 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1519 return str_new_frozen(rb_obj_class(orig), orig);
1520}
1521
1522static VALUE
1523rb_str_new_frozen_String(VALUE orig)
1524{
1525 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1526 return str_new_frozen(rb_cString, orig);
1527}
1528
1529
1530VALUE
1531rb_str_frozen_bare_string(VALUE orig)
1532{
1533 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1534 return str_new_frozen(rb_cString, orig);
1535}
1536
1537VALUE
1538rb_str_tmp_frozen_acquire(VALUE orig)
1539{
1540 if (OBJ_FROZEN_RAW(orig)) return orig;
1541 return str_new_frozen_buffer(0, orig, FALSE);
1542}
1543
1544VALUE
1545rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1546{
1547 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1548 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1549
1550 VALUE str = str_alloc_heap(0);
1551 OBJ_FREEZE(str);
1552 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1553 FL_SET(str, STR_SHARED_ROOT);
1554
1555 size_t capa = str_capacity(orig, TERM_LEN(orig));
1556
1557 /* If the string is embedded then we want to create a copy that is heap
1558 * allocated. If the string is shared then the shared root must be
1559 * embedded, so we want to create a copy. If the string is a shared root
1560 * then it must be embedded, so we want to create a copy. */
1561 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1562 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1563 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1564 }
1565 else {
1566 /* orig must be heap allocated and not shared, so we can safely transfer
1567 * the pointer to str. */
1568 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1569 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1570 RBASIC(orig)->flags &= ~STR_NOFREE;
1571 STR_SET_SHARED(orig, str);
1572 if (RB_OBJ_SHAREABLE_P(orig)) {
1573 RB_OBJ_SET_SHAREABLE(str);
1574 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1575 }
1576 }
1577
1578 RSTRING(str)->len = RSTRING(orig)->len;
1579 RSTRING(str)->as.heap.aux.capa = capa;
1580
1581 return str;
1582}
1583
1584void
1585rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1586{
1587 if (RBASIC_CLASS(tmp) != 0)
1588 return;
1589
1590 if (STR_EMBED_P(tmp)) {
1592 }
1593 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1594 !OBJ_FROZEN_RAW(orig)) {
1595 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1596
1597 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1598 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1599 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1600
1601 /* Unshare orig since the root (tmp) only has this one child. */
1602 FL_UNSET_RAW(orig, STR_SHARED);
1603 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1604 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1606
1607 /* Make tmp embedded and empty so it is safe for sweeping. */
1608 STR_SET_EMBED(tmp);
1609 STR_SET_LEN(tmp, 0);
1610 }
1611 }
1612}
1613
1614static VALUE
1615str_new_frozen(VALUE klass, VALUE orig)
1616{
1617 return str_new_frozen_buffer(klass, orig, TRUE);
1618}
1619
1620static VALUE
1621heap_str_make_shared(VALUE klass, VALUE orig)
1622{
1623 RUBY_ASSERT(!STR_EMBED_P(orig));
1624 RUBY_ASSERT(!STR_SHARED_P(orig));
1626
1627 VALUE str = str_alloc_heap(klass);
1628 STR_SET_LEN(str, RSTRING_LEN(orig));
1629 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1630 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1631 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1632 RBASIC(orig)->flags &= ~STR_NOFREE;
1633 STR_SET_SHARED(orig, str);
1634 if (klass == 0)
1635 FL_UNSET_RAW(str, STR_BORROWED);
1636 return str;
1637}
1638
1639static VALUE
1640str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1641{
1642 VALUE str;
1643
1644 long len = RSTRING_LEN(orig);
1645 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1646 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1647
1648 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1649 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1650 RUBY_ASSERT(STR_EMBED_P(str));
1651 }
1652 else {
1653 if (FL_TEST_RAW(orig, STR_SHARED)) {
1654 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1655 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1656 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1657 RUBY_ASSERT(ofs >= 0);
1658 RUBY_ASSERT(rest >= 0);
1659 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1661
1662 if ((ofs > 0) || (rest > 0) ||
1663 (klass != RBASIC(shared)->klass) ||
1664 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1665 str = str_new_shared(klass, shared);
1666 RUBY_ASSERT(!STR_EMBED_P(str));
1667 RSTRING(str)->as.heap.ptr += ofs;
1668 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1669 }
1670 else {
1671 if (RBASIC_CLASS(shared) == 0)
1672 FL_SET_RAW(shared, STR_BORROWED);
1673 return shared;
1674 }
1675 }
1676 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1677 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1678 STR_SET_EMBED(str);
1679 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1680 STR_SET_LEN(str, RSTRING_LEN(orig));
1681 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1682 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1683 }
1684 else {
1685 if (RB_OBJ_SHAREABLE_P(orig)) {
1686 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1687 }
1688 else {
1689 str = heap_str_make_shared(klass, orig);
1690 }
1691 }
1692 }
1693
1694 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1695 OBJ_FREEZE(str);
1696 return str;
1697}
1698
1699VALUE
1700rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1701{
1702 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1703}
1704
1705static VALUE
1706str_new_empty_String(VALUE str)
1707{
1708 VALUE v = rb_str_new(0, 0);
1709 rb_enc_copy(v, str);
1710 return v;
1711}
1712
1713#define STR_BUF_MIN_SIZE 63
1714
1715VALUE
1717{
1718 if (STR_EMBEDDABLE_P(capa, 1)) {
1719 return str_alloc_embed(rb_cString, capa + 1);
1720 }
1721
1722 VALUE str = str_alloc_heap(rb_cString);
1723
1724 RSTRING(str)->as.heap.aux.capa = capa;
1725 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1726 RSTRING(str)->as.heap.ptr[0] = '\0';
1727
1728 return str;
1729}
1730
1731VALUE
1733{
1734 VALUE str;
1735 long len = strlen(ptr);
1736
1737 str = rb_str_buf_new(len);
1738 rb_str_buf_cat(str, ptr, len);
1739
1740 return str;
1741}
1742
1743VALUE
1745{
1746 return str_new(0, 0, len);
1747}
1748
1749void
1751{
1752 if (STR_EMBED_P(str)) {
1753 RB_DEBUG_COUNTER_INC(obj_str_embed);
1754 }
1755 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1756 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1757 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1758 }
1759 else {
1760 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1761 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1762 }
1763}
1764
1765size_t
1766rb_str_memsize(VALUE str)
1767{
1768 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1769 return STR_HEAP_SIZE(str);
1770 }
1771 else {
1772 return 0;
1773 }
1774}
1775
1776VALUE
1778{
1779 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1780}
1781
1782static inline void str_discard(VALUE str);
1783static void str_shared_replace(VALUE str, VALUE str2);
1784
1785void
1787{
1788 if (str != str2) str_shared_replace(str, str2);
1789}
1790
1791static void
1792str_shared_replace(VALUE str, VALUE str2)
1793{
1794 rb_encoding *enc;
1795 int cr;
1796 int termlen;
1797
1798 RUBY_ASSERT(str2 != str);
1799 enc = STR_ENC_GET(str2);
1800 cr = ENC_CODERANGE(str2);
1801 str_discard(str);
1802 termlen = rb_enc_mbminlen(enc);
1803
1804 STR_SET_LEN(str, RSTRING_LEN(str2));
1805
1806 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1807 STR_SET_EMBED(str);
1808 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1809 rb_enc_associate(str, enc);
1810 ENC_CODERANGE_SET(str, cr);
1811 }
1812 else {
1813 if (STR_EMBED_P(str2)) {
1814 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1815 long len = RSTRING_LEN(str2);
1816 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1817
1818 char *new_ptr = ALLOC_N(char, len + termlen);
1819 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1820 RSTRING(str2)->as.heap.ptr = new_ptr;
1821 STR_SET_LEN(str2, len);
1822 RSTRING(str2)->as.heap.aux.capa = len;
1823 STR_SET_NOEMBED(str2);
1824 }
1825
1826 STR_SET_NOEMBED(str);
1827 FL_UNSET(str, STR_SHARED);
1828 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1829
1830 if (FL_TEST(str2, STR_SHARED)) {
1831 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1832 STR_SET_SHARED(str, shared);
1833 }
1834 else {
1835 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1836 }
1837
1838 /* abandon str2 */
1839 STR_SET_EMBED(str2);
1840 RSTRING_PTR(str2)[0] = 0;
1841 STR_SET_LEN(str2, 0);
1842 rb_enc_associate(str, enc);
1843 ENC_CODERANGE_SET(str, cr);
1844 }
1845}
1846
1847VALUE
1849{
1850 VALUE str;
1851
1852 if (RB_TYPE_P(obj, T_STRING)) {
1853 return obj;
1854 }
1855 str = rb_funcall(obj, idTo_s, 0);
1856 return rb_obj_as_string_result(str, obj);
1857}
1858
1859VALUE
1860rb_obj_as_string_result(VALUE str, VALUE obj)
1861{
1862 if (!RB_TYPE_P(str, T_STRING))
1863 return rb_any_to_s(obj);
1864 return str;
1865}
1866
1867static VALUE
1868str_replace(VALUE str, VALUE str2)
1869{
1870 long len;
1871
1872 len = RSTRING_LEN(str2);
1873 if (STR_SHARED_P(str2)) {
1874 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1876 STR_SET_NOEMBED(str);
1877 STR_SET_LEN(str, len);
1878 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1879 STR_SET_SHARED(str, shared);
1880 rb_enc_cr_str_exact_copy(str, str2);
1881 }
1882 else {
1883 str_replace_shared(str, str2);
1884 }
1885
1886 return str;
1887}
1888
1889static inline VALUE
1890ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1891{
1892 size_t size = rb_str_embed_size(capa, 0);
1893 RUBY_ASSERT(size > 0);
1894 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1895
1896 NEWOBJ_OF(str, struct RString, klass,
1898
1899 str->len = 0;
1900
1901 return (VALUE)str;
1902}
1903
1904static inline VALUE
1905ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1906{
1907 NEWOBJ_OF(str, struct RString, klass,
1908 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1909
1910 str->as.heap.aux.capa = 0;
1911 str->as.heap.ptr = NULL;
1912
1913 return (VALUE)str;
1914}
1915
1916static inline VALUE
1917str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1918{
1919 int encidx = 0;
1920 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1921 encidx = rb_enc_get_index(str);
1922 flags &= ~ENCODING_MASK;
1923 }
1924 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1925 if (encidx) rb_enc_associate_index(dup, encidx);
1926 return dup;
1927}
1928
1929static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1930
1931static inline VALUE
1932str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1933{
1934 VALUE flags = FL_TEST_RAW(str, flag_mask);
1935 long len = RSTRING_LEN(str);
1936
1937 RUBY_ASSERT(STR_EMBED_P(dup));
1938 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1939 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1940 STR_SET_LEN(dup, RSTRING_LEN(str));
1941 return str_duplicate_setup_encoding(str, dup, flags);
1942}
1943
1944static inline VALUE
1945str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1946{
1947 VALUE flags = FL_TEST_RAW(str, flag_mask);
1948 VALUE root = str;
1949 if (FL_TEST_RAW(str, STR_SHARED)) {
1950 root = RSTRING(str)->as.heap.aux.shared;
1951 }
1952 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1953 root = str = str_new_frozen(klass, str);
1954 flags = FL_TEST_RAW(str, flag_mask);
1955 }
1956 RUBY_ASSERT(!STR_SHARED_P(root));
1958
1959 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1960 FL_SET(root, STR_SHARED_ROOT);
1961 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1962 flags |= RSTRING_NOEMBED | STR_SHARED;
1963
1964 STR_SET_LEN(dup, RSTRING_LEN(str));
1965 return str_duplicate_setup_encoding(str, dup, flags);
1966}
1967
1968static inline VALUE
1969str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1970{
1971 if (STR_EMBED_P(str)) {
1972 return str_duplicate_setup_embed(klass, str, dup);
1973 }
1974 else {
1975 return str_duplicate_setup_heap(klass, str, dup);
1976 }
1977}
1978
1979static inline VALUE
1980str_duplicate(VALUE klass, VALUE str)
1981{
1982 VALUE dup;
1983 if (STR_EMBED_P(str)) {
1984 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1985 }
1986 else {
1987 dup = str_alloc_heap(klass);
1988 }
1989
1990 return str_duplicate_setup(klass, str, dup);
1991}
1992
1993VALUE
1995{
1996 return str_duplicate(rb_obj_class(str), str);
1997}
1998
1999/* :nodoc: */
2000VALUE
2001rb_str_dup_m(VALUE str)
2002{
2003 if (LIKELY(BARE_STRING_P(str))) {
2004 return str_duplicate(rb_cString, str);
2005 }
2006 else {
2007 return rb_obj_dup(str);
2008 }
2009}
2010
2011VALUE
2013{
2014 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2015 return str_duplicate(rb_cString, str);
2016}
2017
2018VALUE
2019rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2020{
2021 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2022 VALUE new_str, klass = rb_cString;
2023
2024 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2025 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2026 str_duplicate_setup_embed(klass, str, new_str);
2027 }
2028 else {
2029 new_str = ec_str_alloc_heap(ec, klass);
2030 str_duplicate_setup_heap(klass, str, new_str);
2031 }
2032 if (chilled) {
2033 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2034 }
2035 return new_str;
2036}
2037
2038VALUE
2039rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2040{
2041 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2042 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2043 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2044 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2045 return rb_str_freeze(str);
2046}
2047
2048/*
2049 * The documentation block below uses an include (instead of inline text)
2050 * because the included text has non-ASCII characters (which are not allowed in a C file).
2051 */
2052
2053/*
2054 *
2055 * call-seq:
2056 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2057 *
2058 * :include: doc/string/new.rdoc
2059 *
2060 */
2061
2062static VALUE
2063rb_str_init(int argc, VALUE *argv, VALUE str)
2064{
2065 static ID keyword_ids[2];
2066 VALUE orig, opt, venc, vcapa;
2067 VALUE kwargs[2];
2068 rb_encoding *enc = 0;
2069 int n;
2070
2071 if (!keyword_ids[0]) {
2072 keyword_ids[0] = rb_id_encoding();
2073 CONST_ID(keyword_ids[1], "capacity");
2074 }
2075
2076 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2077 if (!NIL_P(opt)) {
2078 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2079 venc = kwargs[0];
2080 vcapa = kwargs[1];
2081 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2082 enc = rb_to_encoding(venc);
2083 }
2084 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2085 long capa = NUM2LONG(vcapa);
2086 long len = 0;
2087 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2088
2089 if (capa < STR_BUF_MIN_SIZE) {
2090 capa = STR_BUF_MIN_SIZE;
2091 }
2092 if (n == 1) {
2093 StringValue(orig);
2094 len = RSTRING_LEN(orig);
2095 if (capa < len) {
2096 capa = len;
2097 }
2098 if (orig == str) n = 0;
2099 }
2100 str_modifiable(str);
2101 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2102 /* make noembed always */
2103 const size_t size = (size_t)capa + termlen;
2104 const char *const old_ptr = RSTRING_PTR(str);
2105 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2106 char *new_ptr = ALLOC_N(char, size);
2107 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2108 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2109 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2110 RSTRING(str)->as.heap.ptr = new_ptr;
2111 }
2112 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2113 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2114 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2115 }
2116 STR_SET_LEN(str, len);
2117 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2118 if (n == 1) {
2119 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2120 rb_enc_cr_str_exact_copy(str, orig);
2121 }
2122 FL_SET(str, STR_NOEMBED);
2123 RSTRING(str)->as.heap.aux.capa = capa;
2124 }
2125 else if (n == 1) {
2126 rb_str_replace(str, orig);
2127 }
2128 if (enc) {
2129 rb_enc_associate(str, enc);
2131 }
2132 }
2133 else if (n == 1) {
2134 rb_str_replace(str, orig);
2135 }
2136 return str;
2137}
2138
2139/* :nodoc: */
2140static VALUE
2141rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2142{
2143 if (klass != rb_cString) {
2144 return rb_class_new_instance_pass_kw(argc, argv, klass);
2145 }
2146
2147 static ID keyword_ids[2];
2148 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2149 VALUE kwargs[2];
2150 rb_encoding *enc = NULL;
2151
2152 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2153 if (NIL_P(opt)) {
2154 return rb_class_new_instance_pass_kw(argc, argv, klass);
2155 }
2156
2157 keyword_ids[0] = rb_id_encoding();
2158 CONST_ID(keyword_ids[1], "capacity");
2159 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2160 encoding = kwargs[0];
2161 capacity = kwargs[1];
2162
2163 if (n == 1) {
2164 orig = StringValue(orig);
2165 }
2166 else {
2167 orig = Qnil;
2168 }
2169
2170 if (UNDEF_P(encoding)) {
2171 if (!NIL_P(orig)) {
2172 encoding = rb_obj_encoding(orig);
2173 }
2174 }
2175
2176 if (!UNDEF_P(encoding)) {
2177 enc = rb_to_encoding(encoding);
2178 }
2179
2180 // If capacity is nil, we're basically just duping `orig`.
2181 if (UNDEF_P(capacity)) {
2182 if (NIL_P(orig)) {
2183 VALUE empty_str = str_new(klass, "", 0);
2184 if (enc) {
2185 rb_enc_associate(empty_str, enc);
2186 }
2187 return empty_str;
2188 }
2189 VALUE copy = str_duplicate(klass, orig);
2190 rb_enc_associate(copy, enc);
2191 ENC_CODERANGE_CLEAR(copy);
2192 return copy;
2193 }
2194
2195 long capa = 0;
2196 capa = NUM2LONG(capacity);
2197 if (capa < 0) {
2198 capa = 0;
2199 }
2200
2201 if (!NIL_P(orig)) {
2202 long orig_capa = rb_str_capacity(orig);
2203 if (orig_capa > capa) {
2204 capa = orig_capa;
2205 }
2206 }
2207
2208 VALUE str = str_enc_new(klass, NULL, capa, enc);
2209 STR_SET_LEN(str, 0);
2210 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2211
2212 if (!NIL_P(orig)) {
2213 rb_str_buf_append(str, orig);
2214 }
2215
2216 return str;
2217}
2218
2219#ifdef NONASCII_MASK
2220#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2221
2222/*
2223 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2224 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2225 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2226 *
2227 * if (!(byte & 0x80))
2228 * byte |= 0x40; // turn on bit6
2229 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2230 *
2231 * This function calculates whether a byte is leading or not for all bytes
2232 * in the argument word by concurrently using the above logic, and then
2233 * adds up the number of leading bytes in the word.
2234 */
2235static inline uintptr_t
2236count_utf8_lead_bytes_with_word(const uintptr_t *s)
2237{
2238 uintptr_t d = *s;
2239
2240 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2241 d = (d>>6) | (~d>>7);
2242 d &= NONASCII_MASK >> 7;
2243
2244 /* Gather all bytes. */
2245#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2246 /* use only if it can use POPCNT */
2247 return rb_popcount_intptr(d);
2248#else
2249 d += (d>>8);
2250 d += (d>>16);
2251# if SIZEOF_VOIDP == 8
2252 d += (d>>32);
2253# endif
2254 return (d&0xF);
2255#endif
2256}
2257#endif
2258
2259static inline long
2260enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2261{
2262 long c;
2263 const char *q;
2264
2265 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2266 long diff = (long)(e - p);
2267 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2268 }
2269#ifdef NONASCII_MASK
2270 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2271 uintptr_t len = 0;
2272 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2273 const uintptr_t *s, *t;
2274 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2275 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2276 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2277 while (p < (const char *)s) {
2278 if (is_utf8_lead_byte(*p)) len++;
2279 p++;
2280 }
2281 while (s < t) {
2282 len += count_utf8_lead_bytes_with_word(s);
2283 s++;
2284 }
2285 p = (const char *)s;
2286 }
2287 while (p < e) {
2288 if (is_utf8_lead_byte(*p)) len++;
2289 p++;
2290 }
2291 return (long)len;
2292 }
2293#endif
2294 else if (rb_enc_asciicompat(enc)) {
2295 c = 0;
2296 if (ENC_CODERANGE_CLEAN_P(cr)) {
2297 while (p < e) {
2298 if (ISASCII(*p)) {
2299 q = search_nonascii(p, e);
2300 if (!q)
2301 return c + (e - p);
2302 c += q - p;
2303 p = q;
2304 }
2305 p += rb_enc_fast_mbclen(p, e, enc);
2306 c++;
2307 }
2308 }
2309 else {
2310 while (p < e) {
2311 if (ISASCII(*p)) {
2312 q = search_nonascii(p, e);
2313 if (!q)
2314 return c + (e - p);
2315 c += q - p;
2316 p = q;
2317 }
2318 p += rb_enc_mbclen(p, e, enc);
2319 c++;
2320 }
2321 }
2322 return c;
2323 }
2324
2325 for (c=0; p<e; c++) {
2326 p += rb_enc_mbclen(p, e, enc);
2327 }
2328 return c;
2329}
2330
2331long
2332rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2333{
2334 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2335}
2336
2337/* To get strlen with cr
2338 * Note that given cr is not used.
2339 */
2340long
2341rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2342{
2343 long c;
2344 const char *q;
2345 int ret;
2346
2347 *cr = 0;
2348 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2349 long diff = (long)(e - p);
2350 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2351 }
2352 else if (rb_enc_asciicompat(enc)) {
2353 c = 0;
2354 while (p < e) {
2355 if (ISASCII(*p)) {
2356 q = search_nonascii(p, e);
2357 if (!q) {
2358 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2359 return c + (e - p);
2360 }
2361 c += q - p;
2362 p = q;
2363 }
2364 ret = rb_enc_precise_mbclen(p, e, enc);
2365 if (MBCLEN_CHARFOUND_P(ret)) {
2366 *cr |= ENC_CODERANGE_VALID;
2367 p += MBCLEN_CHARFOUND_LEN(ret);
2368 }
2369 else {
2371 p++;
2372 }
2373 c++;
2374 }
2375 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2376 return c;
2377 }
2378
2379 for (c=0; p<e; c++) {
2380 ret = rb_enc_precise_mbclen(p, e, enc);
2381 if (MBCLEN_CHARFOUND_P(ret)) {
2382 *cr |= ENC_CODERANGE_VALID;
2383 p += MBCLEN_CHARFOUND_LEN(ret);
2384 }
2385 else {
2387 if (p + rb_enc_mbminlen(enc) <= e)
2388 p += rb_enc_mbminlen(enc);
2389 else
2390 p = e;
2391 }
2392 }
2393 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2394 return c;
2395}
2396
2397/* enc must be str's enc or rb_enc_check(str, str2) */
2398static long
2399str_strlen(VALUE str, rb_encoding *enc)
2400{
2401 const char *p, *e;
2402 int cr;
2403
2404 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2405 if (!enc) enc = STR_ENC_GET(str);
2406 p = RSTRING_PTR(str);
2407 e = RSTRING_END(str);
2408 cr = ENC_CODERANGE(str);
2409
2410 if (cr == ENC_CODERANGE_UNKNOWN) {
2411 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2412 if (cr) ENC_CODERANGE_SET(str, cr);
2413 return n;
2414 }
2415 else {
2416 return enc_strlen(p, e, enc, cr);
2417 }
2418}
2419
2420long
2422{
2423 return str_strlen(str, NULL);
2424}
2425
2426/*
2427 * call-seq:
2428 * length -> integer
2429 *
2430 * :include: doc/string/length.rdoc
2431 *
2432 */
2433
2434VALUE
2436{
2437 return LONG2NUM(str_strlen(str, NULL));
2438}
2439
2440/*
2441 * call-seq:
2442 * bytesize -> integer
2443 *
2444 * :include: doc/string/bytesize.rdoc
2445 *
2446 */
2447
2448VALUE
2449rb_str_bytesize(VALUE str)
2450{
2451 return LONG2NUM(RSTRING_LEN(str));
2452}
2453
2454/*
2455 * call-seq:
2456 * empty? -> true or false
2457 *
2458 * Returns whether the length of +self+ is zero:
2459 *
2460 * 'hello'.empty? # => false
2461 * ' '.empty? # => false
2462 * ''.empty? # => true
2463 *
2464 * Related: see {Querying}[rdoc-ref:String@Querying].
2465 */
2466
2467static VALUE
2468rb_str_empty(VALUE str)
2469{
2470 return RBOOL(RSTRING_LEN(str) == 0);
2471}
2472
2473/*
2474 * call-seq:
2475 * self + other_string -> new_string
2476 *
2477 * Returns a new string containing +other_string+ concatenated to +self+:
2478 *
2479 * 'Hello from ' + self.to_s # => "Hello from main"
2480 *
2481 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2482 */
2483
2484VALUE
2486{
2487 VALUE str3;
2488 rb_encoding *enc;
2489 char *ptr1, *ptr2, *ptr3;
2490 long len1, len2;
2491 int termlen;
2492
2493 StringValue(str2);
2494 enc = rb_enc_check_str(str1, str2);
2495 RSTRING_GETMEM(str1, ptr1, len1);
2496 RSTRING_GETMEM(str2, ptr2, len2);
2497 termlen = rb_enc_mbminlen(enc);
2498 if (len1 > LONG_MAX - len2) {
2499 rb_raise(rb_eArgError, "string size too big");
2500 }
2501 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2502 ptr3 = RSTRING_PTR(str3);
2503 memcpy(ptr3, ptr1, len1);
2504 memcpy(ptr3+len1, ptr2, len2);
2505 TERM_FILL(&ptr3[len1+len2], termlen);
2506
2507 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2509 RB_GC_GUARD(str1);
2510 RB_GC_GUARD(str2);
2511 return str3;
2512}
2513
2514/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2515VALUE
2516rb_str_opt_plus(VALUE str1, VALUE str2)
2517{
2520 long len1, len2;
2521 MAYBE_UNUSED(char) *ptr1, *ptr2;
2522 RSTRING_GETMEM(str1, ptr1, len1);
2523 RSTRING_GETMEM(str2, ptr2, len2);
2524 int enc1 = rb_enc_get_index(str1);
2525 int enc2 = rb_enc_get_index(str2);
2526
2527 if (enc1 < 0) {
2528 return Qundef;
2529 }
2530 else if (enc2 < 0) {
2531 return Qundef;
2532 }
2533 else if (enc1 != enc2) {
2534 return Qundef;
2535 }
2536 else if (len1 > LONG_MAX - len2) {
2537 return Qundef;
2538 }
2539 else {
2540 return rb_str_plus(str1, str2);
2541 }
2542
2543}
2544
2545/*
2546 * call-seq:
2547 * self * n -> new_string
2548 *
2549 * Returns a new string containing +n+ copies of +self+:
2550 *
2551 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2552 * 'No!' * 0 # => ""
2553 *
2554 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2555 */
2556
2557VALUE
2559{
2560 VALUE str2;
2561 long n, len;
2562 char *ptr2;
2563 int termlen;
2564
2565 if (times == INT2FIX(1)) {
2566 return str_duplicate(rb_cString, str);
2567 }
2568 if (times == INT2FIX(0)) {
2569 str2 = str_alloc_embed(rb_cString, 0);
2570 rb_enc_copy(str2, str);
2571 return str2;
2572 }
2573 len = NUM2LONG(times);
2574 if (len < 0) {
2575 rb_raise(rb_eArgError, "negative argument");
2576 }
2577 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2578 if (STR_EMBEDDABLE_P(len, 1)) {
2579 str2 = str_alloc_embed(rb_cString, len + 1);
2580 memset(RSTRING_PTR(str2), 0, len + 1);
2581 }
2582 else {
2583 str2 = str_alloc_heap(rb_cString);
2584 RSTRING(str2)->as.heap.aux.capa = len;
2585 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2586 }
2587 STR_SET_LEN(str2, len);
2588 rb_enc_copy(str2, str);
2589 return str2;
2590 }
2591 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2592 rb_raise(rb_eArgError, "argument too big");
2593 }
2594
2595 len *= RSTRING_LEN(str);
2596 termlen = TERM_LEN(str);
2597 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2598 ptr2 = RSTRING_PTR(str2);
2599 if (len) {
2600 n = RSTRING_LEN(str);
2601 memcpy(ptr2, RSTRING_PTR(str), n);
2602 while (n <= len/2) {
2603 memcpy(ptr2 + n, ptr2, n);
2604 n *= 2;
2605 }
2606 memcpy(ptr2 + n, ptr2, len-n);
2607 }
2608 STR_SET_LEN(str2, len);
2609 TERM_FILL(&ptr2[len], termlen);
2610 rb_enc_cr_str_copy_for_substr(str2, str);
2611
2612 return str2;
2613}
2614
2615/*
2616 * call-seq:
2617 * self % object -> new_string
2618 *
2619 * Returns the result of formatting +object+ into the format specifications
2620 * contained in +self+
2621 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2622 *
2623 * '%05d' % 123 # => "00123"
2624 *
2625 * If +self+ contains multiple format specifications,
2626 * +object+ must be an array or hash containing the objects to be formatted:
2627 *
2628 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2629 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2630 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2631 *
2632 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2633 */
2634
2635static VALUE
2636rb_str_format_m(VALUE str, VALUE arg)
2637{
2638 VALUE tmp = rb_check_array_type(arg);
2639
2640 if (!NIL_P(tmp)) {
2641 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2642 }
2643 return rb_str_format(1, &arg, str);
2644}
2645
2646static inline void
2647rb_check_lockedtmp(VALUE str)
2648{
2649 if (FL_TEST(str, STR_TMPLOCK)) {
2650 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2651 }
2652}
2653
2654// If none of these flags are set, we know we have an modifiable string.
2655// If any is set, we need to do more detailed checks.
2656#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2657static inline void
2658str_modifiable(VALUE str)
2659{
2660 RUBY_ASSERT(ruby_thread_has_gvl_p());
2661
2662 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2663 if (CHILLED_STRING_P(str)) {
2664 CHILLED_STRING_MUTATED(str);
2665 }
2666 rb_check_lockedtmp(str);
2667 rb_check_frozen(str);
2668 }
2669}
2670
2671static inline int
2672str_dependent_p(VALUE str)
2673{
2674 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2675 return FALSE;
2676 }
2677 else {
2678 return TRUE;
2679 }
2680}
2681
2682// If none of these flags are set, we know we have an independent string.
2683// If any is set, we need to do more detailed checks.
2684#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2685static inline int
2686str_independent(VALUE str)
2687{
2688 RUBY_ASSERT(ruby_thread_has_gvl_p());
2689
2690 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2691 str_modifiable(str);
2692 return !str_dependent_p(str);
2693 }
2694 return TRUE;
2695}
2696
2697static void
2698str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2699{
2700 RUBY_ASSERT(ruby_thread_has_gvl_p());
2701
2702 char *ptr;
2703 char *oldptr;
2704 long capa = len + expand;
2705
2706 if (len > capa) len = capa;
2707
2708 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2709 ptr = RSTRING(str)->as.heap.ptr;
2710 STR_SET_EMBED(str);
2711 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2712 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2713 STR_SET_LEN(str, len);
2714 return;
2715 }
2716
2717 ptr = ALLOC_N(char, (size_t)capa + termlen);
2718 oldptr = RSTRING_PTR(str);
2719 if (oldptr) {
2720 memcpy(ptr, oldptr, len);
2721 }
2722 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2723 xfree(oldptr);
2724 }
2725 STR_SET_NOEMBED(str);
2726 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2727 TERM_FILL(ptr + len, termlen);
2728 RSTRING(str)->as.heap.ptr = ptr;
2729 STR_SET_LEN(str, len);
2730 RSTRING(str)->as.heap.aux.capa = capa;
2731}
2732
2733void
2734rb_str_modify(VALUE str)
2735{
2736 if (!str_independent(str))
2737 str_make_independent(str);
2739}
2740
2741void
2743{
2744 RUBY_ASSERT(ruby_thread_has_gvl_p());
2745
2746 int termlen = TERM_LEN(str);
2747 long len = RSTRING_LEN(str);
2748
2749 if (expand < 0) {
2750 rb_raise(rb_eArgError, "negative expanding string size");
2751 }
2752 if (expand >= LONG_MAX - len) {
2753 rb_raise(rb_eArgError, "string size too big");
2754 }
2755
2756 if (!str_independent(str)) {
2757 str_make_independent_expand(str, len, expand, termlen);
2758 }
2759 else if (expand > 0) {
2760 RESIZE_CAPA_TERM(str, len + expand, termlen);
2761 }
2763}
2764
2765/* As rb_str_modify(), but don't clear coderange */
2766static void
2767str_modify_keep_cr(VALUE str)
2768{
2769 if (!str_independent(str))
2770 str_make_independent(str);
2772 /* Force re-scan later */
2774}
2775
2776static inline void
2777str_discard(VALUE str)
2778{
2779 str_modifiable(str);
2780 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2781 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2782 RSTRING(str)->as.heap.ptr = 0;
2783 STR_SET_LEN(str, 0);
2784 }
2785}
2786
2787void
2789{
2790 int encindex = rb_enc_get_index(str);
2791
2792 if (RB_UNLIKELY(encindex == -1)) {
2793 rb_raise(rb_eTypeError, "not encoding capable object");
2794 }
2795
2796 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2797 return;
2798 }
2799
2800 rb_encoding *enc = rb_enc_from_index(encindex);
2801 if (!rb_enc_asciicompat(enc)) {
2802 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2803 }
2804}
2805
2806VALUE
2808{
2809 RUBY_ASSERT(ruby_thread_has_gvl_p());
2810
2811 VALUE s = *ptr;
2812 if (!RB_TYPE_P(s, T_STRING)) {
2813 s = rb_str_to_str(s);
2814 *ptr = s;
2815 }
2816 return s;
2817}
2818
2819char *
2821{
2822 VALUE str = rb_string_value(ptr);
2823 return RSTRING_PTR(str);
2824}
2825
2826static int
2827zero_filled(const char *s, int n)
2828{
2829 for (; n > 0; --n) {
2830 if (*s++) return 0;
2831 }
2832 return 1;
2833}
2834
2835static const char *
2836str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2837{
2838 const char *e = s + len;
2839
2840 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2841 if (zero_filled(s, minlen)) return s;
2842 }
2843 return 0;
2844}
2845
2846static char *
2847str_fill_term(VALUE str, char *s, long len, int termlen)
2848{
2849 /* This function assumes that (capa + termlen) bytes of memory
2850 * is allocated, like many other functions in this file.
2851 */
2852 if (str_dependent_p(str)) {
2853 if (!zero_filled(s + len, termlen))
2854 str_make_independent_expand(str, len, 0L, termlen);
2855 }
2856 else {
2857 TERM_FILL(s + len, termlen);
2858 return s;
2859 }
2860 return RSTRING_PTR(str);
2861}
2862
2863void
2864rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2865{
2866 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2867 long len = RSTRING_LEN(str);
2868
2869 RUBY_ASSERT(capa >= len);
2870 if (capa - len < termlen) {
2871 rb_check_lockedtmp(str);
2872 str_make_independent_expand(str, len, 0L, termlen);
2873 }
2874 else if (str_dependent_p(str)) {
2875 if (termlen > oldtermlen)
2876 str_make_independent_expand(str, len, 0L, termlen);
2877 }
2878 else {
2879 if (!STR_EMBED_P(str)) {
2880 /* modify capa instead of realloc */
2881 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2882 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2883 }
2884 if (termlen > oldtermlen) {
2885 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2886 }
2887 }
2888
2889 return;
2890}
2891
2892static char *
2893str_null_check(VALUE str, int *w)
2894{
2895 char *s = RSTRING_PTR(str);
2896 long len = RSTRING_LEN(str);
2897 rb_encoding *enc = rb_enc_get(str);
2898 const int minlen = rb_enc_mbminlen(enc);
2899
2900 if (minlen > 1) {
2901 *w = 1;
2902 if (str_null_char(s, len, minlen, enc)) {
2903 return NULL;
2904 }
2905 return str_fill_term(str, s, len, minlen);
2906 }
2907 *w = 0;
2908 if (!s || memchr(s, 0, len)) {
2909 return NULL;
2910 }
2911 if (s[len]) {
2912 s = str_fill_term(str, s, len, minlen);
2913 }
2914 return s;
2915}
2916
2917char *
2918rb_str_to_cstr(VALUE str)
2919{
2920 int w;
2921 return str_null_check(str, &w);
2922}
2923
2924char *
2926{
2927 VALUE str = rb_string_value(ptr);
2928 int w;
2929 char *s = str_null_check(str, &w);
2930 if (!s) {
2931 if (w) {
2932 rb_raise(rb_eArgError, "string contains null char");
2933 }
2934 rb_raise(rb_eArgError, "string contains null byte");
2935 }
2936 return s;
2937}
2938
2939char *
2940rb_str_fill_terminator(VALUE str, const int newminlen)
2941{
2942 char *s = RSTRING_PTR(str);
2943 long len = RSTRING_LEN(str);
2944 return str_fill_term(str, s, len, newminlen);
2945}
2946
2947VALUE
2949{
2950 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2951 return str;
2952}
2953
2954/*
2955 * call-seq:
2956 * String.try_convert(object) -> object, new_string, or nil
2957 *
2958 * Attempts to convert the given +object+ to a string.
2959 *
2960 * If +object+ is already a string, returns +object+, unmodified.
2961 *
2962 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2963 * calls <tt>object.to_str</tt> and returns the result.
2964 *
2965 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2966 *
2967 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2968 */
2969static VALUE
2970rb_str_s_try_convert(VALUE dummy, VALUE str)
2971{
2972 return rb_check_string_type(str);
2973}
2974
2975static char*
2976str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2977{
2978 long nth = *nthp;
2979 if (rb_enc_mbmaxlen(enc) == 1) {
2980 p += nth;
2981 }
2982 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2983 p += nth * rb_enc_mbmaxlen(enc);
2984 }
2985 else if (rb_enc_asciicompat(enc)) {
2986 const char *p2, *e2;
2987 int n;
2988
2989 while (p < e && 0 < nth) {
2990 e2 = p + nth;
2991 if (e < e2) {
2992 *nthp = nth;
2993 return (char *)e;
2994 }
2995 if (ISASCII(*p)) {
2996 p2 = search_nonascii(p, e2);
2997 if (!p2) {
2998 nth -= e2 - p;
2999 *nthp = nth;
3000 return (char *)e2;
3001 }
3002 nth -= p2 - p;
3003 p = p2;
3004 }
3005 n = rb_enc_mbclen(p, e, enc);
3006 p += n;
3007 nth--;
3008 }
3009 *nthp = nth;
3010 if (nth != 0) {
3011 return (char *)e;
3012 }
3013 return (char *)p;
3014 }
3015 else {
3016 while (p < e && nth--) {
3017 p += rb_enc_mbclen(p, e, enc);
3018 }
3019 }
3020 if (p > e) p = e;
3021 *nthp = nth;
3022 return (char*)p;
3023}
3024
3025char*
3026rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3027{
3028 return str_nth_len(p, e, &nth, enc);
3029}
3030
3031static char*
3032str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3033{
3034 if (singlebyte)
3035 p += nth;
3036 else {
3037 p = str_nth_len(p, e, &nth, enc);
3038 }
3039 if (!p) return 0;
3040 if (p > e) p = e;
3041 return (char *)p;
3042}
3043
3044/* char offset to byte offset */
3045static long
3046str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3047{
3048 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3049 if (!pp) return e - p;
3050 return pp - p;
3051}
3052
3053long
3054rb_str_offset(VALUE str, long pos)
3055{
3056 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3057 STR_ENC_GET(str), single_byte_optimizable(str));
3058}
3059
3060#ifdef NONASCII_MASK
3061static char *
3062str_utf8_nth(const char *p, const char *e, long *nthp)
3063{
3064 long nth = *nthp;
3065 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3066 const uintptr_t *s, *t;
3067 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3068 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3069 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3070 while (p < (const char *)s) {
3071 if (is_utf8_lead_byte(*p)) nth--;
3072 p++;
3073 }
3074 do {
3075 nth -= count_utf8_lead_bytes_with_word(s);
3076 s++;
3077 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3078 p = (char *)s;
3079 }
3080 while (p < e) {
3081 if (is_utf8_lead_byte(*p)) {
3082 if (nth == 0) break;
3083 nth--;
3084 }
3085 p++;
3086 }
3087 *nthp = nth;
3088 return (char *)p;
3089}
3090
3091static long
3092str_utf8_offset(const char *p, const char *e, long nth)
3093{
3094 const char *pp = str_utf8_nth(p, e, &nth);
3095 return pp - p;
3096}
3097#endif
3098
3099/* byte offset to char offset */
3100long
3101rb_str_sublen(VALUE str, long pos)
3102{
3103 if (single_byte_optimizable(str) || pos < 0)
3104 return pos;
3105 else {
3106 char *p = RSTRING_PTR(str);
3107 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3108 }
3109}
3110
3111static VALUE
3112str_subseq(VALUE str, long beg, long len)
3113{
3114 VALUE str2;
3115
3116 RUBY_ASSERT(beg >= 0);
3117 RUBY_ASSERT(len >= 0);
3118 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3119
3120 const int termlen = TERM_LEN(str);
3121 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3122 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3123 RB_GC_GUARD(str);
3124 return str2;
3125 }
3126
3127 str2 = str_alloc_heap(rb_cString);
3128 if (str_embed_capa(str2) >= len + termlen) {
3129 char *ptr2 = RSTRING(str2)->as.embed.ary;
3130 STR_SET_EMBED(str2);
3131 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3132 TERM_FILL(ptr2+len, termlen);
3133
3134 STR_SET_LEN(str2, len);
3135 RB_GC_GUARD(str);
3136 }
3137 else {
3138 str_replace_shared(str2, str);
3139 RUBY_ASSERT(!STR_EMBED_P(str2));
3140 ENC_CODERANGE_CLEAR(str2);
3141 RSTRING(str2)->as.heap.ptr += beg;
3142 if (RSTRING_LEN(str2) > len) {
3143 STR_SET_LEN(str2, len);
3144 }
3145 }
3146
3147 return str2;
3148}
3149
3150VALUE
3151rb_str_subseq(VALUE str, long beg, long len)
3152{
3153 VALUE str2 = str_subseq(str, beg, len);
3154 rb_enc_cr_str_copy_for_substr(str2, str);
3155 return str2;
3156}
3157
3158char *
3159rb_str_subpos(VALUE str, long beg, long *lenp)
3160{
3161 long len = *lenp;
3162 long slen = -1L;
3163 const long blen = RSTRING_LEN(str);
3164 rb_encoding *enc = STR_ENC_GET(str);
3165 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3166
3167 if (len < 0) return 0;
3168 if (beg < 0 && -beg < 0) return 0;
3169 if (!blen) {
3170 len = 0;
3171 }
3172 if (single_byte_optimizable(str)) {
3173 if (beg > blen) return 0;
3174 if (beg < 0) {
3175 beg += blen;
3176 if (beg < 0) return 0;
3177 }
3178 if (len > blen - beg)
3179 len = blen - beg;
3180 if (len < 0) return 0;
3181 p = s + beg;
3182 goto end;
3183 }
3184 if (beg < 0) {
3185 if (len > -beg) len = -beg;
3186 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3187 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3188 beg = -beg;
3189 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3190 p = e;
3191 if (!p) return 0;
3192 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3193 if (!p) return 0;
3194 len = e - p;
3195 goto end;
3196 }
3197 else {
3198 slen = str_strlen(str, enc);
3199 beg += slen;
3200 if (beg < 0) return 0;
3201 p = s + beg;
3202 if (len == 0) goto end;
3203 }
3204 }
3205 else if (beg > 0 && beg > blen) {
3206 return 0;
3207 }
3208 if (len == 0) {
3209 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3210 p = s + beg;
3211 }
3212#ifdef NONASCII_MASK
3213 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3214 enc == rb_utf8_encoding()) {
3215 p = str_utf8_nth(s, e, &beg);
3216 if (beg > 0) return 0;
3217 len = str_utf8_offset(p, e, len);
3218 }
3219#endif
3220 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3221 int char_sz = rb_enc_mbmaxlen(enc);
3222
3223 p = s + beg * char_sz;
3224 if (p > e) {
3225 return 0;
3226 }
3227 else if (len * char_sz > e - p)
3228 len = e - p;
3229 else
3230 len *= char_sz;
3231 }
3232 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3233 if (beg > 0) return 0;
3234 len = 0;
3235 }
3236 else {
3237 len = str_offset(p, e, len, enc, 0);
3238 }
3239 end:
3240 *lenp = len;
3241 RB_GC_GUARD(str);
3242 return p;
3243}
3244
3245static VALUE str_substr(VALUE str, long beg, long len, int empty);
3246
3247VALUE
3248rb_str_substr(VALUE str, long beg, long len)
3249{
3250 return str_substr(str, beg, len, TRUE);
3251}
3252
3253VALUE
3254rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3255{
3256 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3257}
3258
3259static VALUE
3260str_substr(VALUE str, long beg, long len, int empty)
3261{
3262 char *p = rb_str_subpos(str, beg, &len);
3263
3264 if (!p) return Qnil;
3265 if (!len && !empty) return Qnil;
3266
3267 beg = p - RSTRING_PTR(str);
3268
3269 VALUE str2 = str_subseq(str, beg, len);
3270 rb_enc_cr_str_copy_for_substr(str2, str);
3271 return str2;
3272}
3273
3274/* :nodoc: */
3275VALUE
3277{
3278 if (CHILLED_STRING_P(str)) {
3279 FL_UNSET_RAW(str, STR_CHILLED);
3280 }
3281
3282 if (OBJ_FROZEN(str)) return str;
3283 rb_str_resize(str, RSTRING_LEN(str));
3284 return rb_obj_freeze(str);
3285}
3286
3287/*
3288 * call-seq:
3289 * +string -> new_string or self
3290 *
3291 * Returns +self+ if +self+ is not frozen and can be mutated
3292 * without warning issuance.
3293 *
3294 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3295 *
3296 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3297 */
3298static VALUE
3299str_uplus(VALUE str)
3300{
3301 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3302 return rb_str_dup(str);
3303 }
3304 else {
3305 return str;
3306 }
3307}
3308
3309/*
3310 * call-seq:
3311 * -self -> frozen_string
3312 *
3313 * Returns a frozen string equal to +self+.
3314 *
3315 * The returned string is +self+ if and only if all of the following are true:
3316 *
3317 * - +self+ is already frozen.
3318 * - +self+ is an instance of \String (rather than of a subclass of \String)
3319 * - +self+ has no instance variables set on it.
3320 *
3321 * Otherwise, the returned string is a frozen copy of +self+.
3322 *
3323 * Returning +self+, when possible, saves duplicating +self+;
3324 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3325 *
3326 * It may also save duplicating other, already-existing, strings:
3327 *
3328 * s0 = 'foo'
3329 * s1 = 'foo'
3330 * s0.object_id == s1.object_id # => false
3331 * (-s0).object_id == (-s1).object_id # => true
3332 *
3333 * Note that method #-@ is convenient for defining a constant:
3334 *
3335 * FileName = -'config/database.yml'
3336 *
3337 * While its alias #dedup is better suited for chaining:
3338 *
3339 * 'foo'.dedup.gsub!('o')
3340 *
3341 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3342 */
3343static VALUE
3344str_uminus(VALUE str)
3345{
3346 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3347 str = rb_str_dup(str);
3348 }
3349 return rb_fstring(str);
3350}
3351
3352RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3353#define rb_str_dup_frozen rb_str_new_frozen
3354
3355VALUE
3357{
3358 rb_check_frozen(str);
3359 if (FL_TEST(str, STR_TMPLOCK)) {
3360 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3361 }
3362 FL_SET(str, STR_TMPLOCK);
3363 return str;
3364}
3365
3366VALUE
3368{
3369 rb_check_frozen(str);
3370 if (!FL_TEST(str, STR_TMPLOCK)) {
3371 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3372 }
3373 FL_UNSET(str, STR_TMPLOCK);
3374 return str;
3375}
3376
3377VALUE
3378rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3379{
3380 rb_str_locktmp(str);
3381 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3382}
3383
3384void
3386{
3387 RUBY_ASSERT(ruby_thread_has_gvl_p());
3388
3389 long capa;
3390 const int termlen = TERM_LEN(str);
3391
3392 str_modifiable(str);
3393 if (STR_SHARED_P(str)) {
3394 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3395 }
3396 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3397 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3398 }
3399
3400 int cr = ENC_CODERANGE(str);
3401 if (len == 0) {
3402 /* Empty string does not contain non-ASCII */
3404 }
3405 else if (cr == ENC_CODERANGE_UNKNOWN) {
3406 /* Leave unknown. */
3407 }
3408 else if (len > RSTRING_LEN(str)) {
3409 if (ENC_CODERANGE_CLEAN_P(cr)) {
3410 /* Update the coderange regarding the extended part. */
3411 const char *const prev_end = RSTRING_END(str);
3412 const char *const new_end = RSTRING_PTR(str) + len;
3413 rb_encoding *enc = rb_enc_get(str);
3414 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3415 ENC_CODERANGE_SET(str, cr);
3416 }
3417 else if (cr == ENC_CODERANGE_BROKEN) {
3418 /* May be valid now, by appended part. */
3420 }
3421 }
3422 else if (len < RSTRING_LEN(str)) {
3423 if (cr != ENC_CODERANGE_7BIT) {
3424 /* ASCII-only string is keeping after truncated. Valid
3425 * and broken may be invalid or valid, leave unknown. */
3427 }
3428 }
3429
3430 STR_SET_LEN(str, len);
3431 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3432}
3433
3434VALUE
3435rb_str_resize(VALUE str, long len)
3436{
3437 if (len < 0) {
3438 rb_raise(rb_eArgError, "negative string size (or size too big)");
3439 }
3440
3441 int independent = str_independent(str);
3442 long slen = RSTRING_LEN(str);
3443 const int termlen = TERM_LEN(str);
3444
3445 if (slen > len || (termlen != 1 && slen < len)) {
3447 }
3448
3449 {
3450 long capa;
3451 if (STR_EMBED_P(str)) {
3452 if (len == slen) return str;
3453 if (str_embed_capa(str) >= len + termlen) {
3454 STR_SET_LEN(str, len);
3455 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3456 return str;
3457 }
3458 str_make_independent_expand(str, slen, len - slen, termlen);
3459 }
3460 else if (str_embed_capa(str) >= len + termlen) {
3461 char *ptr = STR_HEAP_PTR(str);
3462 STR_SET_EMBED(str);
3463 if (slen > len) slen = len;
3464 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3465 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3466 STR_SET_LEN(str, len);
3467 if (independent) ruby_xfree(ptr);
3468 return str;
3469 }
3470 else if (!independent) {
3471 if (len == slen) return str;
3472 str_make_independent_expand(str, slen, len - slen, termlen);
3473 }
3474 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3475 (capa - len) > (len < 1024 ? len : 1024)) {
3476 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3477 (size_t)len + termlen, STR_HEAP_SIZE(str));
3478 RSTRING(str)->as.heap.aux.capa = len;
3479 }
3480 else if (len == slen) return str;
3481 STR_SET_LEN(str, len);
3482 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3483 }
3484 return str;
3485}
3486
3487static void
3488str_ensure_available_capa(VALUE str, long len)
3489{
3490 str_modify_keep_cr(str);
3491
3492 const int termlen = TERM_LEN(str);
3493 long olen = RSTRING_LEN(str);
3494
3495 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3496 rb_raise(rb_eArgError, "string sizes too big");
3497 }
3498
3499 long total = olen + len;
3500 long capa = str_capacity(str, termlen);
3501
3502 if (capa < total) {
3503 if (total >= LONG_MAX / 2) {
3504 capa = total;
3505 }
3506 while (total > capa) {
3507 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3508 }
3509 RESIZE_CAPA_TERM(str, capa, termlen);
3510 }
3511}
3512
3513static VALUE
3514str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3515{
3516 if (keep_cr) {
3517 str_modify_keep_cr(str);
3518 }
3519 else {
3520 rb_str_modify(str);
3521 }
3522 if (len == 0) return 0;
3523
3524 long total, olen, off = -1;
3525 char *sptr;
3526 const int termlen = TERM_LEN(str);
3527
3528 RSTRING_GETMEM(str, sptr, olen);
3529 if (ptr >= sptr && ptr <= sptr + olen) {
3530 off = ptr - sptr;
3531 }
3532
3533 long capa = str_capacity(str, termlen);
3534
3535 if (olen > LONG_MAX - len) {
3536 rb_raise(rb_eArgError, "string sizes too big");
3537 }
3538 total = olen + len;
3539 if (capa < total) {
3540 if (total >= LONG_MAX / 2) {
3541 capa = total;
3542 }
3543 while (total > capa) {
3544 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3545 }
3546 RESIZE_CAPA_TERM(str, capa, termlen);
3547 sptr = RSTRING_PTR(str);
3548 }
3549 if (off != -1) {
3550 ptr = sptr + off;
3551 }
3552 memcpy(sptr + olen, ptr, len);
3553 STR_SET_LEN(str, total);
3554 TERM_FILL(sptr + total, termlen); /* sentinel */
3555
3556 return str;
3557}
3558
3559#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3560#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3561
3562VALUE
3563rb_str_cat(VALUE str, const char *ptr, long len)
3564{
3565 if (len == 0) return str;
3566 if (len < 0) {
3567 rb_raise(rb_eArgError, "negative string size (or size too big)");
3568 }
3569 return str_buf_cat(str, ptr, len);
3570}
3571
3572VALUE
3573rb_str_cat_cstr(VALUE str, const char *ptr)
3574{
3575 must_not_null(ptr);
3576 return rb_str_buf_cat(str, ptr, strlen(ptr));
3577}
3578
3579static void
3580rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3581{
3582 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3583
3584 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3585 if (UNLIKELY(!str_independent(str))) {
3586 str_make_independent(str);
3587 }
3588
3589 long string_length = -1;
3590 const int null_terminator_length = 1;
3591 char *sptr;
3592 RSTRING_GETMEM(str, sptr, string_length);
3593
3594 // Ensure the resulting string wouldn't be too long.
3595 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3596 rb_raise(rb_eArgError, "string sizes too big");
3597 }
3598
3599 long string_capacity = str_capacity(str, null_terminator_length);
3600
3601 // Get the code range before any modifications since those might clear the code range.
3602 int cr = ENC_CODERANGE(str);
3603
3604 // Check if the string has spare string_capacity to write the new byte.
3605 if (LIKELY(string_capacity >= string_length + 1)) {
3606 // In fast path we can write the new byte and note the string's new length.
3607 sptr[string_length] = byte;
3608 STR_SET_LEN(str, string_length + 1);
3609 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3610 }
3611 else {
3612 // If there's not enough string_capacity, make a call into the general string concatenation function.
3613 str_buf_cat(str, (char *)&byte, 1);
3614 }
3615
3616 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3617 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3618 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3619 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3620 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3621 if (ISASCII(byte)) {
3623 }
3624 else {
3626
3627 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3628 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3629 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3630 }
3631 }
3632 }
3633}
3634
3635RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3636RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3637RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3638
3639static VALUE
3640rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3641 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3642{
3643 int str_encindex = ENCODING_GET(str);
3644 int res_encindex;
3645 int str_cr, res_cr;
3646 rb_encoding *str_enc, *ptr_enc;
3647
3648 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3649
3650 if (str_encindex == ptr_encindex) {
3651 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3652 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3653 }
3654 }
3655 else {
3656 str_enc = rb_enc_from_index(str_encindex);
3657 ptr_enc = rb_enc_from_index(ptr_encindex);
3658 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3659 if (len == 0)
3660 return str;
3661 if (RSTRING_LEN(str) == 0) {
3662 rb_str_buf_cat(str, ptr, len);
3663 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3664 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3665 return str;
3666 }
3667 goto incompatible;
3668 }
3669 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3670 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3671 }
3672 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3673 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3674 str_cr = rb_enc_str_coderange(str);
3675 }
3676 }
3677 }
3678 if (ptr_cr_ret)
3679 *ptr_cr_ret = ptr_cr;
3680
3681 if (str_encindex != ptr_encindex &&
3682 str_cr != ENC_CODERANGE_7BIT &&
3683 ptr_cr != ENC_CODERANGE_7BIT) {
3684 str_enc = rb_enc_from_index(str_encindex);
3685 ptr_enc = rb_enc_from_index(ptr_encindex);
3686 goto incompatible;
3687 }
3688
3689 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3690 res_encindex = str_encindex;
3691 res_cr = ENC_CODERANGE_UNKNOWN;
3692 }
3693 else if (str_cr == ENC_CODERANGE_7BIT) {
3694 if (ptr_cr == ENC_CODERANGE_7BIT) {
3695 res_encindex = str_encindex;
3696 res_cr = ENC_CODERANGE_7BIT;
3697 }
3698 else {
3699 res_encindex = ptr_encindex;
3700 res_cr = ptr_cr;
3701 }
3702 }
3703 else if (str_cr == ENC_CODERANGE_VALID) {
3704 res_encindex = str_encindex;
3705 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3706 res_cr = str_cr;
3707 else
3708 res_cr = ptr_cr;
3709 }
3710 else { /* str_cr == ENC_CODERANGE_BROKEN */
3711 res_encindex = str_encindex;
3712 res_cr = str_cr;
3713 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3714 }
3715
3716 if (len < 0) {
3717 rb_raise(rb_eArgError, "negative string size (or size too big)");
3718 }
3719 str_buf_cat(str, ptr, len);
3720 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3721 return str;
3722
3723 incompatible:
3724 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3725 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3727}
3728
3729VALUE
3730rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3731{
3732 return rb_enc_cr_str_buf_cat(str, ptr, len,
3733 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3734}
3735
3736VALUE
3738{
3739 /* ptr must reference NUL terminated ASCII string. */
3740 int encindex = ENCODING_GET(str);
3741 rb_encoding *enc = rb_enc_from_index(encindex);
3742 if (rb_enc_asciicompat(enc)) {
3743 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3744 encindex, ENC_CODERANGE_7BIT, 0);
3745 }
3746 else {
3747 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3748 while (*ptr) {
3749 unsigned int c = (unsigned char)*ptr;
3750 int len = rb_enc_codelen(c, enc);
3751 rb_enc_mbcput(c, buf, enc);
3752 rb_enc_cr_str_buf_cat(str, buf, len,
3753 encindex, ENC_CODERANGE_VALID, 0);
3754 ptr++;
3755 }
3756 return str;
3757 }
3758}
3759
3760VALUE
3762{
3763 int str2_cr = rb_enc_str_coderange(str2);
3764
3765 if (str_enc_fastpath(str)) {
3766 switch (str2_cr) {
3767 case ENC_CODERANGE_7BIT:
3768 // If RHS is 7bit we can do simple concatenation
3769 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3770 RB_GC_GUARD(str2);
3771 return str;
3773 // If RHS is valid, we can do simple concatenation if encodings are the same
3774 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3775 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3776 int str_cr = ENC_CODERANGE(str);
3777 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3778 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3779 }
3780 RB_GC_GUARD(str2);
3781 return str;
3782 }
3783 }
3784 }
3785
3786 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3787 ENCODING_GET(str2), str2_cr, &str2_cr);
3788
3789 ENC_CODERANGE_SET(str2, str2_cr);
3790
3791 return str;
3792}
3793
3794VALUE
3796{
3797 StringValue(str2);
3798 return rb_str_buf_append(str, str2);
3799}
3800
3801VALUE
3802rb_str_concat_literals(size_t num, const VALUE *strary)
3803{
3804 VALUE str;
3805 size_t i, s = 0;
3806 unsigned long len = 1;
3807
3808 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3809 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3810
3811 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3812 str = rb_str_buf_new(len);
3813 str_enc_copy_direct(str, strary[0]);
3814
3815 for (i = s; i < num; ++i) {
3816 const VALUE v = strary[i];
3817 int encidx = ENCODING_GET(v);
3818
3819 rb_str_buf_append(str, v);
3820 if (encidx != ENCINDEX_US_ASCII) {
3821 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3822 rb_enc_set_index(str, encidx);
3823 }
3824 }
3825 return str;
3826}
3827
3828/*
3829 * call-seq:
3830 * concat(*objects) -> string
3831 *
3832 * :include: doc/string/concat.rdoc
3833 */
3834static VALUE
3835rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3836{
3837 str_modifiable(str);
3838
3839 if (argc == 1) {
3840 return rb_str_concat(str, argv[0]);
3841 }
3842 else if (argc > 1) {
3843 int i;
3844 VALUE arg_str = rb_str_tmp_new(0);
3845 rb_enc_copy(arg_str, str);
3846 for (i = 0; i < argc; i++) {
3847 rb_str_concat(arg_str, argv[i]);
3848 }
3849 rb_str_buf_append(str, arg_str);
3850 }
3851
3852 return str;
3853}
3854
3855/*
3856 * call-seq:
3857 * append_as_bytes(*objects) -> self
3858 *
3859 * Concatenates each object in +objects+ into +self+; returns +self+;
3860 * performs no encoding validation or conversion:
3861 *
3862 * s = 'foo'
3863 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3864 * s.valid_encoding? # => false
3865 * s.append_as_bytes("\xAC 12")
3866 * s.valid_encoding? # => true
3867 *
3868 * When a given object is an integer,
3869 * the value is considered an 8-bit byte;
3870 * if the integer occupies more than one byte (i.e,. is greater than 255),
3871 * appends only the low-order byte (similar to String#setbyte):
3872 *
3873 * s = ""
3874 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3875 * s.bytesize # => 2
3876 *
3877 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3878 */
3879
3880VALUE
3881rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3882{
3883 long needed_capacity = 0;
3884 volatile VALUE t0;
3885 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3886
3887 for (int index = 0; index < argc; index++) {
3888 VALUE obj = argv[index];
3889 enum ruby_value_type type = types[index] = rb_type(obj);
3890 switch (type) {
3891 case T_FIXNUM:
3892 case T_BIGNUM:
3893 needed_capacity++;
3894 break;
3895 case T_STRING:
3896 needed_capacity += RSTRING_LEN(obj);
3897 break;
3898 default:
3899 rb_raise(
3901 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3902 rb_obj_class(obj)
3903 );
3904 break;
3905 }
3906 }
3907
3908 str_ensure_available_capa(str, needed_capacity);
3909 char *sptr = RSTRING_END(str);
3910
3911 for (int index = 0; index < argc; index++) {
3912 VALUE obj = argv[index];
3913 enum ruby_value_type type = types[index];
3914 switch (type) {
3915 case T_FIXNUM:
3916 case T_BIGNUM: {
3917 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3918 char byte = (char)(NUM2INT(obj) & 0xFF);
3919 *sptr = byte;
3920 sptr++;
3921 break;
3922 }
3923 case T_STRING: {
3924 const char *ptr;
3925 long len;
3926 RSTRING_GETMEM(obj, ptr, len);
3927 memcpy(sptr, ptr, len);
3928 sptr += len;
3929 break;
3930 }
3931 default:
3932 rb_bug("append_as_bytes arguments should have been validated");
3933 }
3934 }
3935
3936 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3937 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3938
3939 int cr = ENC_CODERANGE(str);
3940 switch (cr) {
3941 case ENC_CODERANGE_7BIT: {
3942 for (int index = 0; index < argc; index++) {
3943 VALUE obj = argv[index];
3944 enum ruby_value_type type = types[index];
3945 switch (type) {
3946 case T_FIXNUM:
3947 case T_BIGNUM: {
3948 if (!ISASCII(NUM2INT(obj))) {
3949 goto clear_cr;
3950 }
3951 break;
3952 }
3953 case T_STRING: {
3954 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3955 goto clear_cr;
3956 }
3957 break;
3958 }
3959 default:
3960 rb_bug("append_as_bytes arguments should have been validated");
3961 }
3962 }
3963 break;
3964 }
3966 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3967 goto keep_cr;
3968 }
3969 else {
3970 goto clear_cr;
3971 }
3972 break;
3973 default:
3974 goto clear_cr;
3975 break;
3976 }
3977
3978 RB_GC_GUARD(t0);
3979
3980 clear_cr:
3981 // If no fast path was hit, we clear the coderange.
3982 // append_as_bytes is predominently meant to be used in
3983 // buffering situation, hence it's likely the coderange
3984 // will never be scanned, so it's not worth spending time
3985 // precomputing the coderange except for simple and common
3986 // situations.
3988 keep_cr:
3989 return str;
3990}
3991
3992/*
3993 * call-seq:
3994 * self << object -> self
3995 *
3996 * Appends a string representation of +object+ to +self+;
3997 * returns +self+.
3998 *
3999 * If +object+ is a string, appends it to +self+:
4000 *
4001 * s = 'foo'
4002 * s << 'bar' # => "foobar"
4003 * s # => "foobar"
4004 *
4005 * If +object+ is an integer,
4006 * its value is considered a codepoint;
4007 * converts the value to a character before concatenating:
4008 *
4009 * s = 'foo'
4010 * s << 33 # => "foo!"
4011 *
4012 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4013 * and the encoding of +self+ is Encoding::US_ASCII,
4014 * changes the encoding to Encoding::ASCII_8BIT:
4015 *
4016 * s = 'foo'.encode(Encoding::US_ASCII)
4017 * s.encoding # => #<Encoding:US-ASCII>
4018 * s << 0xff # => "foo\xFF"
4019 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4020 *
4021 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4022 *
4023 * s = 'foo'
4024 * s.encoding # => <Encoding:UTF-8>
4025 * s << 0x00110000 # 1114112 out of char range (RangeError)
4026 * s = 'foo'.encode(Encoding::EUC_JP)
4027 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4028 *
4029 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4030 */
4031VALUE
4033{
4034 unsigned int code;
4035 rb_encoding *enc = STR_ENC_GET(str1);
4036 int encidx;
4037
4038 if (RB_INTEGER_TYPE_P(str2)) {
4039 if (rb_num_to_uint(str2, &code) == 0) {
4040 }
4041 else if (FIXNUM_P(str2)) {
4042 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4043 }
4044 else {
4045 rb_raise(rb_eRangeError, "bignum out of char range");
4046 }
4047 }
4048 else {
4049 return rb_str_append(str1, str2);
4050 }
4051
4052 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4053
4054 if (encidx >= 0) {
4055 rb_str_buf_cat_byte(str1, (unsigned char)code);
4056 }
4057 else {
4058 long pos = RSTRING_LEN(str1);
4059 int cr = ENC_CODERANGE(str1);
4060 int len;
4061 char *buf;
4062
4063 switch (len = rb_enc_codelen(code, enc)) {
4064 case ONIGERR_INVALID_CODE_POINT_VALUE:
4065 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4066 break;
4067 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4068 case 0:
4069 rb_raise(rb_eRangeError, "%u out of char range", code);
4070 break;
4071 }
4072 buf = ALLOCA_N(char, len + 1);
4073 rb_enc_mbcput(code, buf, enc);
4074 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4075 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4076 }
4077 rb_str_resize(str1, pos+len);
4078 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4079 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4081 }
4082 else if (cr == ENC_CODERANGE_BROKEN) {
4084 }
4085 ENC_CODERANGE_SET(str1, cr);
4086 }
4087 return str1;
4088}
4089
4090int
4091rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4092{
4093 int encidx = rb_enc_to_index(enc);
4094
4095 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4096 /* US-ASCII automatically extended to ASCII-8BIT */
4097 if (code > 0xFF) {
4098 rb_raise(rb_eRangeError, "%u out of char range", code);
4099 }
4100 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4101 return ENCINDEX_ASCII_8BIT;
4102 }
4103 return encidx;
4104 }
4105 else {
4106 return -1;
4107 }
4108}
4109
4110/*
4111 * call-seq:
4112 * prepend(*other_strings) -> new_string
4113 *
4114 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4115 *
4116 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4117 *
4118 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4119 *
4120 */
4121
4122static VALUE
4123rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4124{
4125 str_modifiable(str);
4126
4127 if (argc == 1) {
4128 rb_str_update(str, 0L, 0L, argv[0]);
4129 }
4130 else if (argc > 1) {
4131 int i;
4132 VALUE arg_str = rb_str_tmp_new(0);
4133 rb_enc_copy(arg_str, str);
4134 for (i = 0; i < argc; i++) {
4135 rb_str_append(arg_str, argv[i]);
4136 }
4137 rb_str_update(str, 0L, 0L, arg_str);
4138 }
4139
4140 return str;
4141}
4142
4143st_index_t
4145{
4146 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4147 st_index_t precomputed_hash;
4148 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4149
4150 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4151 return precomputed_hash;
4152 }
4153
4154 return str_do_hash(str);
4155}
4156
4157int
4159{
4160 long len1, len2;
4161 const char *ptr1, *ptr2;
4162 RSTRING_GETMEM(str1, ptr1, len1);
4163 RSTRING_GETMEM(str2, ptr2, len2);
4164 return (len1 != len2 ||
4165 !rb_str_comparable(str1, str2) ||
4166 memcmp(ptr1, ptr2, len1) != 0);
4167}
4168
4169/*
4170 * call-seq:
4171 * hash -> integer
4172 *
4173 * :include: doc/string/hash.rdoc
4174 *
4175 */
4176
4177static VALUE
4178rb_str_hash_m(VALUE str)
4179{
4180 st_index_t hval = rb_str_hash(str);
4181 return ST2FIX(hval);
4182}
4183
4184#define lesser(a,b) (((a)>(b))?(b):(a))
4185
4186int
4188{
4189 int idx1, idx2;
4190 int rc1, rc2;
4191
4192 if (RSTRING_LEN(str1) == 0) return TRUE;
4193 if (RSTRING_LEN(str2) == 0) return TRUE;
4194 idx1 = ENCODING_GET(str1);
4195 idx2 = ENCODING_GET(str2);
4196 if (idx1 == idx2) return TRUE;
4197 rc1 = rb_enc_str_coderange(str1);
4198 rc2 = rb_enc_str_coderange(str2);
4199 if (rc1 == ENC_CODERANGE_7BIT) {
4200 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4201 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4202 return TRUE;
4203 }
4204 if (rc2 == ENC_CODERANGE_7BIT) {
4205 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4206 return TRUE;
4207 }
4208 return FALSE;
4209}
4210
4211int
4213{
4214 long len1, len2;
4215 const char *ptr1, *ptr2;
4216 int retval;
4217
4218 if (str1 == str2) return 0;
4219 RSTRING_GETMEM(str1, ptr1, len1);
4220 RSTRING_GETMEM(str2, ptr2, len2);
4221 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4222 if (len1 == len2) {
4223 if (!rb_str_comparable(str1, str2)) {
4224 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4225 return 1;
4226 return -1;
4227 }
4228 return 0;
4229 }
4230 if (len1 > len2) return 1;
4231 return -1;
4232 }
4233 if (retval > 0) return 1;
4234 return -1;
4235}
4236
4237/*
4238 * call-seq:
4239 * self == object -> true or false
4240 *
4241 * Returns whether +object+ is equal to +self+.
4242 *
4243 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4244 *
4245 * s = 'foo'
4246 * s == 'foo' # => true
4247 * s == 'food' # => false
4248 * s == 'FOO' # => false
4249 *
4250 * Returns +false+ if the two strings' encodings are not compatible:
4251 *
4252 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4253 *
4254 * When +object+ is not a string:
4255 *
4256 * - If +object+ responds to method <tt>to_str</tt>,
4257 * <tt>object == self</tt> is called and its return value is returned.
4258 * - If +object+ does not respond to <tt>to_str</tt>,
4259 * +false+ is returned.
4260 *
4261 * Related: {Comparing}[rdoc-ref:String@Comparing].
4262 */
4263
4264VALUE
4266{
4267 if (str1 == str2) return Qtrue;
4268 if (!RB_TYPE_P(str2, T_STRING)) {
4269 if (!rb_respond_to(str2, idTo_str)) {
4270 return Qfalse;
4271 }
4272 return rb_equal(str2, str1);
4273 }
4274 return rb_str_eql_internal(str1, str2);
4275}
4276
4277/*
4278 * call-seq:
4279 * eql?(object) -> true or false
4280 *
4281 * :include: doc/string/eql_p.rdoc
4282 *
4283 */
4284
4285VALUE
4286rb_str_eql(VALUE str1, VALUE str2)
4287{
4288 if (str1 == str2) return Qtrue;
4289 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4290 return rb_str_eql_internal(str1, str2);
4291}
4292
4293/*
4294 * call-seq:
4295 * self <=> other_string -> -1, 0, 1, or nil
4296 *
4297 * Compares +self+ and +other_string+, returning:
4298 *
4299 * - -1 if +other_string+ is larger.
4300 * - 0 if the two are equal.
4301 * - 1 if +other_string+ is smaller.
4302 * - +nil+ if the two are incomparable.
4303 *
4304 * Examples:
4305 *
4306 * 'foo' <=> 'foo' # => 0
4307 * 'foo' <=> 'food' # => -1
4308 * 'food' <=> 'foo' # => 1
4309 * 'FOO' <=> 'foo' # => -1
4310 * 'foo' <=> 'FOO' # => 1
4311 * 'foo' <=> 1 # => nil
4312 *
4313 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4314 */
4315
4316static VALUE
4317rb_str_cmp_m(VALUE str1, VALUE str2)
4318{
4319 int result;
4320 VALUE s = rb_check_string_type(str2);
4321 if (NIL_P(s)) {
4322 return rb_invcmp(str1, str2);
4323 }
4324 result = rb_str_cmp(str1, s);
4325 return INT2FIX(result);
4326}
4327
4328static VALUE str_casecmp(VALUE str1, VALUE str2);
4329static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4330
4331/*
4332 * call-seq:
4333 * casecmp(other_string) -> -1, 0, 1, or nil
4334 *
4335 * Ignoring case, compares +self+ and +other_string+; returns:
4336 *
4337 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4338 * - 0 if the two are equal.
4339 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4340 * - +nil+ if the two are incomparable.
4341 *
4342 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4343 *
4344 * Examples:
4345 *
4346 * 'foo'.casecmp('goo') # => -1
4347 * 'goo'.casecmp('foo') # => 1
4348 * 'foo'.casecmp('food') # => -1
4349 * 'food'.casecmp('foo') # => 1
4350 * 'FOO'.casecmp('foo') # => 0
4351 * 'foo'.casecmp('FOO') # => 0
4352 * 'foo'.casecmp(1) # => nil
4353 *
4354 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4355 */
4356
4357static VALUE
4358rb_str_casecmp(VALUE str1, VALUE str2)
4359{
4360 VALUE s = rb_check_string_type(str2);
4361 if (NIL_P(s)) {
4362 return Qnil;
4363 }
4364 return str_casecmp(str1, s);
4365}
4366
4367static VALUE
4368str_casecmp(VALUE str1, VALUE str2)
4369{
4370 long len;
4371 rb_encoding *enc;
4372 const char *p1, *p1end, *p2, *p2end;
4373
4374 enc = rb_enc_compatible(str1, str2);
4375 if (!enc) {
4376 return Qnil;
4377 }
4378
4379 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4380 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4381 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4382 while (p1 < p1end && p2 < p2end) {
4383 if (*p1 != *p2) {
4384 unsigned int c1 = TOLOWER(*p1 & 0xff);
4385 unsigned int c2 = TOLOWER(*p2 & 0xff);
4386 if (c1 != c2)
4387 return INT2FIX(c1 < c2 ? -1 : 1);
4388 }
4389 p1++;
4390 p2++;
4391 }
4392 }
4393 else {
4394 while (p1 < p1end && p2 < p2end) {
4395 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4396 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4397
4398 if (0 <= c1 && 0 <= c2) {
4399 c1 = TOLOWER(c1);
4400 c2 = TOLOWER(c2);
4401 if (c1 != c2)
4402 return INT2FIX(c1 < c2 ? -1 : 1);
4403 }
4404 else {
4405 int r;
4406 l1 = rb_enc_mbclen(p1, p1end, enc);
4407 l2 = rb_enc_mbclen(p2, p2end, enc);
4408 len = l1 < l2 ? l1 : l2;
4409 r = memcmp(p1, p2, len);
4410 if (r != 0)
4411 return INT2FIX(r < 0 ? -1 : 1);
4412 if (l1 != l2)
4413 return INT2FIX(l1 < l2 ? -1 : 1);
4414 }
4415 p1 += l1;
4416 p2 += l2;
4417 }
4418 }
4419 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4420 if (p1 == p1end) return INT2FIX(-1);
4421 return INT2FIX(1);
4422}
4423
4424/*
4425 * call-seq:
4426 * casecmp?(other_string) -> true, false, or nil
4427 *
4428 * Returns +true+ if +self+ and +other_string+ are equal after
4429 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4430 *
4431 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4432 *
4433 * Examples:
4434 *
4435 * 'foo'.casecmp?('goo') # => false
4436 * 'goo'.casecmp?('foo') # => false
4437 * 'foo'.casecmp?('food') # => false
4438 * 'food'.casecmp?('foo') # => false
4439 * 'FOO'.casecmp?('foo') # => true
4440 * 'foo'.casecmp?('FOO') # => true
4441 * 'foo'.casecmp?(1) # => nil
4442 *
4443 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4444 */
4445
4446static VALUE
4447rb_str_casecmp_p(VALUE str1, VALUE str2)
4448{
4449 VALUE s = rb_check_string_type(str2);
4450 if (NIL_P(s)) {
4451 return Qnil;
4452 }
4453 return str_casecmp_p(str1, s);
4454}
4455
4456static VALUE
4457str_casecmp_p(VALUE str1, VALUE str2)
4458{
4459 rb_encoding *enc;
4460 VALUE folded_str1, folded_str2;
4461 VALUE fold_opt = sym_fold;
4462
4463 enc = rb_enc_compatible(str1, str2);
4464 if (!enc) {
4465 return Qnil;
4466 }
4467
4468 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4469 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4470
4471 return rb_str_eql(folded_str1, folded_str2);
4472}
4473
4474static long
4475strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4476 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4477{
4478 const char *search_start = str_ptr;
4479 long pos, search_len = str_len - offset;
4480
4481 for (;;) {
4482 const char *t;
4483 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4484 if (pos < 0) return pos;
4485 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4486 if (t == search_start + pos) break;
4487 search_len -= t - search_start;
4488 if (search_len <= 0) return -1;
4489 offset += t - search_start;
4490 search_start = t;
4491 }
4492 return pos + offset;
4493}
4494
4495/* found index in byte */
4496#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4497#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4498
4499static long
4500rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4501{
4502 const char *str_ptr, *str_ptr_end, *sub_ptr;
4503 long str_len, sub_len;
4504 rb_encoding *enc;
4505
4506 enc = rb_enc_check(str, sub);
4507 if (is_broken_string(sub)) return -1;
4508
4509 str_ptr = RSTRING_PTR(str);
4510 str_ptr_end = RSTRING_END(str);
4511 str_len = RSTRING_LEN(str);
4512 sub_ptr = RSTRING_PTR(sub);
4513 sub_len = RSTRING_LEN(sub);
4514
4515 if (str_len < sub_len) return -1;
4516
4517 if (offset != 0) {
4518 long str_len_char, sub_len_char;
4519 int single_byte = single_byte_optimizable(str);
4520 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4521 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4522 if (offset < 0) {
4523 offset += str_len_char;
4524 if (offset < 0) return -1;
4525 }
4526 if (str_len_char - offset < sub_len_char) return -1;
4527 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4528 str_ptr += offset;
4529 }
4530 if (sub_len == 0) return offset;
4531
4532 /* need proceed one character at a time */
4533 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4534}
4535
4536
4537/*
4538 * call-seq:
4539 * index(pattern, offset = 0) -> integer or nil
4540 *
4541 * :include: doc/string/index.rdoc
4542 *
4543 */
4544
4545static VALUE
4546rb_str_index_m(int argc, VALUE *argv, VALUE str)
4547{
4548 VALUE sub;
4549 VALUE initpos;
4550 rb_encoding *enc = STR_ENC_GET(str);
4551 long pos;
4552
4553 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4554 long slen = str_strlen(str, enc); /* str's enc */
4555 pos = NUM2LONG(initpos);
4556 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4557 if (RB_TYPE_P(sub, T_REGEXP)) {
4559 }
4560 return Qnil;
4561 }
4562 }
4563 else {
4564 pos = 0;
4565 }
4566
4567 if (RB_TYPE_P(sub, T_REGEXP)) {
4568 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4569 enc, single_byte_optimizable(str));
4570
4571 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4572 VALUE match = rb_backref_get();
4573 struct re_registers *regs = RMATCH_REGS(match);
4574 pos = rb_str_sublen(str, BEG(0));
4575 return LONG2NUM(pos);
4576 }
4577 }
4578 else {
4579 StringValue(sub);
4580 pos = rb_str_index(str, sub, pos);
4581 if (pos >= 0) {
4582 pos = rb_str_sublen(str, pos);
4583 return LONG2NUM(pos);
4584 }
4585 }
4586 return Qnil;
4587}
4588
4589/* Ensure that the given pos is a valid character boundary.
4590 * Note that in this function, "character" means a code point
4591 * (Unicode scalar value), not a grapheme cluster.
4592 */
4593static void
4594str_ensure_byte_pos(VALUE str, long pos)
4595{
4596 if (!single_byte_optimizable(str)) {
4597 const char *s = RSTRING_PTR(str);
4598 const char *e = RSTRING_END(str);
4599 const char *p = s + pos;
4600 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4601 rb_raise(rb_eIndexError,
4602 "offset %ld does not land on character boundary", pos);
4603 }
4604 }
4605}
4606
4607/*
4608 * call-seq:
4609 * byteindex(object, offset = 0) -> integer or nil
4610 *
4611 * Returns the 0-based integer index of a substring of +self+
4612 * specified by +object+ (a string or Regexp) and +offset+,
4613 * or +nil+ if there is no such substring;
4614 * the returned index is the count of _bytes_ (not characters).
4615 *
4616 * When +object+ is a string,
4617 * returns the index of the first found substring equal to +object+:
4618 *
4619 * s = 'foo' # => "foo"
4620 * s.size # => 3 # Three 1-byte characters.
4621 * s.bytesize # => 3 # Three bytes.
4622 * s.byteindex('f') # => 0
4623 * s.byteindex('o') # => 1
4624 * s.byteindex('oo') # => 1
4625 * s.byteindex('ooo') # => nil
4626 *
4627 * When +object+ is a Regexp,
4628 * returns the index of the first found substring matching +object+;
4629 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4630 *
4631 * s = 'foo'
4632 * s.byteindex(/f/) # => 0
4633 * $~ # => #<MatchData "f">
4634 * s.byteindex(/o/) # => 1
4635 * s.byteindex(/oo/) # => 1
4636 * s.byteindex(/ooo/) # => nil
4637 * $~ # => nil
4638 *
4639 * \Integer argument +offset+, if given, specifies the 0-based index
4640 * of the byte where searching is to begin.
4641 *
4642 * When +offset+ is non-negative,
4643 * searching begins at byte position +offset+:
4644 *
4645 * s = 'foo'
4646 * s.byteindex('o', 1) # => 1
4647 * s.byteindex('o', 2) # => 2
4648 * s.byteindex('o', 3) # => nil
4649 *
4650 * When +offset+ is negative, counts backward from the end of +self+:
4651 *
4652 * s = 'foo'
4653 * s.byteindex('o', -1) # => 2
4654 * s.byteindex('o', -2) # => 1
4655 * s.byteindex('o', -3) # => 1
4656 * s.byteindex('o', -4) # => nil
4657 *
4658 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4659 *
4660 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4661 * s.size # => 2 # Two 3-byte characters.
4662 * s.bytesize # => 6 # Six bytes.
4663 * s.byteindex("\uFFFF") # => 0
4664 * s.byteindex("\uFFFF", 1) # Raises IndexError
4665 * s.byteindex("\uFFFF", 2) # Raises IndexError
4666 * s.byteindex("\uFFFF", 3) # => 3
4667 * s.byteindex("\uFFFF", 4) # Raises IndexError
4668 * s.byteindex("\uFFFF", 5) # Raises IndexError
4669 * s.byteindex("\uFFFF", 6) # => nil
4670 *
4671 * Related: see {Querying}[rdoc-ref:String@Querying].
4672 */
4673
4674static VALUE
4675rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4676{
4677 VALUE sub;
4678 VALUE initpos;
4679 long pos;
4680
4681 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4682 long slen = RSTRING_LEN(str);
4683 pos = NUM2LONG(initpos);
4684 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4685 if (RB_TYPE_P(sub, T_REGEXP)) {
4687 }
4688 return Qnil;
4689 }
4690 }
4691 else {
4692 pos = 0;
4693 }
4694
4695 str_ensure_byte_pos(str, pos);
4696
4697 if (RB_TYPE_P(sub, T_REGEXP)) {
4698 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4699 VALUE match = rb_backref_get();
4700 struct re_registers *regs = RMATCH_REGS(match);
4701 pos = BEG(0);
4702 return LONG2NUM(pos);
4703 }
4704 }
4705 else {
4706 StringValue(sub);
4707 pos = rb_str_byteindex(str, sub, pos);
4708 if (pos >= 0) return LONG2NUM(pos);
4709 }
4710 return Qnil;
4711}
4712
4713#ifndef HAVE_MEMRCHR
4714static void*
4715memrchr(const char *search_str, int chr, long search_len)
4716{
4717 const char *ptr = search_str + search_len;
4718 while (ptr > search_str) {
4719 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4720 }
4721
4722 return ((void *)0);
4723}
4724#endif
4725
4726static long
4727str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4728{
4729 char *hit, *adjusted;
4730 int c;
4731 long slen, searchlen;
4732 char *sbeg, *e, *t;
4733
4734 sbeg = RSTRING_PTR(str);
4735 slen = RSTRING_LEN(sub);
4736 if (slen == 0) return s - sbeg;
4737 e = RSTRING_END(str);
4738 t = RSTRING_PTR(sub);
4739 c = *t & 0xff;
4740 searchlen = s - sbeg + 1;
4741
4742 if (memcmp(s, t, slen) == 0) {
4743 return s - sbeg;
4744 }
4745
4746 do {
4747 hit = memrchr(sbeg, c, searchlen);
4748 if (!hit) break;
4749 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4750 if (hit != adjusted) {
4751 searchlen = adjusted - sbeg;
4752 continue;
4753 }
4754 if (memcmp(hit, t, slen) == 0)
4755 return hit - sbeg;
4756 searchlen = adjusted - sbeg;
4757 } while (searchlen > 0);
4758
4759 return -1;
4760}
4761
4762/* found index in byte */
4763static long
4764rb_str_rindex(VALUE str, VALUE sub, long pos)
4765{
4766 long len, slen;
4767 char *sbeg, *s;
4768 rb_encoding *enc;
4769 int singlebyte;
4770
4771 enc = rb_enc_check(str, sub);
4772 if (is_broken_string(sub)) return -1;
4773 singlebyte = single_byte_optimizable(str);
4774 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4775 slen = str_strlen(sub, enc); /* rb_enc_check */
4776
4777 /* substring longer than string */
4778 if (len < slen) return -1;
4779 if (len - pos < slen) pos = len - slen;
4780 if (len == 0) return pos;
4781
4782 sbeg = RSTRING_PTR(str);
4783
4784 if (pos == 0) {
4785 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4786 return 0;
4787 else
4788 return -1;
4789 }
4790
4791 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4792 return str_rindex(str, sub, s, enc);
4793}
4794
4795/*
4796 * call-seq:
4797 * rindex(pattern, offset = self.length) -> integer or nil
4798 *
4799 * :include:doc/string/rindex.rdoc
4800 *
4801 */
4802
4803static VALUE
4804rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4805{
4806 VALUE sub;
4807 VALUE initpos;
4808 rb_encoding *enc = STR_ENC_GET(str);
4809 long pos, len = str_strlen(str, enc); /* str's enc */
4810
4811 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4812 pos = NUM2LONG(initpos);
4813 if (pos < 0 && (pos += len) < 0) {
4814 if (RB_TYPE_P(sub, T_REGEXP)) {
4816 }
4817 return Qnil;
4818 }
4819 if (pos > len) pos = len;
4820 }
4821 else {
4822 pos = len;
4823 }
4824
4825 if (RB_TYPE_P(sub, T_REGEXP)) {
4826 /* enc = rb_enc_check(str, sub); */
4827 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4828 enc, single_byte_optimizable(str));
4829
4830 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4831 VALUE match = rb_backref_get();
4832 struct re_registers *regs = RMATCH_REGS(match);
4833 pos = rb_str_sublen(str, BEG(0));
4834 return LONG2NUM(pos);
4835 }
4836 }
4837 else {
4838 StringValue(sub);
4839 pos = rb_str_rindex(str, sub, pos);
4840 if (pos >= 0) {
4841 pos = rb_str_sublen(str, pos);
4842 return LONG2NUM(pos);
4843 }
4844 }
4845 return Qnil;
4846}
4847
4848static long
4849rb_str_byterindex(VALUE str, VALUE sub, long pos)
4850{
4851 long len, slen;
4852 char *sbeg, *s;
4853 rb_encoding *enc;
4854
4855 enc = rb_enc_check(str, sub);
4856 if (is_broken_string(sub)) return -1;
4857 len = RSTRING_LEN(str);
4858 slen = RSTRING_LEN(sub);
4859
4860 /* substring longer than string */
4861 if (len < slen) return -1;
4862 if (len - pos < slen) pos = len - slen;
4863 if (len == 0) return pos;
4864
4865 sbeg = RSTRING_PTR(str);
4866
4867 if (pos == 0) {
4868 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4869 return 0;
4870 else
4871 return -1;
4872 }
4873
4874 s = sbeg + pos;
4875 return str_rindex(str, sub, s, enc);
4876}
4877
4878/*
4879 * call-seq:
4880 * byterindex(object, offset = self.bytesize) -> integer or nil
4881 *
4882 * Returns the 0-based integer index of a substring of +self+
4883 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4884 * or +nil+ if there is no such substring;
4885 * the returned index is the count of _bytes_ (not characters).
4886 *
4887 * When +object+ is a string,
4888 * returns the index of the _last_ found substring equal to +object+:
4889 *
4890 * s = 'foo' # => "foo"
4891 * s.size # => 3 # Three 1-byte characters.
4892 * s.bytesize # => 3 # Three bytes.
4893 * s.byterindex('f') # => 0
4894 s.byterindex('o') # => 2
4895 s.byterindex('oo') # => 1
4896 s.byterindex('ooo') # => nil
4897 *
4898 * When +object+ is a Regexp,
4899 * returns the index of the last found substring matching +object+;
4900 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4901 *
4902 * s = 'foo'
4903 * s.byterindex(/f/) # => 0
4904 * $~ # => #<MatchData "f">
4905 * s.byterindex(/o/) # => 2
4906 * s.byterindex(/oo/) # => 1
4907 * s.byterindex(/ooo/) # => nil
4908 * $~ # => nil
4909 *
4910 * The last match means starting at the possible last position,
4911 * not the last of the longest matches:
4912 *
4913 * s = 'foo'
4914 * s.byterindex(/o+/) # => 2
4915 * $~ #=> #<MatchData "o">
4916 *
4917 * To get the last longest match, use a negative lookbehind:
4918 *
4919 * s = 'foo'
4920 * s.byterindex(/(?<!o)o+/) # => 1
4921 * $~ # => #<MatchData "oo">
4922 *
4923 * Or use method #byteindex with negative lookahead:
4924 *
4925 * s = 'foo'
4926 * s.byteindex(/o+(?!.*o)/) # => 1
4927 * $~ #=> #<MatchData "oo">
4928 *
4929 * \Integer argument +offset+, if given, specifies the 0-based index
4930 * of the byte where searching is to end.
4931 *
4932 * When +offset+ is non-negative,
4933 * searching ends at byte position +offset+:
4934 *
4935 * s = 'foo'
4936 * s.byterindex('o', 0) # => nil
4937 * s.byterindex('o', 1) # => 1
4938 * s.byterindex('o', 2) # => 2
4939 * s.byterindex('o', 3) # => 2
4940 *
4941 * When +offset+ is negative, counts backward from the end of +self+:
4942 *
4943 * s = 'foo'
4944 * s.byterindex('o', -1) # => 2
4945 * s.byterindex('o', -2) # => 1
4946 * s.byterindex('o', -3) # => nil
4947 *
4948 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4949 *
4950 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4951 * s.size # => 2 # Two 3-byte characters.
4952 * s.bytesize # => 6 # Six bytes.
4953 * s.byterindex("\uFFFF") # => 3
4954 * s.byterindex("\uFFFF", 1) # Raises IndexError
4955 * s.byterindex("\uFFFF", 2) # Raises IndexError
4956 * s.byterindex("\uFFFF", 3) # => 3
4957 * s.byterindex("\uFFFF", 4) # Raises IndexError
4958 * s.byterindex("\uFFFF", 5) # Raises IndexError
4959 * s.byterindex("\uFFFF", 6) # => nil
4960 *
4961 * Related: see {Querying}[rdoc-ref:String@Querying].
4962 */
4963
4964static VALUE
4965rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4966{
4967 VALUE sub;
4968 VALUE initpos;
4969 long pos, len = RSTRING_LEN(str);
4970
4971 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4972 pos = NUM2LONG(initpos);
4973 if (pos < 0 && (pos += len) < 0) {
4974 if (RB_TYPE_P(sub, T_REGEXP)) {
4976 }
4977 return Qnil;
4978 }
4979 if (pos > len) pos = len;
4980 }
4981 else {
4982 pos = len;
4983 }
4984
4985 str_ensure_byte_pos(str, pos);
4986
4987 if (RB_TYPE_P(sub, T_REGEXP)) {
4988 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4989 VALUE match = rb_backref_get();
4990 struct re_registers *regs = RMATCH_REGS(match);
4991 pos = BEG(0);
4992 return LONG2NUM(pos);
4993 }
4994 }
4995 else {
4996 StringValue(sub);
4997 pos = rb_str_byterindex(str, sub, pos);
4998 if (pos >= 0) return LONG2NUM(pos);
4999 }
5000 return Qnil;
5001}
5002
5003/*
5004 * call-seq:
5005 * self =~ object -> integer or nil
5006 *
5007 * When +object+ is a Regexp, returns the index of the first substring in +self+
5008 * matched by +object+,
5009 * or +nil+ if no match is found;
5010 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5011 *
5012 * 'foo' =~ /f/ # => 0
5013 * $~ # => #<MatchData "f">
5014 * 'foo' =~ /o/ # => 1
5015 * $~ # => #<MatchData "o">
5016 * 'foo' =~ /x/ # => nil
5017 * $~ # => nil
5018 *
5019 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5020 * (see Regexp#=~):
5021 *
5022 * number = nil
5023 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5024 * number # => nil # Not assigned.
5025 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5026 * number # => "9" # Assigned.
5027 *
5028 * If +object+ is not a Regexp, returns the value
5029 * returned by <tt>object =~ self</tt>.
5030 *
5031 * Related: see {Querying}[rdoc-ref:String@Querying].
5032 */
5033
5034static VALUE
5035rb_str_match(VALUE x, VALUE y)
5036{
5037 switch (OBJ_BUILTIN_TYPE(y)) {
5038 case T_STRING:
5039 rb_raise(rb_eTypeError, "type mismatch: String given");
5040
5041 case T_REGEXP:
5042 return rb_reg_match(y, x);
5043
5044 default:
5045 return rb_funcall(y, idEqTilde, 1, x);
5046 }
5047}
5048
5049
5050static VALUE get_pat(VALUE);
5051
5052
5053/*
5054 * call-seq:
5055 * match(pattern, offset = 0) -> matchdata or nil
5056 * match(pattern, offset = 0) {|matchdata| ... } -> object
5057 *
5058 * Creates a MatchData object based on +self+ and the given arguments;
5059 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5060 *
5061 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5062 *
5063 * regexp = Regexp.new(pattern)
5064 *
5065 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5066 * (see Regexp#match):
5067 *
5068 * matchdata = regexp.match(self[offset..])
5069 *
5070 * With no block given, returns the computed +matchdata+ or +nil+:
5071 *
5072 * 'foo'.match('f') # => #<MatchData "f">
5073 * 'foo'.match('o') # => #<MatchData "o">
5074 * 'foo'.match('x') # => nil
5075 * 'foo'.match('f', 1) # => nil
5076 * 'foo'.match('o', 1) # => #<MatchData "o">
5077 *
5078 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5079 * returns the block's return value:
5080 *
5081 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5082 *
5083 * With a block given and +nil+ +matchdata+, does not call the block:
5084 *
5085 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5086 *
5087 * Related: see {Querying}[rdoc-ref:String@Querying].
5088 */
5089
5090static VALUE
5091rb_str_match_m(int argc, VALUE *argv, VALUE str)
5092{
5093 VALUE re, result;
5094 if (argc < 1)
5095 rb_check_arity(argc, 1, 2);
5096 re = argv[0];
5097 argv[0] = str;
5098 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5099 if (!NIL_P(result) && rb_block_given_p()) {
5100 return rb_yield(result);
5101 }
5102 return result;
5103}
5104
5105/*
5106 * call-seq:
5107 * match?(pattern, offset = 0) -> true or false
5108 *
5109 * Returns whether a match is found for +self+ and the given arguments;
5110 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5111 *
5112 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5113 *
5114 * regexp = Regexp.new(pattern)
5115 *
5116 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5117 * +false+ otherwise:
5118 *
5119 * 'foo'.match?(/o/) # => true
5120 * 'foo'.match?('o') # => true
5121 * 'foo'.match?(/x/) # => false
5122 * 'foo'.match?('f', 1) # => false
5123 * 'foo'.match?('o', 1) # => true
5124 *
5125 * Related: see {Querying}[rdoc-ref:String@Querying].
5126 */
5127
5128static VALUE
5129rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5130{
5131 VALUE re;
5132 rb_check_arity(argc, 1, 2);
5133 re = get_pat(argv[0]);
5134 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5135}
5136
5137enum neighbor_char {
5138 NEIGHBOR_NOT_CHAR,
5139 NEIGHBOR_FOUND,
5140 NEIGHBOR_WRAPPED
5141};
5142
5143static enum neighbor_char
5144enc_succ_char(char *p, long len, rb_encoding *enc)
5145{
5146 long i;
5147 int l;
5148
5149 if (rb_enc_mbminlen(enc) > 1) {
5150 /* wchar, trivial case */
5151 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5152 if (!MBCLEN_CHARFOUND_P(r)) {
5153 return NEIGHBOR_NOT_CHAR;
5154 }
5155 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5156 l = rb_enc_code_to_mbclen(c, enc);
5157 if (!l) return NEIGHBOR_NOT_CHAR;
5158 if (l != len) return NEIGHBOR_WRAPPED;
5159 rb_enc_mbcput(c, p, enc);
5160 r = rb_enc_precise_mbclen(p, p + len, enc);
5161 if (!MBCLEN_CHARFOUND_P(r)) {
5162 return NEIGHBOR_NOT_CHAR;
5163 }
5164 return NEIGHBOR_FOUND;
5165 }
5166 while (1) {
5167 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5168 p[i] = '\0';
5169 if (i < 0)
5170 return NEIGHBOR_WRAPPED;
5171 ++((unsigned char*)p)[i];
5172 l = rb_enc_precise_mbclen(p, p+len, enc);
5173 if (MBCLEN_CHARFOUND_P(l)) {
5174 l = MBCLEN_CHARFOUND_LEN(l);
5175 if (l == len) {
5176 return NEIGHBOR_FOUND;
5177 }
5178 else {
5179 memset(p+l, 0xff, len-l);
5180 }
5181 }
5182 if (MBCLEN_INVALID_P(l) && i < len-1) {
5183 long len2;
5184 int l2;
5185 for (len2 = len-1; 0 < len2; len2--) {
5186 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5187 if (!MBCLEN_INVALID_P(l2))
5188 break;
5189 }
5190 memset(p+len2+1, 0xff, len-(len2+1));
5191 }
5192 }
5193}
5194
5195static enum neighbor_char
5196enc_pred_char(char *p, long len, rb_encoding *enc)
5197{
5198 long i;
5199 int l;
5200 if (rb_enc_mbminlen(enc) > 1) {
5201 /* wchar, trivial case */
5202 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5203 if (!MBCLEN_CHARFOUND_P(r)) {
5204 return NEIGHBOR_NOT_CHAR;
5205 }
5206 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5207 if (!c) return NEIGHBOR_NOT_CHAR;
5208 --c;
5209 l = rb_enc_code_to_mbclen(c, enc);
5210 if (!l) return NEIGHBOR_NOT_CHAR;
5211 if (l != len) return NEIGHBOR_WRAPPED;
5212 rb_enc_mbcput(c, p, enc);
5213 r = rb_enc_precise_mbclen(p, p + len, enc);
5214 if (!MBCLEN_CHARFOUND_P(r)) {
5215 return NEIGHBOR_NOT_CHAR;
5216 }
5217 return NEIGHBOR_FOUND;
5218 }
5219 while (1) {
5220 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5221 p[i] = '\xff';
5222 if (i < 0)
5223 return NEIGHBOR_WRAPPED;
5224 --((unsigned char*)p)[i];
5225 l = rb_enc_precise_mbclen(p, p+len, enc);
5226 if (MBCLEN_CHARFOUND_P(l)) {
5227 l = MBCLEN_CHARFOUND_LEN(l);
5228 if (l == len) {
5229 return NEIGHBOR_FOUND;
5230 }
5231 else {
5232 memset(p+l, 0, len-l);
5233 }
5234 }
5235 if (MBCLEN_INVALID_P(l) && i < len-1) {
5236 long len2;
5237 int l2;
5238 for (len2 = len-1; 0 < len2; len2--) {
5239 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5240 if (!MBCLEN_INVALID_P(l2))
5241 break;
5242 }
5243 memset(p+len2+1, 0, len-(len2+1));
5244 }
5245 }
5246}
5247
5248/*
5249 overwrite +p+ by succeeding letter in +enc+ and returns
5250 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5251 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5252 assuming each ranges are successive, and mbclen
5253 never change in each ranges.
5254 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5255 character.
5256 */
5257static enum neighbor_char
5258enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5259{
5260 enum neighbor_char ret;
5261 unsigned int c;
5262 int ctype;
5263 int range;
5264 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5265
5266 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5267 int try;
5268 const int max_gaps = 1;
5269
5270 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5271 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5272 ctype = ONIGENC_CTYPE_DIGIT;
5273 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5274 ctype = ONIGENC_CTYPE_ALPHA;
5275 else
5276 return NEIGHBOR_NOT_CHAR;
5277
5278 MEMCPY(save, p, char, len);
5279 for (try = 0; try <= max_gaps; ++try) {
5280 ret = enc_succ_char(p, len, enc);
5281 if (ret == NEIGHBOR_FOUND) {
5282 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5283 if (rb_enc_isctype(c, ctype, enc))
5284 return NEIGHBOR_FOUND;
5285 }
5286 }
5287 MEMCPY(p, save, char, len);
5288 range = 1;
5289 while (1) {
5290 MEMCPY(save, p, char, len);
5291 ret = enc_pred_char(p, len, enc);
5292 if (ret == NEIGHBOR_FOUND) {
5293 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5294 if (!rb_enc_isctype(c, ctype, enc)) {
5295 MEMCPY(p, save, char, len);
5296 break;
5297 }
5298 }
5299 else {
5300 MEMCPY(p, save, char, len);
5301 break;
5302 }
5303 range++;
5304 }
5305 if (range == 1) {
5306 return NEIGHBOR_NOT_CHAR;
5307 }
5308
5309 if (ctype != ONIGENC_CTYPE_DIGIT) {
5310 MEMCPY(carry, p, char, len);
5311 return NEIGHBOR_WRAPPED;
5312 }
5313
5314 MEMCPY(carry, p, char, len);
5315 enc_succ_char(carry, len, enc);
5316 return NEIGHBOR_WRAPPED;
5317}
5318
5319
5320static VALUE str_succ(VALUE str);
5321
5322/*
5323 * call-seq:
5324 * succ -> new_str
5325 *
5326 * :include: doc/string/succ.rdoc
5327 *
5328 */
5329
5330VALUE
5332{
5333 VALUE str;
5334 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5335 rb_enc_cr_str_copy_for_substr(str, orig);
5336 return str_succ(str);
5337}
5338
5339static VALUE
5340str_succ(VALUE str)
5341{
5342 rb_encoding *enc;
5343 char *sbeg, *s, *e, *last_alnum = 0;
5344 int found_alnum = 0;
5345 long l, slen;
5346 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5347 long carry_pos = 0, carry_len = 1;
5348 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5349
5350 slen = RSTRING_LEN(str);
5351 if (slen == 0) return str;
5352
5353 enc = STR_ENC_GET(str);
5354 sbeg = RSTRING_PTR(str);
5355 s = e = sbeg + slen;
5356
5357 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5358 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5359 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5360 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5361 break;
5362 }
5363 }
5364 l = rb_enc_precise_mbclen(s, e, enc);
5365 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5366 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5367 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5368 switch (neighbor) {
5369 case NEIGHBOR_NOT_CHAR:
5370 continue;
5371 case NEIGHBOR_FOUND:
5372 return str;
5373 case NEIGHBOR_WRAPPED:
5374 last_alnum = s;
5375 break;
5376 }
5377 found_alnum = 1;
5378 carry_pos = s - sbeg;
5379 carry_len = l;
5380 }
5381 if (!found_alnum) { /* str contains no alnum */
5382 s = e;
5383 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5384 enum neighbor_char neighbor;
5385 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5386 l = rb_enc_precise_mbclen(s, e, enc);
5387 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5388 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5389 MEMCPY(tmp, s, char, l);
5390 neighbor = enc_succ_char(tmp, l, enc);
5391 switch (neighbor) {
5392 case NEIGHBOR_FOUND:
5393 MEMCPY(s, tmp, char, l);
5394 return str;
5395 break;
5396 case NEIGHBOR_WRAPPED:
5397 MEMCPY(s, tmp, char, l);
5398 break;
5399 case NEIGHBOR_NOT_CHAR:
5400 break;
5401 }
5402 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5403 /* wrapped to \0...\0. search next valid char. */
5404 enc_succ_char(s, l, enc);
5405 }
5406 if (!rb_enc_asciicompat(enc)) {
5407 MEMCPY(carry, s, char, l);
5408 carry_len = l;
5409 }
5410 carry_pos = s - sbeg;
5411 }
5413 }
5414 RESIZE_CAPA(str, slen + carry_len);
5415 sbeg = RSTRING_PTR(str);
5416 s = sbeg + carry_pos;
5417 memmove(s + carry_len, s, slen - carry_pos);
5418 memmove(s, carry, carry_len);
5419 slen += carry_len;
5420 STR_SET_LEN(str, slen);
5421 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5423 return str;
5424}
5425
5426
5427/*
5428 * call-seq:
5429 * succ! -> self
5430 *
5431 * Like String#succ, but modifies +self+ in place; returns +self+.
5432 *
5433 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5434 */
5435
5436static VALUE
5437rb_str_succ_bang(VALUE str)
5438{
5439 rb_str_modify(str);
5440 str_succ(str);
5441 return str;
5442}
5443
5444static int
5445all_digits_p(const char *s, long len)
5446{
5447 while (len-- > 0) {
5448 if (!ISDIGIT(*s)) return 0;
5449 s++;
5450 }
5451 return 1;
5452}
5453
5454static int
5455str_upto_i(VALUE str, VALUE arg)
5456{
5457 rb_yield(str);
5458 return 0;
5459}
5460
5461/*
5462 * call-seq:
5463 * upto(other_string, exclusive = false) {|string| ... } -> self
5464 * upto(other_string, exclusive = false) -> new_enumerator
5465 *
5466 * :include: doc/string/upto.rdoc
5467 *
5468 */
5469
5470static VALUE
5471rb_str_upto(int argc, VALUE *argv, VALUE beg)
5472{
5473 VALUE end, exclusive;
5474
5475 rb_scan_args(argc, argv, "11", &end, &exclusive);
5476 RETURN_ENUMERATOR(beg, argc, argv);
5477 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5478}
5479
5480VALUE
5481rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5482{
5483 VALUE current, after_end;
5484 ID succ;
5485 int n, ascii;
5486 rb_encoding *enc;
5487
5488 CONST_ID(succ, "succ");
5489 StringValue(end);
5490 enc = rb_enc_check(beg, end);
5491 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5492 /* single character */
5493 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5494 char c = RSTRING_PTR(beg)[0];
5495 char e = RSTRING_PTR(end)[0];
5496
5497 if (c > e || (excl && c == e)) return beg;
5498 for (;;) {
5499 VALUE str = rb_enc_str_new(&c, 1, enc);
5501 if ((*each)(str, arg)) break;
5502 if (!excl && c == e) break;
5503 c++;
5504 if (excl && c == e) break;
5505 }
5506 return beg;
5507 }
5508 /* both edges are all digits */
5509 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5510 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5511 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5512 VALUE b, e;
5513 int width;
5514
5515 width = RSTRING_LENINT(beg);
5516 b = rb_str_to_inum(beg, 10, FALSE);
5517 e = rb_str_to_inum(end, 10, FALSE);
5518 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5519 long bi = FIX2LONG(b);
5520 long ei = FIX2LONG(e);
5521 rb_encoding *usascii = rb_usascii_encoding();
5522
5523 while (bi <= ei) {
5524 if (excl && bi == ei) break;
5525 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5526 bi++;
5527 }
5528 }
5529 else {
5530 ID op = excl ? '<' : idLE;
5531 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5532
5533 args[0] = INT2FIX(width);
5534 while (rb_funcall(b, op, 1, e)) {
5535 args[1] = b;
5536 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5537 b = rb_funcallv(b, succ, 0, 0);
5538 }
5539 }
5540 return beg;
5541 }
5542 /* normal case */
5543 n = rb_str_cmp(beg, end);
5544 if (n > 0 || (excl && n == 0)) return beg;
5545
5546 after_end = rb_funcallv(end, succ, 0, 0);
5547 current = str_duplicate(rb_cString, beg);
5548 while (!rb_str_equal(current, after_end)) {
5549 VALUE next = Qnil;
5550 if (excl || !rb_str_equal(current, end))
5551 next = rb_funcallv(current, succ, 0, 0);
5552 if ((*each)(current, arg)) break;
5553 if (NIL_P(next)) break;
5554 current = next;
5555 StringValue(current);
5556 if (excl && rb_str_equal(current, end)) break;
5557 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5558 break;
5559 }
5560
5561 return beg;
5562}
5563
5564VALUE
5565rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5566{
5567 VALUE current;
5568 ID succ;
5569
5570 CONST_ID(succ, "succ");
5571 /* both edges are all digits */
5572 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5573 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5574 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5575 int width = RSTRING_LENINT(beg);
5576 b = rb_str_to_inum(beg, 10, FALSE);
5577 if (FIXNUM_P(b)) {
5578 long bi = FIX2LONG(b);
5579 rb_encoding *usascii = rb_usascii_encoding();
5580
5581 while (FIXABLE(bi)) {
5582 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5583 bi++;
5584 }
5585 b = LONG2NUM(bi);
5586 }
5587 args[0] = INT2FIX(width);
5588 while (1) {
5589 args[1] = b;
5590 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5591 b = rb_funcallv(b, succ, 0, 0);
5592 }
5593 }
5594 /* normal case */
5595 current = str_duplicate(rb_cString, beg);
5596 while (1) {
5597 VALUE next = rb_funcallv(current, succ, 0, 0);
5598 if ((*each)(current, arg)) break;
5599 current = next;
5600 StringValue(current);
5601 if (RSTRING_LEN(current) == 0)
5602 break;
5603 }
5604
5605 return beg;
5606}
5607
5608static int
5609include_range_i(VALUE str, VALUE arg)
5610{
5611 VALUE *argp = (VALUE *)arg;
5612 if (!rb_equal(str, *argp)) return 0;
5613 *argp = Qnil;
5614 return 1;
5615}
5616
5617VALUE
5618rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5619{
5620 beg = rb_str_new_frozen(beg);
5621 StringValue(end);
5622 end = rb_str_new_frozen(end);
5623 if (NIL_P(val)) return Qfalse;
5624 val = rb_check_string_type(val);
5625 if (NIL_P(val)) return Qfalse;
5626 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5627 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5628 rb_enc_asciicompat(STR_ENC_GET(val))) {
5629 const char *bp = RSTRING_PTR(beg);
5630 const char *ep = RSTRING_PTR(end);
5631 const char *vp = RSTRING_PTR(val);
5632 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5633 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5634 return Qfalse;
5635 else {
5636 char b = *bp;
5637 char e = *ep;
5638 char v = *vp;
5639
5640 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5641 if (b <= v && v < e) return Qtrue;
5642 return RBOOL(!RTEST(exclusive) && v == e);
5643 }
5644 }
5645 }
5646#if 0
5647 /* both edges are all digits */
5648 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5649 all_digits_p(bp, RSTRING_LEN(beg)) &&
5650 all_digits_p(ep, RSTRING_LEN(end))) {
5651 /* TODO */
5652 }
5653#endif
5654 }
5655 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5656
5657 return RBOOL(NIL_P(val));
5658}
5659
5660static VALUE
5661rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5662{
5663 if (rb_reg_search(re, str, 0, 0) >= 0) {
5664 VALUE match = rb_backref_get();
5665 int nth = rb_reg_backref_number(match, backref);
5666 return rb_reg_nth_match(nth, match);
5667 }
5668 return Qnil;
5669}
5670
5671static VALUE
5672rb_str_aref(VALUE str, VALUE indx)
5673{
5674 long idx;
5675
5676 if (FIXNUM_P(indx)) {
5677 idx = FIX2LONG(indx);
5678 }
5679 else if (RB_TYPE_P(indx, T_REGEXP)) {
5680 return rb_str_subpat(str, indx, INT2FIX(0));
5681 }
5682 else if (RB_TYPE_P(indx, T_STRING)) {
5683 if (rb_str_index(str, indx, 0) != -1)
5684 return str_duplicate(rb_cString, indx);
5685 return Qnil;
5686 }
5687 else {
5688 /* check if indx is Range */
5689 long beg, len = str_strlen(str, NULL);
5690 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5691 case Qfalse:
5692 break;
5693 case Qnil:
5694 return Qnil;
5695 default:
5696 return rb_str_substr(str, beg, len);
5697 }
5698 idx = NUM2LONG(indx);
5699 }
5700
5701 return str_substr(str, idx, 1, FALSE);
5702}
5703
5704
5705/*
5706 * call-seq:
5707 * self[index] -> new_string or nil
5708 * self[start, length] -> new_string or nil
5709 * self[range] -> new_string or nil
5710 * self[regexp, capture = 0] -> new_string or nil
5711 * self[substring] -> new_string or nil
5712 *
5713 * :include: doc/string/aref.rdoc
5714 *
5715 */
5716
5717static VALUE
5718rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5719{
5720 if (argc == 2) {
5721 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5722 return rb_str_subpat(str, argv[0], argv[1]);
5723 }
5724 else {
5725 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5726 }
5727 }
5728 rb_check_arity(argc, 1, 2);
5729 return rb_str_aref(str, argv[0]);
5730}
5731
5732VALUE
5734{
5735 char *ptr = RSTRING_PTR(str);
5736 long olen = RSTRING_LEN(str), nlen;
5737
5738 str_modifiable(str);
5739 if (len > olen) len = olen;
5740 nlen = olen - len;
5741 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5742 char *oldptr = ptr;
5743 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5744 STR_SET_EMBED(str);
5745 ptr = RSTRING(str)->as.embed.ary;
5746 memmove(ptr, oldptr + len, nlen);
5747 if (fl == STR_NOEMBED) xfree(oldptr);
5748 }
5749 else {
5750 if (!STR_SHARED_P(str)) {
5751 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5752 rb_enc_cr_str_exact_copy(shared, str);
5753 OBJ_FREEZE(shared);
5754 }
5755 ptr = RSTRING(str)->as.heap.ptr += len;
5756 }
5757 STR_SET_LEN(str, nlen);
5758
5759 if (!SHARABLE_MIDDLE_SUBSTRING) {
5760 TERM_FILL(ptr + nlen, TERM_LEN(str));
5761 }
5763 return str;
5764}
5765
5766static void
5767rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5768{
5769 char *sptr;
5770 long slen;
5771 int cr;
5772
5773 if (beg == 0 && vlen == 0) {
5774 rb_str_drop_bytes(str, len);
5775 return;
5776 }
5777
5778 str_modify_keep_cr(str);
5779 RSTRING_GETMEM(str, sptr, slen);
5780 if (len < vlen) {
5781 /* expand string */
5782 RESIZE_CAPA(str, slen + vlen - len);
5783 sptr = RSTRING_PTR(str);
5784 }
5785
5787 cr = rb_enc_str_coderange(val);
5788 else
5790
5791 if (vlen != len) {
5792 memmove(sptr + beg + vlen,
5793 sptr + beg + len,
5794 slen - (beg + len));
5795 }
5796 if (vlen < beg && len < 0) {
5797 MEMZERO(sptr + slen, char, -len);
5798 }
5799 if (vlen > 0) {
5800 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5801 }
5802 slen += vlen - len;
5803 STR_SET_LEN(str, slen);
5804 TERM_FILL(&sptr[slen], TERM_LEN(str));
5805 ENC_CODERANGE_SET(str, cr);
5806}
5807
5808static inline void
5809rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5810{
5811 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5812}
5813
5814void
5815rb_str_update(VALUE str, long beg, long len, VALUE val)
5816{
5817 long slen;
5818 char *p, *e;
5819 rb_encoding *enc;
5820 int singlebyte = single_byte_optimizable(str);
5821 int cr;
5822
5823 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5824
5825 StringValue(val);
5826 enc = rb_enc_check(str, val);
5827 slen = str_strlen(str, enc); /* rb_enc_check */
5828
5829 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5830 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5831 }
5832 if (beg < 0) {
5833 beg += slen;
5834 }
5835 RUBY_ASSERT(beg >= 0);
5836 RUBY_ASSERT(beg <= slen);
5837
5838 if (len > slen - beg) {
5839 len = slen - beg;
5840 }
5841 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5842 if (!p) p = RSTRING_END(str);
5843 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5844 if (!e) e = RSTRING_END(str);
5845 /* error check */
5846 beg = p - RSTRING_PTR(str); /* physical position */
5847 len = e - p; /* physical length */
5848 rb_str_update_0(str, beg, len, val);
5849 rb_enc_associate(str, enc);
5851 if (cr != ENC_CODERANGE_BROKEN)
5852 ENC_CODERANGE_SET(str, cr);
5853}
5854
5855static void
5856rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5857{
5858 int nth;
5859 VALUE match;
5860 long start, end, len;
5861 rb_encoding *enc;
5862 struct re_registers *regs;
5863
5864 if (rb_reg_search(re, str, 0, 0) < 0) {
5865 rb_raise(rb_eIndexError, "regexp not matched");
5866 }
5867 match = rb_backref_get();
5868 nth = rb_reg_backref_number(match, backref);
5869 regs = RMATCH_REGS(match);
5870 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5871 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5872 }
5873 if (nth < 0) {
5874 nth += regs->num_regs;
5875 }
5876
5877 start = BEG(nth);
5878 if (start == -1) {
5879 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5880 }
5881 end = END(nth);
5882 len = end - start;
5883 StringValue(val);
5884 enc = rb_enc_check_str(str, val);
5885 rb_str_update_0(str, start, len, val);
5886 rb_enc_associate(str, enc);
5887}
5888
5889static VALUE
5890rb_str_aset(VALUE str, VALUE indx, VALUE val)
5891{
5892 long idx, beg;
5893
5894 switch (TYPE(indx)) {
5895 case T_REGEXP:
5896 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5897 return val;
5898
5899 case T_STRING:
5900 beg = rb_str_index(str, indx, 0);
5901 if (beg < 0) {
5902 rb_raise(rb_eIndexError, "string not matched");
5903 }
5904 beg = rb_str_sublen(str, beg);
5905 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5906 return val;
5907
5908 default:
5909 /* check if indx is Range */
5910 {
5911 long beg, len;
5912 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5913 rb_str_update(str, beg, len, val);
5914 return val;
5915 }
5916 }
5917 /* FALLTHROUGH */
5918
5919 case T_FIXNUM:
5920 idx = NUM2LONG(indx);
5921 rb_str_update(str, idx, 1, val);
5922 return val;
5923 }
5924}
5925
5926/*
5927 * call-seq:
5928 * self[index] = other_string -> new_string
5929 * self[start, length] = other_string -> new_string
5930 * self[range] = other_string -> new_string
5931 * self[regexp, capture = 0] = other_string -> new_string
5932 * self[substring] = other_string -> new_string
5933 *
5934 * :include: doc/string/aset.rdoc
5935 *
5936 */
5937
5938static VALUE
5939rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5940{
5941 if (argc == 3) {
5942 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5943 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5944 }
5945 else {
5946 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5947 }
5948 return argv[2];
5949 }
5950 rb_check_arity(argc, 2, 3);
5951 return rb_str_aset(str, argv[0], argv[1]);
5952}
5953
5954/*
5955 * call-seq:
5956 * insert(offset, other_string) -> self
5957 *
5958 * :include: doc/string/insert.rdoc
5959 *
5960 */
5961
5962static VALUE
5963rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5964{
5965 long pos = NUM2LONG(idx);
5966
5967 if (pos == -1) {
5968 return rb_str_append(str, str2);
5969 }
5970 else if (pos < 0) {
5971 pos++;
5972 }
5973 rb_str_update(str, pos, 0, str2);
5974 return str;
5975}
5976
5977
5978/*
5979 * call-seq:
5980 * slice!(index) -> new_string or nil
5981 * slice!(start, length) -> new_string or nil
5982 * slice!(range) -> new_string or nil
5983 * slice!(regexp, capture = 0) -> new_string or nil
5984 * slice!(substring) -> new_string or nil
5985 *
5986 * Like String#[] (and its alias String#slice), except that:
5987 *
5988 * - Performs substitutions in +self+ (not in a copy of +self+).
5989 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
5990 *
5991 * A few examples:
5992 *
5993 * s = 'hello'
5994 * s.slice!('e') # => "e"
5995 * s # => "hllo"
5996 * s.slice!('e') # => nil
5997 * s # => "hllo"
5998 *
5999 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6000 */
6001
6002static VALUE
6003rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6004{
6005 VALUE result = Qnil;
6006 VALUE indx;
6007 long beg, len = 1;
6008 char *p;
6009
6010 rb_check_arity(argc, 1, 2);
6011 str_modify_keep_cr(str);
6012 indx = argv[0];
6013 if (RB_TYPE_P(indx, T_REGEXP)) {
6014 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6015 VALUE match = rb_backref_get();
6016 struct re_registers *regs = RMATCH_REGS(match);
6017 int nth = 0;
6018 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6019 if ((nth += regs->num_regs) <= 0) return Qnil;
6020 }
6021 else if (nth >= regs->num_regs) return Qnil;
6022 beg = BEG(nth);
6023 len = END(nth) - beg;
6024 goto subseq;
6025 }
6026 else if (argc == 2) {
6027 beg = NUM2LONG(indx);
6028 len = NUM2LONG(argv[1]);
6029 goto num_index;
6030 }
6031 else if (FIXNUM_P(indx)) {
6032 beg = FIX2LONG(indx);
6033 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6034 if (!len) return Qnil;
6035 beg = p - RSTRING_PTR(str);
6036 goto subseq;
6037 }
6038 else if (RB_TYPE_P(indx, T_STRING)) {
6039 beg = rb_str_index(str, indx, 0);
6040 if (beg == -1) return Qnil;
6041 len = RSTRING_LEN(indx);
6042 result = str_duplicate(rb_cString, indx);
6043 goto squash;
6044 }
6045 else {
6046 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6047 case Qnil:
6048 return Qnil;
6049 case Qfalse:
6050 beg = NUM2LONG(indx);
6051 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6052 if (!len) return Qnil;
6053 beg = p - RSTRING_PTR(str);
6054 goto subseq;
6055 default:
6056 goto num_index;
6057 }
6058 }
6059
6060 num_index:
6061 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6062 beg = p - RSTRING_PTR(str);
6063
6064 subseq:
6065 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6066 rb_enc_cr_str_copy_for_substr(result, str);
6067
6068 squash:
6069 if (len > 0) {
6070 if (beg == 0) {
6071 rb_str_drop_bytes(str, len);
6072 }
6073 else {
6074 char *sptr = RSTRING_PTR(str);
6075 long slen = RSTRING_LEN(str);
6076 if (beg + len > slen) /* pathological check */
6077 len = slen - beg;
6078 memmove(sptr + beg,
6079 sptr + beg + len,
6080 slen - (beg + len));
6081 slen -= len;
6082 STR_SET_LEN(str, slen);
6083 TERM_FILL(&sptr[slen], TERM_LEN(str));
6084 }
6085 }
6086 return result;
6087}
6088
6089static VALUE
6090get_pat(VALUE pat)
6091{
6092 VALUE val;
6093
6094 switch (OBJ_BUILTIN_TYPE(pat)) {
6095 case T_REGEXP:
6096 return pat;
6097
6098 case T_STRING:
6099 break;
6100
6101 default:
6102 val = rb_check_string_type(pat);
6103 if (NIL_P(val)) {
6104 Check_Type(pat, T_REGEXP);
6105 }
6106 pat = val;
6107 }
6108
6109 return rb_reg_regcomp(pat);
6110}
6111
6112static VALUE
6113get_pat_quoted(VALUE pat, int check)
6114{
6115 VALUE val;
6116
6117 switch (OBJ_BUILTIN_TYPE(pat)) {
6118 case T_REGEXP:
6119 return pat;
6120
6121 case T_STRING:
6122 break;
6123
6124 default:
6125 val = rb_check_string_type(pat);
6126 if (NIL_P(val)) {
6127 Check_Type(pat, T_REGEXP);
6128 }
6129 pat = val;
6130 }
6131 if (check && is_broken_string(pat)) {
6132 rb_exc_raise(rb_reg_check_preprocess(pat));
6133 }
6134 return pat;
6135}
6136
6137static long
6138rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6139{
6140 if (BUILTIN_TYPE(pat) == T_STRING) {
6141 pos = rb_str_byteindex(str, pat, pos);
6142 if (set_backref_str) {
6143 if (pos >= 0) {
6144 str = rb_str_new_frozen_String(str);
6145 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6146 if (match) {
6147 *match = match_data;
6148 }
6149 }
6150 else {
6152 }
6153 }
6154 return pos;
6155 }
6156 else {
6157 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6158 }
6159}
6160
6161static long
6162rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6163{
6164 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6165}
6166
6167
6168/*
6169 * call-seq:
6170 * sub!(pattern, replacement) -> self or nil
6171 * sub!(pattern) {|match| ... } -> self or nil
6172 *
6173 * Like String#sub, except that:
6174 *
6175 * - Changes are made to +self+, not to copy of +self+.
6176 * - Returns +self+ if any changes are made, +nil+ otherwise.
6177 *
6178 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6179 */
6180
6181static VALUE
6182rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6183{
6184 VALUE pat, repl, hash = Qnil;
6185 int iter = 0;
6186 long plen;
6187 int min_arity = rb_block_given_p() ? 1 : 2;
6188 long beg;
6189
6190 rb_check_arity(argc, min_arity, 2);
6191 if (argc == 1) {
6192 iter = 1;
6193 }
6194 else {
6195 repl = argv[1];
6196 hash = rb_check_hash_type(argv[1]);
6197 if (NIL_P(hash)) {
6198 StringValue(repl);
6199 }
6200 }
6201
6202 pat = get_pat_quoted(argv[0], 1);
6203
6204 str_modifiable(str);
6205 beg = rb_pat_search(pat, str, 0, 1);
6206 if (beg >= 0) {
6207 rb_encoding *enc;
6208 int cr = ENC_CODERANGE(str);
6209 long beg0, end0;
6210 VALUE match, match0 = Qnil;
6211 struct re_registers *regs;
6212 char *p, *rp;
6213 long len, rlen;
6214
6215 match = rb_backref_get();
6216 regs = RMATCH_REGS(match);
6217 if (RB_TYPE_P(pat, T_STRING)) {
6218 beg0 = beg;
6219 end0 = beg0 + RSTRING_LEN(pat);
6220 match0 = pat;
6221 }
6222 else {
6223 beg0 = BEG(0);
6224 end0 = END(0);
6225 if (iter) match0 = rb_reg_nth_match(0, match);
6226 }
6227
6228 if (iter || !NIL_P(hash)) {
6229 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6230
6231 if (iter) {
6232 repl = rb_obj_as_string(rb_yield(match0));
6233 }
6234 else {
6235 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6236 repl = rb_obj_as_string(repl);
6237 }
6238 str_mod_check(str, p, len);
6239 rb_check_frozen(str);
6240 }
6241 else {
6242 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6243 }
6244
6245 enc = rb_enc_compatible(str, repl);
6246 if (!enc) {
6247 rb_encoding *str_enc = STR_ENC_GET(str);
6248 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6249 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6250 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6251 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6252 rb_enc_inspect_name(str_enc),
6253 rb_enc_inspect_name(STR_ENC_GET(repl)));
6254 }
6255 enc = STR_ENC_GET(repl);
6256 }
6257 rb_str_modify(str);
6258 rb_enc_associate(str, enc);
6260 int cr2 = ENC_CODERANGE(repl);
6261 if (cr2 == ENC_CODERANGE_BROKEN ||
6262 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6264 else
6265 cr = cr2;
6266 }
6267 plen = end0 - beg0;
6268 rlen = RSTRING_LEN(repl);
6269 len = RSTRING_LEN(str);
6270 if (rlen > plen) {
6271 RESIZE_CAPA(str, len + rlen - plen);
6272 }
6273 p = RSTRING_PTR(str);
6274 if (rlen != plen) {
6275 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6276 }
6277 rp = RSTRING_PTR(repl);
6278 memmove(p + beg0, rp, rlen);
6279 len += rlen - plen;
6280 STR_SET_LEN(str, len);
6281 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6282 ENC_CODERANGE_SET(str, cr);
6283
6284 RB_GC_GUARD(match);
6285
6286 return str;
6287 }
6288 return Qnil;
6289}
6290
6291
6292/*
6293 * call-seq:
6294 * sub(pattern, replacement) -> new_string
6295 * sub(pattern) {|match| ... } -> new_string
6296 *
6297 * :include: doc/string/sub.rdoc
6298 */
6299
6300static VALUE
6301rb_str_sub(int argc, VALUE *argv, VALUE str)
6302{
6303 str = str_duplicate(rb_cString, str);
6304 rb_str_sub_bang(argc, argv, str);
6305 return str;
6306}
6307
6308static VALUE
6309str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6310{
6311 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6312 long beg, beg0, end0;
6313 long offset, blen, slen, len, last;
6314 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6315 char *sp, *cp;
6316 int need_backref_str = -1;
6317 rb_encoding *str_enc;
6318
6319 switch (argc) {
6320 case 1:
6321 RETURN_ENUMERATOR(str, argc, argv);
6322 mode = ITER;
6323 break;
6324 case 2:
6325 repl = argv[1];
6326 hash = rb_check_hash_type(argv[1]);
6327 if (NIL_P(hash)) {
6328 StringValue(repl);
6329 }
6330 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6331 mode = FAST_MAP;
6332 }
6333 else {
6334 mode = MAP;
6335 }
6336 break;
6337 default:
6338 rb_error_arity(argc, 1, 2);
6339 }
6340
6341 pat = get_pat_quoted(argv[0], 1);
6342 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6343
6344 if (beg < 0) {
6345 if (bang) return Qnil; /* no match, no substitution */
6346 return str_duplicate(rb_cString, str);
6347 }
6348
6349 offset = 0;
6350 blen = RSTRING_LEN(str) + 30; /* len + margin */
6351 dest = rb_str_buf_new(blen);
6352 sp = RSTRING_PTR(str);
6353 slen = RSTRING_LEN(str);
6354 cp = sp;
6355 str_enc = STR_ENC_GET(str);
6356 rb_enc_associate(dest, str_enc);
6357 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6358
6359 do {
6360 struct re_registers *regs = RMATCH_REGS(match);
6361 if (RB_TYPE_P(pat, T_STRING)) {
6362 beg0 = beg;
6363 end0 = beg0 + RSTRING_LEN(pat);
6364 match0 = pat;
6365 }
6366 else {
6367 beg0 = BEG(0);
6368 end0 = END(0);
6369 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6370 }
6371
6372 if (mode != STR) {
6373 if (mode == ITER) {
6374 val = rb_obj_as_string(rb_yield(match0));
6375 }
6376 else {
6377 struct RString fake_str = {RBASIC_INIT};
6378 VALUE key;
6379 if (mode == FAST_MAP) {
6380 // It is safe to use a fake_str here because we established that it won't escape,
6381 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6382 // default proc.
6383 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6384 }
6385 else {
6386 key = rb_str_subseq(str, beg0, end0 - beg0);
6387 }
6388 val = rb_hash_aref(hash, key);
6389 val = rb_obj_as_string(val);
6390 }
6391 str_mod_check(str, sp, slen);
6392 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6393 rb_raise(rb_eRuntimeError, "block should not cheat");
6394 }
6395 }
6396 else if (need_backref_str) {
6397 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6398 if (need_backref_str < 0) {
6399 need_backref_str = val != repl;
6400 }
6401 }
6402 else {
6403 val = repl;
6404 }
6405
6406 len = beg0 - offset; /* copy pre-match substr */
6407 if (len) {
6408 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6409 }
6410
6411 rb_str_buf_append(dest, val);
6412
6413 last = offset;
6414 offset = end0;
6415 if (beg0 == end0) {
6416 /*
6417 * Always consume at least one character of the input string
6418 * in order to prevent infinite loops.
6419 */
6420 if (RSTRING_LEN(str) <= end0) break;
6421 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6422 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6423 offset = end0 + len;
6424 }
6425 cp = RSTRING_PTR(str) + offset;
6426 if (offset > RSTRING_LEN(str)) break;
6427
6428 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6429 if (mode != FAST_MAP && mode != STR) {
6430 match = Qnil;
6431 }
6432 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6433
6434 RB_GC_GUARD(match);
6435 } while (beg >= 0);
6436
6437 if (RSTRING_LEN(str) > offset) {
6438 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6439 }
6440 rb_pat_search0(pat, str, last, 1, &match);
6441 if (bang) {
6442 str_shared_replace(str, dest);
6443 }
6444 else {
6445 str = dest;
6446 }
6447
6448 return str;
6449}
6450
6451
6452/*
6453 * call-seq:
6454 * gsub!(pattern, replacement) -> self or nil
6455 * gsub!(pattern) {|match| ... } -> self or nil
6456 * gsub!(pattern) -> an_enumerator
6457 *
6458 * Like String#gsub, except that:
6459 *
6460 * - Performs substitutions in +self+ (not in a copy of +self+).
6461 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6462 *
6463 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6464 */
6465
6466static VALUE
6467rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6468{
6469 str_modify_keep_cr(str);
6470 return str_gsub(argc, argv, str, 1);
6471}
6472
6473
6474/*
6475 * call-seq:
6476 * gsub(pattern, replacement) -> new_string
6477 * gsub(pattern) {|match| ... } -> new_string
6478 * gsub(pattern) -> enumerator
6479 *
6480 * Returns a copy of +self+ with zero or more substrings replaced.
6481 *
6482 * Argument +pattern+ may be a string or a Regexp;
6483 * argument +replacement+ may be a string or a Hash.
6484 * Varying types for the argument values makes this method very versatile.
6485 *
6486 * Below are some simple examples;
6487 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6488 *
6489 * With arguments +pattern+ and string +replacement+ given,
6490 * replaces each matching substring with the given +replacement+ string:
6491 *
6492 * s = 'abracadabra'
6493 * s.gsub('ab', 'AB') # => "ABracadABra"
6494 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6495 *
6496 * With arguments +pattern+ and hash +replacement+ given,
6497 * replaces each matching substring with a value from the given +replacement+ hash,
6498 * or removes it:
6499 *
6500 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6501 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6502 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6503 *
6504 * With argument +pattern+ and a block given,
6505 * calls the block with each matching substring;
6506 * replaces that substring with the block's return value:
6507 *
6508 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6509 * # => "ABrACADABrA"
6510 *
6511 * With argument +pattern+ and no block given,
6512 * returns a new Enumerator.
6513 *
6514 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6515 */
6516
6517static VALUE
6518rb_str_gsub(int argc, VALUE *argv, VALUE str)
6519{
6520 return str_gsub(argc, argv, str, 0);
6521}
6522
6523
6524/*
6525 * call-seq:
6526 * replace(other_string) -> self
6527 *
6528 * Replaces the contents of +self+ with the contents of +other_string+;
6529 * returns +self+:
6530 *
6531 * s = 'foo' # => "foo"
6532 * s.replace('bar') # => "bar"
6533 *
6534 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6535 */
6536
6537VALUE
6539{
6540 str_modifiable(str);
6541 if (str == str2) return str;
6542
6543 StringValue(str2);
6544 str_discard(str);
6545 return str_replace(str, str2);
6546}
6547
6548/*
6549 * call-seq:
6550 * clear -> self
6551 *
6552 * Removes the contents of +self+:
6553 *
6554 * s = 'foo'
6555 * s.clear # => ""
6556 * s # => ""
6557 *
6558 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6559 */
6560
6561static VALUE
6562rb_str_clear(VALUE str)
6563{
6564 str_discard(str);
6565 STR_SET_EMBED(str);
6566 STR_SET_LEN(str, 0);
6567 RSTRING_PTR(str)[0] = 0;
6568 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6570 else
6572 return str;
6573}
6574
6575/*
6576 * call-seq:
6577 * chr -> string
6578 *
6579 * :include: doc/string/chr.rdoc
6580 *
6581 */
6582
6583static VALUE
6584rb_str_chr(VALUE str)
6585{
6586 return rb_str_substr(str, 0, 1);
6587}
6588
6589/*
6590 * call-seq:
6591 * getbyte(index) -> integer or nil
6592 *
6593 * :include: doc/string/getbyte.rdoc
6594 *
6595 */
6596VALUE
6597rb_str_getbyte(VALUE str, VALUE index)
6598{
6599 long pos = NUM2LONG(index);
6600
6601 if (pos < 0)
6602 pos += RSTRING_LEN(str);
6603 if (pos < 0 || RSTRING_LEN(str) <= pos)
6604 return Qnil;
6605
6606 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6607}
6608
6609/*
6610 * call-seq:
6611 * setbyte(index, integer) -> integer
6612 *
6613 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6614 * returns +integer+:
6615 *
6616 * s = 'xyzzy'
6617 * s.setbyte(2, 129) # => 129
6618 * s # => "xy\x81zy"
6619 *
6620 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6621 */
6622VALUE
6623rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6624{
6625 long pos = NUM2LONG(index);
6626 long len = RSTRING_LEN(str);
6627 char *ptr, *head, *left = 0;
6628 rb_encoding *enc;
6629 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6630
6631 if (pos < -len || len <= pos)
6632 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6633 if (pos < 0)
6634 pos += len;
6635
6636 VALUE v = rb_to_int(value);
6637 VALUE w = rb_int_and(v, INT2FIX(0xff));
6638 char byte = (char)(NUM2INT(w) & 0xFF);
6639
6640 if (!str_independent(str))
6641 str_make_independent(str);
6642 enc = STR_ENC_GET(str);
6643 head = RSTRING_PTR(str);
6644 ptr = &head[pos];
6645 if (!STR_EMBED_P(str)) {
6646 cr = ENC_CODERANGE(str);
6647 switch (cr) {
6648 case ENC_CODERANGE_7BIT:
6649 left = ptr;
6650 *ptr = byte;
6651 if (ISASCII(byte)) goto end;
6652 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6653 if (!MBCLEN_CHARFOUND_P(nlen))
6655 else
6657 goto end;
6659 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6660 width = rb_enc_precise_mbclen(left, head+len, enc);
6661 *ptr = byte;
6662 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6663 if (!MBCLEN_CHARFOUND_P(nlen))
6665 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6667 goto end;
6668 }
6669 }
6671 *ptr = byte;
6672
6673 end:
6674 return value;
6675}
6676
6677static VALUE
6678str_byte_substr(VALUE str, long beg, long len, int empty)
6679{
6680 long n = RSTRING_LEN(str);
6681
6682 if (beg > n || len < 0) return Qnil;
6683 if (beg < 0) {
6684 beg += n;
6685 if (beg < 0) return Qnil;
6686 }
6687 if (len > n - beg)
6688 len = n - beg;
6689 if (len <= 0) {
6690 if (!empty) return Qnil;
6691 len = 0;
6692 }
6693
6694 VALUE str2 = str_subseq(str, beg, len);
6695
6696 str_enc_copy_direct(str2, str);
6697
6698 if (RSTRING_LEN(str2) == 0) {
6699 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6701 else
6703 }
6704 else {
6705 switch (ENC_CODERANGE(str)) {
6706 case ENC_CODERANGE_7BIT:
6708 break;
6709 default:
6711 break;
6712 }
6713 }
6714
6715 return str2;
6716}
6717
6718VALUE
6719rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6720{
6721 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6722}
6723
6724static VALUE
6725str_byte_aref(VALUE str, VALUE indx)
6726{
6727 long idx;
6728 if (FIXNUM_P(indx)) {
6729 idx = FIX2LONG(indx);
6730 }
6731 else {
6732 /* check if indx is Range */
6733 long beg, len = RSTRING_LEN(str);
6734
6735 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6736 case Qfalse:
6737 break;
6738 case Qnil:
6739 return Qnil;
6740 default:
6741 return str_byte_substr(str, beg, len, TRUE);
6742 }
6743
6744 idx = NUM2LONG(indx);
6745 }
6746 return str_byte_substr(str, idx, 1, FALSE);
6747}
6748
6749/*
6750 * call-seq:
6751 * byteslice(offset, length = 1) -> string or nil
6752 * byteslice(range) -> string or nil
6753 *
6754 * :include: doc/string/byteslice.rdoc
6755 */
6756
6757static VALUE
6758rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6759{
6760 if (argc == 2) {
6761 long beg = NUM2LONG(argv[0]);
6762 long len = NUM2LONG(argv[1]);
6763 return str_byte_substr(str, beg, len, TRUE);
6764 }
6765 rb_check_arity(argc, 1, 2);
6766 return str_byte_aref(str, argv[0]);
6767}
6768
6769static void
6770str_check_beg_len(VALUE str, long *beg, long *len)
6771{
6772 long end, slen = RSTRING_LEN(str);
6773
6774 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6775 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6776 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6777 }
6778 if (*beg < 0) {
6779 *beg += slen;
6780 }
6781 RUBY_ASSERT(*beg >= 0);
6782 RUBY_ASSERT(*beg <= slen);
6783
6784 if (*len > slen - *beg) {
6785 *len = slen - *beg;
6786 }
6787 end = *beg + *len;
6788 str_ensure_byte_pos(str, *beg);
6789 str_ensure_byte_pos(str, end);
6790}
6791
6792/*
6793 * call-seq:
6794 * bytesplice(offset, length, str) -> self
6795 * bytesplice(offset, length, str, str_offset, str_length) -> self
6796 * bytesplice(range, str) -> self
6797 * bytesplice(range, str, str_range) -> self
6798 *
6799 * :include: doc/string/bytesplice.rdoc
6800 */
6801
6802static VALUE
6803rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6804{
6805 long beg, len, vbeg, vlen;
6806 VALUE val;
6807 int cr;
6808
6809 rb_check_arity(argc, 2, 5);
6810 if (!(argc == 2 || argc == 3 || argc == 5)) {
6811 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6812 }
6813 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6814 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6815 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6816 rb_builtin_class_name(argv[0]));
6817 }
6818 val = argv[1];
6819 StringValue(val);
6820 if (argc == 2) {
6821 /* bytesplice(range, str) */
6822 vbeg = 0;
6823 vlen = RSTRING_LEN(val);
6824 }
6825 else {
6826 /* bytesplice(range, str, str_range) */
6827 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6828 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6829 rb_builtin_class_name(argv[2]));
6830 }
6831 }
6832 }
6833 else {
6834 beg = NUM2LONG(argv[0]);
6835 len = NUM2LONG(argv[1]);
6836 val = argv[2];
6837 StringValue(val);
6838 if (argc == 3) {
6839 /* bytesplice(index, length, str) */
6840 vbeg = 0;
6841 vlen = RSTRING_LEN(val);
6842 }
6843 else {
6844 /* bytesplice(index, length, str, str_index, str_length) */
6845 vbeg = NUM2LONG(argv[3]);
6846 vlen = NUM2LONG(argv[4]);
6847 }
6848 }
6849 str_check_beg_len(str, &beg, &len);
6850 str_check_beg_len(val, &vbeg, &vlen);
6851 str_modify_keep_cr(str);
6852
6853 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6854 rb_enc_associate(str, rb_enc_check(str, val));
6855 }
6856
6857 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6859 if (cr != ENC_CODERANGE_BROKEN)
6860 ENC_CODERANGE_SET(str, cr);
6861 return str;
6862}
6863
6864/*
6865 * call-seq:
6866 * reverse -> new_string
6867 *
6868 * Returns a new string with the characters from +self+ in reverse order.
6869 *
6870 * 'drawer'.reverse # => "reward"
6871 * 'reviled'.reverse # => "deliver"
6872 * 'stressed'.reverse # => "desserts"
6873 * 'semordnilaps'.reverse # => "spalindromes"
6874 *
6875 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6876 */
6877
6878static VALUE
6879rb_str_reverse(VALUE str)
6880{
6881 rb_encoding *enc;
6882 VALUE rev;
6883 char *s, *e, *p;
6884 int cr;
6885
6886 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6887 enc = STR_ENC_GET(str);
6888 rev = rb_str_new(0, RSTRING_LEN(str));
6889 s = RSTRING_PTR(str); e = RSTRING_END(str);
6890 p = RSTRING_END(rev);
6891 cr = ENC_CODERANGE(str);
6892
6893 if (RSTRING_LEN(str) > 1) {
6894 if (single_byte_optimizable(str)) {
6895 while (s < e) {
6896 *--p = *s++;
6897 }
6898 }
6899 else if (cr == ENC_CODERANGE_VALID) {
6900 while (s < e) {
6901 int clen = rb_enc_fast_mbclen(s, e, enc);
6902
6903 p -= clen;
6904 memcpy(p, s, clen);
6905 s += clen;
6906 }
6907 }
6908 else {
6909 cr = rb_enc_asciicompat(enc) ?
6911 while (s < e) {
6912 int clen = rb_enc_mbclen(s, e, enc);
6913
6914 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6915 p -= clen;
6916 memcpy(p, s, clen);
6917 s += clen;
6918 }
6919 }
6920 }
6921 STR_SET_LEN(rev, RSTRING_LEN(str));
6922 str_enc_copy_direct(rev, str);
6923 ENC_CODERANGE_SET(rev, cr);
6924
6925 return rev;
6926}
6927
6928
6929/*
6930 * call-seq:
6931 * reverse! -> self
6932 *
6933 * Returns +self+ with its characters reversed:
6934 *
6935 * 'drawer'.reverse! # => "reward"
6936 * 'reviled'.reverse! # => "deliver"
6937 * 'stressed'.reverse! # => "desserts"
6938 * 'semordnilaps'.reverse! # => "spalindromes"
6939 *
6940 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6941 */
6942
6943static VALUE
6944rb_str_reverse_bang(VALUE str)
6945{
6946 if (RSTRING_LEN(str) > 1) {
6947 if (single_byte_optimizable(str)) {
6948 char *s, *e, c;
6949
6950 str_modify_keep_cr(str);
6951 s = RSTRING_PTR(str);
6952 e = RSTRING_END(str) - 1;
6953 while (s < e) {
6954 c = *s;
6955 *s++ = *e;
6956 *e-- = c;
6957 }
6958 }
6959 else {
6960 str_shared_replace(str, rb_str_reverse(str));
6961 }
6962 }
6963 else {
6964 str_modify_keep_cr(str);
6965 }
6966 return str;
6967}
6968
6969
6970/*
6971 * call-seq:
6972 * include?(other_string) -> true or false
6973 *
6974 * Returns whether +self+ contains +other_string+:
6975 *
6976 * s = 'bar'
6977 * s.include?('ba') # => true
6978 * s.include?('ar') # => true
6979 * s.include?('bar') # => true
6980 * s.include?('a') # => true
6981 * s.include?('') # => true
6982 * s.include?('foo') # => false
6983 *
6984 * Related: see {Querying}[rdoc-ref:String@Querying].
6985 */
6986
6987VALUE
6988rb_str_include(VALUE str, VALUE arg)
6989{
6990 long i;
6991
6992 StringValue(arg);
6993 i = rb_str_index(str, arg, 0);
6994
6995 return RBOOL(i != -1);
6996}
6997
6998
6999/*
7000 * call-seq:
7001 * to_i(base = 10) -> integer
7002 *
7003 * Returns the result of interpreting leading characters in +self+
7004 * as an integer in the given +base+;
7005 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7006 *
7007 * '123456'.to_i # => 123456
7008 * '123def'.to_i(16) # => 1195503
7009 *
7010 * With +base+ zero given, string +object+ may contain leading characters
7011 * to specify the actual base:
7012 *
7013 * '123def'.to_i(0) # => 123
7014 * '0123def'.to_i(0) # => 83
7015 * '0b123def'.to_i(0) # => 1
7016 * '0o123def'.to_i(0) # => 83
7017 * '0d123def'.to_i(0) # => 123
7018 * '0x123def'.to_i(0) # => 1195503
7019 *
7020 * Characters past a leading valid number (in the given +base+) are ignored:
7021 *
7022 * '12.345'.to_i # => 12
7023 * '12345'.to_i(2) # => 1
7024 *
7025 * Returns zero if there is no leading valid number:
7026 *
7027 * 'abcdef'.to_i # => 0
7028 * '2'.to_i(2) # => 0
7029 *
7030 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7031 */
7032
7033static VALUE
7034rb_str_to_i(int argc, VALUE *argv, VALUE str)
7035{
7036 int base = 10;
7037
7038 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7039 rb_raise(rb_eArgError, "invalid radix %d", base);
7040 }
7041 return rb_str_to_inum(str, base, FALSE);
7042}
7043
7044
7045/*
7046 * call-seq:
7047 * to_f -> float
7048 *
7049 * Returns the result of interpreting leading characters in +self+ as a Float:
7050 *
7051 * '3.14159'.to_f # => 3.14159
7052 * '1.234e-2'.to_f # => 0.01234
7053 *
7054 * Characters past a leading valid number are ignored:
7055 *
7056 * '3.14 (pi to two places)'.to_f # => 3.14
7057 *
7058 * Returns zero if there is no leading valid number:
7059 *
7060 * 'abcdef'.to_f # => 0.0
7061 *
7062 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7063 */
7064
7065static VALUE
7066rb_str_to_f(VALUE str)
7067{
7068 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7069}
7070
7071
7072/*
7073 * call-seq:
7074 * to_s -> self or new_string
7075 *
7076 * Returns +self+ if +self+ is a +String+,
7077 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7078 *
7079 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7080 */
7081
7082static VALUE
7083rb_str_to_s(VALUE str)
7084{
7085 if (rb_obj_class(str) != rb_cString) {
7086 return str_duplicate(rb_cString, str);
7087 }
7088 return str;
7089}
7090
7091#if 0
7092static void
7093str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7094{
7095 char s[RUBY_MAX_CHAR_LEN];
7096 int n = rb_enc_codelen(c, enc);
7097
7098 rb_enc_mbcput(c, s, enc);
7099 rb_enc_str_buf_cat(str, s, n, enc);
7100}
7101#endif
7102
7103#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7104
7105int
7106rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7107{
7108 char buf[CHAR_ESC_LEN + 1];
7109 int l;
7110
7111#if SIZEOF_INT > 4
7112 c &= 0xffffffff;
7113#endif
7114 if (unicode_p) {
7115 if (c < 0x7F && ISPRINT(c)) {
7116 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7117 }
7118 else if (c < 0x10000) {
7119 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7120 }
7121 else {
7122 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7123 }
7124 }
7125 else {
7126 if (c < 0x100) {
7127 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7128 }
7129 else {
7130 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7131 }
7132 }
7133 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7134 rb_str_buf_cat(result, buf, l);
7135 return l;
7136}
7137
7138const char *
7139ruby_escaped_char(int c)
7140{
7141 switch (c) {
7142 case '\0': return "\\0";
7143 case '\n': return "\\n";
7144 case '\r': return "\\r";
7145 case '\t': return "\\t";
7146 case '\f': return "\\f";
7147 case '\013': return "\\v";
7148 case '\010': return "\\b";
7149 case '\007': return "\\a";
7150 case '\033': return "\\e";
7151 case '\x7f': return "\\c?";
7152 }
7153 return NULL;
7154}
7155
7156VALUE
7157rb_str_escape(VALUE str)
7158{
7159 int encidx = ENCODING_GET(str);
7160 rb_encoding *enc = rb_enc_from_index(encidx);
7161 const char *p = RSTRING_PTR(str);
7162 const char *pend = RSTRING_END(str);
7163 const char *prev = p;
7164 char buf[CHAR_ESC_LEN + 1];
7165 VALUE result = rb_str_buf_new(0);
7166 int unicode_p = rb_enc_unicode_p(enc);
7167 int asciicompat = rb_enc_asciicompat(enc);
7168
7169 while (p < pend) {
7170 unsigned int c;
7171 const char *cc;
7172 int n = rb_enc_precise_mbclen(p, pend, enc);
7173 if (!MBCLEN_CHARFOUND_P(n)) {
7174 if (p > prev) str_buf_cat(result, prev, p - prev);
7175 n = rb_enc_mbminlen(enc);
7176 if (pend < p + n)
7177 n = (int)(pend - p);
7178 while (n--) {
7179 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7180 str_buf_cat(result, buf, strlen(buf));
7181 prev = ++p;
7182 }
7183 continue;
7184 }
7185 n = MBCLEN_CHARFOUND_LEN(n);
7186 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7187 p += n;
7188 cc = ruby_escaped_char(c);
7189 if (cc) {
7190 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7191 str_buf_cat(result, cc, strlen(cc));
7192 prev = p;
7193 }
7194 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7195 }
7196 else {
7197 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7198 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7199 prev = p;
7200 }
7201 }
7202 if (p > prev) str_buf_cat(result, prev, p - prev);
7203 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7204
7205 return result;
7206}
7207
7208/*
7209 * call-seq:
7210 * inspect -> string
7211 *
7212 * :include: doc/string/inspect.rdoc
7213 *
7214 */
7215
7216VALUE
7218{
7219 int encidx = ENCODING_GET(str);
7220 rb_encoding *enc = rb_enc_from_index(encidx);
7221 const char *p, *pend, *prev;
7222 char buf[CHAR_ESC_LEN + 1];
7223 VALUE result = rb_str_buf_new(0);
7224 rb_encoding *resenc = rb_default_internal_encoding();
7225 int unicode_p = rb_enc_unicode_p(enc);
7226 int asciicompat = rb_enc_asciicompat(enc);
7227
7228 if (resenc == NULL) resenc = rb_default_external_encoding();
7229 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7230 rb_enc_associate(result, resenc);
7231 str_buf_cat2(result, "\"");
7232
7233 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7234 prev = p;
7235 while (p < pend) {
7236 unsigned int c, cc;
7237 int n;
7238
7239 n = rb_enc_precise_mbclen(p, pend, enc);
7240 if (!MBCLEN_CHARFOUND_P(n)) {
7241 if (p > prev) str_buf_cat(result, prev, p - prev);
7242 n = rb_enc_mbminlen(enc);
7243 if (pend < p + n)
7244 n = (int)(pend - p);
7245 while (n--) {
7246 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7247 str_buf_cat(result, buf, strlen(buf));
7248 prev = ++p;
7249 }
7250 continue;
7251 }
7252 n = MBCLEN_CHARFOUND_LEN(n);
7253 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7254 p += n;
7255 if ((asciicompat || unicode_p) &&
7256 (c == '"'|| c == '\\' ||
7257 (c == '#' &&
7258 p < pend &&
7259 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7260 (cc = rb_enc_codepoint(p,pend,enc),
7261 (cc == '$' || cc == '@' || cc == '{'))))) {
7262 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7263 str_buf_cat2(result, "\\");
7264 if (asciicompat || enc == resenc) {
7265 prev = p - n;
7266 continue;
7267 }
7268 }
7269 switch (c) {
7270 case '\n': cc = 'n'; break;
7271 case '\r': cc = 'r'; break;
7272 case '\t': cc = 't'; break;
7273 case '\f': cc = 'f'; break;
7274 case '\013': cc = 'v'; break;
7275 case '\010': cc = 'b'; break;
7276 case '\007': cc = 'a'; break;
7277 case 033: cc = 'e'; break;
7278 default: cc = 0; break;
7279 }
7280 if (cc) {
7281 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7282 buf[0] = '\\';
7283 buf[1] = (char)cc;
7284 str_buf_cat(result, buf, 2);
7285 prev = p;
7286 continue;
7287 }
7288 /* The special casing of 0x85 (NEXT_LINE) here is because
7289 * Oniguruma historically treats it as printable, but it
7290 * doesn't match the print POSIX bracket class or character
7291 * property in regexps.
7292 *
7293 * See Ruby Bug #16842 for details:
7294 * https://bugs.ruby-lang.org/issues/16842
7295 */
7296 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7297 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7298 continue;
7299 }
7300 else {
7301 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7302 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7303 prev = p;
7304 continue;
7305 }
7306 }
7307 if (p > prev) str_buf_cat(result, prev, p - prev);
7308 str_buf_cat2(result, "\"");
7309
7310 return result;
7311}
7312
7313#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7314
7315/*
7316 * call-seq:
7317 * dump -> new_string
7318 *
7319 * :include: doc/string/dump.rdoc
7320 *
7321 */
7322
7323VALUE
7325{
7326 int encidx = rb_enc_get_index(str);
7327 rb_encoding *enc = rb_enc_from_index(encidx);
7328 long len;
7329 const char *p, *pend;
7330 char *q, *qend;
7331 VALUE result;
7332 int u8 = (encidx == rb_utf8_encindex());
7333 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7334
7335 len = 2; /* "" */
7336 if (!rb_enc_asciicompat(enc)) {
7337 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7338 len += strlen(enc->name);
7339 }
7340
7341 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7342 while (p < pend) {
7343 int clen;
7344 unsigned char c = *p++;
7345
7346 switch (c) {
7347 case '"': case '\\':
7348 case '\n': case '\r':
7349 case '\t': case '\f':
7350 case '\013': case '\010': case '\007': case '\033':
7351 clen = 2;
7352 break;
7353
7354 case '#':
7355 clen = IS_EVSTR(p, pend) ? 2 : 1;
7356 break;
7357
7358 default:
7359 if (ISPRINT(c)) {
7360 clen = 1;
7361 }
7362 else {
7363 if (u8 && c > 0x7F) { /* \u notation */
7364 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7365 if (MBCLEN_CHARFOUND_P(n)) {
7366 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7367 if (cc <= 0xFFFF)
7368 clen = 6; /* \uXXXX */
7369 else if (cc <= 0xFFFFF)
7370 clen = 9; /* \u{XXXXX} */
7371 else
7372 clen = 10; /* \u{XXXXXX} */
7373 p += MBCLEN_CHARFOUND_LEN(n)-1;
7374 break;
7375 }
7376 }
7377 clen = 4; /* \xNN */
7378 }
7379 break;
7380 }
7381
7382 if (clen > LONG_MAX - len) {
7383 rb_raise(rb_eRuntimeError, "string size too big");
7384 }
7385 len += clen;
7386 }
7387
7388 result = rb_str_new(0, len);
7389 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7390 q = RSTRING_PTR(result); qend = q + len + 1;
7391
7392 *q++ = '"';
7393 while (p < pend) {
7394 unsigned char c = *p++;
7395
7396 if (c == '"' || c == '\\') {
7397 *q++ = '\\';
7398 *q++ = c;
7399 }
7400 else if (c == '#') {
7401 if (IS_EVSTR(p, pend)) *q++ = '\\';
7402 *q++ = '#';
7403 }
7404 else if (c == '\n') {
7405 *q++ = '\\';
7406 *q++ = 'n';
7407 }
7408 else if (c == '\r') {
7409 *q++ = '\\';
7410 *q++ = 'r';
7411 }
7412 else if (c == '\t') {
7413 *q++ = '\\';
7414 *q++ = 't';
7415 }
7416 else if (c == '\f') {
7417 *q++ = '\\';
7418 *q++ = 'f';
7419 }
7420 else if (c == '\013') {
7421 *q++ = '\\';
7422 *q++ = 'v';
7423 }
7424 else if (c == '\010') {
7425 *q++ = '\\';
7426 *q++ = 'b';
7427 }
7428 else if (c == '\007') {
7429 *q++ = '\\';
7430 *q++ = 'a';
7431 }
7432 else if (c == '\033') {
7433 *q++ = '\\';
7434 *q++ = 'e';
7435 }
7436 else if (ISPRINT(c)) {
7437 *q++ = c;
7438 }
7439 else {
7440 *q++ = '\\';
7441 if (u8) {
7442 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7443 if (MBCLEN_CHARFOUND_P(n)) {
7444 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7445 p += n;
7446 if (cc <= 0xFFFF)
7447 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7448 else
7449 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7450 q += strlen(q);
7451 continue;
7452 }
7453 }
7454 snprintf(q, qend-q, "x%02X", c);
7455 q += 3;
7456 }
7457 }
7458 *q++ = '"';
7459 *q = '\0';
7460 if (!rb_enc_asciicompat(enc)) {
7461 snprintf(q, qend-q, nonascii_suffix, enc->name);
7462 encidx = rb_ascii8bit_encindex();
7463 }
7464 /* result from dump is ASCII */
7465 rb_enc_associate_index(result, encidx);
7467 return result;
7468}
7469
7470static int
7471unescape_ascii(unsigned int c)
7472{
7473 switch (c) {
7474 case 'n':
7475 return '\n';
7476 case 'r':
7477 return '\r';
7478 case 't':
7479 return '\t';
7480 case 'f':
7481 return '\f';
7482 case 'v':
7483 return '\13';
7484 case 'b':
7485 return '\010';
7486 case 'a':
7487 return '\007';
7488 case 'e':
7489 return 033;
7490 }
7492}
7493
7494static void
7495undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7496{
7497 const char *s = *ss;
7498 unsigned int c;
7499 int codelen;
7500 size_t hexlen;
7501 unsigned char buf[6];
7502 static rb_encoding *enc_utf8 = NULL;
7503
7504 switch (*s) {
7505 case '\\':
7506 case '"':
7507 case '#':
7508 rb_str_cat(undumped, s, 1); /* cat itself */
7509 s++;
7510 break;
7511 case 'n':
7512 case 'r':
7513 case 't':
7514 case 'f':
7515 case 'v':
7516 case 'b':
7517 case 'a':
7518 case 'e':
7519 *buf = unescape_ascii(*s);
7520 rb_str_cat(undumped, (char *)buf, 1);
7521 s++;
7522 break;
7523 case 'u':
7524 if (*binary) {
7525 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7526 }
7527 *utf8 = true;
7528 if (++s >= s_end) {
7529 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7530 }
7531 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7532 if (*penc != enc_utf8) {
7533 *penc = enc_utf8;
7534 rb_enc_associate(undumped, enc_utf8);
7535 }
7536 if (*s == '{') { /* handle \u{...} form */
7537 s++;
7538 for (;;) {
7539 if (s >= s_end) {
7540 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7541 }
7542 if (*s == '}') {
7543 s++;
7544 break;
7545 }
7546 if (ISSPACE(*s)) {
7547 s++;
7548 continue;
7549 }
7550 c = scan_hex(s, s_end-s, &hexlen);
7551 if (hexlen == 0 || hexlen > 6) {
7552 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7553 }
7554 if (c > 0x10ffff) {
7555 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7556 }
7557 if (0xd800 <= c && c <= 0xdfff) {
7558 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7559 }
7560 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7561 rb_str_cat(undumped, (char *)buf, codelen);
7562 s += hexlen;
7563 }
7564 }
7565 else { /* handle \uXXXX form */
7566 c = scan_hex(s, 4, &hexlen);
7567 if (hexlen != 4) {
7568 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7569 }
7570 if (0xd800 <= c && c <= 0xdfff) {
7571 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7572 }
7573 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7574 rb_str_cat(undumped, (char *)buf, codelen);
7575 s += hexlen;
7576 }
7577 break;
7578 case 'x':
7579 if (*utf8) {
7580 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7581 }
7582 *binary = true;
7583 if (++s >= s_end) {
7584 rb_raise(rb_eRuntimeError, "invalid hex escape");
7585 }
7586 *buf = scan_hex(s, 2, &hexlen);
7587 if (hexlen != 2) {
7588 rb_raise(rb_eRuntimeError, "invalid hex escape");
7589 }
7590 rb_str_cat(undumped, (char *)buf, 1);
7591 s += hexlen;
7592 break;
7593 default:
7594 rb_str_cat(undumped, s-1, 2);
7595 s++;
7596 }
7597
7598 *ss = s;
7599}
7600
7601static VALUE rb_str_is_ascii_only_p(VALUE str);
7602
7603/*
7604 * call-seq:
7605 * undump -> new_string
7606 *
7607 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7608 *
7609 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7610 */
7611
7612static VALUE
7613str_undump(VALUE str)
7614{
7615 const char *s = RSTRING_PTR(str);
7616 const char *s_end = RSTRING_END(str);
7617 rb_encoding *enc = rb_enc_get(str);
7618 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7619 bool utf8 = false;
7620 bool binary = false;
7621 int w;
7622
7624 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7625 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7626 }
7627 if (!str_null_check(str, &w)) {
7628 rb_raise(rb_eRuntimeError, "string contains null byte");
7629 }
7630 if (RSTRING_LEN(str) < 2) goto invalid_format;
7631 if (*s != '"') goto invalid_format;
7632
7633 /* strip '"' at the start */
7634 s++;
7635
7636 for (;;) {
7637 if (s >= s_end) {
7638 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7639 }
7640
7641 if (*s == '"') {
7642 /* epilogue */
7643 s++;
7644 if (s == s_end) {
7645 /* ascii compatible dumped string */
7646 break;
7647 }
7648 else {
7649 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7650 static const char dup_suffix[] = ".dup";
7651 const char *encname;
7652 int encidx;
7653 ptrdiff_t size;
7654
7655 /* check separately for strings dumped by older versions */
7656 size = sizeof(dup_suffix) - 1;
7657 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7658
7659 size = sizeof(force_encoding_suffix) - 1;
7660 if (s_end - s <= size) goto invalid_format;
7661 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7662 s += size;
7663
7664 if (utf8) {
7665 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7666 }
7667
7668 encname = s;
7669 s = memchr(s, '"', s_end-s);
7670 size = s - encname;
7671 if (!s) goto invalid_format;
7672 if (s_end - s != 2) goto invalid_format;
7673 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7674
7675 encidx = rb_enc_find_index2(encname, (long)size);
7676 if (encidx < 0) {
7677 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7678 }
7679 rb_enc_associate_index(undumped, encidx);
7680 }
7681 break;
7682 }
7683
7684 if (*s == '\\') {
7685 s++;
7686 if (s >= s_end) {
7687 rb_raise(rb_eRuntimeError, "invalid escape");
7688 }
7689 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7690 }
7691 else {
7692 rb_str_cat(undumped, s++, 1);
7693 }
7694 }
7695
7696 RB_GC_GUARD(str);
7697
7698 return undumped;
7699invalid_format:
7700 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7701}
7702
7703static void
7704rb_str_check_dummy_enc(rb_encoding *enc)
7705{
7706 if (rb_enc_dummy_p(enc)) {
7707 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7708 rb_enc_name(enc));
7709 }
7710}
7711
7712static rb_encoding *
7713str_true_enc(VALUE str)
7714{
7715 rb_encoding *enc = STR_ENC_GET(str);
7716 rb_str_check_dummy_enc(enc);
7717 return enc;
7718}
7719
7720static OnigCaseFoldType
7721check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7722{
7723 if (argc==0)
7724 return flags;
7725 if (argc>2)
7726 rb_raise(rb_eArgError, "too many options");
7727 if (argv[0]==sym_turkic) {
7728 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7729 if (argc==2) {
7730 if (argv[1]==sym_lithuanian)
7731 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7732 else
7733 rb_raise(rb_eArgError, "invalid second option");
7734 }
7735 }
7736 else if (argv[0]==sym_lithuanian) {
7737 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7738 if (argc==2) {
7739 if (argv[1]==sym_turkic)
7740 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7741 else
7742 rb_raise(rb_eArgError, "invalid second option");
7743 }
7744 }
7745 else if (argc>1)
7746 rb_raise(rb_eArgError, "too many options");
7747 else if (argv[0]==sym_ascii)
7748 flags |= ONIGENC_CASE_ASCII_ONLY;
7749 else if (argv[0]==sym_fold) {
7750 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7751 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7752 else
7753 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7754 }
7755 else
7756 rb_raise(rb_eArgError, "invalid option");
7757 return flags;
7758}
7759
7760static inline bool
7761case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7762{
7763 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7764 return true;
7765 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7766}
7767
7768/* 16 should be long enough to absorb any kind of single character length increase */
7769#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7770#ifndef CASEMAP_DEBUG
7771# define CASEMAP_DEBUG 0
7772#endif
7773
7774struct mapping_buffer;
7775typedef struct mapping_buffer {
7776 size_t capa;
7777 size_t used;
7778 struct mapping_buffer *next;
7779 OnigUChar space[FLEX_ARY_LEN];
7781
7782static void
7783mapping_buffer_free(void *p)
7784{
7785 mapping_buffer *previous_buffer;
7786 mapping_buffer *current_buffer = p;
7787 while (current_buffer) {
7788 previous_buffer = current_buffer;
7789 current_buffer = current_buffer->next;
7790 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7791 }
7792}
7793
7794static const rb_data_type_t mapping_buffer_type = {
7795 "mapping_buffer",
7796 {0, mapping_buffer_free,},
7797 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7798};
7799
7800static VALUE
7801rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7802{
7803 VALUE target;
7804
7805 const OnigUChar *source_current, *source_end;
7806 int target_length = 0;
7807 VALUE buffer_anchor;
7808 mapping_buffer *current_buffer = 0;
7809 mapping_buffer **pre_buffer;
7810 size_t buffer_count = 0;
7811 int buffer_length_or_invalid;
7812
7813 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7814
7815 source_current = (OnigUChar*)RSTRING_PTR(source);
7816 source_end = (OnigUChar*)RSTRING_END(source);
7817
7818 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7819 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7820 while (source_current < source_end) {
7821 /* increase multiplier using buffer count to converge quickly */
7822 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7823 if (CASEMAP_DEBUG) {
7824 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7825 }
7826 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7827 *pre_buffer = current_buffer;
7828 pre_buffer = &current_buffer->next;
7829 current_buffer->next = NULL;
7830 current_buffer->capa = capa;
7831 buffer_length_or_invalid = enc->case_map(flags,
7832 &source_current, source_end,
7833 current_buffer->space,
7834 current_buffer->space+current_buffer->capa,
7835 enc);
7836 if (buffer_length_or_invalid < 0) {
7837 current_buffer = DATA_PTR(buffer_anchor);
7838 DATA_PTR(buffer_anchor) = 0;
7839 mapping_buffer_free(current_buffer);
7840 rb_raise(rb_eArgError, "input string invalid");
7841 }
7842 target_length += current_buffer->used = buffer_length_or_invalid;
7843 }
7844 if (CASEMAP_DEBUG) {
7845 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7846 }
7847
7848 if (buffer_count==1) {
7849 target = rb_str_new((const char*)current_buffer->space, target_length);
7850 }
7851 else {
7852 char *target_current;
7853
7854 target = rb_str_new(0, target_length);
7855 target_current = RSTRING_PTR(target);
7856 current_buffer = DATA_PTR(buffer_anchor);
7857 while (current_buffer) {
7858 memcpy(target_current, current_buffer->space, current_buffer->used);
7859 target_current += current_buffer->used;
7860 current_buffer = current_buffer->next;
7861 }
7862 }
7863 current_buffer = DATA_PTR(buffer_anchor);
7864 DATA_PTR(buffer_anchor) = 0;
7865 mapping_buffer_free(current_buffer);
7866
7867 RB_GC_GUARD(buffer_anchor);
7868
7869 /* TODO: check about string terminator character */
7870 str_enc_copy_direct(target, source);
7871 /*ENC_CODERANGE_SET(mapped, cr);*/
7872
7873 return target;
7874}
7875
7876static VALUE
7877rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7878{
7879 const OnigUChar *source_current, *source_end;
7880 OnigUChar *target_current, *target_end;
7881 long old_length = RSTRING_LEN(source);
7882 int length_or_invalid;
7883
7884 if (old_length == 0) return Qnil;
7885
7886 source_current = (OnigUChar*)RSTRING_PTR(source);
7887 source_end = (OnigUChar*)RSTRING_END(source);
7888 if (source == target) {
7889 target_current = (OnigUChar*)source_current;
7890 target_end = (OnigUChar*)source_end;
7891 }
7892 else {
7893 target_current = (OnigUChar*)RSTRING_PTR(target);
7894 target_end = (OnigUChar*)RSTRING_END(target);
7895 }
7896
7897 length_or_invalid = onigenc_ascii_only_case_map(flags,
7898 &source_current, source_end,
7899 target_current, target_end, enc);
7900 if (length_or_invalid < 0)
7901 rb_raise(rb_eArgError, "input string invalid");
7902 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7903 fprintf(stderr, "problem with rb_str_ascii_casemap"
7904 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7905 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7906 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7907 }
7908
7909 str_enc_copy(target, source);
7910
7911 return target;
7912}
7913
7914static bool
7915upcase_single(VALUE str)
7916{
7917 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7918 bool modified = false;
7919
7920 while (s < send) {
7921 unsigned int c = *(unsigned char*)s;
7922
7923 if ('a' <= c && c <= 'z') {
7924 *s = 'A' + (c - 'a');
7925 modified = true;
7926 }
7927 s++;
7928 }
7929 return modified;
7930}
7931
7932/*
7933 * call-seq:
7934 * upcase!(mapping) -> self or nil
7935 *
7936 * Like String#upcase, except that:
7937 *
7938 * - Changes character casings in +self+ (not in a copy of +self+).
7939 * - Returns +self+ if any changes are made, +nil+ otherwise.
7940 *
7941 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7942 */
7943
7944static VALUE
7945rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7946{
7947 rb_encoding *enc;
7948 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7949
7950 flags = check_case_options(argc, argv, flags);
7951 str_modify_keep_cr(str);
7952 enc = str_true_enc(str);
7953 if (case_option_single_p(flags, enc, str)) {
7954 if (upcase_single(str))
7955 flags |= ONIGENC_CASE_MODIFIED;
7956 }
7957 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7958 rb_str_ascii_casemap(str, str, &flags, enc);
7959 else
7960 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7961
7962 if (ONIGENC_CASE_MODIFIED&flags) return str;
7963 return Qnil;
7964}
7965
7966
7967/*
7968 * call-seq:
7969 * upcase(mapping) -> string
7970 *
7971 * :include: doc/string/upcase.rdoc
7972 */
7973
7974static VALUE
7975rb_str_upcase(int argc, VALUE *argv, VALUE str)
7976{
7977 rb_encoding *enc;
7978 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7979 VALUE ret;
7980
7981 flags = check_case_options(argc, argv, flags);
7982 enc = str_true_enc(str);
7983 if (case_option_single_p(flags, enc, str)) {
7984 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7985 str_enc_copy_direct(ret, str);
7986 upcase_single(ret);
7987 }
7988 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7989 ret = rb_str_new(0, RSTRING_LEN(str));
7990 rb_str_ascii_casemap(str, ret, &flags, enc);
7991 }
7992 else {
7993 ret = rb_str_casemap(str, &flags, enc);
7994 }
7995
7996 return ret;
7997}
7998
7999static bool
8000downcase_single(VALUE str)
8001{
8002 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8003 bool modified = false;
8004
8005 while (s < send) {
8006 unsigned int c = *(unsigned char*)s;
8007
8008 if ('A' <= c && c <= 'Z') {
8009 *s = 'a' + (c - 'A');
8010 modified = true;
8011 }
8012 s++;
8013 }
8014
8015 return modified;
8016}
8017
8018/*
8019 * call-seq:
8020 * downcase!(mapping) -> self or nil
8021 *
8022 * Like String#downcase, except that:
8023 *
8024 * - Changes character casings in +self+ (not in a copy of +self+).
8025 * - Returns +self+ if any changes are made, +nil+ otherwise.
8026 *
8027 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8028 */
8029
8030static VALUE
8031rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8032{
8033 rb_encoding *enc;
8034 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8035
8036 flags = check_case_options(argc, argv, flags);
8037 str_modify_keep_cr(str);
8038 enc = str_true_enc(str);
8039 if (case_option_single_p(flags, enc, str)) {
8040 if (downcase_single(str))
8041 flags |= ONIGENC_CASE_MODIFIED;
8042 }
8043 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8044 rb_str_ascii_casemap(str, str, &flags, enc);
8045 else
8046 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8047
8048 if (ONIGENC_CASE_MODIFIED&flags) return str;
8049 return Qnil;
8050}
8051
8052
8053/*
8054 * call-seq:
8055 * downcase(mapping) -> string
8056 *
8057 * :include: doc/string/downcase.rdoc
8058 *
8059 */
8060
8061static VALUE
8062rb_str_downcase(int argc, VALUE *argv, VALUE str)
8063{
8064 rb_encoding *enc;
8065 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8066 VALUE ret;
8067
8068 flags = check_case_options(argc, argv, flags);
8069 enc = str_true_enc(str);
8070 if (case_option_single_p(flags, enc, str)) {
8071 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8072 str_enc_copy_direct(ret, str);
8073 downcase_single(ret);
8074 }
8075 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8076 ret = rb_str_new(0, RSTRING_LEN(str));
8077 rb_str_ascii_casemap(str, ret, &flags, enc);
8078 }
8079 else {
8080 ret = rb_str_casemap(str, &flags, enc);
8081 }
8082
8083 return ret;
8084}
8085
8086
8087/*
8088 * call-seq:
8089 * capitalize!(mapping = :ascii) -> self or nil
8090 *
8091 * Like String#capitalize, except that:
8092 *
8093 * - Changes character casings in +self+ (not in a copy of +self+).
8094 * - Returns +self+ if any changes are made, +nil+ otherwise.
8095 *
8096 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8097 */
8098
8099static VALUE
8100rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8101{
8102 rb_encoding *enc;
8103 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8104
8105 flags = check_case_options(argc, argv, flags);
8106 str_modify_keep_cr(str);
8107 enc = str_true_enc(str);
8108 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8109 if (flags&ONIGENC_CASE_ASCII_ONLY)
8110 rb_str_ascii_casemap(str, str, &flags, enc);
8111 else
8112 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8113
8114 if (ONIGENC_CASE_MODIFIED&flags) return str;
8115 return Qnil;
8116}
8117
8118
8119/*
8120 * call-seq:
8121 * capitalize(mapping = :ascii) -> string
8122 *
8123 * Returns a string containing the characters in +self+,
8124 * each with possibly changed case:
8125 *
8126 * - The first character is upcased.
8127 * - All other characters are downcased.
8128 *
8129 * Examples:
8130 *
8131 * 'hello world'.capitalize # => "Hello world"
8132 * 'HELLO WORLD'.capitalize # => "Hello world"
8133 *
8134 * Some characters do not have upcase and downcase, and so are not changed;
8135 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc]:
8136 *
8137 * '1, 2, 3, ...'.capitalize # => "1, 2, 3, ..."
8138 *
8139 * The casing is affected by the given +mapping+,
8140 * which may be +:ascii+, +:fold+, or +:turkic+;
8141 * see {Case Mappings}[rdoc-ref:case_mapping.rdoc@Case+Mappings].
8142 *
8143 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8144 */
8145
8146static VALUE
8147rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8148{
8149 rb_encoding *enc;
8150 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8151 VALUE ret;
8152
8153 flags = check_case_options(argc, argv, flags);
8154 enc = str_true_enc(str);
8155 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8156 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8157 ret = rb_str_new(0, RSTRING_LEN(str));
8158 rb_str_ascii_casemap(str, ret, &flags, enc);
8159 }
8160 else {
8161 ret = rb_str_casemap(str, &flags, enc);
8162 }
8163 return ret;
8164}
8165
8166
8167/*
8168 * call-seq:
8169 * swapcase!(mapping) -> self or nil
8170 *
8171 * Like String#swapcase, except that:
8172 *
8173 * - Changes are made to +self+, not to copy of +self+.
8174 * - Returns +self+ if any changes are made, +nil+ otherwise.
8175 *
8176 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8177 */
8178
8179static VALUE
8180rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8181{
8182 rb_encoding *enc;
8183 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8184
8185 flags = check_case_options(argc, argv, flags);
8186 str_modify_keep_cr(str);
8187 enc = str_true_enc(str);
8188 if (flags&ONIGENC_CASE_ASCII_ONLY)
8189 rb_str_ascii_casemap(str, str, &flags, enc);
8190 else
8191 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8192
8193 if (ONIGENC_CASE_MODIFIED&flags) return str;
8194 return Qnil;
8195}
8196
8197
8198/*
8199 * call-seq:
8200 * swapcase(mapping) -> new_string
8201 *
8202 * :include: doc/string/swapcase.rdoc
8203 *
8204 */
8205
8206static VALUE
8207rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8208{
8209 rb_encoding *enc;
8210 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8211 VALUE ret;
8212
8213 flags = check_case_options(argc, argv, flags);
8214 enc = str_true_enc(str);
8215 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8216 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8217 ret = rb_str_new(0, RSTRING_LEN(str));
8218 rb_str_ascii_casemap(str, ret, &flags, enc);
8219 }
8220 else {
8221 ret = rb_str_casemap(str, &flags, enc);
8222 }
8223 return ret;
8224}
8225
8226typedef unsigned char *USTR;
8227
8228struct tr {
8229 int gen;
8230 unsigned int now, max;
8231 char *p, *pend;
8232};
8233
8234static unsigned int
8235trnext(struct tr *t, rb_encoding *enc)
8236{
8237 int n;
8238
8239 for (;;) {
8240 nextpart:
8241 if (!t->gen) {
8242 if (t->p == t->pend) return -1;
8243 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8244 t->p += n;
8245 }
8246 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8247 t->p += n;
8248 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8249 t->p += n;
8250 if (t->p < t->pend) {
8251 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8252 t->p += n;
8253 if (t->now > c) {
8254 if (t->now < 0x80 && c < 0x80) {
8255 rb_raise(rb_eArgError,
8256 "invalid range \"%c-%c\" in string transliteration",
8257 t->now, c);
8258 }
8259 else {
8260 rb_raise(rb_eArgError, "invalid range in string transliteration");
8261 }
8262 continue; /* not reached */
8263 }
8264 else if (t->now < c) {
8265 t->gen = 1;
8266 t->max = c;
8267 }
8268 }
8269 }
8270 return t->now;
8271 }
8272 else {
8273 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8274 if (t->now == t->max) {
8275 t->gen = 0;
8276 goto nextpart;
8277 }
8278 }
8279 if (t->now < t->max) {
8280 return t->now;
8281 }
8282 else {
8283 t->gen = 0;
8284 return t->max;
8285 }
8286 }
8287 }
8288}
8289
8290static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8291
8292static VALUE
8293tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8294{
8295 const unsigned int errc = -1;
8296 unsigned int trans[256];
8297 rb_encoding *enc, *e1, *e2;
8298 struct tr trsrc, trrepl;
8299 int cflag = 0;
8300 unsigned int c, c0, last = 0;
8301 int modify = 0, i, l;
8302 unsigned char *s, *send;
8303 VALUE hash = 0;
8304 int singlebyte = single_byte_optimizable(str);
8305 int termlen;
8306 int cr;
8307
8308#define CHECK_IF_ASCII(c) \
8309 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8310 (cr = ENC_CODERANGE_VALID) : 0)
8311
8312 StringValue(src);
8313 StringValue(repl);
8314 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8315 if (RSTRING_LEN(repl) == 0) {
8316 return rb_str_delete_bang(1, &src, str);
8317 }
8318
8319 cr = ENC_CODERANGE(str);
8320 e1 = rb_enc_check(str, src);
8321 e2 = rb_enc_check(str, repl);
8322 if (e1 == e2) {
8323 enc = e1;
8324 }
8325 else {
8326 enc = rb_enc_check(src, repl);
8327 }
8328 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8329 if (RSTRING_LEN(src) > 1 &&
8330 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8331 trsrc.p + l < trsrc.pend) {
8332 cflag = 1;
8333 trsrc.p += l;
8334 }
8335 trrepl.p = RSTRING_PTR(repl);
8336 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8337 trsrc.gen = trrepl.gen = 0;
8338 trsrc.now = trrepl.now = 0;
8339 trsrc.max = trrepl.max = 0;
8340
8341 if (cflag) {
8342 for (i=0; i<256; i++) {
8343 trans[i] = 1;
8344 }
8345 while ((c = trnext(&trsrc, enc)) != errc) {
8346 if (c < 256) {
8347 trans[c] = errc;
8348 }
8349 else {
8350 if (!hash) hash = rb_hash_new();
8351 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8352 }
8353 }
8354 while ((c = trnext(&trrepl, enc)) != errc)
8355 /* retrieve last replacer */;
8356 last = trrepl.now;
8357 for (i=0; i<256; i++) {
8358 if (trans[i] != errc) {
8359 trans[i] = last;
8360 }
8361 }
8362 }
8363 else {
8364 unsigned int r;
8365
8366 for (i=0; i<256; i++) {
8367 trans[i] = errc;
8368 }
8369 while ((c = trnext(&trsrc, enc)) != errc) {
8370 r = trnext(&trrepl, enc);
8371 if (r == errc) r = trrepl.now;
8372 if (c < 256) {
8373 trans[c] = r;
8374 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8375 }
8376 else {
8377 if (!hash) hash = rb_hash_new();
8378 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8379 }
8380 }
8381 }
8382
8383 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8384 cr = ENC_CODERANGE_7BIT;
8385 str_modify_keep_cr(str);
8386 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8387 termlen = rb_enc_mbminlen(enc);
8388 if (sflag) {
8389 int clen, tlen;
8390 long offset, max = RSTRING_LEN(str);
8391 unsigned int save = -1;
8392 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8393
8394 while (s < send) {
8395 int may_modify = 0;
8396
8397 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8398 if (!MBCLEN_CHARFOUND_P(r)) {
8399 xfree(buf);
8400 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8401 }
8402 clen = MBCLEN_CHARFOUND_LEN(r);
8403 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8404
8405 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8406
8407 s += clen;
8408 if (c < 256) {
8409 c = trans[c];
8410 }
8411 else if (hash) {
8412 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8413 if (NIL_P(tmp)) {
8414 if (cflag) c = last;
8415 else c = errc;
8416 }
8417 else if (cflag) c = errc;
8418 else c = NUM2INT(tmp);
8419 }
8420 else {
8421 c = errc;
8422 }
8423 if (c != (unsigned int)-1) {
8424 if (save == c) {
8425 CHECK_IF_ASCII(c);
8426 continue;
8427 }
8428 save = c;
8429 tlen = rb_enc_codelen(c, enc);
8430 modify = 1;
8431 }
8432 else {
8433 save = -1;
8434 c = c0;
8435 if (enc != e1) may_modify = 1;
8436 }
8437 if ((offset = t - buf) + tlen > max) {
8438 size_t MAYBE_UNUSED(old) = max + termlen;
8439 max = offset + tlen + (send - s);
8440 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8441 t = buf + offset;
8442 }
8443 rb_enc_mbcput(c, t, enc);
8444 if (may_modify && memcmp(s, t, tlen) != 0) {
8445 modify = 1;
8446 }
8447 CHECK_IF_ASCII(c);
8448 t += tlen;
8449 }
8450 if (!STR_EMBED_P(str)) {
8451 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8452 }
8453 TERM_FILL((char *)t, termlen);
8454 RSTRING(str)->as.heap.ptr = (char *)buf;
8455 STR_SET_LEN(str, t - buf);
8456 STR_SET_NOEMBED(str);
8457 RSTRING(str)->as.heap.aux.capa = max;
8458 }
8459 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8460 while (s < send) {
8461 c = (unsigned char)*s;
8462 if (trans[c] != errc) {
8463 if (!cflag) {
8464 c = trans[c];
8465 *s = c;
8466 modify = 1;
8467 }
8468 else {
8469 *s = last;
8470 modify = 1;
8471 }
8472 }
8473 CHECK_IF_ASCII(c);
8474 s++;
8475 }
8476 }
8477 else {
8478 int clen, tlen;
8479 long offset, max = (long)((send - s) * 1.2);
8480 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8481
8482 while (s < send) {
8483 int may_modify = 0;
8484
8485 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8486 if (!MBCLEN_CHARFOUND_P(r)) {
8487 xfree(buf);
8488 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8489 }
8490 clen = MBCLEN_CHARFOUND_LEN(r);
8491 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8492
8493 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8494
8495 if (c < 256) {
8496 c = trans[c];
8497 }
8498 else if (hash) {
8499 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8500 if (NIL_P(tmp)) {
8501 if (cflag) c = last;
8502 else c = errc;
8503 }
8504 else if (cflag) c = errc;
8505 else c = NUM2INT(tmp);
8506 }
8507 else {
8508 c = cflag ? last : errc;
8509 }
8510 if (c != errc) {
8511 tlen = rb_enc_codelen(c, enc);
8512 modify = 1;
8513 }
8514 else {
8515 c = c0;
8516 if (enc != e1) may_modify = 1;
8517 }
8518 if ((offset = t - buf) + tlen > max) {
8519 size_t MAYBE_UNUSED(old) = max + termlen;
8520 max = offset + tlen + (long)((send - s) * 1.2);
8521 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8522 t = buf + offset;
8523 }
8524 if (s != t) {
8525 rb_enc_mbcput(c, t, enc);
8526 if (may_modify && memcmp(s, t, tlen) != 0) {
8527 modify = 1;
8528 }
8529 }
8530 CHECK_IF_ASCII(c);
8531 s += clen;
8532 t += tlen;
8533 }
8534 if (!STR_EMBED_P(str)) {
8535 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8536 }
8537 TERM_FILL((char *)t, termlen);
8538 RSTRING(str)->as.heap.ptr = (char *)buf;
8539 STR_SET_LEN(str, t - buf);
8540 STR_SET_NOEMBED(str);
8541 RSTRING(str)->as.heap.aux.capa = max;
8542 }
8543
8544 if (modify) {
8545 if (cr != ENC_CODERANGE_BROKEN)
8546 ENC_CODERANGE_SET(str, cr);
8547 rb_enc_associate(str, enc);
8548 return str;
8549 }
8550 return Qnil;
8551}
8552
8553
8554/*
8555 * call-seq:
8556 * tr!(selector, replacements) -> self or nil
8557 *
8558 * Like String#tr, except:
8559 *
8560 * - Performs substitutions in +self+ (not in a copy of +self+).
8561 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8562 *
8563 * Related: {Modifying}[rdoc-ref:String@Modifying].
8564 */
8565
8566static VALUE
8567rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8568{
8569 return tr_trans(str, src, repl, 0);
8570}
8571
8572
8573/*
8574 * call-seq:
8575 * tr(selector, replacements) -> new_string
8576 *
8577 * Returns a copy of +self+ with each character specified by string +selector+
8578 * translated to the corresponding character in string +replacements+.
8579 * The correspondence is _positional_:
8580 *
8581 * - Each occurrence of the first character specified by +selector+
8582 * is translated to the first character in +replacements+.
8583 * - Each occurrence of the second character specified by +selector+
8584 * is translated to the second character in +replacements+.
8585 * - And so on.
8586 *
8587 * Example:
8588 *
8589 * 'hello'.tr('el', 'ip') #=> "hippo"
8590 *
8591 * If +replacements+ is shorter than +selector+,
8592 * it is implicitly padded with its own last character:
8593 *
8594 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8595 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8596 *
8597 * Arguments +selector+ and +replacements+ must be valid character selectors
8598 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8599 * and may use any of its valid forms, including negation, ranges, and escapes:
8600 *
8601 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8602 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8603 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8604 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8605 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8606 *
8607 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8608 */
8609
8610static VALUE
8611rb_str_tr(VALUE str, VALUE src, VALUE repl)
8612{
8613 str = str_duplicate(rb_cString, str);
8614 tr_trans(str, src, repl, 0);
8615 return str;
8616}
8617
8618#define TR_TABLE_MAX (UCHAR_MAX+1)
8619#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8620static void
8621tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8622 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8623{
8624 const unsigned int errc = -1;
8625 char buf[TR_TABLE_MAX];
8626 struct tr tr;
8627 unsigned int c;
8628 VALUE table = 0, ptable = 0;
8629 int i, l, cflag = 0;
8630
8631 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8632 tr.gen = tr.now = tr.max = 0;
8633
8634 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8635 cflag = 1;
8636 tr.p += l;
8637 }
8638 if (first) {
8639 for (i=0; i<TR_TABLE_MAX; i++) {
8640 stable[i] = 1;
8641 }
8642 stable[TR_TABLE_MAX] = cflag;
8643 }
8644 else if (stable[TR_TABLE_MAX] && !cflag) {
8645 stable[TR_TABLE_MAX] = 0;
8646 }
8647 for (i=0; i<TR_TABLE_MAX; i++) {
8648 buf[i] = cflag;
8649 }
8650
8651 while ((c = trnext(&tr, enc)) != errc) {
8652 if (c < TR_TABLE_MAX) {
8653 buf[(unsigned char)c] = !cflag;
8654 }
8655 else {
8656 VALUE key = UINT2NUM(c);
8657
8658 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8659 if (cflag) {
8660 ptable = *ctablep;
8661 table = ptable ? ptable : rb_hash_new();
8662 *ctablep = table;
8663 }
8664 else {
8665 table = rb_hash_new();
8666 ptable = *tablep;
8667 *tablep = table;
8668 }
8669 }
8670 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8671 rb_hash_aset(table, key, Qtrue);
8672 }
8673 }
8674 }
8675 for (i=0; i<TR_TABLE_MAX; i++) {
8676 stable[i] = stable[i] && buf[i];
8677 }
8678 if (!table && !cflag) {
8679 *tablep = 0;
8680 }
8681}
8682
8683
8684static int
8685tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8686{
8687 if (c < TR_TABLE_MAX) {
8688 return table[c] != 0;
8689 }
8690 else {
8691 VALUE v = UINT2NUM(c);
8692
8693 if (del) {
8694 if (!NIL_P(rb_hash_lookup(del, v)) &&
8695 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8696 return TRUE;
8697 }
8698 }
8699 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8700 return FALSE;
8701 }
8702 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8703 }
8704}
8705
8706/*
8707 * call-seq:
8708 * delete!(*selectors) -> self or nil
8709 *
8710 * Like String#delete, but modifies +self+ in place;
8711 * returns +self+ if any characters were deleted, +nil+ otherwise.
8712 *
8713 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8714 */
8715
8716static VALUE
8717rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8718{
8719 char squeez[TR_TABLE_SIZE];
8720 rb_encoding *enc = 0;
8721 char *s, *send, *t;
8722 VALUE del = 0, nodel = 0;
8723 int modify = 0;
8724 int i, ascompat, cr;
8725
8726 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8728 for (i=0; i<argc; i++) {
8729 VALUE s = argv[i];
8730
8731 StringValue(s);
8732 enc = rb_enc_check(str, s);
8733 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8734 }
8735
8736 str_modify_keep_cr(str);
8737 ascompat = rb_enc_asciicompat(enc);
8738 s = t = RSTRING_PTR(str);
8739 send = RSTRING_END(str);
8740 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8741 while (s < send) {
8742 unsigned int c;
8743 int clen;
8744
8745 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8746 if (squeez[c]) {
8747 modify = 1;
8748 }
8749 else {
8750 if (t != s) *t = c;
8751 t++;
8752 }
8753 s++;
8754 }
8755 else {
8756 c = rb_enc_codepoint_len(s, send, &clen, enc);
8757
8758 if (tr_find(c, squeez, del, nodel)) {
8759 modify = 1;
8760 }
8761 else {
8762 if (t != s) rb_enc_mbcput(c, t, enc);
8763 t += clen;
8765 }
8766 s += clen;
8767 }
8768 }
8769 TERM_FILL(t, TERM_LEN(str));
8770 STR_SET_LEN(str, t - RSTRING_PTR(str));
8771 ENC_CODERANGE_SET(str, cr);
8772
8773 if (modify) return str;
8774 return Qnil;
8775}
8776
8777
8778/*
8779 * call-seq:
8780 * delete(*selectors) -> new_string
8781 *
8782 * :include: doc/string/delete.rdoc
8783 *
8784 */
8785
8786static VALUE
8787rb_str_delete(int argc, VALUE *argv, VALUE str)
8788{
8789 str = str_duplicate(rb_cString, str);
8790 rb_str_delete_bang(argc, argv, str);
8791 return str;
8792}
8793
8794
8795/*
8796 * call-seq:
8797 * squeeze!(*selectors) -> self or nil
8798 *
8799 * Like String#squeeze, except that:
8800 *
8801 * - Characters are squeezed in +self+ (not in a copy of +self+).
8802 * - Returns +self+ if any changes are made, +nil+ otherwise.
8803 *
8804 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8805 */
8806
8807static VALUE
8808rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8809{
8810 char squeez[TR_TABLE_SIZE];
8811 rb_encoding *enc = 0;
8812 VALUE del = 0, nodel = 0;
8813 unsigned char *s, *send, *t;
8814 int i, modify = 0;
8815 int ascompat, singlebyte = single_byte_optimizable(str);
8816 unsigned int save;
8817
8818 if (argc == 0) {
8819 enc = STR_ENC_GET(str);
8820 }
8821 else {
8822 for (i=0; i<argc; i++) {
8823 VALUE s = argv[i];
8824
8825 StringValue(s);
8826 enc = rb_enc_check(str, s);
8827 if (singlebyte && !single_byte_optimizable(s))
8828 singlebyte = 0;
8829 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8830 }
8831 }
8832
8833 str_modify_keep_cr(str);
8834 s = t = (unsigned char *)RSTRING_PTR(str);
8835 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8836 send = (unsigned char *)RSTRING_END(str);
8837 save = -1;
8838 ascompat = rb_enc_asciicompat(enc);
8839
8840 if (singlebyte) {
8841 while (s < send) {
8842 unsigned int c = *s++;
8843 if (c != save || (argc > 0 && !squeez[c])) {
8844 *t++ = save = c;
8845 }
8846 }
8847 }
8848 else {
8849 while (s < send) {
8850 unsigned int c;
8851 int clen;
8852
8853 if (ascompat && (c = *s) < 0x80) {
8854 if (c != save || (argc > 0 && !squeez[c])) {
8855 *t++ = save = c;
8856 }
8857 s++;
8858 }
8859 else {
8860 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8861
8862 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8863 if (t != s) rb_enc_mbcput(c, t, enc);
8864 save = c;
8865 t += clen;
8866 }
8867 s += clen;
8868 }
8869 }
8870 }
8871
8872 TERM_FILL((char *)t, TERM_LEN(str));
8873 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8874 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8875 modify = 1;
8876 }
8877
8878 if (modify) return str;
8879 return Qnil;
8880}
8881
8882
8883/*
8884 * call-seq:
8885 * squeeze(*selectors) -> new_string
8886 *
8887 * :include: doc/string/squeeze.rdoc
8888 *
8889 */
8890
8891static VALUE
8892rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8893{
8894 str = str_duplicate(rb_cString, str);
8895 rb_str_squeeze_bang(argc, argv, str);
8896 return str;
8897}
8898
8899
8900/*
8901 * call-seq:
8902 * tr_s!(selector, replacements) -> self or nil
8903 *
8904 * Like String#tr_s, except:
8905 *
8906 * - Modifies +self+ in place (not a copy of +self+).
8907 * - Returns +self+ if any changes were made, +nil+ otherwise.
8908 *
8909 * Related: {Modifying}[rdoc-ref:String@Modifying].
8910 */
8911
8912static VALUE
8913rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8914{
8915 return tr_trans(str, src, repl, 1);
8916}
8917
8918
8919/*
8920 * call-seq:
8921 * tr_s(selector, replacements) -> new_string
8922 *
8923 * Like String#tr, except:
8924 *
8925 * - Also squeezes the modified portions of the translated string;
8926 * see String#squeeze.
8927 * - Returns the translated and squeezed string.
8928 *
8929 * Examples:
8930 *
8931 * 'hello'.tr_s('l', 'r') #=> "hero"
8932 * 'hello'.tr_s('el', '-') #=> "h-o"
8933 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8934 *
8935 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8936 *
8937 */
8938
8939static VALUE
8940rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8941{
8942 str = str_duplicate(rb_cString, str);
8943 tr_trans(str, src, repl, 1);
8944 return str;
8945}
8946
8947
8948/*
8949 * call-seq:
8950 * count(*selectors) -> integer
8951 *
8952 * :include: doc/string/count.rdoc
8953 */
8954
8955static VALUE
8956rb_str_count(int argc, VALUE *argv, VALUE str)
8957{
8958 char table[TR_TABLE_SIZE];
8959 rb_encoding *enc = 0;
8960 VALUE del = 0, nodel = 0, tstr;
8961 char *s, *send;
8962 int i;
8963 int ascompat;
8964 size_t n = 0;
8965
8967
8968 tstr = argv[0];
8969 StringValue(tstr);
8970 enc = rb_enc_check(str, tstr);
8971 if (argc == 1) {
8972 const char *ptstr;
8973 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8974 (ptstr = RSTRING_PTR(tstr),
8975 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8976 !is_broken_string(str)) {
8977 int clen;
8978 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8979
8980 s = RSTRING_PTR(str);
8981 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8982 send = RSTRING_END(str);
8983 while (s < send) {
8984 if (*(unsigned char*)s++ == c) n++;
8985 }
8986 return SIZET2NUM(n);
8987 }
8988 }
8989
8990 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8991 for (i=1; i<argc; i++) {
8992 tstr = argv[i];
8993 StringValue(tstr);
8994 enc = rb_enc_check(str, tstr);
8995 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8996 }
8997
8998 s = RSTRING_PTR(str);
8999 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9000 send = RSTRING_END(str);
9001 ascompat = rb_enc_asciicompat(enc);
9002 while (s < send) {
9003 unsigned int c;
9004
9005 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9006 if (table[c]) {
9007 n++;
9008 }
9009 s++;
9010 }
9011 else {
9012 int clen;
9013 c = rb_enc_codepoint_len(s, send, &clen, enc);
9014 if (tr_find(c, table, del, nodel)) {
9015 n++;
9016 }
9017 s += clen;
9018 }
9019 }
9020
9021 return SIZET2NUM(n);
9022}
9023
9024static VALUE
9025rb_fs_check(VALUE val)
9026{
9027 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9028 val = rb_check_string_type(val);
9029 if (NIL_P(val)) return 0;
9030 }
9031 return val;
9032}
9033
9034static const char isspacetable[256] = {
9035 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9036 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9037 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9038 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9039 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9040 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9041 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9042 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9043 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9044 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9045 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9046 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9047 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9048 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9049 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9050 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9051};
9052
9053#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9054
9055static long
9056split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9057{
9058 if (empty_count >= 0 && len == 0) {
9059 return empty_count + 1;
9060 }
9061 if (empty_count > 0) {
9062 /* make different substrings */
9063 if (result) {
9064 do {
9065 rb_ary_push(result, str_new_empty_String(str));
9066 } while (--empty_count > 0);
9067 }
9068 else {
9069 do {
9070 rb_yield(str_new_empty_String(str));
9071 } while (--empty_count > 0);
9072 }
9073 }
9074 str = rb_str_subseq(str, beg, len);
9075 if (result) {
9076 rb_ary_push(result, str);
9077 }
9078 else {
9079 rb_yield(str);
9080 }
9081 return empty_count;
9082}
9083
9084typedef enum {
9085 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9086} split_type_t;
9087
9088static split_type_t
9089literal_split_pattern(VALUE spat, split_type_t default_type)
9090{
9091 rb_encoding *enc = STR_ENC_GET(spat);
9092 const char *ptr;
9093 long len;
9094 RSTRING_GETMEM(spat, ptr, len);
9095 if (len == 0) {
9096 /* Special case - split into chars */
9097 return SPLIT_TYPE_CHARS;
9098 }
9099 else if (rb_enc_asciicompat(enc)) {
9100 if (len == 1 && ptr[0] == ' ') {
9101 return SPLIT_TYPE_AWK;
9102 }
9103 }
9104 else {
9105 int l;
9106 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9107 return SPLIT_TYPE_AWK;
9108 }
9109 }
9110 return default_type;
9111}
9112
9113/*
9114 * call-seq:
9115 * split(field_sep = $;, limit = 0) -> array_of_substrings
9116 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9117 *
9118 * :include: doc/string/split.rdoc
9119 *
9120 */
9121
9122static VALUE
9123rb_str_split_m(int argc, VALUE *argv, VALUE str)
9124{
9125 rb_encoding *enc;
9126 VALUE spat;
9127 VALUE limit;
9128 split_type_t split_type;
9129 long beg, end, i = 0, empty_count = -1;
9130 int lim = 0;
9131 VALUE result, tmp;
9132
9133 result = rb_block_given_p() ? Qfalse : Qnil;
9134 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9135 lim = NUM2INT(limit);
9136 if (lim <= 0) limit = Qnil;
9137 else if (lim == 1) {
9138 if (RSTRING_LEN(str) == 0)
9139 return result ? rb_ary_new2(0) : str;
9140 tmp = str_duplicate(rb_cString, str);
9141 if (!result) {
9142 rb_yield(tmp);
9143 return str;
9144 }
9145 return rb_ary_new3(1, tmp);
9146 }
9147 i = 1;
9148 }
9149 if (NIL_P(limit) && !lim) empty_count = 0;
9150
9151 enc = STR_ENC_GET(str);
9152 split_type = SPLIT_TYPE_REGEXP;
9153 if (!NIL_P(spat)) {
9154 spat = get_pat_quoted(spat, 0);
9155 }
9156 else if (NIL_P(spat = rb_fs)) {
9157 split_type = SPLIT_TYPE_AWK;
9158 }
9159 else if (!(spat = rb_fs_check(spat))) {
9160 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9161 }
9162 else {
9163 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9164 }
9165 if (split_type != SPLIT_TYPE_AWK) {
9166 switch (BUILTIN_TYPE(spat)) {
9167 case T_REGEXP:
9168 rb_reg_options(spat); /* check if uninitialized */
9169 tmp = RREGEXP_SRC(spat);
9170 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9171 if (split_type == SPLIT_TYPE_AWK) {
9172 spat = tmp;
9173 split_type = SPLIT_TYPE_STRING;
9174 }
9175 break;
9176
9177 case T_STRING:
9178 mustnot_broken(spat);
9179 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9180 break;
9181
9182 default:
9184 }
9185 }
9186
9187#define SPLIT_STR(beg, len) ( \
9188 empty_count = split_string(result, str, beg, len, empty_count), \
9189 str_mod_check(str, str_start, str_len))
9190
9191 beg = 0;
9192 char *ptr = RSTRING_PTR(str);
9193 char *const str_start = ptr;
9194 const long str_len = RSTRING_LEN(str);
9195 char *const eptr = str_start + str_len;
9196 if (split_type == SPLIT_TYPE_AWK) {
9197 char *bptr = ptr;
9198 int skip = 1;
9199 unsigned int c;
9200
9201 if (result) result = rb_ary_new();
9202 end = beg;
9203 if (is_ascii_string(str)) {
9204 while (ptr < eptr) {
9205 c = (unsigned char)*ptr++;
9206 if (skip) {
9207 if (ascii_isspace(c)) {
9208 beg = ptr - bptr;
9209 }
9210 else {
9211 end = ptr - bptr;
9212 skip = 0;
9213 if (!NIL_P(limit) && lim <= i) break;
9214 }
9215 }
9216 else if (ascii_isspace(c)) {
9217 SPLIT_STR(beg, end-beg);
9218 skip = 1;
9219 beg = ptr - bptr;
9220 if (!NIL_P(limit)) ++i;
9221 }
9222 else {
9223 end = ptr - bptr;
9224 }
9225 }
9226 }
9227 else {
9228 while (ptr < eptr) {
9229 int n;
9230
9231 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9232 ptr += n;
9233 if (skip) {
9234 if (rb_isspace(c)) {
9235 beg = ptr - bptr;
9236 }
9237 else {
9238 end = ptr - bptr;
9239 skip = 0;
9240 if (!NIL_P(limit) && lim <= i) break;
9241 }
9242 }
9243 else if (rb_isspace(c)) {
9244 SPLIT_STR(beg, end-beg);
9245 skip = 1;
9246 beg = ptr - bptr;
9247 if (!NIL_P(limit)) ++i;
9248 }
9249 else {
9250 end = ptr - bptr;
9251 }
9252 }
9253 }
9254 }
9255 else if (split_type == SPLIT_TYPE_STRING) {
9256 char *substr_start = ptr;
9257 char *sptr = RSTRING_PTR(spat);
9258 long slen = RSTRING_LEN(spat);
9259
9260 if (result) result = rb_ary_new();
9261 mustnot_broken(str);
9262 enc = rb_enc_check(str, spat);
9263 while (ptr < eptr &&
9264 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9265 /* Check we are at the start of a char */
9266 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9267 if (t != ptr + end) {
9268 ptr = t;
9269 continue;
9270 }
9271 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9272 str_mod_check(spat, sptr, slen);
9273 ptr += end + slen;
9274 substr_start = ptr;
9275 if (!NIL_P(limit) && lim <= ++i) break;
9276 }
9277 beg = ptr - str_start;
9278 }
9279 else if (split_type == SPLIT_TYPE_CHARS) {
9280 int n;
9281
9282 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9283 mustnot_broken(str);
9284 enc = rb_enc_get(str);
9285 while (ptr < eptr &&
9286 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9287 SPLIT_STR(ptr - str_start, n);
9288 ptr += n;
9289 if (!NIL_P(limit) && lim <= ++i) break;
9290 }
9291 beg = ptr - str_start;
9292 }
9293 else {
9294 if (result) result = rb_ary_new();
9295 long len = RSTRING_LEN(str);
9296 long start = beg;
9297 long idx;
9298 int last_null = 0;
9299 struct re_registers *regs;
9300 VALUE match = 0;
9301
9302 for (; rb_reg_search(spat, str, start, 0) >= 0;
9303 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9304 match = rb_backref_get();
9305 if (!result) rb_match_busy(match);
9306 regs = RMATCH_REGS(match);
9307 end = BEG(0);
9308 if (start == end && BEG(0) == END(0)) {
9309 if (!ptr) {
9310 SPLIT_STR(0, 0);
9311 break;
9312 }
9313 else if (last_null == 1) {
9314 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9315 beg = start;
9316 }
9317 else {
9318 if (start == len)
9319 start++;
9320 else
9321 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9322 last_null = 1;
9323 continue;
9324 }
9325 }
9326 else {
9327 SPLIT_STR(beg, end-beg);
9328 beg = start = END(0);
9329 }
9330 last_null = 0;
9331
9332 for (idx=1; idx < regs->num_regs; idx++) {
9333 if (BEG(idx) == -1) continue;
9334 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9335 }
9336 if (!NIL_P(limit) && lim <= ++i) break;
9337 }
9338 if (match) rb_match_unbusy(match);
9339 }
9340 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9341 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9342 }
9343
9344 return result ? result : str;
9345}
9346
9347VALUE
9348rb_str_split(VALUE str, const char *sep0)
9349{
9350 VALUE sep;
9351
9352 StringValue(str);
9353 sep = rb_str_new_cstr(sep0);
9354 return rb_str_split_m(1, &sep, str);
9355}
9356
9357#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9358
9359static inline int
9360enumerator_element(VALUE ary, VALUE e)
9361{
9362 if (ary) {
9363 rb_ary_push(ary, e);
9364 return 0;
9365 }
9366 else {
9367 rb_yield(e);
9368 return 1;
9369 }
9370}
9371
9372#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9373
9374static const char *
9375chomp_newline(const char *p, const char *e, rb_encoding *enc)
9376{
9377 const char *prev = rb_enc_prev_char(p, e, e, enc);
9378 if (rb_enc_is_newline(prev, e, enc)) {
9379 e = prev;
9380 prev = rb_enc_prev_char(p, e, e, enc);
9381 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9382 e = prev;
9383 }
9384 return e;
9385}
9386
9387static VALUE
9388get_rs(void)
9389{
9390 VALUE rs = rb_rs;
9391 if (!NIL_P(rs) &&
9392 (!RB_TYPE_P(rs, T_STRING) ||
9393 RSTRING_LEN(rs) != 1 ||
9394 RSTRING_PTR(rs)[0] != '\n')) {
9395 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9396 }
9397 return rs;
9398}
9399
9400#define rb_rs get_rs()
9401
9402static VALUE
9403rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9404{
9405 rb_encoding *enc;
9406 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9407 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9408 long pos, len, rslen;
9409 int rsnewline = 0;
9410
9411 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9412 rs = rb_rs;
9413 if (!NIL_P(opts)) {
9414 static ID keywords[1];
9415 if (!keywords[0]) {
9416 keywords[0] = rb_intern_const("chomp");
9417 }
9418 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9419 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9420 }
9421
9422 if (NIL_P(rs)) {
9423 if (!ENUM_ELEM(ary, str)) {
9424 return ary;
9425 }
9426 else {
9427 return orig;
9428 }
9429 }
9430
9431 if (!RSTRING_LEN(str)) goto end;
9432 str = rb_str_new_frozen(str);
9433 ptr = subptr = RSTRING_PTR(str);
9434 pend = RSTRING_END(str);
9435 len = RSTRING_LEN(str);
9436 StringValue(rs);
9437 rslen = RSTRING_LEN(rs);
9438
9439 if (rs == rb_default_rs)
9440 enc = rb_enc_get(str);
9441 else
9442 enc = rb_enc_check(str, rs);
9443
9444 if (rslen == 0) {
9445 /* paragraph mode */
9446 int n;
9447 const char *eol = NULL;
9448 subend = subptr;
9449 while (subend < pend) {
9450 long chomp_rslen = 0;
9451 do {
9452 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9453 n = 0;
9454 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9455 if (rb_enc_is_newline(subend + n, pend, enc)) {
9456 if (eol == subend) break;
9457 subend += rslen;
9458 if (subptr) {
9459 eol = subend;
9460 chomp_rslen = -rslen;
9461 }
9462 }
9463 else {
9464 if (!subptr) subptr = subend;
9465 subend += rslen;
9466 }
9467 rslen = 0;
9468 } while (subend < pend);
9469 if (!subptr) break;
9470 if (rslen == 0) chomp_rslen = 0;
9471 line = rb_str_subseq(str, subptr - ptr,
9472 subend - subptr + (chomp ? chomp_rslen : rslen));
9473 if (ENUM_ELEM(ary, line)) {
9474 str_mod_check(str, ptr, len);
9475 }
9476 subptr = eol = NULL;
9477 }
9478 goto end;
9479 }
9480 else {
9481 rsptr = RSTRING_PTR(rs);
9482 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9483 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9484 rsnewline = 1;
9485 }
9486 }
9487
9488 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9489 rs = rb_str_new(rsptr, rslen);
9490 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9491 rsptr = RSTRING_PTR(rs);
9492 rslen = RSTRING_LEN(rs);
9493 }
9494
9495 while (subptr < pend) {
9496 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9497 if (pos < 0) break;
9498 hit = subptr + pos;
9499 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9500 if (hit != adjusted) {
9501 subptr = adjusted;
9502 continue;
9503 }
9504 subend = hit += rslen;
9505 if (chomp) {
9506 if (rsnewline) {
9507 subend = chomp_newline(subptr, subend, enc);
9508 }
9509 else {
9510 subend -= rslen;
9511 }
9512 }
9513 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9514 if (ENUM_ELEM(ary, line)) {
9515 str_mod_check(str, ptr, len);
9516 }
9517 subptr = hit;
9518 }
9519
9520 if (subptr != pend) {
9521 if (chomp) {
9522 if (rsnewline) {
9523 pend = chomp_newline(subptr, pend, enc);
9524 }
9525 else if (pend - subptr >= rslen &&
9526 memcmp(pend - rslen, rsptr, rslen) == 0) {
9527 pend -= rslen;
9528 }
9529 }
9530 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9531 ENUM_ELEM(ary, line);
9532 RB_GC_GUARD(str);
9533 }
9534
9535 end:
9536 if (ary)
9537 return ary;
9538 else
9539 return orig;
9540}
9541
9542/*
9543 * call-seq:
9544 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9545 * each_line(record_separator = $/, chomp: false) -> enumerator
9546 *
9547 * :include: doc/string/each_line.rdoc
9548 *
9549 */
9550
9551static VALUE
9552rb_str_each_line(int argc, VALUE *argv, VALUE str)
9553{
9554 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9555 return rb_str_enumerate_lines(argc, argv, str, 0);
9556}
9557
9558/*
9559 * call-seq:
9560 * lines(record_separator = $/, chomp: false) -> array_of_strings
9561 *
9562 * Returns substrings ("lines") of +self+
9563 * according to the given arguments:
9564 *
9565 * s = <<~EOT
9566 * This is the first line.
9567 * This is line two.
9568 *
9569 * This is line four.
9570 * This is line five.
9571 * EOT
9572 *
9573 * With the default argument values:
9574 *
9575 * $/ # => "\n"
9576 * s.lines
9577 * # =>
9578 * ["This is the first line.\n",
9579 * "This is line two.\n",
9580 * "\n",
9581 * "This is line four.\n",
9582 * "This is line five.\n"]
9583 *
9584 * With a different +record_separator+:
9585 *
9586 * record_separator = ' is '
9587 * s.lines(record_separator)
9588 * # =>
9589 * ["This is ",
9590 * "the first line.\nThis is ",
9591 * "line two.\n\nThis is ",
9592 * "line four.\nThis is ",
9593 * "line five.\n"]
9594 *
9595 * With keyword argument +chomp+ as +true+,
9596 * removes the trailing newline from each line:
9597 *
9598 * s.lines(chomp: true)
9599 * # =>
9600 * ["This is the first line.",
9601 * "This is line two.",
9602 * "",
9603 * "This is line four.",
9604 * "This is line five."]
9605 *
9606 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9607 */
9608
9609static VALUE
9610rb_str_lines(int argc, VALUE *argv, VALUE str)
9611{
9612 VALUE ary = WANTARRAY("lines", 0);
9613 return rb_str_enumerate_lines(argc, argv, str, ary);
9614}
9615
9616static VALUE
9617rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9618{
9619 return LONG2FIX(RSTRING_LEN(str));
9620}
9621
9622static VALUE
9623rb_str_enumerate_bytes(VALUE str, VALUE ary)
9624{
9625 long i;
9626
9627 for (i=0; i<RSTRING_LEN(str); i++) {
9628 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9629 }
9630 if (ary)
9631 return ary;
9632 else
9633 return str;
9634}
9635
9636/*
9637 * call-seq:
9638 * each_byte {|byte| ... } -> self
9639 * each_byte -> enumerator
9640 *
9641 * :include: doc/string/each_byte.rdoc
9642 *
9643 */
9644
9645static VALUE
9646rb_str_each_byte(VALUE str)
9647{
9648 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9649 return rb_str_enumerate_bytes(str, 0);
9650}
9651
9652/*
9653 * call-seq:
9654 * bytes -> array_of_bytes
9655 *
9656 * :include: doc/string/bytes.rdoc
9657 *
9658 */
9659
9660static VALUE
9661rb_str_bytes(VALUE str)
9662{
9663 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9664 return rb_str_enumerate_bytes(str, ary);
9665}
9666
9667static VALUE
9668rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9669{
9670 return rb_str_length(str);
9671}
9672
9673static VALUE
9674rb_str_enumerate_chars(VALUE str, VALUE ary)
9675{
9676 VALUE orig = str;
9677 long i, len, n;
9678 const char *ptr;
9679 rb_encoding *enc;
9680
9681 str = rb_str_new_frozen(str);
9682 ptr = RSTRING_PTR(str);
9683 len = RSTRING_LEN(str);
9684 enc = rb_enc_get(str);
9685
9687 for (i = 0; i < len; i += n) {
9688 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9689 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9690 }
9691 }
9692 else {
9693 for (i = 0; i < len; i += n) {
9694 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9695 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9696 }
9697 }
9698 RB_GC_GUARD(str);
9699 if (ary)
9700 return ary;
9701 else
9702 return orig;
9703}
9704
9705/*
9706 * call-seq:
9707 * each_char {|char| ... } -> self
9708 * each_char -> enumerator
9709 *
9710 * :include: doc/string/each_char.rdoc
9711 *
9712 */
9713
9714static VALUE
9715rb_str_each_char(VALUE str)
9716{
9717 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9718 return rb_str_enumerate_chars(str, 0);
9719}
9720
9721/*
9722 * call-seq:
9723 * chars -> array_of_characters
9724 *
9725 * :include: doc/string/chars.rdoc
9726 *
9727 */
9728
9729static VALUE
9730rb_str_chars(VALUE str)
9731{
9732 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9733 return rb_str_enumerate_chars(str, ary);
9734}
9735
9736static VALUE
9737rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9738{
9739 VALUE orig = str;
9740 int n;
9741 unsigned int c;
9742 const char *ptr, *end;
9743 rb_encoding *enc;
9744
9745 if (single_byte_optimizable(str))
9746 return rb_str_enumerate_bytes(str, ary);
9747
9748 str = rb_str_new_frozen(str);
9749 ptr = RSTRING_PTR(str);
9750 end = RSTRING_END(str);
9751 enc = STR_ENC_GET(str);
9752
9753 while (ptr < end) {
9754 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9755 ENUM_ELEM(ary, UINT2NUM(c));
9756 ptr += n;
9757 }
9758 RB_GC_GUARD(str);
9759 if (ary)
9760 return ary;
9761 else
9762 return orig;
9763}
9764
9765/*
9766 * call-seq:
9767 * each_codepoint {|codepoint| ... } -> self
9768 * each_codepoint -> enumerator
9769 *
9770 * :include: doc/string/each_codepoint.rdoc
9771 *
9772 */
9773
9774static VALUE
9775rb_str_each_codepoint(VALUE str)
9776{
9777 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9778 return rb_str_enumerate_codepoints(str, 0);
9779}
9780
9781/*
9782 * call-seq:
9783 * codepoints -> array_of_integers
9784 *
9785 * :include: doc/string/codepoints.rdoc
9786 *
9787 */
9788
9789static VALUE
9790rb_str_codepoints(VALUE str)
9791{
9792 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9793 return rb_str_enumerate_codepoints(str, ary);
9794}
9795
9796static regex_t *
9797get_reg_grapheme_cluster(rb_encoding *enc)
9798{
9799 int encidx = rb_enc_to_index(enc);
9800
9801 const OnigUChar source_ascii[] = "\\X";
9802 const OnigUChar *source = source_ascii;
9803 size_t source_len = sizeof(source_ascii) - 1;
9804
9805 switch (encidx) {
9806#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9807#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9808#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9809#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9810#define CASE_UTF(e) \
9811 case ENCINDEX_UTF_##e: { \
9812 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9813 source = source_UTF_##e; \
9814 source_len = sizeof(source_UTF_##e); \
9815 break; \
9816 }
9817 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9818#undef CASE_UTF
9819#undef CHARS_16BE
9820#undef CHARS_16LE
9821#undef CHARS_32BE
9822#undef CHARS_32LE
9823 }
9824
9825 regex_t *reg_grapheme_cluster;
9826 OnigErrorInfo einfo;
9827 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9828 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9829 if (r) {
9830 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9831 onig_error_code_to_str(message, r, &einfo);
9832 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9833 }
9834
9835 return reg_grapheme_cluster;
9836}
9837
9838static regex_t *
9839get_cached_reg_grapheme_cluster(rb_encoding *enc)
9840{
9841 int encidx = rb_enc_to_index(enc);
9842 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9843
9844 if (encidx == rb_utf8_encindex()) {
9845 if (!reg_grapheme_cluster_utf8) {
9846 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9847 }
9848
9849 return reg_grapheme_cluster_utf8;
9850 }
9851
9852 return NULL;
9853}
9854
9855static VALUE
9856rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9857{
9858 size_t grapheme_cluster_count = 0;
9859 rb_encoding *enc = get_encoding(str);
9860 const char *ptr, *end;
9861
9862 if (!rb_enc_unicode_p(enc)) {
9863 return rb_str_length(str);
9864 }
9865
9866 bool cached_reg_grapheme_cluster = true;
9867 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9868 if (!reg_grapheme_cluster) {
9869 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9870 cached_reg_grapheme_cluster = false;
9871 }
9872
9873 ptr = RSTRING_PTR(str);
9874 end = RSTRING_END(str);
9875
9876 while (ptr < end) {
9877 OnigPosition len = onig_match(reg_grapheme_cluster,
9878 (const OnigUChar *)ptr, (const OnigUChar *)end,
9879 (const OnigUChar *)ptr, NULL, 0);
9880 if (len <= 0) break;
9881 grapheme_cluster_count++;
9882 ptr += len;
9883 }
9884
9885 if (!cached_reg_grapheme_cluster) {
9886 onig_free(reg_grapheme_cluster);
9887 }
9888
9889 return SIZET2NUM(grapheme_cluster_count);
9890}
9891
9892static VALUE
9893rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9894{
9895 VALUE orig = str;
9896 rb_encoding *enc = get_encoding(str);
9897 const char *ptr0, *ptr, *end;
9898
9899 if (!rb_enc_unicode_p(enc)) {
9900 return rb_str_enumerate_chars(str, ary);
9901 }
9902
9903 if (!ary) str = rb_str_new_frozen(str);
9904
9905 bool cached_reg_grapheme_cluster = true;
9906 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9907 if (!reg_grapheme_cluster) {
9908 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9909 cached_reg_grapheme_cluster = false;
9910 }
9911
9912 ptr0 = ptr = RSTRING_PTR(str);
9913 end = RSTRING_END(str);
9914
9915 while (ptr < end) {
9916 OnigPosition len = onig_match(reg_grapheme_cluster,
9917 (const OnigUChar *)ptr, (const OnigUChar *)end,
9918 (const OnigUChar *)ptr, NULL, 0);
9919 if (len <= 0) break;
9920 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9921 ptr += len;
9922 }
9923
9924 if (!cached_reg_grapheme_cluster) {
9925 onig_free(reg_grapheme_cluster);
9926 }
9927
9928 RB_GC_GUARD(str);
9929 if (ary)
9930 return ary;
9931 else
9932 return orig;
9933}
9934
9935/*
9936 * call-seq:
9937 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9938 * each_grapheme_cluster -> enumerator
9939 *
9940 * :include: doc/string/each_grapheme_cluster.rdoc
9941 *
9942 */
9943
9944static VALUE
9945rb_str_each_grapheme_cluster(VALUE str)
9946{
9947 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9948 return rb_str_enumerate_grapheme_clusters(str, 0);
9949}
9950
9951/*
9952 * call-seq:
9953 * grapheme_clusters -> array_of_grapheme_clusters
9954 *
9955 * :include: doc/string/grapheme_clusters.rdoc
9956 *
9957 */
9958
9959static VALUE
9960rb_str_grapheme_clusters(VALUE str)
9961{
9962 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9963 return rb_str_enumerate_grapheme_clusters(str, ary);
9964}
9965
9966static long
9967chopped_length(VALUE str)
9968{
9969 rb_encoding *enc = STR_ENC_GET(str);
9970 const char *p, *p2, *beg, *end;
9971
9972 beg = RSTRING_PTR(str);
9973 end = beg + RSTRING_LEN(str);
9974 if (beg >= end) return 0;
9975 p = rb_enc_prev_char(beg, end, end, enc);
9976 if (!p) return 0;
9977 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9978 p2 = rb_enc_prev_char(beg, p, end, enc);
9979 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9980 }
9981 return p - beg;
9982}
9983
9984/*
9985 * call-seq:
9986 * chop! -> self or nil
9987 *
9988 * Like String#chop, except that:
9989 *
9990 * - Removes trailing characters from +self+ (not from a copy of +self+).
9991 * - Returns +self+ if any characters are removed, +nil+ otherwise.
9992 *
9993 * Related: see {Modifying}[rdoc-ref:String@Modifying].
9994 */
9995
9996static VALUE
9997rb_str_chop_bang(VALUE str)
9998{
9999 str_modify_keep_cr(str);
10000 if (RSTRING_LEN(str) > 0) {
10001 long len;
10002 len = chopped_length(str);
10003 STR_SET_LEN(str, len);
10004 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10005 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10007 }
10008 return str;
10009 }
10010 return Qnil;
10011}
10012
10013
10014/*
10015 * call-seq:
10016 * chop -> new_string
10017 *
10018 * :include: doc/string/chop.rdoc
10019 *
10020 */
10021
10022static VALUE
10023rb_str_chop(VALUE str)
10024{
10025 return rb_str_subseq(str, 0, chopped_length(str));
10026}
10027
10028static long
10029smart_chomp(VALUE str, const char *e, const char *p)
10030{
10031 rb_encoding *enc = rb_enc_get(str);
10032 if (rb_enc_mbminlen(enc) > 1) {
10033 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10034 if (rb_enc_is_newline(pp, e, enc)) {
10035 e = pp;
10036 }
10037 pp = e - rb_enc_mbminlen(enc);
10038 if (pp >= p) {
10039 pp = rb_enc_left_char_head(p, pp, e, enc);
10040 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10041 e = pp;
10042 }
10043 }
10044 }
10045 else {
10046 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10047 case '\n':
10048 if (--e > p && *(e-1) == '\r') {
10049 --e;
10050 }
10051 break;
10052 case '\r':
10053 --e;
10054 break;
10055 }
10056 }
10057 return e - p;
10058}
10059
10060static long
10061chompped_length(VALUE str, VALUE rs)
10062{
10063 rb_encoding *enc;
10064 int newline;
10065 char *pp, *e, *rsptr;
10066 long rslen;
10067 char *const p = RSTRING_PTR(str);
10068 long len = RSTRING_LEN(str);
10069
10070 if (len == 0) return 0;
10071 e = p + len;
10072 if (rs == rb_default_rs) {
10073 return smart_chomp(str, e, p);
10074 }
10075
10076 enc = rb_enc_get(str);
10077 RSTRING_GETMEM(rs, rsptr, rslen);
10078 if (rslen == 0) {
10079 if (rb_enc_mbminlen(enc) > 1) {
10080 while (e > p) {
10081 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10082 if (!rb_enc_is_newline(pp, e, enc)) break;
10083 e = pp;
10084 pp -= rb_enc_mbminlen(enc);
10085 if (pp >= p) {
10086 pp = rb_enc_left_char_head(p, pp, e, enc);
10087 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10088 e = pp;
10089 }
10090 }
10091 }
10092 }
10093 else {
10094 while (e > p && *(e-1) == '\n') {
10095 --e;
10096 if (e > p && *(e-1) == '\r')
10097 --e;
10098 }
10099 }
10100 return e - p;
10101 }
10102 if (rslen > len) return len;
10103
10104 enc = rb_enc_get(rs);
10105 newline = rsptr[rslen-1];
10106 if (rslen == rb_enc_mbminlen(enc)) {
10107 if (rslen == 1) {
10108 if (newline == '\n')
10109 return smart_chomp(str, e, p);
10110 }
10111 else {
10112 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10113 return smart_chomp(str, e, p);
10114 }
10115 }
10116
10117 enc = rb_enc_check(str, rs);
10118 if (is_broken_string(rs)) {
10119 return len;
10120 }
10121 pp = e - rslen;
10122 if (p[len-1] == newline &&
10123 (rslen <= 1 ||
10124 memcmp(rsptr, pp, rslen) == 0)) {
10125 if (at_char_boundary(p, pp, e, enc))
10126 return len - rslen;
10127 RB_GC_GUARD(rs);
10128 }
10129 return len;
10130}
10131
10137static VALUE
10138chomp_rs(int argc, const VALUE *argv)
10139{
10140 rb_check_arity(argc, 0, 1);
10141 if (argc > 0) {
10142 VALUE rs = argv[0];
10143 if (!NIL_P(rs)) StringValue(rs);
10144 return rs;
10145 }
10146 else {
10147 return rb_rs;
10148 }
10149}
10150
10151VALUE
10152rb_str_chomp_string(VALUE str, VALUE rs)
10153{
10154 long olen = RSTRING_LEN(str);
10155 long len = chompped_length(str, rs);
10156 if (len >= olen) return Qnil;
10157 str_modify_keep_cr(str);
10158 STR_SET_LEN(str, len);
10159 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10160 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10162 }
10163 return str;
10164}
10165
10166/*
10167 * call-seq:
10168 * chomp!(line_sep = $/) -> self or nil
10169 *
10170 * Like String#chomp, except that:
10171 *
10172 * - Removes trailing characters from +self+ (not from a copy of +self+).
10173 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10174 *
10175 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10176 */
10177
10178static VALUE
10179rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10180{
10181 VALUE rs;
10182 str_modifiable(str);
10183 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10184 rs = chomp_rs(argc, argv);
10185 if (NIL_P(rs)) return Qnil;
10186 return rb_str_chomp_string(str, rs);
10187}
10188
10189
10190/*
10191 * call-seq:
10192 * chomp(line_sep = $/) -> new_string
10193 *
10194 * :include: doc/string/chomp.rdoc
10195 *
10196 */
10197
10198static VALUE
10199rb_str_chomp(int argc, VALUE *argv, VALUE str)
10200{
10201 VALUE rs = chomp_rs(argc, argv);
10202 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10203 return rb_str_subseq(str, 0, chompped_length(str, rs));
10204}
10205
10206static long
10207lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10208{
10209 const char *const start = s;
10210
10211 if (!s || s >= e) return 0;
10212
10213 /* remove spaces at head */
10214 if (single_byte_optimizable(str)) {
10215 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10216 }
10217 else {
10218 while (s < e) {
10219 int n;
10220 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10221
10222 if (cc && !rb_isspace(cc)) break;
10223 s += n;
10224 }
10225 }
10226 return s - start;
10227}
10228
10229/*
10230 * call-seq:
10231 * lstrip! -> self or nil
10232 *
10233 * Like String#lstrip, except that:
10234 *
10235 * - Performs stripping in +self+ (not in a copy of +self+).
10236 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10237 *
10238 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10239 */
10240
10241static VALUE
10242rb_str_lstrip_bang(VALUE str)
10243{
10244 rb_encoding *enc;
10245 char *start, *s;
10246 long olen, loffset;
10247
10248 str_modify_keep_cr(str);
10249 enc = STR_ENC_GET(str);
10250 RSTRING_GETMEM(str, start, olen);
10251 loffset = lstrip_offset(str, start, start+olen, enc);
10252 if (loffset > 0) {
10253 long len = olen-loffset;
10254 s = start + loffset;
10255 memmove(start, s, len);
10256 STR_SET_LEN(str, len);
10257 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10258 return str;
10259 }
10260 return Qnil;
10261}
10262
10263
10264/*
10265 * call-seq:
10266 * lstrip -> new_string
10267 *
10268 * Returns a copy of +self+ with leading whitespace removed;
10269 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10270 *
10271 * whitespace = "\x00\t\n\v\f\r "
10272 * s = whitespace + 'abc' + whitespace
10273 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10274 * s.lstrip
10275 * # => "abc\u0000\t\n\v\f\r "
10276 *
10277 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10278 */
10279
10280static VALUE
10281rb_str_lstrip(VALUE str)
10282{
10283 char *start;
10284 long len, loffset;
10285 RSTRING_GETMEM(str, start, len);
10286 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10287 if (loffset <= 0) return str_duplicate(rb_cString, str);
10288 return rb_str_subseq(str, loffset, len - loffset);
10289}
10290
10291static long
10292rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10293{
10294 const char *t;
10295
10296 rb_str_check_dummy_enc(enc);
10298 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10299 }
10300 if (!s || s >= e) return 0;
10301 t = e;
10302
10303 /* remove trailing spaces or '\0's */
10304 if (single_byte_optimizable(str)) {
10305 unsigned char c;
10306 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10307 }
10308 else {
10309 char *tp;
10310
10311 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10312 unsigned int c = rb_enc_codepoint(tp, e, enc);
10313 if (c && !rb_isspace(c)) break;
10314 t = tp;
10315 }
10316 }
10317 return e - t;
10318}
10319
10320/*
10321 * call-seq:
10322 * rstrip! -> self or nil
10323 *
10324 * Like String#rstrip, except that:
10325 *
10326 * - Performs stripping in +self+ (not in a copy of +self+).
10327 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10328 *
10329 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10330 */
10331
10332static VALUE
10333rb_str_rstrip_bang(VALUE str)
10334{
10335 rb_encoding *enc;
10336 char *start;
10337 long olen, roffset;
10338
10339 str_modify_keep_cr(str);
10340 enc = STR_ENC_GET(str);
10341 RSTRING_GETMEM(str, start, olen);
10342 roffset = rstrip_offset(str, start, start+olen, enc);
10343 if (roffset > 0) {
10344 long len = olen - roffset;
10345
10346 STR_SET_LEN(str, len);
10347 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10348 return str;
10349 }
10350 return Qnil;
10351}
10352
10353
10354/*
10355 * call-seq:
10356 * rstrip -> new_string
10357 *
10358 * Returns a copy of +self+ with trailing whitespace removed;
10359 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10360 *
10361 * whitespace = "\x00\t\n\v\f\r "
10362 * s = whitespace + 'abc' + whitespace
10363 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10364 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10365 *
10366 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10367 */
10368
10369static VALUE
10370rb_str_rstrip(VALUE str)
10371{
10372 rb_encoding *enc;
10373 char *start;
10374 long olen, roffset;
10375
10376 enc = STR_ENC_GET(str);
10377 RSTRING_GETMEM(str, start, olen);
10378 roffset = rstrip_offset(str, start, start+olen, enc);
10379
10380 if (roffset <= 0) return str_duplicate(rb_cString, str);
10381 return rb_str_subseq(str, 0, olen-roffset);
10382}
10383
10384
10385/*
10386 * call-seq:
10387 * strip! -> self or nil
10388 *
10389 * Like String#strip, except that:
10390 *
10391 * - Any modifications are made to +self+.
10392 * - Returns +self+ if any modification are made, +nil+ otherwise.
10393 *
10394 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10395 */
10396
10397static VALUE
10398rb_str_strip_bang(VALUE str)
10399{
10400 char *start;
10401 long olen, loffset, roffset;
10402 rb_encoding *enc;
10403
10404 str_modify_keep_cr(str);
10405 enc = STR_ENC_GET(str);
10406 RSTRING_GETMEM(str, start, olen);
10407 loffset = lstrip_offset(str, start, start+olen, enc);
10408 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10409
10410 if (loffset > 0 || roffset > 0) {
10411 long len = olen-roffset;
10412 if (loffset > 0) {
10413 len -= loffset;
10414 memmove(start, start + loffset, len);
10415 }
10416 STR_SET_LEN(str, len);
10417 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10418 return str;
10419 }
10420 return Qnil;
10421}
10422
10423
10424/*
10425 * call-seq:
10426 * strip -> new_string
10427 *
10428 * Returns a copy of +self+ with leading and trailing whitespace removed;
10429 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10430 *
10431 * whitespace = "\x00\t\n\v\f\r "
10432 * s = whitespace + 'abc' + whitespace
10433 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10434 * s.strip # => "abc"
10435 *
10436 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10437 */
10438
10439static VALUE
10440rb_str_strip(VALUE str)
10441{
10442 char *start;
10443 long olen, loffset, roffset;
10444 rb_encoding *enc = STR_ENC_GET(str);
10445
10446 RSTRING_GETMEM(str, start, olen);
10447 loffset = lstrip_offset(str, start, start+olen, enc);
10448 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10449
10450 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10451 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10452}
10453
10454static VALUE
10455scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10456{
10457 VALUE result = Qnil;
10458 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10459 if (pos >= 0) {
10460 VALUE match;
10461 struct re_registers *regs;
10462 if (BUILTIN_TYPE(pat) == T_STRING) {
10463 regs = NULL;
10464 end = pos + RSTRING_LEN(pat);
10465 }
10466 else {
10467 match = rb_backref_get();
10468 regs = RMATCH_REGS(match);
10469 pos = BEG(0);
10470 end = END(0);
10471 }
10472
10473 if (pos == end) {
10474 rb_encoding *enc = STR_ENC_GET(str);
10475 /*
10476 * Always consume at least one character of the input string
10477 */
10478 if (RSTRING_LEN(str) > end)
10479 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10480 RSTRING_END(str), enc);
10481 else
10482 *start = end + 1;
10483 }
10484 else {
10485 *start = end;
10486 }
10487
10488 if (!regs || regs->num_regs == 1) {
10489 result = rb_str_subseq(str, pos, end - pos);
10490 return result;
10491 }
10492 else {
10493 result = rb_ary_new2(regs->num_regs);
10494 for (int i = 1; i < regs->num_regs; i++) {
10495 VALUE s = Qnil;
10496 if (BEG(i) >= 0) {
10497 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10498 }
10499
10500 rb_ary_push(result, s);
10501 }
10502 }
10503
10504 RB_GC_GUARD(match);
10505 }
10506
10507 return result;
10508}
10509
10510
10511/*
10512 * call-seq:
10513 * scan(pattern) -> array_of_results
10514 * scan(pattern) {|result| ... } -> self
10515 *
10516 * :include: doc/string/scan.rdoc
10517 *
10518 */
10519
10520static VALUE
10521rb_str_scan(VALUE str, VALUE pat)
10522{
10523 VALUE result;
10524 long start = 0;
10525 long last = -1, prev = 0;
10526 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10527
10528 pat = get_pat_quoted(pat, 1);
10529 mustnot_broken(str);
10530 if (!rb_block_given_p()) {
10531 VALUE ary = rb_ary_new();
10532
10533 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10534 last = prev;
10535 prev = start;
10536 rb_ary_push(ary, result);
10537 }
10538 if (last >= 0) rb_pat_search(pat, str, last, 1);
10539 else rb_backref_set(Qnil);
10540 return ary;
10541 }
10542
10543 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10544 last = prev;
10545 prev = start;
10546 rb_yield(result);
10547 str_mod_check(str, p, len);
10548 }
10549 if (last >= 0) rb_pat_search(pat, str, last, 1);
10550 return str;
10551}
10552
10553
10554/*
10555 * call-seq:
10556 * hex -> integer
10557 *
10558 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10559 * returns its value as an integer.
10560 *
10561 * The leading substring is interpreted as hexadecimal when it begins with:
10562 *
10563 * - One or more character representing hexadecimal digits
10564 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10565 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10566 *
10567 * 'f'.hex # => 15
10568 * '11'.hex # => 17
10569 * 'FFF'.hex # => 4095
10570 * 'fffg'.hex # => 4095
10571 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10572 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10573 * 'deadbeef'.hex # => 3735928559
10574 *
10575 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10576 *
10577 * '0xfff'.hex # => 4095
10578 * '0xfffg'.hex # => 4095
10579 *
10580 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10581 *
10582 * '-fff'.hex # => -4095
10583 * '-0xFFF'.hex # => -4095
10584 *
10585 * For any substring not described above, returns zero:
10586 *
10587 * 'xxx'.hex # => 0
10588 * ''.hex # => 0
10589 *
10590 * Note that, unlike #oct, this method interprets only hexadecimal,
10591 * and not binary, octal, or decimal notations:
10592 *
10593 * '0b111'.hex # => 45329
10594 * '0o777'.hex # => 0
10595 * '0d999'.hex # => 55705
10596 *
10597 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10598 */
10599
10600static VALUE
10601rb_str_hex(VALUE str)
10602{
10603 return rb_str_to_inum(str, 16, FALSE);
10604}
10605
10606
10607/*
10608 * call-seq:
10609 * oct -> integer
10610 *
10611 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10612 * returns their value as an integer.
10613 *
10614 * In brief:
10615 *
10616 * # Interpreted as octal.
10617 * '777'.oct # => 511
10618 * '777x'.oct # => 511
10619 * '0777'.oct # => 511
10620 * '0o777'.oct # => 511
10621 * '-777'.oct # => -511
10622 * # Not interpreted as octal.
10623 * '0b111'.oct # => 7 # Interpreted as binary.
10624 * '0d999'.oct # => 999 # Interpreted as decimal.
10625 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10626 *
10627 * The leading substring is interpreted as octal when it begins with:
10628 *
10629 * - One or more character representing octal digits
10630 * (each in the range <tt>'0'..'7'</tt>);
10631 * the string to be interpreted ends at the first character that does not represent an octal digit:
10632 *
10633 * '7'.oct @ => 7
10634 * '11'.oct # => 9
10635 * '777'.oct # => 511
10636 * '0777'.oct # => 511
10637 * '7778'.oct # => 511
10638 * '777x'.oct # => 511
10639 *
10640 * - <tt>'0o'</tt>, followed by one or more octal digits:
10641 *
10642 * '0o777'.oct # => 511
10643 * '0o7778'.oct # => 511
10644 *
10645 * The leading substring is _not_ interpreted as octal when it begins with:
10646 *
10647 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10648 * (each in the range <tt>'0'..'1'</tt>);
10649 * the string to be interpreted ends at the first character that does not represent a binary digit.
10650 * the string is interpreted as binary digits (base 2):
10651 *
10652 * '0b111'.oct # => 7
10653 * '0b1112'.oct # => 7
10654 *
10655 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10656 * (each in the range <tt>'0'..'9'</tt>);
10657 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10658 * the string is interpreted as decimal digits (base 10):
10659 *
10660 * '0d999'.oct # => 999
10661 * '0d999x'.oct # => 999
10662 *
10663 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10664 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10665 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10666 * the string is interpreted as hexadecimal digits (base 16):
10667 *
10668 * '0xfff'.oct # => 4095
10669 * '0xfffg'.oct # => 4095
10670 *
10671 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10672 *
10673 * '-777'.oct # => -511
10674 * '-0777'.oct # => -511
10675 * '-0b111'.oct # => -7
10676 * '-0xfff'.oct # => -4095
10677 *
10678 * For any substring not described above, returns zero:
10679 *
10680 * 'foo'.oct # => 0
10681 * ''.oct # => 0
10682 *
10683 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10684 */
10685
10686static VALUE
10687rb_str_oct(VALUE str)
10688{
10689 return rb_str_to_inum(str, -8, FALSE);
10690}
10691
10692#ifndef HAVE_CRYPT_R
10693# include "ruby/thread_native.h"
10694# include "ruby/atomic.h"
10695
10696static struct {
10697 rb_nativethread_lock_t lock;
10698} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10699#endif
10700
10701/*
10702 * call-seq:
10703 * crypt(salt_str) -> new_string
10704 *
10705 * Returns the string generated by calling <code>crypt(3)</code>
10706 * standard library function with <code>str</code> and
10707 * <code>salt_str</code>, in this order, as its arguments. Please do
10708 * not use this method any longer. It is legacy; provided only for
10709 * backward compatibility with ruby scripts in earlier days. It is
10710 * bad to use in contemporary programs for several reasons:
10711 *
10712 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10713 * run. The generated string lacks data portability.
10714 *
10715 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10716 * (i.e. silently ends up in unexpected results).
10717 *
10718 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10719 * thread safe.
10720 *
10721 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10722 * very very weak. According to its manpage, Linux's traditional
10723 * <code>crypt(3)</code> output has only 2**56 variations; too
10724 * easy to brute force today. And this is the default behaviour.
10725 *
10726 * * In order to make things robust some OSes implement so-called
10727 * "modular" usage. To go through, you have to do a complex
10728 * build-up of the <code>salt_str</code> parameter, by hand.
10729 * Failure in generation of a proper salt string tends not to
10730 * yield any errors; typos in parameters are normally not
10731 * detectable.
10732 *
10733 * * For instance, in the following example, the second invocation
10734 * of String#crypt is wrong; it has a typo in "round=" (lacks
10735 * "s"). However the call does not fail and something unexpected
10736 * is generated.
10737 *
10738 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10739 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10740 *
10741 * * Even in the "modular" mode, some hash functions are considered
10742 * archaic and no longer recommended at all; for instance module
10743 * <code>$1$</code> is officially abandoned by its author: see
10744 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10745 * instance module <code>$3$</code> is considered completely
10746 * broken: see the manpage of FreeBSD.
10747 *
10748 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10749 * written above, <code>crypt(3)</code> on Mac OS never fails.
10750 * This means even if you build up a proper salt string it
10751 * generates a traditional DES hash anyways, and there is no way
10752 * for you to be aware of.
10753 *
10754 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10755 *
10756 * If for some reason you cannot migrate to other secure contemporary
10757 * password hashing algorithms, install the string-crypt gem and
10758 * <code>require 'string/crypt'</code> to continue using it.
10759 */
10760
10761static VALUE
10762rb_str_crypt(VALUE str, VALUE salt)
10763{
10764#ifdef HAVE_CRYPT_R
10765 VALUE databuf;
10766 struct crypt_data *data;
10767# define CRYPT_END() ALLOCV_END(databuf)
10768#else
10769 char *tmp_buf;
10770 extern char *crypt(const char *, const char *);
10771# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10772#endif
10773 VALUE result;
10774 const char *s, *saltp;
10775 char *res;
10776#ifdef BROKEN_CRYPT
10777 char salt_8bit_clean[3];
10778#endif
10779
10780 StringValue(salt);
10781 mustnot_wchar(str);
10782 mustnot_wchar(salt);
10783 s = StringValueCStr(str);
10784 saltp = RSTRING_PTR(salt);
10785 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10786 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10787 }
10788
10789#ifdef BROKEN_CRYPT
10790 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10791 salt_8bit_clean[0] = saltp[0] & 0x7f;
10792 salt_8bit_clean[1] = saltp[1] & 0x7f;
10793 salt_8bit_clean[2] = '\0';
10794 saltp = salt_8bit_clean;
10795 }
10796#endif
10797#ifdef HAVE_CRYPT_R
10798 data = ALLOCV(databuf, sizeof(struct crypt_data));
10799# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10800 data->initialized = 0;
10801# endif
10802 res = crypt_r(s, saltp, data);
10803#else
10804 rb_nativethread_lock_lock(&crypt_mutex.lock);
10805 res = crypt(s, saltp);
10806#endif
10807 if (!res) {
10808 int err = errno;
10809 CRYPT_END();
10810 rb_syserr_fail(err, "crypt");
10811 }
10812#ifdef HAVE_CRYPT_R
10813 result = rb_str_new_cstr(res);
10814 CRYPT_END();
10815#else
10816 // We need to copy this buffer because it's static and we need to unlock the mutex
10817 // before allocating a new object (the string to be returned). If we allocate while
10818 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10819 // if other ractors are waiting on this lock.
10820 size_t res_size = strlen(res)+1;
10821 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10822 memcpy(tmp_buf, res, res_size);
10823 res = tmp_buf;
10824 CRYPT_END();
10825 result = rb_str_new_cstr(res);
10826#endif
10827 return result;
10828}
10829
10830
10831/*
10832 * call-seq:
10833 * ord -> integer
10834 *
10835 * :include: doc/string/ord.rdoc
10836 *
10837 */
10838
10839static VALUE
10840rb_str_ord(VALUE s)
10841{
10842 unsigned int c;
10843
10844 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10845 return UINT2NUM(c);
10846}
10847/*
10848 * call-seq:
10849 * sum(n = 16) -> integer
10850 *
10851 * :include: doc/string/sum.rdoc
10852 *
10853 */
10854
10855static VALUE
10856rb_str_sum(int argc, VALUE *argv, VALUE str)
10857{
10858 int bits = 16;
10859 char *ptr, *p, *pend;
10860 long len;
10861 VALUE sum = INT2FIX(0);
10862 unsigned long sum0 = 0;
10863
10864 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10865 bits = 0;
10866 }
10867 ptr = p = RSTRING_PTR(str);
10868 len = RSTRING_LEN(str);
10869 pend = p + len;
10870
10871 while (p < pend) {
10872 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10873 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10874 str_mod_check(str, ptr, len);
10875 sum0 = 0;
10876 }
10877 sum0 += (unsigned char)*p;
10878 p++;
10879 }
10880
10881 if (bits == 0) {
10882 if (sum0) {
10883 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10884 }
10885 }
10886 else {
10887 if (sum == INT2FIX(0)) {
10888 if (bits < (int)sizeof(long)*CHAR_BIT) {
10889 sum0 &= (((unsigned long)1)<<bits)-1;
10890 }
10891 sum = LONG2FIX(sum0);
10892 }
10893 else {
10894 VALUE mod;
10895
10896 if (sum0) {
10897 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10898 }
10899
10900 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10901 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10902 sum = rb_funcall(sum, '&', 1, mod);
10903 }
10904 }
10905 return sum;
10906}
10907
10908static VALUE
10909rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10910{
10911 rb_encoding *enc;
10912 VALUE w;
10913 long width, len, flen = 1, fclen = 1;
10914 VALUE res;
10915 char *p;
10916 const char *f = " ";
10917 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10918 VALUE pad;
10919 int singlebyte = 1, cr;
10920 int termlen;
10921
10922 rb_scan_args(argc, argv, "11", &w, &pad);
10923 enc = STR_ENC_GET(str);
10924 termlen = rb_enc_mbminlen(enc);
10925 width = NUM2LONG(w);
10926 if (argc == 2) {
10927 StringValue(pad);
10928 enc = rb_enc_check(str, pad);
10929 f = RSTRING_PTR(pad);
10930 flen = RSTRING_LEN(pad);
10931 fclen = str_strlen(pad, enc); /* rb_enc_check */
10932 singlebyte = single_byte_optimizable(pad);
10933 if (flen == 0 || fclen == 0) {
10934 rb_raise(rb_eArgError, "zero width padding");
10935 }
10936 }
10937 len = str_strlen(str, enc); /* rb_enc_check */
10938 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10939 n = width - len;
10940 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10941 rlen = n - llen;
10942 cr = ENC_CODERANGE(str);
10943 if (flen > 1) {
10944 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10945 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10946 }
10947 size = RSTRING_LEN(str);
10948 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10949 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10950 (len += llen2 + rlen2) >= LONG_MAX - size) {
10951 rb_raise(rb_eArgError, "argument too big");
10952 }
10953 len += size;
10954 res = str_enc_new(rb_cString, 0, len, enc);
10955 p = RSTRING_PTR(res);
10956 if (flen <= 1) {
10957 memset(p, *f, llen);
10958 p += llen;
10959 }
10960 else {
10961 while (llen >= fclen) {
10962 memcpy(p,f,flen);
10963 p += flen;
10964 llen -= fclen;
10965 }
10966 if (llen > 0) {
10967 memcpy(p, f, llen2);
10968 p += llen2;
10969 }
10970 }
10971 memcpy(p, RSTRING_PTR(str), size);
10972 p += size;
10973 if (flen <= 1) {
10974 memset(p, *f, rlen);
10975 p += rlen;
10976 }
10977 else {
10978 while (rlen >= fclen) {
10979 memcpy(p,f,flen);
10980 p += flen;
10981 rlen -= fclen;
10982 }
10983 if (rlen > 0) {
10984 memcpy(p, f, rlen2);
10985 p += rlen2;
10986 }
10987 }
10988 TERM_FILL(p, termlen);
10989 STR_SET_LEN(res, p-RSTRING_PTR(res));
10990
10991 if (argc == 2)
10992 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10993 if (cr != ENC_CODERANGE_BROKEN)
10994 ENC_CODERANGE_SET(res, cr);
10995
10996 RB_GC_GUARD(pad);
10997 return res;
10998}
10999
11000
11001/*
11002 * call-seq:
11003 * ljust(width, pad_string = ' ') -> new_string
11004 *
11005 * :include: doc/string/ljust.rdoc
11006 *
11007 */
11008
11009static VALUE
11010rb_str_ljust(int argc, VALUE *argv, VALUE str)
11011{
11012 return rb_str_justify(argc, argv, str, 'l');
11013}
11014
11015/*
11016 * call-seq:
11017 * rjust(width, pad_string = ' ') -> new_string
11018 *
11019 * :include: doc/string/rjust.rdoc
11020 *
11021 */
11022
11023static VALUE
11024rb_str_rjust(int argc, VALUE *argv, VALUE str)
11025{
11026 return rb_str_justify(argc, argv, str, 'r');
11027}
11028
11029
11030/*
11031 * call-seq:
11032 * center(size, pad_string = ' ') -> new_string
11033 *
11034 * :include: doc/string/center.rdoc
11035 *
11036 */
11037
11038static VALUE
11039rb_str_center(int argc, VALUE *argv, VALUE str)
11040{
11041 return rb_str_justify(argc, argv, str, 'c');
11042}
11043
11044/*
11045 * call-seq:
11046 * partition(pattern) -> [pre_match, first_match, post_match]
11047 *
11048 * :include: doc/string/partition.rdoc
11049 *
11050 */
11051
11052static VALUE
11053rb_str_partition(VALUE str, VALUE sep)
11054{
11055 long pos;
11056
11057 sep = get_pat_quoted(sep, 0);
11058 if (RB_TYPE_P(sep, T_REGEXP)) {
11059 if (rb_reg_search(sep, str, 0, 0) < 0) {
11060 goto failed;
11061 }
11062 VALUE match = rb_backref_get();
11063 struct re_registers *regs = RMATCH_REGS(match);
11064
11065 pos = BEG(0);
11066 sep = rb_str_subseq(str, pos, END(0) - pos);
11067 }
11068 else {
11069 pos = rb_str_index(str, sep, 0);
11070 if (pos < 0) goto failed;
11071 }
11072 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11073 sep,
11074 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11075 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11076
11077 failed:
11078 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11079}
11080
11081/*
11082 * call-seq:
11083 * rpartition(pattern) -> [pre_match, last_match, post_match]
11084 *
11085 * :include: doc/string/rpartition.rdoc
11086 *
11087 */
11088
11089static VALUE
11090rb_str_rpartition(VALUE str, VALUE sep)
11091{
11092 long pos = RSTRING_LEN(str);
11093
11094 sep = get_pat_quoted(sep, 0);
11095 if (RB_TYPE_P(sep, T_REGEXP)) {
11096 if (rb_reg_search(sep, str, pos, 1) < 0) {
11097 goto failed;
11098 }
11099 VALUE match = rb_backref_get();
11100 struct re_registers *regs = RMATCH_REGS(match);
11101
11102 pos = BEG(0);
11103 sep = rb_str_subseq(str, pos, END(0) - pos);
11104 }
11105 else {
11106 pos = rb_str_sublen(str, pos);
11107 pos = rb_str_rindex(str, sep, pos);
11108 if (pos < 0) {
11109 goto failed;
11110 }
11111 }
11112
11113 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11114 sep,
11115 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11116 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11117 failed:
11118 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11119}
11120
11121/*
11122 * call-seq:
11123 * start_with?(*patterns) -> true or false
11124 *
11125 * :include: doc/string/start_with_p.rdoc
11126 *
11127 */
11128
11129static VALUE
11130rb_str_start_with(int argc, VALUE *argv, VALUE str)
11131{
11132 int i;
11133
11134 for (i=0; i<argc; i++) {
11135 VALUE tmp = argv[i];
11136 if (RB_TYPE_P(tmp, T_REGEXP)) {
11137 if (rb_reg_start_with_p(tmp, str))
11138 return Qtrue;
11139 }
11140 else {
11141 const char *p, *s, *e;
11142 long slen, tlen;
11143 rb_encoding *enc;
11144
11145 StringValue(tmp);
11146 enc = rb_enc_check(str, tmp);
11147 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11148 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11149 p = RSTRING_PTR(str);
11150 e = p + slen;
11151 s = p + tlen;
11152 if (!at_char_right_boundary(p, s, e, enc))
11153 continue;
11154 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11155 return Qtrue;
11156 }
11157 }
11158 return Qfalse;
11159}
11160
11161/*
11162 * call-seq:
11163 * end_with?(*strings) -> true or false
11164 *
11165 * :include: doc/string/end_with_p.rdoc
11166 *
11167 */
11168
11169static VALUE
11170rb_str_end_with(int argc, VALUE *argv, VALUE str)
11171{
11172 int i;
11173
11174 for (i=0; i<argc; i++) {
11175 VALUE tmp = argv[i];
11176 const char *p, *s, *e;
11177 long slen, tlen;
11178 rb_encoding *enc;
11179
11180 StringValue(tmp);
11181 enc = rb_enc_check(str, tmp);
11182 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11183 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11184 p = RSTRING_PTR(str);
11185 e = p + slen;
11186 s = e - tlen;
11187 if (!at_char_boundary(p, s, e, enc))
11188 continue;
11189 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11190 return Qtrue;
11191 }
11192 return Qfalse;
11193}
11194
11204static long
11205deleted_prefix_length(VALUE str, VALUE prefix)
11206{
11207 const char *strptr, *prefixptr;
11208 long olen, prefixlen;
11209 rb_encoding *enc = rb_enc_get(str);
11210
11211 StringValue(prefix);
11212
11213 if (!is_broken_string(prefix) ||
11214 !rb_enc_asciicompat(enc) ||
11215 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11216 enc = rb_enc_check(str, prefix);
11217 }
11218
11219 /* return 0 if not start with prefix */
11220 prefixlen = RSTRING_LEN(prefix);
11221 if (prefixlen <= 0) return 0;
11222 olen = RSTRING_LEN(str);
11223 if (olen < prefixlen) return 0;
11224 strptr = RSTRING_PTR(str);
11225 prefixptr = RSTRING_PTR(prefix);
11226 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11227 if (is_broken_string(prefix)) {
11228 if (!is_broken_string(str)) {
11229 /* prefix in a valid string cannot be broken */
11230 return 0;
11231 }
11232 const char *strend = strptr + olen;
11233 const char *after_prefix = strptr + prefixlen;
11234 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11235 /* prefix does not end at char-boundary */
11236 return 0;
11237 }
11238 }
11239 /* prefix part in `str` also should be valid. */
11240
11241 return prefixlen;
11242}
11243
11244/*
11245 * call-seq:
11246 * delete_prefix!(prefix) -> self or nil
11247 *
11248 * Like String#delete_prefix, except that +self+ is modified in place;
11249 * returns +self+ if the prefix is removed, +nil+ otherwise.
11250 *
11251 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11252 */
11253
11254static VALUE
11255rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11256{
11257 long prefixlen;
11258 str_modify_keep_cr(str);
11259
11260 prefixlen = deleted_prefix_length(str, prefix);
11261 if (prefixlen <= 0) return Qnil;
11262
11263 return rb_str_drop_bytes(str, prefixlen);
11264}
11265
11266/*
11267 * call-seq:
11268 * delete_prefix(prefix) -> new_string
11269 *
11270 * :include: doc/string/delete_prefix.rdoc
11271 *
11272 */
11273
11274static VALUE
11275rb_str_delete_prefix(VALUE str, VALUE prefix)
11276{
11277 long prefixlen;
11278
11279 prefixlen = deleted_prefix_length(str, prefix);
11280 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11281
11282 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11283}
11284
11294static long
11295deleted_suffix_length(VALUE str, VALUE suffix)
11296{
11297 const char *strptr, *suffixptr;
11298 long olen, suffixlen;
11299 rb_encoding *enc;
11300
11301 StringValue(suffix);
11302 if (is_broken_string(suffix)) return 0;
11303 enc = rb_enc_check(str, suffix);
11304
11305 /* return 0 if not start with suffix */
11306 suffixlen = RSTRING_LEN(suffix);
11307 if (suffixlen <= 0) return 0;
11308 olen = RSTRING_LEN(str);
11309 if (olen < suffixlen) return 0;
11310 strptr = RSTRING_PTR(str);
11311 suffixptr = RSTRING_PTR(suffix);
11312 const char *strend = strptr + olen;
11313 const char *before_suffix = strend - suffixlen;
11314 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11315 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11316
11317 return suffixlen;
11318}
11319
11320/*
11321 * call-seq:
11322 * delete_suffix!(suffix) -> self or nil
11323 *
11324 * Like String#delete_suffix, except that +self+ is modified in place;
11325 * returns +self+ if the suffix is removed, +nil+ otherwise.
11326 *
11327 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11328 */
11329
11330static VALUE
11331rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11332{
11333 long olen, suffixlen, len;
11334 str_modifiable(str);
11335
11336 suffixlen = deleted_suffix_length(str, suffix);
11337 if (suffixlen <= 0) return Qnil;
11338
11339 olen = RSTRING_LEN(str);
11340 str_modify_keep_cr(str);
11341 len = olen - suffixlen;
11342 STR_SET_LEN(str, len);
11343 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11344 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11346 }
11347 return str;
11348}
11349
11350/*
11351 * call-seq:
11352 * delete_suffix(suffix) -> new_string
11353 *
11354 * :include: doc/string/delete_suffix.rdoc
11355 *
11356 */
11357
11358static VALUE
11359rb_str_delete_suffix(VALUE str, VALUE suffix)
11360{
11361 long suffixlen;
11362
11363 suffixlen = deleted_suffix_length(str, suffix);
11364 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11365
11366 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11367}
11368
11369void
11370rb_str_setter(VALUE val, ID id, VALUE *var)
11371{
11372 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11373 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11374 }
11375 *var = val;
11376}
11377
11378static void
11379rb_fs_setter(VALUE val, ID id, VALUE *var)
11380{
11381 val = rb_fs_check(val);
11382 if (!val) {
11383 rb_raise(rb_eTypeError,
11384 "value of %"PRIsVALUE" must be String or Regexp",
11385 rb_id2str(id));
11386 }
11387 if (!NIL_P(val)) {
11388 rb_warn_deprecated("'$;'", NULL);
11389 }
11390 *var = val;
11391}
11392
11393
11394/*
11395 * call-seq:
11396 * force_encoding(encoding) -> self
11397 *
11398 * :include: doc/string/force_encoding.rdoc
11399 *
11400 */
11401
11402static VALUE
11403rb_str_force_encoding(VALUE str, VALUE enc)
11404{
11405 str_modifiable(str);
11406
11407 rb_encoding *encoding = rb_to_encoding(enc);
11408 int idx = rb_enc_to_index(encoding);
11409
11410 // If the encoding is unchanged, we do nothing.
11411 if (ENCODING_GET(str) == idx) {
11412 return str;
11413 }
11414
11415 rb_enc_associate_index(str, idx);
11416
11417 // If the coderange was 7bit and the new encoding is ASCII-compatible
11418 // we can keep the coderange.
11419 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11420 return str;
11421 }
11422
11424 return str;
11425}
11426
11427/*
11428 * call-seq:
11429 * b -> new_string
11430 *
11431 * :include: doc/string/b.rdoc
11432 *
11433 */
11434
11435static VALUE
11436rb_str_b(VALUE str)
11437{
11438 VALUE str2;
11439 if (STR_EMBED_P(str)) {
11440 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11441 }
11442 else {
11443 str2 = str_alloc_heap(rb_cString);
11444 }
11445 str_replace_shared_without_enc(str2, str);
11446
11447 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11448 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11449 // If we know the receiver's code range then we know the result's code range.
11450 int cr = ENC_CODERANGE(str);
11451 switch (cr) {
11452 case ENC_CODERANGE_7BIT:
11454 break;
11458 break;
11459 default:
11460 ENC_CODERANGE_CLEAR(str2);
11461 break;
11462 }
11463 }
11464
11465 return str2;
11466}
11467
11468/*
11469 * call-seq:
11470 * valid_encoding? -> true or false
11471 *
11472 * :include: doc/string/valid_encoding_p.rdoc
11473 *
11474 */
11475
11476static VALUE
11477rb_str_valid_encoding_p(VALUE str)
11478{
11479 int cr = rb_enc_str_coderange(str);
11480
11481 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11482}
11483
11484/*
11485 * call-seq:
11486 * ascii_only? -> true or false
11487 *
11488 * Returns whether +self+ contains only ASCII characters:
11489 *
11490 * 'abc'.ascii_only? # => true
11491 * "abc\u{6666}".ascii_only? # => false
11492 *
11493 * Related: see {Querying}[rdoc-ref:String@Querying].
11494 */
11495
11496static VALUE
11497rb_str_is_ascii_only_p(VALUE str)
11498{
11499 int cr = rb_enc_str_coderange(str);
11500
11501 return RBOOL(cr == ENC_CODERANGE_7BIT);
11502}
11503
11504VALUE
11506{
11507 static const char ellipsis[] = "...";
11508 const long ellipsislen = sizeof(ellipsis) - 1;
11509 rb_encoding *const enc = rb_enc_get(str);
11510 const long blen = RSTRING_LEN(str);
11511 const char *const p = RSTRING_PTR(str), *e = p + blen;
11512 VALUE estr, ret = 0;
11513
11514 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11515 if (len * rb_enc_mbminlen(enc) >= blen ||
11516 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11517 ret = str;
11518 }
11519 else if (len <= ellipsislen ||
11520 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11521 if (rb_enc_asciicompat(enc)) {
11522 ret = rb_str_new(ellipsis, len);
11523 rb_enc_associate(ret, enc);
11524 }
11525 else {
11526 estr = rb_usascii_str_new(ellipsis, len);
11527 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11528 }
11529 }
11530 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11531 rb_str_cat(ret, ellipsis, ellipsislen);
11532 }
11533 else {
11534 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11535 rb_enc_from_encoding(enc), 0, Qnil);
11536 rb_str_append(ret, estr);
11537 }
11538 return ret;
11539}
11540
11541static VALUE
11542str_compat_and_valid(VALUE str, rb_encoding *enc)
11543{
11544 int cr;
11545 str = StringValue(str);
11546 cr = rb_enc_str_coderange(str);
11547 if (cr == ENC_CODERANGE_BROKEN) {
11548 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11549 }
11550 else {
11551 rb_encoding *e = STR_ENC_GET(str);
11552 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11553 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11554 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11555 }
11556 }
11557 return str;
11558}
11559
11560static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11561
11562VALUE
11564{
11565 rb_encoding *enc = STR_ENC_GET(str);
11566 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11567}
11568
11569VALUE
11570rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11571{
11572 int cr = ENC_CODERANGE_UNKNOWN;
11573 if (enc == STR_ENC_GET(str)) {
11574 /* cached coderange makes sense only when enc equals the
11575 * actual encoding of str */
11576 cr = ENC_CODERANGE(str);
11577 }
11578 return enc_str_scrub(enc, str, repl, cr);
11579}
11580
11581static VALUE
11582enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11583{
11584 int encidx;
11585 VALUE buf = Qnil;
11586 const char *rep, *p, *e, *p1, *sp;
11587 long replen = -1;
11588 long slen;
11589
11590 if (rb_block_given_p()) {
11591 if (!NIL_P(repl))
11592 rb_raise(rb_eArgError, "both of block and replacement given");
11593 replen = 0;
11594 }
11595
11596 if (ENC_CODERANGE_CLEAN_P(cr))
11597 return Qnil;
11598
11599 if (!NIL_P(repl)) {
11600 repl = str_compat_and_valid(repl, enc);
11601 }
11602
11603 if (rb_enc_dummy_p(enc)) {
11604 return Qnil;
11605 }
11606 encidx = rb_enc_to_index(enc);
11607
11608#define DEFAULT_REPLACE_CHAR(str) do { \
11609 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11610 rep = replace; replen = (int)sizeof(replace); \
11611 } while (0)
11612
11613 slen = RSTRING_LEN(str);
11614 p = RSTRING_PTR(str);
11615 e = RSTRING_END(str);
11616 p1 = p;
11617 sp = p;
11618
11619 if (rb_enc_asciicompat(enc)) {
11620 int rep7bit_p;
11621 if (!replen) {
11622 rep = NULL;
11623 rep7bit_p = FALSE;
11624 }
11625 else if (!NIL_P(repl)) {
11626 rep = RSTRING_PTR(repl);
11627 replen = RSTRING_LEN(repl);
11628 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11629 }
11630 else if (encidx == rb_utf8_encindex()) {
11631 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11632 rep7bit_p = FALSE;
11633 }
11634 else {
11635 DEFAULT_REPLACE_CHAR("?");
11636 rep7bit_p = TRUE;
11637 }
11638 cr = ENC_CODERANGE_7BIT;
11639
11640 p = search_nonascii(p, e);
11641 if (!p) {
11642 p = e;
11643 }
11644 while (p < e) {
11645 int ret = rb_enc_precise_mbclen(p, e, enc);
11646 if (MBCLEN_NEEDMORE_P(ret)) {
11647 break;
11648 }
11649 else if (MBCLEN_CHARFOUND_P(ret)) {
11651 p += MBCLEN_CHARFOUND_LEN(ret);
11652 }
11653 else if (MBCLEN_INVALID_P(ret)) {
11654 /*
11655 * p1~p: valid ascii/multibyte chars
11656 * p ~e: invalid bytes + unknown bytes
11657 */
11658 long clen = rb_enc_mbmaxlen(enc);
11659 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11660 if (p > p1) {
11661 rb_str_buf_cat(buf, p1, p - p1);
11662 }
11663
11664 if (e - p < clen) clen = e - p;
11665 if (clen <= 2) {
11666 clen = 1;
11667 }
11668 else {
11669 const char *q = p;
11670 clen--;
11671 for (; clen > 1; clen--) {
11672 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11673 if (MBCLEN_NEEDMORE_P(ret)) break;
11674 if (MBCLEN_INVALID_P(ret)) continue;
11676 }
11677 }
11678 if (rep) {
11679 rb_str_buf_cat(buf, rep, replen);
11680 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11681 }
11682 else {
11683 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11684 str_mod_check(str, sp, slen);
11685 repl = str_compat_and_valid(repl, enc);
11686 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11689 }
11690 p += clen;
11691 p1 = p;
11692 p = search_nonascii(p, e);
11693 if (!p) {
11694 p = e;
11695 break;
11696 }
11697 }
11698 else {
11700 }
11701 }
11702 if (NIL_P(buf)) {
11703 if (p == e) {
11704 ENC_CODERANGE_SET(str, cr);
11705 return Qnil;
11706 }
11707 buf = rb_str_buf_new(RSTRING_LEN(str));
11708 }
11709 if (p1 < p) {
11710 rb_str_buf_cat(buf, p1, p - p1);
11711 }
11712 if (p < e) {
11713 if (rep) {
11714 rb_str_buf_cat(buf, rep, replen);
11715 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11716 }
11717 else {
11718 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11719 str_mod_check(str, sp, slen);
11720 repl = str_compat_and_valid(repl, enc);
11721 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11724 }
11725 }
11726 }
11727 else {
11728 /* ASCII incompatible */
11729 long mbminlen = rb_enc_mbminlen(enc);
11730 if (!replen) {
11731 rep = NULL;
11732 }
11733 else if (!NIL_P(repl)) {
11734 rep = RSTRING_PTR(repl);
11735 replen = RSTRING_LEN(repl);
11736 }
11737 else if (encidx == ENCINDEX_UTF_16BE) {
11738 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11739 }
11740 else if (encidx == ENCINDEX_UTF_16LE) {
11741 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11742 }
11743 else if (encidx == ENCINDEX_UTF_32BE) {
11744 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11745 }
11746 else if (encidx == ENCINDEX_UTF_32LE) {
11747 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11748 }
11749 else {
11750 DEFAULT_REPLACE_CHAR("?");
11751 }
11752
11753 while (p < e) {
11754 int ret = rb_enc_precise_mbclen(p, e, enc);
11755 if (MBCLEN_NEEDMORE_P(ret)) {
11756 break;
11757 }
11758 else if (MBCLEN_CHARFOUND_P(ret)) {
11759 p += MBCLEN_CHARFOUND_LEN(ret);
11760 }
11761 else if (MBCLEN_INVALID_P(ret)) {
11762 const char *q = p;
11763 long clen = rb_enc_mbmaxlen(enc);
11764 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11765 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11766
11767 if (e - p < clen) clen = e - p;
11768 if (clen <= mbminlen * 2) {
11769 clen = mbminlen;
11770 }
11771 else {
11772 clen -= mbminlen;
11773 for (; clen > mbminlen; clen-=mbminlen) {
11774 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11775 if (MBCLEN_NEEDMORE_P(ret)) break;
11776 if (MBCLEN_INVALID_P(ret)) continue;
11778 }
11779 }
11780 if (rep) {
11781 rb_str_buf_cat(buf, rep, replen);
11782 }
11783 else {
11784 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11785 str_mod_check(str, sp, slen);
11786 repl = str_compat_and_valid(repl, enc);
11787 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11788 }
11789 p += clen;
11790 p1 = p;
11791 }
11792 else {
11794 }
11795 }
11796 if (NIL_P(buf)) {
11797 if (p == e) {
11799 return Qnil;
11800 }
11801 buf = rb_str_buf_new(RSTRING_LEN(str));
11802 }
11803 if (p1 < p) {
11804 rb_str_buf_cat(buf, p1, p - p1);
11805 }
11806 if (p < e) {
11807 if (rep) {
11808 rb_str_buf_cat(buf, rep, replen);
11809 }
11810 else {
11811 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11812 str_mod_check(str, sp, slen);
11813 repl = str_compat_and_valid(repl, enc);
11814 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11815 }
11816 }
11818 }
11819 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11820 return buf;
11821}
11822
11823/*
11824 * call-seq:
11825 * scrub(replacement_string = default_replacement_string) -> new_string
11826 * scrub{|sequence| ... } -> new_string
11827 *
11828 * :include: doc/string/scrub.rdoc
11829 *
11830 */
11831static VALUE
11832str_scrub(int argc, VALUE *argv, VALUE str)
11833{
11834 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11835 VALUE new = rb_str_scrub(str, repl);
11836 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11837}
11838
11839/*
11840 * call-seq:
11841 * scrub!(replacement_string = default_replacement_string) -> self
11842 * scrub!{|sequence| ... } -> self
11843 *
11844 * Like String#scrub, except that:
11845 *
11846 * - Any replacements are made in +self+.
11847 * - Returns +self+.
11848 *
11849 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11850 *
11851 */
11852static VALUE
11853str_scrub_bang(int argc, VALUE *argv, VALUE str)
11854{
11855 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11856 VALUE new = rb_str_scrub(str, repl);
11857 if (!NIL_P(new)) rb_str_replace(str, new);
11858 return str;
11859}
11860
11861static ID id_normalize;
11862static ID id_normalized_p;
11863static VALUE mUnicodeNormalize;
11864
11865static VALUE
11866unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11867{
11868 static int UnicodeNormalizeRequired = 0;
11869 VALUE argv2[2];
11870
11871 if (!UnicodeNormalizeRequired) {
11872 rb_require("unicode_normalize/normalize.rb");
11873 UnicodeNormalizeRequired = 1;
11874 }
11875 argv2[0] = str;
11876 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11877 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11878}
11879
11880/*
11881 * call-seq:
11882 * unicode_normalize(form = :nfc) -> string
11883 *
11884 * :include: doc/string/unicode_normalize.rdoc
11885 *
11886 */
11887static VALUE
11888rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11889{
11890 return unicode_normalize_common(argc, argv, str, id_normalize);
11891}
11892
11893/*
11894 * call-seq:
11895 * unicode_normalize!(form = :nfc) -> self
11896 *
11897 * Like String#unicode_normalize, except that the normalization
11898 * is performed on +self+ (not on a copy of +self+).
11899 *
11900 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11901 *
11902 */
11903static VALUE
11904rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11905{
11906 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11907}
11908
11909/* call-seq:
11910 * unicode_normalized?(form = :nfc) -> true or false
11911 *
11912 * Returns whether +self+ is in the given +form+ of Unicode normalization;
11913 * see String#unicode_normalize.
11914 *
11915 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11916 *
11917 * Examples:
11918 *
11919 * "a\u0300".unicode_normalized? # => false
11920 * "a\u0300".unicode_normalized?(:nfd) # => true
11921 * "\u00E0".unicode_normalized? # => true
11922 * "\u00E0".unicode_normalized?(:nfd) # => false
11923 *
11924 *
11925 * Raises an exception if +self+ is not in a Unicode encoding:
11926 *
11927 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
11928 * s.unicode_normalized? # Raises Encoding::CompatibilityError
11929 *
11930 * Related: see {Querying}[rdoc-ref:String@Querying].
11931 */
11932static VALUE
11933rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11934{
11935 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11936}
11937
11938/**********************************************************************
11939 * Document-class: Symbol
11940 *
11941 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11942 *
11943 * You can create a +Symbol+ object explicitly with:
11944 *
11945 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11946 *
11947 * The same +Symbol+ object will be
11948 * created for a given name or string for the duration of a program's
11949 * execution, regardless of the context or meaning of that name. Thus
11950 * if <code>Fred</code> is a constant in one context, a method in
11951 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11952 * will be the same object in all three contexts.
11953 *
11954 * module One
11955 * class Fred
11956 * end
11957 * $f1 = :Fred
11958 * end
11959 * module Two
11960 * Fred = 1
11961 * $f2 = :Fred
11962 * end
11963 * def Fred()
11964 * end
11965 * $f3 = :Fred
11966 * $f1.object_id #=> 2514190
11967 * $f2.object_id #=> 2514190
11968 * $f3.object_id #=> 2514190
11969 *
11970 * Constant, method, and variable names are returned as symbols:
11971 *
11972 * module One
11973 * Two = 2
11974 * def three; 3 end
11975 * @four = 4
11976 * @@five = 5
11977 * $six = 6
11978 * end
11979 * seven = 7
11980 *
11981 * One.constants
11982 * # => [:Two]
11983 * One.instance_methods(true)
11984 * # => [:three]
11985 * One.instance_variables
11986 * # => [:@four]
11987 * One.class_variables
11988 * # => [:@@five]
11989 * global_variables.grep(/six/)
11990 * # => [:$six]
11991 * local_variables
11992 * # => [:seven]
11993 *
11994 * A +Symbol+ object differs from a String object in that
11995 * a +Symbol+ object represents an identifier, while a String object
11996 * represents text or data.
11997 *
11998 * == What's Here
11999 *
12000 * First, what's elsewhere. Class +Symbol+:
12001 *
12002 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12003 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12004 *
12005 * Here, class +Symbol+ provides methods that are useful for:
12006 *
12007 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12008 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12009 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12010 *
12011 * === Methods for Querying
12012 *
12013 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12014 * - #=~: Returns the index of the first substring in symbol that matches a
12015 * given Regexp or other object; returns +nil+ if no match is found.
12016 * - #[], #slice : Returns a substring of symbol
12017 * determined by a given index, start/length, or range, or string.
12018 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12019 * - #encoding: Returns the Encoding object that represents the encoding
12020 * of symbol.
12021 * - #end_with?: Returns +true+ if symbol ends with
12022 * any of the given strings.
12023 * - #match: Returns a MatchData object if symbol
12024 * matches a given Regexp; +nil+ otherwise.
12025 * - #match?: Returns +true+ if symbol
12026 * matches a given Regexp; +false+ otherwise.
12027 * - #length, #size: Returns the number of characters in symbol.
12028 * - #start_with?: Returns +true+ if symbol starts with
12029 * any of the given strings.
12030 *
12031 * === Methods for Comparing
12032 *
12033 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12034 * or larger than symbol.
12035 * - #==, #===: Returns +true+ if a given symbol has the same content and
12036 * encoding.
12037 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12038 * symbol is smaller than, equal to, or larger than symbol.
12039 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12040 * after Unicode case folding; +false+ otherwise.
12041 *
12042 * === Methods for Converting
12043 *
12044 * - #capitalize: Returns symbol with the first character upcased
12045 * and all other characters downcased.
12046 * - #downcase: Returns symbol with all characters downcased.
12047 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12048 * - #name: Returns the frozen string corresponding to symbol.
12049 * - #succ, #next: Returns the symbol that is the successor to symbol.
12050 * - #swapcase: Returns symbol with all upcase characters downcased
12051 * and all downcase characters upcased.
12052 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12053 * - #to_s, #id2name: Returns the string corresponding to +self+.
12054 * - #to_sym, #intern: Returns +self+.
12055 * - #upcase: Returns symbol with all characters upcased.
12056 *
12057 */
12058
12059
12060/*
12061 * call-seq:
12062 * symbol == object -> true or false
12063 *
12064 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12065 */
12066
12067#define sym_equal rb_obj_equal
12068
12069static int
12070sym_printable(const char *s, const char *send, rb_encoding *enc)
12071{
12072 while (s < send) {
12073 int n;
12074 int c = rb_enc_precise_mbclen(s, send, enc);
12075
12076 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12077 n = MBCLEN_CHARFOUND_LEN(c);
12078 c = rb_enc_mbc_to_codepoint(s, send, enc);
12079 if (!rb_enc_isprint(c, enc)) return FALSE;
12080 s += n;
12081 }
12082 return TRUE;
12083}
12084
12085int
12086rb_str_symname_p(VALUE sym)
12087{
12088 rb_encoding *enc;
12089 const char *ptr;
12090 long len;
12091 rb_encoding *resenc = rb_default_internal_encoding();
12092
12093 if (resenc == NULL) resenc = rb_default_external_encoding();
12094 enc = STR_ENC_GET(sym);
12095 ptr = RSTRING_PTR(sym);
12096 len = RSTRING_LEN(sym);
12097 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12098 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12099 return FALSE;
12100 }
12101 return TRUE;
12102}
12103
12104VALUE
12105rb_str_quote_unprintable(VALUE str)
12106{
12107 rb_encoding *enc;
12108 const char *ptr;
12109 long len;
12110 rb_encoding *resenc;
12111
12112 Check_Type(str, T_STRING);
12113 resenc = rb_default_internal_encoding();
12114 if (resenc == NULL) resenc = rb_default_external_encoding();
12115 enc = STR_ENC_GET(str);
12116 ptr = RSTRING_PTR(str);
12117 len = RSTRING_LEN(str);
12118 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12119 !sym_printable(ptr, ptr + len, enc)) {
12120 return rb_str_escape(str);
12121 }
12122 return str;
12123}
12124
12125VALUE
12126rb_id_quote_unprintable(ID id)
12127{
12128 VALUE str = rb_id2str(id);
12129 if (!rb_str_symname_p(str)) {
12130 return rb_str_escape(str);
12131 }
12132 return str;
12133}
12134
12135/*
12136 * call-seq:
12137 * inspect -> string
12138 *
12139 * Returns a string representation of +self+ (including the leading colon):
12140 *
12141 * :foo.inspect # => ":foo"
12142 *
12143 * Related: Symbol#to_s, Symbol#name.
12144 *
12145 */
12146
12147static VALUE
12148sym_inspect(VALUE sym)
12149{
12150 VALUE str = rb_sym2str(sym);
12151 const char *ptr;
12152 long len;
12153 char *dest;
12154
12155 if (!rb_str_symname_p(str)) {
12156 str = rb_str_inspect(str);
12157 len = RSTRING_LEN(str);
12158 rb_str_resize(str, len + 1);
12159 dest = RSTRING_PTR(str);
12160 memmove(dest + 1, dest, len);
12161 }
12162 else {
12163 rb_encoding *enc = STR_ENC_GET(str);
12164 VALUE orig_str = str;
12165
12166 len = RSTRING_LEN(orig_str);
12167 str = rb_enc_str_new(0, len + 1, enc);
12168
12169 // Get data pointer after allocation
12170 ptr = RSTRING_PTR(orig_str);
12171 dest = RSTRING_PTR(str);
12172 memcpy(dest + 1, ptr, len);
12173
12174 RB_GC_GUARD(orig_str);
12175 }
12176 dest[0] = ':';
12177
12179
12180 return str;
12181}
12182
12183VALUE
12185{
12186 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12187 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12188 return str;
12189}
12190
12191VALUE
12192rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12193{
12194 VALUE obj;
12195
12196 if (argc < 1) {
12197 rb_raise(rb_eArgError, "no receiver given");
12198 }
12199 obj = argv[0];
12200 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12201}
12202
12203/*
12204 * call-seq:
12205 * succ
12206 *
12207 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12208 *
12209 * :foo.succ # => :fop
12210 *
12211 * Related: String#succ.
12212 */
12213
12214static VALUE
12215sym_succ(VALUE sym)
12216{
12217 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12218}
12219
12220/*
12221 * call-seq:
12222 * symbol <=> object -> -1, 0, +1, or nil
12223 *
12224 * If +object+ is a symbol,
12225 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12226 *
12227 * :bar <=> :foo # => -1
12228 * :foo <=> :foo # => 0
12229 * :foo <=> :bar # => 1
12230 *
12231 * Otherwise, returns +nil+:
12232 *
12233 * :foo <=> 'bar' # => nil
12234 *
12235 * Related: String#<=>.
12236 */
12237
12238static VALUE
12239sym_cmp(VALUE sym, VALUE other)
12240{
12241 if (!SYMBOL_P(other)) {
12242 return Qnil;
12243 }
12244 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12245}
12246
12247/*
12248 * call-seq:
12249 * casecmp(object) -> -1, 0, 1, or nil
12250 *
12251 * :include: doc/symbol/casecmp.rdoc
12252 *
12253 */
12254
12255static VALUE
12256sym_casecmp(VALUE sym, VALUE other)
12257{
12258 if (!SYMBOL_P(other)) {
12259 return Qnil;
12260 }
12261 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12262}
12263
12264/*
12265 * call-seq:
12266 * casecmp?(object) -> true, false, or nil
12267 *
12268 * :include: doc/symbol/casecmp_p.rdoc
12269 *
12270 */
12271
12272static VALUE
12273sym_casecmp_p(VALUE sym, VALUE other)
12274{
12275 if (!SYMBOL_P(other)) {
12276 return Qnil;
12277 }
12278 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12279}
12280
12281/*
12282 * call-seq:
12283 * symbol =~ object -> integer or nil
12284 *
12285 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12286 * including possible updates to global variables;
12287 * see String#=~.
12288 *
12289 */
12290
12291static VALUE
12292sym_match(VALUE sym, VALUE other)
12293{
12294 return rb_str_match(rb_sym2str(sym), other);
12295}
12296
12297/*
12298 * call-seq:
12299 * match(pattern, offset = 0) -> matchdata or nil
12300 * match(pattern, offset = 0) {|matchdata| } -> object
12301 *
12302 * Equivalent to <tt>self.to_s.match</tt>,
12303 * including possible updates to global variables;
12304 * see String#match.
12305 *
12306 */
12307
12308static VALUE
12309sym_match_m(int argc, VALUE *argv, VALUE sym)
12310{
12311 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12312}
12313
12314/*
12315 * call-seq:
12316 * match?(pattern, offset) -> true or false
12317 *
12318 * Equivalent to <tt>sym.to_s.match?</tt>;
12319 * see String#match.
12320 *
12321 */
12322
12323static VALUE
12324sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12325{
12326 return rb_str_match_m_p(argc, argv, sym);
12327}
12328
12329/*
12330 * call-seq:
12331 * symbol[index] -> string or nil
12332 * symbol[start, length] -> string or nil
12333 * symbol[range] -> string or nil
12334 * symbol[regexp, capture = 0] -> string or nil
12335 * symbol[substring] -> string or nil
12336 *
12337 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12338 *
12339 */
12340
12341static VALUE
12342sym_aref(int argc, VALUE *argv, VALUE sym)
12343{
12344 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12345}
12346
12347/*
12348 * call-seq:
12349 * length -> integer
12350 *
12351 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12352 */
12353
12354static VALUE
12355sym_length(VALUE sym)
12356{
12357 return rb_str_length(rb_sym2str(sym));
12358}
12359
12360/*
12361 * call-seq:
12362 * empty? -> true or false
12363 *
12364 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12365 *
12366 */
12367
12368static VALUE
12369sym_empty(VALUE sym)
12370{
12371 return rb_str_empty(rb_sym2str(sym));
12372}
12373
12374/*
12375 * call-seq:
12376 * upcase(mapping) -> symbol
12377 *
12378 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12379 *
12380 * See String#upcase.
12381 *
12382 */
12383
12384static VALUE
12385sym_upcase(int argc, VALUE *argv, VALUE sym)
12386{
12387 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12388}
12389
12390/*
12391 * call-seq:
12392 * downcase(mapping) -> symbol
12393 *
12394 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12395 *
12396 * See String#downcase.
12397 *
12398 * Related: Symbol#upcase.
12399 *
12400 */
12401
12402static VALUE
12403sym_downcase(int argc, VALUE *argv, VALUE sym)
12404{
12405 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12406}
12407
12408/*
12409 * call-seq:
12410 * capitalize(mapping) -> symbol
12411 *
12412 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12413 *
12414 * See String#capitalize.
12415 *
12416 */
12417
12418static VALUE
12419sym_capitalize(int argc, VALUE *argv, VALUE sym)
12420{
12421 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12422}
12423
12424/*
12425 * call-seq:
12426 * swapcase(mapping) -> symbol
12427 *
12428 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12429 *
12430 * See String#swapcase.
12431 *
12432 */
12433
12434static VALUE
12435sym_swapcase(int argc, VALUE *argv, VALUE sym)
12436{
12437 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12438}
12439
12440/*
12441 * call-seq:
12442 * start_with?(*string_or_regexp) -> true or false
12443 *
12444 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12445 *
12446 */
12447
12448static VALUE
12449sym_start_with(int argc, VALUE *argv, VALUE sym)
12450{
12451 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12452}
12453
12454/*
12455 * call-seq:
12456 * end_with?(*strings) -> true or false
12457 *
12458 *
12459 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12460 *
12461 */
12462
12463static VALUE
12464sym_end_with(int argc, VALUE *argv, VALUE sym)
12465{
12466 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12467}
12468
12469/*
12470 * call-seq:
12471 * encoding -> encoding
12472 *
12473 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12474 *
12475 */
12476
12477static VALUE
12478sym_encoding(VALUE sym)
12479{
12480 return rb_obj_encoding(rb_sym2str(sym));
12481}
12482
12483static VALUE
12484string_for_symbol(VALUE name)
12485{
12486 if (!RB_TYPE_P(name, T_STRING)) {
12487 VALUE tmp = rb_check_string_type(name);
12488 if (NIL_P(tmp)) {
12489 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12490 name);
12491 }
12492 name = tmp;
12493 }
12494 return name;
12495}
12496
12497ID
12499{
12500 if (SYMBOL_P(name)) {
12501 return SYM2ID(name);
12502 }
12503 name = string_for_symbol(name);
12504 return rb_intern_str(name);
12505}
12506
12507VALUE
12509{
12510 if (SYMBOL_P(name)) {
12511 return name;
12512 }
12513 name = string_for_symbol(name);
12514 return rb_str_intern(name);
12515}
12516
12517/*
12518 * call-seq:
12519 * Symbol.all_symbols -> array_of_symbols
12520 *
12521 * Returns an array of all symbols currently in Ruby's symbol table:
12522 *
12523 * Symbol.all_symbols.size # => 9334
12524 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12525 *
12526 */
12527
12528static VALUE
12529sym_all_symbols(VALUE _)
12530{
12531 return rb_sym_all_symbols();
12532}
12533
12534VALUE
12535rb_str_to_interned_str(VALUE str)
12536{
12537 return rb_fstring(str);
12538}
12539
12540VALUE
12541rb_interned_str(const char *ptr, long len)
12542{
12543 struct RString fake_str = {RBASIC_INIT};
12544 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12545}
12546
12547VALUE
12549{
12550 return rb_interned_str(ptr, strlen(ptr));
12551}
12552
12553VALUE
12554rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12555{
12556 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12557 rb_enc_autoload(enc);
12558 }
12559
12560 struct RString fake_str = {RBASIC_INIT};
12561 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12562}
12563
12564VALUE
12565rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12566{
12567 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12568 rb_enc_autoload(enc);
12569 }
12570
12571 struct RString fake_str = {RBASIC_INIT};
12572 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12573 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12574 return str;
12575}
12576
12577VALUE
12579{
12580 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12581}
12582
12583#if USE_YJIT
12584void
12585rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12586{
12587 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12588 ssize_t code = RB_NUM2SSIZE(codepoint);
12589
12590 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12591 rb_str_buf_cat_byte(str, (char) code);
12592 return;
12593 }
12594 }
12595
12596 rb_str_concat(str, codepoint);
12597}
12598#endif
12599
12600static int
12601fstring_set_class_i(VALUE *str, void *data)
12602{
12603 RBASIC_SET_CLASS(*str, rb_cString);
12604
12605 return ST_CONTINUE;
12606}
12607
12608void
12609Init_String(void)
12610{
12611 rb_cString = rb_define_class("String", rb_cObject);
12612
12613 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12614
12616 rb_define_alloc_func(rb_cString, empty_str_alloc);
12617 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12618 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12619 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12621 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12622 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12625 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12626 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12627 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12628 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12631 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12632 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12633 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12634 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12637 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12638 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12639 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12640 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12641 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12643 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12645 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12646 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12647 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12648 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12649 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12650 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12651 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12652 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12653 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12654 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12655 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12656 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12657 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12658 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12660 rb_define_method(rb_cString, "+@", str_uplus, 0);
12661 rb_define_method(rb_cString, "-@", str_uminus, 0);
12662 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12663 rb_define_alias(rb_cString, "dedup", "-@");
12664
12665 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12666 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12667 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12668 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12671 rb_define_method(rb_cString, "undump", str_undump, 0);
12672
12673 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12674 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12675 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12676 sym_fold = ID2SYM(rb_intern_const("fold"));
12677
12678 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12679 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12680 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12681 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12682
12683 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12684 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12685 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12686 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12687
12688 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12689 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12690 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12691 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12692 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12693 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12694 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12695 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12696 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12697 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12698 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12699 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12701 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12702 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12703 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12704 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12705 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12706
12707 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12708 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12709 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12710
12711 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12712
12713 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12714 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12715 rb_define_method(rb_cString, "center", rb_str_center, -1);
12716
12717 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12718 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12719 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12720 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12721 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12722 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12723 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12724 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12725 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12726
12727 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12728 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12729 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12730 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12731 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12732 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12733 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12734 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12735 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12736
12737 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12738 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12739 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12740 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12741 rb_define_method(rb_cString, "count", rb_str_count, -1);
12742
12743 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12744 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12745 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12746 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12747
12748 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12749 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12750 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12751 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12752 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12753
12754 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12755
12756 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12757 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12758
12759 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12760 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12761
12762 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12763 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12764 rb_define_method(rb_cString, "b", rb_str_b, 0);
12765 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12766 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12767
12768 /* define UnicodeNormalize module here so that we don't have to look it up */
12769 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12770 id_normalize = rb_intern_const("normalize");
12771 id_normalized_p = rb_intern_const("normalized?");
12772
12773 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12774 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12775 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12776
12777 rb_fs = Qnil;
12778 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12779 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12780 rb_gc_register_address(&rb_fs);
12781
12782 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12786 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12787
12788 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12789 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12790 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12791 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12792 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12793 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12794
12795 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12796 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12797 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12798 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12799
12800 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12801 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12802 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12803 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12804 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12805 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12806 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12807
12808 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12809 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12810 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12811 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12812
12813 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12814 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12815
12816 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12817}
12818
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1779
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1572
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1685
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2931
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2751
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3221
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1037
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3010
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:683
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3909
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1435
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1431
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1438
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1429
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1433
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:676
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2164
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2182
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1341
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3578
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:265
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:583
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:177
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1329
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3262
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1340
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:945
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1205
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3026
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1224
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12554
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2332
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3730
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1153
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1445
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1346
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:964
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12578
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:829
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:714
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2024
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2030
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1936
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4223
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3720
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1750
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1510
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2485
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3795
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1421
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12184
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2558
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1397
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1744
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3054
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5331
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4158
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3151
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11505
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1786
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1187
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:999
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1516
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1994
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4144
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3563
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2421
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2012
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6538
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3159
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12548
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1427
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3761
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3101
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4265
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3385
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7217
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2788
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12541
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4212
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4032
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4187
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3737
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3276
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5815
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11563
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1700
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2948
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3248
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3367
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1199
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2742
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7324
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1409
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1716
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2435
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5733
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9348
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1193
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1848
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2013
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2092
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3367
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1625
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12508
ID rb_to_id(VALUE str)
Definition string.c:12498
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3499
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4467
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1439
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2925
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2807
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1433
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2820
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1777
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:455
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1481
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:202
Definition string.c:8228
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113