Ruby 3.5.0dev (2025-10-30 revision eed9441afc861f10d113102536d0e616f44a069f)
string.c (eed9441afc861f10d113102536d0e616f44a069f)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 FL_SET((shared_str), STR_SHARED_ROOT); \
209 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
210 FL_SET_RAW((shared_str), STR_BORROWED); \
211 } \
212} while (0)
213
214#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
215#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216/* TODO: include the terminator size in capa. */
217
218#define STR_ENC_GET(str) get_encoding(str)
219
220#if !defined SHARABLE_MIDDLE_SUBSTRING
221# define SHARABLE_MIDDLE_SUBSTRING 0
222#endif
223#if !SHARABLE_MIDDLE_SUBSTRING
224#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#else
226#define SHARABLE_SUBSTRING_P(beg, len, end) 1
227#endif
228
229
230static inline long
231str_embed_capa(VALUE str)
232{
233 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
234}
235
236bool
237rb_str_reembeddable_p(VALUE str)
238{
239 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
240}
241
242static inline size_t
243rb_str_embed_size(long capa)
244{
245 return offsetof(struct RString, as.embed.ary) + capa;
246}
247
248size_t
249rb_str_size_as_embedded(VALUE str)
250{
251 size_t real_size;
252 if (STR_EMBED_P(str)) {
253 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
254 }
255 /* if the string is not currently embedded, but it can be embedded, how
256 * much space would it require */
257 else if (rb_str_reembeddable_p(str)) {
258 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
259 }
260 else {
261 real_size = sizeof(struct RString);
262 }
263
264 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
265 real_size += sizeof(st_index_t);
266 }
267
268 return real_size;
269}
270
271static inline bool
272STR_EMBEDDABLE_P(long len, long termlen)
273{
274 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
275}
276
277static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
278static VALUE str_new_frozen(VALUE klass, VALUE orig);
279static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
280static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
281static VALUE str_new(VALUE klass, const char *ptr, long len);
282static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
283static inline void str_modifiable(VALUE str);
284static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
285static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
286
287static inline void
288str_make_independent(VALUE str)
289{
290 long len = RSTRING_LEN(str);
291 int termlen = TERM_LEN(str);
292 str_make_independent_expand((str), len, 0L, termlen);
293}
294
295static inline int str_dependent_p(VALUE str);
296
297void
298rb_str_make_independent(VALUE str)
299{
300 if (str_dependent_p(str)) {
301 str_make_independent(str);
302 }
303}
304
305void
306rb_str_make_embedded(VALUE str)
307{
308 RUBY_ASSERT(rb_str_reembeddable_p(str));
309 RUBY_ASSERT(!STR_EMBED_P(str));
310
311 char *buf = RSTRING(str)->as.heap.ptr;
312 long len = RSTRING(str)->len;
313
314 STR_SET_EMBED(str);
315 STR_SET_LEN(str, len);
316
317 if (len > 0) {
318 memcpy(RSTRING_PTR(str), buf, len);
319 ruby_xfree(buf);
320 }
321
322 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
323}
324
325void
326rb_debug_rstring_null_ptr(const char *func)
327{
328 fprintf(stderr, "%s is returning NULL!! "
329 "SIGSEGV is highly expected to follow immediately.\n"
330 "If you could reproduce, attach your debugger here, "
331 "and look at the passed string.\n",
332 func);
333}
334
335/* symbols for [up|down|swap]case/capitalize options */
336static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
337
338static rb_encoding *
339get_encoding(VALUE str)
340{
341 return rb_enc_from_index(ENCODING_GET(str));
342}
343
344static void
345mustnot_broken(VALUE str)
346{
347 if (is_broken_string(str)) {
348 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
349 }
350}
351
352static void
353mustnot_wchar(VALUE str)
354{
355 rb_encoding *enc = STR_ENC_GET(str);
356 if (rb_enc_mbminlen(enc) > 1) {
357 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
358 }
359}
360
361static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
362
363#if SIZEOF_LONG == SIZEOF_VOIDP
364#define PRECOMPUTED_FAKESTR_HASH 1
365#else
366#endif
367
368static inline bool
369BARE_STRING_P(VALUE str)
370{
371 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
372}
373
374static inline st_index_t
375str_do_hash(VALUE str)
376{
377 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
378 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
379 if (e && !is_ascii_string(str)) {
380 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
381 }
382 return h;
383}
384
385static VALUE
386str_store_precomputed_hash(VALUE str, st_index_t hash)
387{
388 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
389 RUBY_ASSERT(STR_EMBED_P(str));
390
391#if RUBY_DEBUG
392 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
393 size_t free_bytes = str_embed_capa(str) - used_bytes;
394 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
395#endif
396
397 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
398
399 FL_SET(str, STR_PRECOMPUTED_HASH);
400
401 return str;
402}
403
404VALUE
405rb_fstring(VALUE str)
406{
407 VALUE fstr;
408 int bare;
409
410 Check_Type(str, T_STRING);
411
412 if (FL_TEST(str, RSTRING_FSTR))
413 return str;
414
415 bare = BARE_STRING_P(str);
416 if (!bare) {
417 if (STR_EMBED_P(str)) {
418 OBJ_FREEZE(str);
419 return str;
420 }
421
422 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
424 return str;
425 }
426 }
427
428 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
429 rb_str_resize(str, RSTRING_LEN(str));
430
431 fstr = register_fstring(str, false, false);
432
433 if (!bare) {
434 str_replace_shared_without_enc(str, fstr);
435 OBJ_FREEZE(str);
436 return str;
437 }
438 return fstr;
439}
440
441static VALUE fstring_table_obj;
442
443static VALUE
444fstring_concurrent_set_hash(VALUE str)
445{
446#ifdef PRECOMPUTED_FAKESTR_HASH
447 st_index_t h;
448 if (FL_TEST_RAW(str, STR_FAKESTR)) {
449 // register_fstring precomputes the hash and stores it in capa for fake strings
450 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
451 }
452 else {
453 h = rb_str_hash(str);
454 }
455 // rb_str_hash doesn't include the encoding for ascii only strings, so
456 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
457 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
458#else
459 return (VALUE)rb_str_hash(str);
460#endif
461}
462
463static bool
464fstring_concurrent_set_cmp(VALUE a, VALUE b)
465{
466 long alen, blen;
467 const char *aptr, *bptr;
468
471
472 RSTRING_GETMEM(a, aptr, alen);
473 RSTRING_GETMEM(b, bptr, blen);
474 return (alen == blen &&
475 ENCODING_GET(a) == ENCODING_GET(b) &&
476 memcmp(aptr, bptr, alen) == 0);
477}
478
480 bool copy;
481 bool force_precompute_hash;
482};
483
484static VALUE
485fstring_concurrent_set_create(VALUE str, void *data)
486{
487 struct fstr_create_arg *arg = data;
488
489 // Unless the string is empty or binary, its coderange has been precomputed.
490 int coderange = ENC_CODERANGE(str);
491
492 if (FL_TEST_RAW(str, STR_FAKESTR)) {
493 if (arg->copy) {
494 VALUE new_str;
495 long len = RSTRING_LEN(str);
496 long capa = len + sizeof(st_index_t);
497 int term_len = TERM_LEN(str);
498
499 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
500 new_str = str_alloc_embed(rb_cString, capa + term_len);
501 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
502 STR_SET_LEN(new_str, RSTRING_LEN(str));
503 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
504 rb_enc_copy(new_str, str);
505 str_store_precomputed_hash(new_str, str_do_hash(str));
506 }
507 else {
508 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
509 rb_enc_copy(new_str, str);
510#ifdef PRECOMPUTED_FAKESTR_HASH
511 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
512 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
513 }
514#endif
515 }
516 str = new_str;
517 }
518 else {
519 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
520 RSTRING(str)->len,
521 ENCODING_GET(str));
522 }
523 OBJ_FREEZE(str);
524 }
525 else {
526 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
527 str = str_new_frozen(rb_cString, str);
528 }
529 if (STR_SHARED_P(str)) { /* str should not be shared */
530 /* shared substring */
531 str_make_independent(str);
533 }
534 if (!BARE_STRING_P(str)) {
535 str = str_new_frozen(rb_cString, str);
536 }
537 }
538
539 ENC_CODERANGE_SET(str, coderange);
540 RBASIC(str)->flags |= RSTRING_FSTR;
541 if (!RB_OBJ_SHAREABLE_P(str)) {
542 RB_OBJ_SET_SHAREABLE(str);
543 }
544 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
547 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
548 RUBY_ASSERT(!rb_obj_exivar_p(str));
550 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
551
552 return str;
553}
554
555static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
556 .hash = fstring_concurrent_set_hash,
557 .cmp = fstring_concurrent_set_cmp,
558 .create = fstring_concurrent_set_create,
559 .free = NULL,
560};
561
562void
563Init_fstring_table(void)
564{
565 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
566 rb_gc_register_address(&fstring_table_obj);
567}
568
569static VALUE
570register_fstring(VALUE str, bool copy, bool force_precompute_hash)
571{
572 struct fstr_create_arg args = {
573 .copy = copy,
574 .force_precompute_hash = force_precompute_hash
575 };
576
577#if SIZEOF_VOIDP == SIZEOF_LONG
578 if (FL_TEST_RAW(str, STR_FAKESTR)) {
579 // if the string hasn't been interned, we'll need the hash twice, so we
580 // compute it once and store it in capa
581 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
582 }
583#endif
584
585 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
586
587 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
589 RUBY_ASSERT(OBJ_FROZEN(result));
591 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
592 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
594
595 return result;
596}
597
598bool
599rb_obj_is_fstring_table(VALUE obj)
600{
601 ASSERT_vm_locking();
602
603 return obj == fstring_table_obj;
604}
605
606void
607rb_gc_free_fstring(VALUE obj)
608{
609 ASSERT_vm_locking_with_barrier();
610
611 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
612
613 RB_DEBUG_COUNTER_INC(obj_str_fstr);
614
615 FL_UNSET(obj, RSTRING_FSTR);
616}
617
618void
619rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
620{
621 if (fstring_table_obj) {
622 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
623 }
624}
625
626static VALUE
627setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
628{
629 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
630 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
631
632 if (!name) {
634 name = "";
635 }
636
637 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
638
639 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
640 fake_str->len = len;
641 fake_str->as.heap.ptr = (char *)name;
642 fake_str->as.heap.aux.capa = len;
643 return (VALUE)fake_str;
644}
645
646/*
647 * set up a fake string which refers a static string literal.
648 */
649VALUE
650rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
651{
652 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
653}
654
655/*
656 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
657 * shared string which refers a static string literal. `ptr` must
658 * point a constant string.
659 */
660VALUE
661rb_fstring_new(const char *ptr, long len)
662{
663 struct RString fake_str = {RBASIC_INIT};
664 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
665}
666
667VALUE
668rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
669{
670 struct RString fake_str = {RBASIC_INIT};
671 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
672}
673
674VALUE
675rb_fstring_cstr(const char *ptr)
676{
677 return rb_fstring_new(ptr, strlen(ptr));
678}
679
680static inline bool
681single_byte_optimizable(VALUE str)
682{
683 int encindex = ENCODING_GET(str);
684 switch (encindex) {
685 case ENCINDEX_ASCII_8BIT:
686 case ENCINDEX_US_ASCII:
687 return true;
688 case ENCINDEX_UTF_8:
689 // For UTF-8 it's worth scanning the string coderange when unknown.
691 }
692 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
693 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
694 return true;
695 }
696
697 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
698 return true;
699 }
700
701 /* Conservative. Possibly single byte.
702 * "\xa1" in Shift_JIS for example. */
703 return false;
704}
705
707
708static inline const char *
709search_nonascii(const char *p, const char *e)
710{
711 const uintptr_t *s, *t;
712
713#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
714# if SIZEOF_UINTPTR_T == 8
715# define NONASCII_MASK UINT64_C(0x8080808080808080)
716# elif SIZEOF_UINTPTR_T == 4
717# define NONASCII_MASK UINT32_C(0x80808080)
718# else
719# error "don't know what to do."
720# endif
721#else
722# if SIZEOF_UINTPTR_T == 8
723# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
724# elif SIZEOF_UINTPTR_T == 4
725# define NONASCII_MASK 0x80808080UL /* or...? */
726# else
727# error "don't know what to do."
728# endif
729#endif
730
731 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
732#if !UNALIGNED_WORD_ACCESS
733 if ((uintptr_t)p % SIZEOF_VOIDP) {
734 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
735 p += l;
736 switch (l) {
737 default: UNREACHABLE;
738#if SIZEOF_VOIDP > 4
739 case 7: if (p[-7]&0x80) return p-7;
740 case 6: if (p[-6]&0x80) return p-6;
741 case 5: if (p[-5]&0x80) return p-5;
742 case 4: if (p[-4]&0x80) return p-4;
743#endif
744 case 3: if (p[-3]&0x80) return p-3;
745 case 2: if (p[-2]&0x80) return p-2;
746 case 1: if (p[-1]&0x80) return p-1;
747 case 0: break;
748 }
749 }
750#endif
751#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
752#define aligned_ptr(value) \
753 __builtin_assume_aligned((value), sizeof(uintptr_t))
754#else
755#define aligned_ptr(value) (uintptr_t *)(value)
756#endif
757 s = aligned_ptr(p);
758 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
759#undef aligned_ptr
760 for (;s < t; s++) {
761 if (*s & NONASCII_MASK) {
762#ifdef WORDS_BIGENDIAN
763 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
764#else
765 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
766#endif
767 }
768 }
769 p = (const char *)s;
770 }
771
772 switch (e - p) {
773 default: UNREACHABLE;
774#if SIZEOF_VOIDP > 4
775 case 7: if (e[-7]&0x80) return e-7;
776 case 6: if (e[-6]&0x80) return e-6;
777 case 5: if (e[-5]&0x80) return e-5;
778 case 4: if (e[-4]&0x80) return e-4;
779#endif
780 case 3: if (e[-3]&0x80) return e-3;
781 case 2: if (e[-2]&0x80) return e-2;
782 case 1: if (e[-1]&0x80) return e-1;
783 case 0: return NULL;
784 }
785}
786
787static int
788coderange_scan(const char *p, long len, rb_encoding *enc)
789{
790 const char *e = p + len;
791
792 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
793 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
794 p = search_nonascii(p, e);
796 }
797
798 if (rb_enc_asciicompat(enc)) {
799 p = search_nonascii(p, e);
800 if (!p) return ENC_CODERANGE_7BIT;
801 for (;;) {
802 int ret = rb_enc_precise_mbclen(p, e, enc);
804 p += MBCLEN_CHARFOUND_LEN(ret);
805 if (p == e) break;
806 p = search_nonascii(p, e);
807 if (!p) break;
808 }
809 }
810 else {
811 while (p < e) {
812 int ret = rb_enc_precise_mbclen(p, e, enc);
814 p += MBCLEN_CHARFOUND_LEN(ret);
815 }
816 }
817 return ENC_CODERANGE_VALID;
818}
819
820long
821rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
822{
823 const char *p = s;
824
825 if (*cr == ENC_CODERANGE_BROKEN)
826 return e - s;
827
828 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
829 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
830 if (*cr == ENC_CODERANGE_VALID) return e - s;
831 p = search_nonascii(p, e);
833 return e - s;
834 }
835 else if (rb_enc_asciicompat(enc)) {
836 p = search_nonascii(p, e);
837 if (!p) {
838 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
839 return e - s;
840 }
841 for (;;) {
842 int ret = rb_enc_precise_mbclen(p, e, enc);
843 if (!MBCLEN_CHARFOUND_P(ret)) {
845 return p - s;
846 }
847 p += MBCLEN_CHARFOUND_LEN(ret);
848 if (p == e) break;
849 p = search_nonascii(p, e);
850 if (!p) break;
851 }
852 }
853 else {
854 while (p < e) {
855 int ret = rb_enc_precise_mbclen(p, e, enc);
856 if (!MBCLEN_CHARFOUND_P(ret)) {
858 return p - s;
859 }
860 p += MBCLEN_CHARFOUND_LEN(ret);
861 }
862 }
864 return e - s;
865}
866
867static inline void
868str_enc_copy(VALUE str1, VALUE str2)
869{
870 rb_enc_set_index(str1, ENCODING_GET(str2));
871}
872
873/* Like str_enc_copy, but does not check frozen status of str1.
874 * You should use this only if you're certain that str1 is not frozen. */
875static inline void
876str_enc_copy_direct(VALUE str1, VALUE str2)
877{
878 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
879 if (inlined_encoding == ENCODING_INLINE_MAX) {
880 rb_enc_set_index(str1, rb_enc_get_index(str2));
881 }
882 else {
883 ENCODING_SET_INLINED(str1, inlined_encoding);
884 }
885}
886
887static void
888rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
889{
890 /* this function is designed for copying encoding and coderange
891 * from src to new string "dest" which is made from the part of src.
892 */
893 str_enc_copy(dest, src);
894 if (RSTRING_LEN(dest) == 0) {
895 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
897 else
899 return;
900 }
901 switch (ENC_CODERANGE(src)) {
904 break;
906 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
907 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
909 else
911 break;
912 default:
913 break;
914 }
915}
916
917static void
918rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
919{
920 str_enc_copy(dest, src);
922}
923
924static int
925enc_coderange_scan(VALUE str, rb_encoding *enc)
926{
927 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
928}
929
930int
931rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
932{
933 return enc_coderange_scan(str, enc);
934}
935
936int
938{
939 int cr = ENC_CODERANGE(str);
940
941 if (cr == ENC_CODERANGE_UNKNOWN) {
942 cr = enc_coderange_scan(str, get_encoding(str));
943 ENC_CODERANGE_SET(str, cr);
944 }
945 return cr;
946}
947
948static inline bool
949rb_enc_str_asciicompat(VALUE str)
950{
951 int encindex = ENCODING_GET_INLINED(str);
952 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
953}
954
955int
957{
958 switch(ENC_CODERANGE(str)) {
960 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
962 return true;
963 default:
964 return false;
965 }
966}
967
968static inline void
969str_mod_check(VALUE s, const char *p, long len)
970{
971 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
972 rb_raise(rb_eRuntimeError, "string modified");
973 }
974}
975
976static size_t
977str_capacity(VALUE str, const int termlen)
978{
979 if (STR_EMBED_P(str)) {
980 return str_embed_capa(str) - termlen;
981 }
982 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
983 return RSTRING(str)->len;
984 }
985 else {
986 return RSTRING(str)->as.heap.aux.capa;
987 }
988}
989
990size_t
992{
993 return str_capacity(str, TERM_LEN(str));
994}
995
996static inline void
997must_not_null(const char *ptr)
998{
999 if (!ptr) {
1000 rb_raise(rb_eArgError, "NULL pointer given");
1001 }
1002}
1003
1004static inline VALUE
1005str_alloc_embed(VALUE klass, size_t capa)
1006{
1007 size_t size = rb_str_embed_size(capa);
1008 RUBY_ASSERT(size > 0);
1009 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1010
1011 NEWOBJ_OF(str, struct RString, klass,
1013
1014 str->len = 0;
1015 str->as.embed.ary[0] = 0;
1016
1017 return (VALUE)str;
1018}
1019
1020static inline VALUE
1021str_alloc_heap(VALUE klass)
1022{
1023 NEWOBJ_OF(str, struct RString, klass,
1024 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1025
1026 str->len = 0;
1027 str->as.heap.aux.capa = 0;
1028 str->as.heap.ptr = NULL;
1029
1030 return (VALUE)str;
1031}
1032
1033static inline VALUE
1034empty_str_alloc(VALUE klass)
1035{
1036 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1037 VALUE str = str_alloc_embed(klass, 0);
1038 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1040 return str;
1041}
1042
1043static VALUE
1044str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1045{
1046 VALUE str;
1047
1048 if (len < 0) {
1049 rb_raise(rb_eArgError, "negative string size (or size too big)");
1050 }
1051
1052 if (enc == NULL) {
1053 enc = rb_ascii8bit_encoding();
1054 }
1055
1056 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1057
1058 int termlen = rb_enc_mbminlen(enc);
1059
1060 if (STR_EMBEDDABLE_P(len, termlen)) {
1061 str = str_alloc_embed(klass, len + termlen);
1062 if (len == 0) {
1063 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1064 }
1065 }
1066 else {
1067 str = str_alloc_heap(klass);
1068 RSTRING(str)->as.heap.aux.capa = len;
1069 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1070 * integer overflow. If we can STATIC_ASSERT that, the following
1071 * mul_add_mul can be reverted to a simple ALLOC_N. */
1072 RSTRING(str)->as.heap.ptr =
1073 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1074 }
1075
1076 rb_enc_raw_set(str, enc);
1077
1078 if (ptr) {
1079 memcpy(RSTRING_PTR(str), ptr, len);
1080 }
1081 else {
1082 memset(RSTRING_PTR(str), 0, len);
1083 }
1084
1085 STR_SET_LEN(str, len);
1086 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1087 return str;
1088}
1089
1090static VALUE
1091str_new(VALUE klass, const char *ptr, long len)
1092{
1093 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1094}
1095
1096VALUE
1097rb_str_new(const char *ptr, long len)
1098{
1099 return str_new(rb_cString, ptr, len);
1100}
1101
1102VALUE
1103rb_usascii_str_new(const char *ptr, long len)
1104{
1105 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1106}
1107
1108VALUE
1109rb_utf8_str_new(const char *ptr, long len)
1110{
1111 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1112}
1113
1114VALUE
1115rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1116{
1117 return str_enc_new(rb_cString, ptr, len, enc);
1118}
1119
1120VALUE
1122{
1123 must_not_null(ptr);
1124 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1125 * memory regions, and that cannot be detected by the MSAN. Just
1126 * trust the programmer that the argument passed here is a sane C
1127 * string. */
1128 __msan_unpoison_string(ptr);
1129 return rb_str_new(ptr, strlen(ptr));
1130}
1131
1132VALUE
1134{
1135 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1136}
1137
1138VALUE
1140{
1141 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1142}
1143
1144VALUE
1146{
1147 must_not_null(ptr);
1148 if (rb_enc_mbminlen(enc) != 1) {
1149 rb_raise(rb_eArgError, "wchar encoding given");
1150 }
1151 return rb_enc_str_new(ptr, strlen(ptr), enc);
1152}
1153
1154static VALUE
1155str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1156{
1157 VALUE str;
1158
1159 if (len < 0) {
1160 rb_raise(rb_eArgError, "negative string size (or size too big)");
1161 }
1162
1163 if (!ptr) {
1164 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1165 }
1166 else {
1167 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1168 str = str_alloc_heap(klass);
1169 RSTRING(str)->len = len;
1170 RSTRING(str)->as.heap.ptr = (char *)ptr;
1171 RSTRING(str)->as.heap.aux.capa = len;
1172 RBASIC(str)->flags |= STR_NOFREE;
1173 rb_enc_associate_index(str, encindex);
1174 }
1175 return str;
1176}
1177
1178VALUE
1179rb_str_new_static(const char *ptr, long len)
1180{
1181 return str_new_static(rb_cString, ptr, len, 0);
1182}
1183
1184VALUE
1186{
1187 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1188}
1189
1190VALUE
1192{
1193 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1194}
1195
1196VALUE
1198{
1199 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1200}
1201
1202static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1203 rb_encoding *from, rb_encoding *to,
1204 int ecflags, VALUE ecopts);
1205
1206static inline bool
1207is_enc_ascii_string(VALUE str, rb_encoding *enc)
1208{
1209 int encidx = rb_enc_to_index(enc);
1210 if (rb_enc_get_index(str) == encidx)
1211 return is_ascii_string(str);
1212 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1213}
1214
1215VALUE
1216rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1217{
1218 long len;
1219 const char *ptr;
1220 VALUE newstr;
1221
1222 if (!to) return str;
1223 if (!from) from = rb_enc_get(str);
1224 if (from == to) return str;
1225 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1226 rb_is_ascii8bit_enc(to)) {
1227 if (STR_ENC_GET(str) != to) {
1228 str = rb_str_dup(str);
1229 rb_enc_associate(str, to);
1230 }
1231 return str;
1232 }
1233
1234 RSTRING_GETMEM(str, ptr, len);
1235 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1236 from, to, ecflags, ecopts);
1237 if (NIL_P(newstr)) {
1238 /* some error, return original */
1239 return str;
1240 }
1241 return newstr;
1242}
1243
1244VALUE
1245rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1246 rb_encoding *from, int ecflags, VALUE ecopts)
1247{
1248 long olen;
1249
1250 olen = RSTRING_LEN(newstr);
1251 if (ofs < -olen || olen < ofs)
1252 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1253 if (ofs < 0) ofs += olen;
1254 if (!from) {
1255 STR_SET_LEN(newstr, ofs);
1256 return rb_str_cat(newstr, ptr, len);
1257 }
1258
1259 rb_str_modify(newstr);
1260 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1261 rb_enc_get(newstr),
1262 ecflags, ecopts);
1263}
1264
1265VALUE
1266rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1267{
1268 STR_SET_LEN(str, 0);
1269 rb_enc_associate(str, enc);
1270 rb_str_cat(str, ptr, len);
1271 return str;
1272}
1273
1274static VALUE
1275str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1276 rb_encoding *from, rb_encoding *to,
1277 int ecflags, VALUE ecopts)
1278{
1279 rb_econv_t *ec;
1281 long olen;
1282 VALUE econv_wrapper;
1283 const unsigned char *start, *sp;
1284 unsigned char *dest, *dp;
1285 size_t converted_output = (size_t)ofs;
1286
1287 olen = rb_str_capacity(newstr);
1288
1289 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1290 RBASIC_CLEAR_CLASS(econv_wrapper);
1291 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1292 if (!ec) return Qnil;
1293 DATA_PTR(econv_wrapper) = ec;
1294
1295 sp = (unsigned char*)ptr;
1296 start = sp;
1297 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1298 (dp = dest + converted_output),
1299 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1301 /* destination buffer short */
1302 size_t converted_input = sp - start;
1303 size_t rest = len - converted_input;
1304 converted_output = dp - dest;
1305 rb_str_set_len(newstr, converted_output);
1306 if (converted_input && converted_output &&
1307 rest < (LONG_MAX / converted_output)) {
1308 rest = (rest * converted_output) / converted_input;
1309 }
1310 else {
1311 rest = olen;
1312 }
1313 olen += rest < 2 ? 2 : rest;
1314 rb_str_resize(newstr, olen);
1315 }
1316 DATA_PTR(econv_wrapper) = 0;
1317 RB_GC_GUARD(econv_wrapper);
1318 rb_econv_close(ec);
1319 switch (ret) {
1320 case econv_finished:
1321 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1322 rb_str_set_len(newstr, len);
1323 rb_enc_associate(newstr, to);
1324 return newstr;
1325
1326 default:
1327 return Qnil;
1328 }
1329}
1330
1331VALUE
1333{
1334 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1335}
1336
1337VALUE
1339{
1340 rb_encoding *ienc;
1341 VALUE str;
1342 const int eidx = rb_enc_to_index(eenc);
1343
1344 if (!ptr) {
1345 return rb_enc_str_new(ptr, len, eenc);
1346 }
1347
1348 /* ASCII-8BIT case, no conversion */
1349 if ((eidx == rb_ascii8bit_encindex()) ||
1350 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1351 return rb_str_new(ptr, len);
1352 }
1353 /* no default_internal or same encoding, no conversion */
1354 ienc = rb_default_internal_encoding();
1355 if (!ienc || eenc == ienc) {
1356 return rb_enc_str_new(ptr, len, eenc);
1357 }
1358 /* ASCII compatible, and ASCII only string, no conversion in
1359 * default_internal */
1360 if ((eidx == rb_ascii8bit_encindex()) ||
1361 (eidx == rb_usascii_encindex()) ||
1362 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1363 return rb_enc_str_new(ptr, len, ienc);
1364 }
1365 /* convert from the given encoding to default_internal */
1366 str = rb_enc_str_new(NULL, 0, ienc);
1367 /* when the conversion failed for some reason, just ignore the
1368 * default_internal and result in the given encoding as-is. */
1369 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1370 rb_str_initialize(str, ptr, len, eenc);
1371 }
1372 return str;
1373}
1374
1375VALUE
1376rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1377{
1378 int eidx = rb_enc_to_index(eenc);
1379 if (eidx == rb_usascii_encindex() &&
1380 !is_ascii_string(str)) {
1381 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1382 return str;
1383 }
1384 rb_enc_associate_index(str, eidx);
1385 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1386}
1387
1388VALUE
1389rb_external_str_new(const char *ptr, long len)
1390{
1391 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1392}
1393
1394VALUE
1396{
1397 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1398}
1399
1400VALUE
1401rb_locale_str_new(const char *ptr, long len)
1402{
1403 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1404}
1405
1406VALUE
1408{
1409 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1410}
1411
1412VALUE
1414{
1415 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1416}
1417
1418VALUE
1420{
1421 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1422}
1423
1424VALUE
1426{
1427 return rb_str_export_to_enc(str, rb_default_external_encoding());
1428}
1429
1430VALUE
1432{
1433 return rb_str_export_to_enc(str, rb_locale_encoding());
1434}
1435
1436VALUE
1438{
1439 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1440}
1441
1442static VALUE
1443str_replace_shared_without_enc(VALUE str2, VALUE str)
1444{
1445 const int termlen = TERM_LEN(str);
1446 char *ptr;
1447 long len;
1448
1449 RSTRING_GETMEM(str, ptr, len);
1450 if (str_embed_capa(str2) >= len + termlen) {
1451 char *ptr2 = RSTRING(str2)->as.embed.ary;
1452 STR_SET_EMBED(str2);
1453 memcpy(ptr2, RSTRING_PTR(str), len);
1454 TERM_FILL(ptr2+len, termlen);
1455 }
1456 else {
1457 VALUE root;
1458 if (STR_SHARED_P(str)) {
1459 root = RSTRING(str)->as.heap.aux.shared;
1460 RSTRING_GETMEM(str, ptr, len);
1461 }
1462 else {
1463 root = rb_str_new_frozen(str);
1464 RSTRING_GETMEM(root, ptr, len);
1465 }
1466 RUBY_ASSERT(OBJ_FROZEN(root));
1467
1468 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1469 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1470 rb_fatal("about to free a possible shared root");
1471 }
1472 char *ptr2 = STR_HEAP_PTR(str2);
1473 if (ptr2 != ptr) {
1474 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1475 }
1476 }
1477 FL_SET(str2, STR_NOEMBED);
1478 RSTRING(str2)->as.heap.ptr = ptr;
1479 STR_SET_SHARED(str2, root);
1480 }
1481
1482 STR_SET_LEN(str2, len);
1483
1484 return str2;
1485}
1486
1487static VALUE
1488str_replace_shared(VALUE str2, VALUE str)
1489{
1490 str_replace_shared_without_enc(str2, str);
1491 rb_enc_cr_str_exact_copy(str2, str);
1492 return str2;
1493}
1494
1495static VALUE
1496str_new_shared(VALUE klass, VALUE str)
1497{
1498 return str_replace_shared(str_alloc_heap(klass), str);
1499}
1500
1501VALUE
1503{
1504 return str_new_shared(rb_obj_class(str), str);
1505}
1506
1507VALUE
1509{
1510 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1511 return str_new_frozen(rb_obj_class(orig), orig);
1512}
1513
1514static VALUE
1515rb_str_new_frozen_String(VALUE orig)
1516{
1517 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1518 return str_new_frozen(rb_cString, orig);
1519}
1520
1521
1522VALUE
1523rb_str_frozen_bare_string(VALUE orig)
1524{
1525 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1526 return str_new_frozen(rb_cString, orig);
1527}
1528
1529VALUE
1530rb_str_tmp_frozen_acquire(VALUE orig)
1531{
1532 if (OBJ_FROZEN_RAW(orig)) return orig;
1533 return str_new_frozen_buffer(0, orig, FALSE);
1534}
1535
1536VALUE
1537rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1538{
1539 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1540 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1541
1542 VALUE str = str_alloc_heap(0);
1543 OBJ_FREEZE(str);
1544 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1545 FL_SET(str, STR_SHARED_ROOT);
1546
1547 size_t capa = str_capacity(orig, TERM_LEN(orig));
1548
1549 /* If the string is embedded then we want to create a copy that is heap
1550 * allocated. If the string is shared then the shared root must be
1551 * embedded, so we want to create a copy. If the string is a shared root
1552 * then it must be embedded, so we want to create a copy. */
1553 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1554 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1555 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1556 }
1557 else {
1558 /* orig must be heap allocated and not shared, so we can safely transfer
1559 * the pointer to str. */
1560 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1561 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1562 RBASIC(orig)->flags &= ~STR_NOFREE;
1563 STR_SET_SHARED(orig, str);
1564 if (RB_OBJ_SHAREABLE_P(orig)) {
1565 RB_OBJ_SET_SHAREABLE(str);
1566 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1567 }
1568 }
1569
1570 RSTRING(str)->len = RSTRING(orig)->len;
1571 RSTRING(str)->as.heap.aux.capa = capa;
1572
1573 return str;
1574}
1575
1576void
1577rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1578{
1579 if (RBASIC_CLASS(tmp) != 0)
1580 return;
1581
1582 if (STR_EMBED_P(tmp)) {
1584 }
1585 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1586 !OBJ_FROZEN_RAW(orig)) {
1587 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1588
1589 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1590 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1591 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1592
1593 /* Unshare orig since the root (tmp) only has this one child. */
1594 FL_UNSET_RAW(orig, STR_SHARED);
1595 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1596 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1598
1599 /* Make tmp embedded and empty so it is safe for sweeping. */
1600 STR_SET_EMBED(tmp);
1601 STR_SET_LEN(tmp, 0);
1602 }
1603 }
1604}
1605
1606static VALUE
1607str_new_frozen(VALUE klass, VALUE orig)
1608{
1609 return str_new_frozen_buffer(klass, orig, TRUE);
1610}
1611
1612static VALUE
1613heap_str_make_shared(VALUE klass, VALUE orig)
1614{
1615 RUBY_ASSERT(!STR_EMBED_P(orig));
1616 RUBY_ASSERT(!STR_SHARED_P(orig));
1618
1619 VALUE str = str_alloc_heap(klass);
1620 STR_SET_LEN(str, RSTRING_LEN(orig));
1621 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1622 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1623 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1624 RBASIC(orig)->flags &= ~STR_NOFREE;
1625 STR_SET_SHARED(orig, str);
1626 if (klass == 0)
1627 FL_UNSET_RAW(str, STR_BORROWED);
1628 return str;
1629}
1630
1631static VALUE
1632str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1633{
1634 VALUE str;
1635
1636 long len = RSTRING_LEN(orig);
1637 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1638 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1639
1640 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1641 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1642 RUBY_ASSERT(STR_EMBED_P(str));
1643 }
1644 else {
1645 if (FL_TEST_RAW(orig, STR_SHARED)) {
1646 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1647 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1648 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1649 RUBY_ASSERT(ofs >= 0);
1650 RUBY_ASSERT(rest >= 0);
1651 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1653
1654 if ((ofs > 0) || (rest > 0) ||
1655 (klass != RBASIC(shared)->klass) ||
1656 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1657 str = str_new_shared(klass, shared);
1658 RUBY_ASSERT(!STR_EMBED_P(str));
1659 RSTRING(str)->as.heap.ptr += ofs;
1660 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1661 }
1662 else {
1663 if (RBASIC_CLASS(shared) == 0)
1664 FL_SET_RAW(shared, STR_BORROWED);
1665 return shared;
1666 }
1667 }
1668 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1669 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1670 STR_SET_EMBED(str);
1671 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1672 STR_SET_LEN(str, RSTRING_LEN(orig));
1673 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1674 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1675 }
1676 else {
1677 if (RB_OBJ_SHAREABLE_P(orig)) {
1678 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1679 }
1680 else {
1681 str = heap_str_make_shared(klass, orig);
1682 }
1683 }
1684 }
1685
1686 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1687 OBJ_FREEZE(str);
1688 return str;
1689}
1690
1691VALUE
1692rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1693{
1694 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1695}
1696
1697static VALUE
1698str_new_empty_String(VALUE str)
1699{
1700 VALUE v = rb_str_new(0, 0);
1701 rb_enc_copy(v, str);
1702 return v;
1703}
1704
1705#define STR_BUF_MIN_SIZE 63
1706
1707VALUE
1709{
1710 if (STR_EMBEDDABLE_P(capa, 1)) {
1711 return str_alloc_embed(rb_cString, capa + 1);
1712 }
1713
1714 VALUE str = str_alloc_heap(rb_cString);
1715
1716 RSTRING(str)->as.heap.aux.capa = capa;
1717 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1718 RSTRING(str)->as.heap.ptr[0] = '\0';
1719
1720 return str;
1721}
1722
1723VALUE
1725{
1726 VALUE str;
1727 long len = strlen(ptr);
1728
1729 str = rb_str_buf_new(len);
1730 rb_str_buf_cat(str, ptr, len);
1731
1732 return str;
1733}
1734
1735VALUE
1737{
1738 return str_new(0, 0, len);
1739}
1740
1741void
1743{
1744 if (STR_EMBED_P(str)) {
1745 RB_DEBUG_COUNTER_INC(obj_str_embed);
1746 }
1747 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1748 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1749 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1750 }
1751 else {
1752 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1753 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1754 }
1755}
1756
1757size_t
1758rb_str_memsize(VALUE str)
1759{
1760 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1761 return STR_HEAP_SIZE(str);
1762 }
1763 else {
1764 return 0;
1765 }
1766}
1767
1768VALUE
1770{
1771 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1772}
1773
1774static inline void str_discard(VALUE str);
1775static void str_shared_replace(VALUE str, VALUE str2);
1776
1777void
1779{
1780 if (str != str2) str_shared_replace(str, str2);
1781}
1782
1783static void
1784str_shared_replace(VALUE str, VALUE str2)
1785{
1786 rb_encoding *enc;
1787 int cr;
1788 int termlen;
1789
1790 RUBY_ASSERT(str2 != str);
1791 enc = STR_ENC_GET(str2);
1792 cr = ENC_CODERANGE(str2);
1793 str_discard(str);
1794 termlen = rb_enc_mbminlen(enc);
1795
1796 STR_SET_LEN(str, RSTRING_LEN(str2));
1797
1798 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1799 STR_SET_EMBED(str);
1800 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1801 rb_enc_associate(str, enc);
1802 ENC_CODERANGE_SET(str, cr);
1803 }
1804 else {
1805 if (STR_EMBED_P(str2)) {
1806 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1807 long len = RSTRING_LEN(str2);
1808 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1809
1810 char *new_ptr = ALLOC_N(char, len + termlen);
1811 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1812 RSTRING(str2)->as.heap.ptr = new_ptr;
1813 STR_SET_LEN(str2, len);
1814 RSTRING(str2)->as.heap.aux.capa = len;
1815 STR_SET_NOEMBED(str2);
1816 }
1817
1818 STR_SET_NOEMBED(str);
1819 FL_UNSET(str, STR_SHARED);
1820 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1821
1822 if (FL_TEST(str2, STR_SHARED)) {
1823 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1824 STR_SET_SHARED(str, shared);
1825 }
1826 else {
1827 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1828 }
1829
1830 /* abandon str2 */
1831 STR_SET_EMBED(str2);
1832 RSTRING_PTR(str2)[0] = 0;
1833 STR_SET_LEN(str2, 0);
1834 rb_enc_associate(str, enc);
1835 ENC_CODERANGE_SET(str, cr);
1836 }
1837}
1838
1839VALUE
1841{
1842 VALUE str;
1843
1844 if (RB_TYPE_P(obj, T_STRING)) {
1845 return obj;
1846 }
1847 str = rb_funcall(obj, idTo_s, 0);
1848 return rb_obj_as_string_result(str, obj);
1849}
1850
1851VALUE
1852rb_obj_as_string_result(VALUE str, VALUE obj)
1853{
1854 if (!RB_TYPE_P(str, T_STRING))
1855 return rb_any_to_s(obj);
1856 return str;
1857}
1858
1859static VALUE
1860str_replace(VALUE str, VALUE str2)
1861{
1862 long len;
1863
1864 len = RSTRING_LEN(str2);
1865 if (STR_SHARED_P(str2)) {
1866 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1868 STR_SET_NOEMBED(str);
1869 STR_SET_LEN(str, len);
1870 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1871 STR_SET_SHARED(str, shared);
1872 rb_enc_cr_str_exact_copy(str, str2);
1873 }
1874 else {
1875 str_replace_shared(str, str2);
1876 }
1877
1878 return str;
1879}
1880
1881static inline VALUE
1882ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1883{
1884 size_t size = rb_str_embed_size(capa);
1885 RUBY_ASSERT(size > 0);
1886 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1887
1888 NEWOBJ_OF(str, struct RString, klass,
1890
1891 str->len = 0;
1892
1893 return (VALUE)str;
1894}
1895
1896static inline VALUE
1897ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1898{
1899 NEWOBJ_OF(str, struct RString, klass,
1900 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1901
1902 str->as.heap.aux.capa = 0;
1903 str->as.heap.ptr = NULL;
1904
1905 return (VALUE)str;
1906}
1907
1908static inline VALUE
1909str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1910{
1911 int encidx = 0;
1912 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1913 encidx = rb_enc_get_index(str);
1914 flags &= ~ENCODING_MASK;
1915 }
1916 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1917 if (encidx) rb_enc_associate_index(dup, encidx);
1918 return dup;
1919}
1920
1921static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1922
1923static inline VALUE
1924str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1925{
1926 VALUE flags = FL_TEST_RAW(str, flag_mask);
1927 long len = RSTRING_LEN(str);
1928
1929 RUBY_ASSERT(STR_EMBED_P(dup));
1930 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1931 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1932 STR_SET_LEN(dup, RSTRING_LEN(str));
1933 return str_duplicate_setup_encoding(str, dup, flags);
1934}
1935
1936static inline VALUE
1937str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1938{
1939 VALUE flags = FL_TEST_RAW(str, flag_mask);
1940 VALUE root = str;
1941 if (FL_TEST_RAW(str, STR_SHARED)) {
1942 root = RSTRING(str)->as.heap.aux.shared;
1943 }
1944 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1945 root = str = str_new_frozen(klass, str);
1946 flags = FL_TEST_RAW(str, flag_mask);
1947 }
1948 RUBY_ASSERT(!STR_SHARED_P(root));
1950
1951 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1952 FL_SET(root, STR_SHARED_ROOT);
1953 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1954 flags |= RSTRING_NOEMBED | STR_SHARED;
1955
1956 STR_SET_LEN(dup, RSTRING_LEN(str));
1957 return str_duplicate_setup_encoding(str, dup, flags);
1958}
1959
1960static inline VALUE
1961str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1962{
1963 if (STR_EMBED_P(str)) {
1964 return str_duplicate_setup_embed(klass, str, dup);
1965 }
1966 else {
1967 return str_duplicate_setup_heap(klass, str, dup);
1968 }
1969}
1970
1971static inline VALUE
1972str_duplicate(VALUE klass, VALUE str)
1973{
1974 VALUE dup;
1975 if (STR_EMBED_P(str)) {
1976 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1977 }
1978 else {
1979 dup = str_alloc_heap(klass);
1980 }
1981
1982 return str_duplicate_setup(klass, str, dup);
1983}
1984
1985VALUE
1987{
1988 return str_duplicate(rb_obj_class(str), str);
1989}
1990
1991/* :nodoc: */
1992VALUE
1993rb_str_dup_m(VALUE str)
1994{
1995 if (LIKELY(BARE_STRING_P(str))) {
1996 return str_duplicate(rb_cString, str);
1997 }
1998 else {
1999 return rb_obj_dup(str);
2000 }
2001}
2002
2003VALUE
2005{
2006 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2007 return str_duplicate(rb_cString, str);
2008}
2009
2010VALUE
2011rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2012{
2013 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2014 VALUE new_str, klass = rb_cString;
2015
2016 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2017 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2018 str_duplicate_setup_embed(klass, str, new_str);
2019 }
2020 else {
2021 new_str = ec_str_alloc_heap(ec, klass);
2022 str_duplicate_setup_heap(klass, str, new_str);
2023 }
2024 if (chilled) {
2025 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2026 }
2027 return new_str;
2028}
2029
2030VALUE
2031rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2032{
2033 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2034 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2035 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2036 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2037 return rb_str_freeze(str);
2038}
2039
2040/*
2041 * The documentation block below uses an include (instead of inline text)
2042 * because the included text has non-ASCII characters (which are not allowed in a C file).
2043 */
2044
2045/*
2046 *
2047 * call-seq:
2048 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2049 *
2050 * :include: doc/string/new.rdoc
2051 *
2052 */
2053
2054static VALUE
2055rb_str_init(int argc, VALUE *argv, VALUE str)
2056{
2057 static ID keyword_ids[2];
2058 VALUE orig, opt, venc, vcapa;
2059 VALUE kwargs[2];
2060 rb_encoding *enc = 0;
2061 int n;
2062
2063 if (!keyword_ids[0]) {
2064 keyword_ids[0] = rb_id_encoding();
2065 CONST_ID(keyword_ids[1], "capacity");
2066 }
2067
2068 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2069 if (!NIL_P(opt)) {
2070 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2071 venc = kwargs[0];
2072 vcapa = kwargs[1];
2073 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2074 enc = rb_to_encoding(venc);
2075 }
2076 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2077 long capa = NUM2LONG(vcapa);
2078 long len = 0;
2079 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2080
2081 if (capa < STR_BUF_MIN_SIZE) {
2082 capa = STR_BUF_MIN_SIZE;
2083 }
2084 if (n == 1) {
2085 StringValue(orig);
2086 len = RSTRING_LEN(orig);
2087 if (capa < len) {
2088 capa = len;
2089 }
2090 if (orig == str) n = 0;
2091 }
2092 str_modifiable(str);
2093 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2094 /* make noembed always */
2095 const size_t size = (size_t)capa + termlen;
2096 const char *const old_ptr = RSTRING_PTR(str);
2097 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2098 char *new_ptr = ALLOC_N(char, size);
2099 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2100 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2101 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2102 RSTRING(str)->as.heap.ptr = new_ptr;
2103 }
2104 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2105 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2106 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2107 }
2108 STR_SET_LEN(str, len);
2109 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2110 if (n == 1) {
2111 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2112 rb_enc_cr_str_exact_copy(str, orig);
2113 }
2114 FL_SET(str, STR_NOEMBED);
2115 RSTRING(str)->as.heap.aux.capa = capa;
2116 }
2117 else if (n == 1) {
2118 rb_str_replace(str, orig);
2119 }
2120 if (enc) {
2121 rb_enc_associate(str, enc);
2123 }
2124 }
2125 else if (n == 1) {
2126 rb_str_replace(str, orig);
2127 }
2128 return str;
2129}
2130
2131/* :nodoc: */
2132static VALUE
2133rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2134{
2135 if (klass != rb_cString) {
2136 return rb_class_new_instance_pass_kw(argc, argv, klass);
2137 }
2138
2139 static ID keyword_ids[2];
2140 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2141 VALUE kwargs[2];
2142 rb_encoding *enc = NULL;
2143
2144 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2145 if (NIL_P(opt)) {
2146 return rb_class_new_instance_pass_kw(argc, argv, klass);
2147 }
2148
2149 keyword_ids[0] = rb_id_encoding();
2150 CONST_ID(keyword_ids[1], "capacity");
2151 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2152 encoding = kwargs[0];
2153 capacity = kwargs[1];
2154
2155 if (n == 1) {
2156 orig = StringValue(orig);
2157 }
2158 else {
2159 orig = Qnil;
2160 }
2161
2162 if (UNDEF_P(encoding)) {
2163 if (!NIL_P(orig)) {
2164 encoding = rb_obj_encoding(orig);
2165 }
2166 }
2167
2168 if (!UNDEF_P(encoding)) {
2169 enc = rb_to_encoding(encoding);
2170 }
2171
2172 // If capacity is nil, we're basically just duping `orig`.
2173 if (UNDEF_P(capacity)) {
2174 if (NIL_P(orig)) {
2175 VALUE empty_str = str_new(klass, "", 0);
2176 if (enc) {
2177 rb_enc_associate(empty_str, enc);
2178 }
2179 return empty_str;
2180 }
2181 VALUE copy = str_duplicate(klass, orig);
2182 rb_enc_associate(copy, enc);
2183 ENC_CODERANGE_CLEAR(copy);
2184 return copy;
2185 }
2186
2187 long capa = 0;
2188 capa = NUM2LONG(capacity);
2189 if (capa < 0) {
2190 capa = 0;
2191 }
2192
2193 if (!NIL_P(orig)) {
2194 long orig_capa = rb_str_capacity(orig);
2195 if (orig_capa > capa) {
2196 capa = orig_capa;
2197 }
2198 }
2199
2200 VALUE str = str_enc_new(klass, NULL, capa, enc);
2201 STR_SET_LEN(str, 0);
2202 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2203
2204 if (!NIL_P(orig)) {
2205 rb_str_buf_append(str, orig);
2206 }
2207
2208 return str;
2209}
2210
2211#ifdef NONASCII_MASK
2212#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2213
2214/*
2215 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2216 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2217 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2218 *
2219 * if (!(byte & 0x80))
2220 * byte |= 0x40; // turn on bit6
2221 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2222 *
2223 * This function calculates whether a byte is leading or not for all bytes
2224 * in the argument word by concurrently using the above logic, and then
2225 * adds up the number of leading bytes in the word.
2226 */
2227static inline uintptr_t
2228count_utf8_lead_bytes_with_word(const uintptr_t *s)
2229{
2230 uintptr_t d = *s;
2231
2232 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2233 d = (d>>6) | (~d>>7);
2234 d &= NONASCII_MASK >> 7;
2235
2236 /* Gather all bytes. */
2237#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2238 /* use only if it can use POPCNT */
2239 return rb_popcount_intptr(d);
2240#else
2241 d += (d>>8);
2242 d += (d>>16);
2243# if SIZEOF_VOIDP == 8
2244 d += (d>>32);
2245# endif
2246 return (d&0xF);
2247#endif
2248}
2249#endif
2250
2251static inline long
2252enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2253{
2254 long c;
2255 const char *q;
2256
2257 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2258 long diff = (long)(e - p);
2259 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2260 }
2261#ifdef NONASCII_MASK
2262 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2263 uintptr_t len = 0;
2264 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2265 const uintptr_t *s, *t;
2266 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2267 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2268 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2269 while (p < (const char *)s) {
2270 if (is_utf8_lead_byte(*p)) len++;
2271 p++;
2272 }
2273 while (s < t) {
2274 len += count_utf8_lead_bytes_with_word(s);
2275 s++;
2276 }
2277 p = (const char *)s;
2278 }
2279 while (p < e) {
2280 if (is_utf8_lead_byte(*p)) len++;
2281 p++;
2282 }
2283 return (long)len;
2284 }
2285#endif
2286 else if (rb_enc_asciicompat(enc)) {
2287 c = 0;
2288 if (ENC_CODERANGE_CLEAN_P(cr)) {
2289 while (p < e) {
2290 if (ISASCII(*p)) {
2291 q = search_nonascii(p, e);
2292 if (!q)
2293 return c + (e - p);
2294 c += q - p;
2295 p = q;
2296 }
2297 p += rb_enc_fast_mbclen(p, e, enc);
2298 c++;
2299 }
2300 }
2301 else {
2302 while (p < e) {
2303 if (ISASCII(*p)) {
2304 q = search_nonascii(p, e);
2305 if (!q)
2306 return c + (e - p);
2307 c += q - p;
2308 p = q;
2309 }
2310 p += rb_enc_mbclen(p, e, enc);
2311 c++;
2312 }
2313 }
2314 return c;
2315 }
2316
2317 for (c=0; p<e; c++) {
2318 p += rb_enc_mbclen(p, e, enc);
2319 }
2320 return c;
2321}
2322
2323long
2324rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2325{
2326 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2327}
2328
2329/* To get strlen with cr
2330 * Note that given cr is not used.
2331 */
2332long
2333rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2334{
2335 long c;
2336 const char *q;
2337 int ret;
2338
2339 *cr = 0;
2340 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2341 long diff = (long)(e - p);
2342 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2343 }
2344 else if (rb_enc_asciicompat(enc)) {
2345 c = 0;
2346 while (p < e) {
2347 if (ISASCII(*p)) {
2348 q = search_nonascii(p, e);
2349 if (!q) {
2350 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2351 return c + (e - p);
2352 }
2353 c += q - p;
2354 p = q;
2355 }
2356 ret = rb_enc_precise_mbclen(p, e, enc);
2357 if (MBCLEN_CHARFOUND_P(ret)) {
2358 *cr |= ENC_CODERANGE_VALID;
2359 p += MBCLEN_CHARFOUND_LEN(ret);
2360 }
2361 else {
2363 p++;
2364 }
2365 c++;
2366 }
2367 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2368 return c;
2369 }
2370
2371 for (c=0; p<e; c++) {
2372 ret = rb_enc_precise_mbclen(p, e, enc);
2373 if (MBCLEN_CHARFOUND_P(ret)) {
2374 *cr |= ENC_CODERANGE_VALID;
2375 p += MBCLEN_CHARFOUND_LEN(ret);
2376 }
2377 else {
2379 if (p + rb_enc_mbminlen(enc) <= e)
2380 p += rb_enc_mbminlen(enc);
2381 else
2382 p = e;
2383 }
2384 }
2385 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2386 return c;
2387}
2388
2389/* enc must be str's enc or rb_enc_check(str, str2) */
2390static long
2391str_strlen(VALUE str, rb_encoding *enc)
2392{
2393 const char *p, *e;
2394 int cr;
2395
2396 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2397 if (!enc) enc = STR_ENC_GET(str);
2398 p = RSTRING_PTR(str);
2399 e = RSTRING_END(str);
2400 cr = ENC_CODERANGE(str);
2401
2402 if (cr == ENC_CODERANGE_UNKNOWN) {
2403 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2404 if (cr) ENC_CODERANGE_SET(str, cr);
2405 return n;
2406 }
2407 else {
2408 return enc_strlen(p, e, enc, cr);
2409 }
2410}
2411
2412long
2414{
2415 return str_strlen(str, NULL);
2416}
2417
2418/*
2419 * call-seq:
2420 * length -> integer
2421 *
2422 * :include: doc/string/length.rdoc
2423 *
2424 */
2425
2426VALUE
2428{
2429 return LONG2NUM(str_strlen(str, NULL));
2430}
2431
2432/*
2433 * call-seq:
2434 * bytesize -> integer
2435 *
2436 * :include: doc/string/bytesize.rdoc
2437 *
2438 */
2439
2440VALUE
2441rb_str_bytesize(VALUE str)
2442{
2443 return LONG2NUM(RSTRING_LEN(str));
2444}
2445
2446/*
2447 * call-seq:
2448 * empty? -> true or false
2449 *
2450 * Returns whether the length of +self+ is zero:
2451 *
2452 * 'hello'.empty? # => false
2453 * ' '.empty? # => false
2454 * ''.empty? # => true
2455 *
2456 * Related: see {Querying}[rdoc-ref:String@Querying].
2457 */
2458
2459static VALUE
2460rb_str_empty(VALUE str)
2461{
2462 return RBOOL(RSTRING_LEN(str) == 0);
2463}
2464
2465/*
2466 * call-seq:
2467 * self + other_string -> new_string
2468 *
2469 * Returns a new string containing +other_string+ concatenated to +self+:
2470 *
2471 * 'Hello from ' + self.to_s # => "Hello from main"
2472 *
2473 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2474 */
2475
2476VALUE
2478{
2479 VALUE str3;
2480 rb_encoding *enc;
2481 char *ptr1, *ptr2, *ptr3;
2482 long len1, len2;
2483 int termlen;
2484
2485 StringValue(str2);
2486 enc = rb_enc_check_str(str1, str2);
2487 RSTRING_GETMEM(str1, ptr1, len1);
2488 RSTRING_GETMEM(str2, ptr2, len2);
2489 termlen = rb_enc_mbminlen(enc);
2490 if (len1 > LONG_MAX - len2) {
2491 rb_raise(rb_eArgError, "string size too big");
2492 }
2493 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2494 ptr3 = RSTRING_PTR(str3);
2495 memcpy(ptr3, ptr1, len1);
2496 memcpy(ptr3+len1, ptr2, len2);
2497 TERM_FILL(&ptr3[len1+len2], termlen);
2498
2499 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2501 RB_GC_GUARD(str1);
2502 RB_GC_GUARD(str2);
2503 return str3;
2504}
2505
2506/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2507VALUE
2508rb_str_opt_plus(VALUE str1, VALUE str2)
2509{
2512 long len1, len2;
2513 MAYBE_UNUSED(char) *ptr1, *ptr2;
2514 RSTRING_GETMEM(str1, ptr1, len1);
2515 RSTRING_GETMEM(str2, ptr2, len2);
2516 int enc1 = rb_enc_get_index(str1);
2517 int enc2 = rb_enc_get_index(str2);
2518
2519 if (enc1 < 0) {
2520 return Qundef;
2521 }
2522 else if (enc2 < 0) {
2523 return Qundef;
2524 }
2525 else if (enc1 != enc2) {
2526 return Qundef;
2527 }
2528 else if (len1 > LONG_MAX - len2) {
2529 return Qundef;
2530 }
2531 else {
2532 return rb_str_plus(str1, str2);
2533 }
2534
2535}
2536
2537/*
2538 * call-seq:
2539 * self * n -> new_string
2540 *
2541 * Returns a new string containing +n+ copies of +self+:
2542 *
2543 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2544 * 'No!' * 0 # => ""
2545 *
2546 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2547 */
2548
2549VALUE
2551{
2552 VALUE str2;
2553 long n, len;
2554 char *ptr2;
2555 int termlen;
2556
2557 if (times == INT2FIX(1)) {
2558 return str_duplicate(rb_cString, str);
2559 }
2560 if (times == INT2FIX(0)) {
2561 str2 = str_alloc_embed(rb_cString, 0);
2562 rb_enc_copy(str2, str);
2563 return str2;
2564 }
2565 len = NUM2LONG(times);
2566 if (len < 0) {
2567 rb_raise(rb_eArgError, "negative argument");
2568 }
2569 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2570 if (STR_EMBEDDABLE_P(len, 1)) {
2571 str2 = str_alloc_embed(rb_cString, len + 1);
2572 memset(RSTRING_PTR(str2), 0, len + 1);
2573 }
2574 else {
2575 str2 = str_alloc_heap(rb_cString);
2576 RSTRING(str2)->as.heap.aux.capa = len;
2577 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2578 }
2579 STR_SET_LEN(str2, len);
2580 rb_enc_copy(str2, str);
2581 return str2;
2582 }
2583 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2584 rb_raise(rb_eArgError, "argument too big");
2585 }
2586
2587 len *= RSTRING_LEN(str);
2588 termlen = TERM_LEN(str);
2589 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2590 ptr2 = RSTRING_PTR(str2);
2591 if (len) {
2592 n = RSTRING_LEN(str);
2593 memcpy(ptr2, RSTRING_PTR(str), n);
2594 while (n <= len/2) {
2595 memcpy(ptr2 + n, ptr2, n);
2596 n *= 2;
2597 }
2598 memcpy(ptr2 + n, ptr2, len-n);
2599 }
2600 STR_SET_LEN(str2, len);
2601 TERM_FILL(&ptr2[len], termlen);
2602 rb_enc_cr_str_copy_for_substr(str2, str);
2603
2604 return str2;
2605}
2606
2607/*
2608 * call-seq:
2609 * self % object -> new_string
2610 *
2611 * Returns the result of formatting +object+ into the format specifications
2612 * contained in +self+
2613 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2614 *
2615 * '%05d' % 123 # => "00123"
2616 *
2617 * If +self+ contains multiple format specifications,
2618 * +object+ must be an array or hash containing the objects to be formatted:
2619 *
2620 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2621 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2622 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2623 *
2624 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2625 */
2626
2627static VALUE
2628rb_str_format_m(VALUE str, VALUE arg)
2629{
2630 VALUE tmp = rb_check_array_type(arg);
2631
2632 if (!NIL_P(tmp)) {
2633 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2634 }
2635 return rb_str_format(1, &arg, str);
2636}
2637
2638static inline void
2639rb_check_lockedtmp(VALUE str)
2640{
2641 if (FL_TEST(str, STR_TMPLOCK)) {
2642 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2643 }
2644}
2645
2646// If none of these flags are set, we know we have an modifiable string.
2647// If any is set, we need to do more detailed checks.
2648#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2649static inline void
2650str_modifiable(VALUE str)
2651{
2652 RUBY_ASSERT(ruby_thread_has_gvl_p());
2653
2654 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2655 if (CHILLED_STRING_P(str)) {
2656 CHILLED_STRING_MUTATED(str);
2657 }
2658 rb_check_lockedtmp(str);
2659 rb_check_frozen(str);
2660 }
2661}
2662
2663static inline int
2664str_dependent_p(VALUE str)
2665{
2666 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2667 return FALSE;
2668 }
2669 else {
2670 return TRUE;
2671 }
2672}
2673
2674// If none of these flags are set, we know we have an independent string.
2675// If any is set, we need to do more detailed checks.
2676#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2677static inline int
2678str_independent(VALUE str)
2679{
2680 RUBY_ASSERT(ruby_thread_has_gvl_p());
2681
2682 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2683 str_modifiable(str);
2684 return !str_dependent_p(str);
2685 }
2686 return TRUE;
2687}
2688
2689static void
2690str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2691{
2692 RUBY_ASSERT(ruby_thread_has_gvl_p());
2693
2694 char *ptr;
2695 char *oldptr;
2696 long capa = len + expand;
2697
2698 if (len > capa) len = capa;
2699
2700 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2701 ptr = RSTRING(str)->as.heap.ptr;
2702 STR_SET_EMBED(str);
2703 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2704 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2705 STR_SET_LEN(str, len);
2706 return;
2707 }
2708
2709 ptr = ALLOC_N(char, (size_t)capa + termlen);
2710 oldptr = RSTRING_PTR(str);
2711 if (oldptr) {
2712 memcpy(ptr, oldptr, len);
2713 }
2714 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2715 xfree(oldptr);
2716 }
2717 STR_SET_NOEMBED(str);
2718 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2719 TERM_FILL(ptr + len, termlen);
2720 RSTRING(str)->as.heap.ptr = ptr;
2721 STR_SET_LEN(str, len);
2722 RSTRING(str)->as.heap.aux.capa = capa;
2723}
2724
2725void
2726rb_str_modify(VALUE str)
2727{
2728 if (!str_independent(str))
2729 str_make_independent(str);
2731}
2732
2733void
2735{
2736 RUBY_ASSERT(ruby_thread_has_gvl_p());
2737
2738 int termlen = TERM_LEN(str);
2739 long len = RSTRING_LEN(str);
2740
2741 if (expand < 0) {
2742 rb_raise(rb_eArgError, "negative expanding string size");
2743 }
2744 if (expand >= LONG_MAX - len) {
2745 rb_raise(rb_eArgError, "string size too big");
2746 }
2747
2748 if (!str_independent(str)) {
2749 str_make_independent_expand(str, len, expand, termlen);
2750 }
2751 else if (expand > 0) {
2752 RESIZE_CAPA_TERM(str, len + expand, termlen);
2753 }
2755}
2756
2757/* As rb_str_modify(), but don't clear coderange */
2758static void
2759str_modify_keep_cr(VALUE str)
2760{
2761 if (!str_independent(str))
2762 str_make_independent(str);
2764 /* Force re-scan later */
2766}
2767
2768static inline void
2769str_discard(VALUE str)
2770{
2771 str_modifiable(str);
2772 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2773 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2774 RSTRING(str)->as.heap.ptr = 0;
2775 STR_SET_LEN(str, 0);
2776 }
2777}
2778
2779void
2781{
2782 int encindex = rb_enc_get_index(str);
2783
2784 if (RB_UNLIKELY(encindex == -1)) {
2785 rb_raise(rb_eTypeError, "not encoding capable object");
2786 }
2787
2788 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2789 return;
2790 }
2791
2792 rb_encoding *enc = rb_enc_from_index(encindex);
2793 if (!rb_enc_asciicompat(enc)) {
2794 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2795 }
2796}
2797
2798VALUE
2800{
2801 RUBY_ASSERT(ruby_thread_has_gvl_p());
2802
2803 VALUE s = *ptr;
2804 if (!RB_TYPE_P(s, T_STRING)) {
2805 s = rb_str_to_str(s);
2806 *ptr = s;
2807 }
2808 return s;
2809}
2810
2811char *
2813{
2814 VALUE str = rb_string_value(ptr);
2815 return RSTRING_PTR(str);
2816}
2817
2818static int
2819zero_filled(const char *s, int n)
2820{
2821 for (; n > 0; --n) {
2822 if (*s++) return 0;
2823 }
2824 return 1;
2825}
2826
2827static const char *
2828str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2829{
2830 const char *e = s + len;
2831
2832 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2833 if (zero_filled(s, minlen)) return s;
2834 }
2835 return 0;
2836}
2837
2838static char *
2839str_fill_term(VALUE str, char *s, long len, int termlen)
2840{
2841 /* This function assumes that (capa + termlen) bytes of memory
2842 * is allocated, like many other functions in this file.
2843 */
2844 if (str_dependent_p(str)) {
2845 if (!zero_filled(s + len, termlen))
2846 str_make_independent_expand(str, len, 0L, termlen);
2847 }
2848 else {
2849 TERM_FILL(s + len, termlen);
2850 return s;
2851 }
2852 return RSTRING_PTR(str);
2853}
2854
2855void
2856rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2857{
2858 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2859 long len = RSTRING_LEN(str);
2860
2861 RUBY_ASSERT(capa >= len);
2862 if (capa - len < termlen) {
2863 rb_check_lockedtmp(str);
2864 str_make_independent_expand(str, len, 0L, termlen);
2865 }
2866 else if (str_dependent_p(str)) {
2867 if (termlen > oldtermlen)
2868 str_make_independent_expand(str, len, 0L, termlen);
2869 }
2870 else {
2871 if (!STR_EMBED_P(str)) {
2872 /* modify capa instead of realloc */
2873 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2874 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2875 }
2876 if (termlen > oldtermlen) {
2877 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2878 }
2879 }
2880
2881 return;
2882}
2883
2884static char *
2885str_null_check(VALUE str, int *w)
2886{
2887 char *s = RSTRING_PTR(str);
2888 long len = RSTRING_LEN(str);
2889 rb_encoding *enc = rb_enc_get(str);
2890 const int minlen = rb_enc_mbminlen(enc);
2891
2892 if (minlen > 1) {
2893 *w = 1;
2894 if (str_null_char(s, len, minlen, enc)) {
2895 return NULL;
2896 }
2897 return str_fill_term(str, s, len, minlen);
2898 }
2899 *w = 0;
2900 if (!s || memchr(s, 0, len)) {
2901 return NULL;
2902 }
2903 if (s[len]) {
2904 s = str_fill_term(str, s, len, minlen);
2905 }
2906 return s;
2907}
2908
2909char *
2910rb_str_to_cstr(VALUE str)
2911{
2912 int w;
2913 return str_null_check(str, &w);
2914}
2915
2916char *
2918{
2919 VALUE str = rb_string_value(ptr);
2920 int w;
2921 char *s = str_null_check(str, &w);
2922 if (!s) {
2923 if (w) {
2924 rb_raise(rb_eArgError, "string contains null char");
2925 }
2926 rb_raise(rb_eArgError, "string contains null byte");
2927 }
2928 return s;
2929}
2930
2931char *
2932rb_str_fill_terminator(VALUE str, const int newminlen)
2933{
2934 char *s = RSTRING_PTR(str);
2935 long len = RSTRING_LEN(str);
2936 return str_fill_term(str, s, len, newminlen);
2937}
2938
2939VALUE
2941{
2942 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2943 return str;
2944}
2945
2946/*
2947 * call-seq:
2948 * String.try_convert(object) -> object, new_string, or nil
2949 *
2950 * Attempts to convert the given +object+ to a string.
2951 *
2952 * If +object+ is already a string, returns +object+, unmodified.
2953 *
2954 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2955 * calls <tt>object.to_str</tt> and returns the result.
2956 *
2957 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2958 *
2959 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2960 */
2961static VALUE
2962rb_str_s_try_convert(VALUE dummy, VALUE str)
2963{
2964 return rb_check_string_type(str);
2965}
2966
2967static char*
2968str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2969{
2970 long nth = *nthp;
2971 if (rb_enc_mbmaxlen(enc) == 1) {
2972 p += nth;
2973 }
2974 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2975 p += nth * rb_enc_mbmaxlen(enc);
2976 }
2977 else if (rb_enc_asciicompat(enc)) {
2978 const char *p2, *e2;
2979 int n;
2980
2981 while (p < e && 0 < nth) {
2982 e2 = p + nth;
2983 if (e < e2) {
2984 *nthp = nth;
2985 return (char *)e;
2986 }
2987 if (ISASCII(*p)) {
2988 p2 = search_nonascii(p, e2);
2989 if (!p2) {
2990 nth -= e2 - p;
2991 *nthp = nth;
2992 return (char *)e2;
2993 }
2994 nth -= p2 - p;
2995 p = p2;
2996 }
2997 n = rb_enc_mbclen(p, e, enc);
2998 p += n;
2999 nth--;
3000 }
3001 *nthp = nth;
3002 if (nth != 0) {
3003 return (char *)e;
3004 }
3005 return (char *)p;
3006 }
3007 else {
3008 while (p < e && nth--) {
3009 p += rb_enc_mbclen(p, e, enc);
3010 }
3011 }
3012 if (p > e) p = e;
3013 *nthp = nth;
3014 return (char*)p;
3015}
3016
3017char*
3018rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3019{
3020 return str_nth_len(p, e, &nth, enc);
3021}
3022
3023static char*
3024str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3025{
3026 if (singlebyte)
3027 p += nth;
3028 else {
3029 p = str_nth_len(p, e, &nth, enc);
3030 }
3031 if (!p) return 0;
3032 if (p > e) p = e;
3033 return (char *)p;
3034}
3035
3036/* char offset to byte offset */
3037static long
3038str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3039{
3040 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3041 if (!pp) return e - p;
3042 return pp - p;
3043}
3044
3045long
3046rb_str_offset(VALUE str, long pos)
3047{
3048 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3049 STR_ENC_GET(str), single_byte_optimizable(str));
3050}
3051
3052#ifdef NONASCII_MASK
3053static char *
3054str_utf8_nth(const char *p, const char *e, long *nthp)
3055{
3056 long nth = *nthp;
3057 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3058 const uintptr_t *s, *t;
3059 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3060 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3061 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3062 while (p < (const char *)s) {
3063 if (is_utf8_lead_byte(*p)) nth--;
3064 p++;
3065 }
3066 do {
3067 nth -= count_utf8_lead_bytes_with_word(s);
3068 s++;
3069 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3070 p = (char *)s;
3071 }
3072 while (p < e) {
3073 if (is_utf8_lead_byte(*p)) {
3074 if (nth == 0) break;
3075 nth--;
3076 }
3077 p++;
3078 }
3079 *nthp = nth;
3080 return (char *)p;
3081}
3082
3083static long
3084str_utf8_offset(const char *p, const char *e, long nth)
3085{
3086 const char *pp = str_utf8_nth(p, e, &nth);
3087 return pp - p;
3088}
3089#endif
3090
3091/* byte offset to char offset */
3092long
3093rb_str_sublen(VALUE str, long pos)
3094{
3095 if (single_byte_optimizable(str) || pos < 0)
3096 return pos;
3097 else {
3098 char *p = RSTRING_PTR(str);
3099 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3100 }
3101}
3102
3103static VALUE
3104str_subseq(VALUE str, long beg, long len)
3105{
3106 VALUE str2;
3107
3108 RUBY_ASSERT(beg >= 0);
3109 RUBY_ASSERT(len >= 0);
3110 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3111
3112 const int termlen = TERM_LEN(str);
3113 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3114 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3115 RB_GC_GUARD(str);
3116 return str2;
3117 }
3118
3119 str2 = str_alloc_heap(rb_cString);
3120 if (str_embed_capa(str2) >= len + termlen) {
3121 char *ptr2 = RSTRING(str2)->as.embed.ary;
3122 STR_SET_EMBED(str2);
3123 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3124 TERM_FILL(ptr2+len, termlen);
3125
3126 STR_SET_LEN(str2, len);
3127 RB_GC_GUARD(str);
3128 }
3129 else {
3130 str_replace_shared(str2, str);
3131 RUBY_ASSERT(!STR_EMBED_P(str2));
3132 ENC_CODERANGE_CLEAR(str2);
3133 RSTRING(str2)->as.heap.ptr += beg;
3134 if (RSTRING_LEN(str2) > len) {
3135 STR_SET_LEN(str2, len);
3136 }
3137 }
3138
3139 return str2;
3140}
3141
3142VALUE
3143rb_str_subseq(VALUE str, long beg, long len)
3144{
3145 VALUE str2 = str_subseq(str, beg, len);
3146 rb_enc_cr_str_copy_for_substr(str2, str);
3147 return str2;
3148}
3149
3150char *
3151rb_str_subpos(VALUE str, long beg, long *lenp)
3152{
3153 long len = *lenp;
3154 long slen = -1L;
3155 const long blen = RSTRING_LEN(str);
3156 rb_encoding *enc = STR_ENC_GET(str);
3157 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3158
3159 if (len < 0) return 0;
3160 if (beg < 0 && -beg < 0) return 0;
3161 if (!blen) {
3162 len = 0;
3163 }
3164 if (single_byte_optimizable(str)) {
3165 if (beg > blen) return 0;
3166 if (beg < 0) {
3167 beg += blen;
3168 if (beg < 0) return 0;
3169 }
3170 if (len > blen - beg)
3171 len = blen - beg;
3172 if (len < 0) return 0;
3173 p = s + beg;
3174 goto end;
3175 }
3176 if (beg < 0) {
3177 if (len > -beg) len = -beg;
3178 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3179 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3180 beg = -beg;
3181 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3182 p = e;
3183 if (!p) return 0;
3184 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3185 if (!p) return 0;
3186 len = e - p;
3187 goto end;
3188 }
3189 else {
3190 slen = str_strlen(str, enc);
3191 beg += slen;
3192 if (beg < 0) return 0;
3193 p = s + beg;
3194 if (len == 0) goto end;
3195 }
3196 }
3197 else if (beg > 0 && beg > blen) {
3198 return 0;
3199 }
3200 if (len == 0) {
3201 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3202 p = s + beg;
3203 }
3204#ifdef NONASCII_MASK
3205 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3206 enc == rb_utf8_encoding()) {
3207 p = str_utf8_nth(s, e, &beg);
3208 if (beg > 0) return 0;
3209 len = str_utf8_offset(p, e, len);
3210 }
3211#endif
3212 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3213 int char_sz = rb_enc_mbmaxlen(enc);
3214
3215 p = s + beg * char_sz;
3216 if (p > e) {
3217 return 0;
3218 }
3219 else if (len * char_sz > e - p)
3220 len = e - p;
3221 else
3222 len *= char_sz;
3223 }
3224 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3225 if (beg > 0) return 0;
3226 len = 0;
3227 }
3228 else {
3229 len = str_offset(p, e, len, enc, 0);
3230 }
3231 end:
3232 *lenp = len;
3233 RB_GC_GUARD(str);
3234 return p;
3235}
3236
3237static VALUE str_substr(VALUE str, long beg, long len, int empty);
3238
3239VALUE
3240rb_str_substr(VALUE str, long beg, long len)
3241{
3242 return str_substr(str, beg, len, TRUE);
3243}
3244
3245VALUE
3246rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3247{
3248 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3249}
3250
3251static VALUE
3252str_substr(VALUE str, long beg, long len, int empty)
3253{
3254 char *p = rb_str_subpos(str, beg, &len);
3255
3256 if (!p) return Qnil;
3257 if (!len && !empty) return Qnil;
3258
3259 beg = p - RSTRING_PTR(str);
3260
3261 VALUE str2 = str_subseq(str, beg, len);
3262 rb_enc_cr_str_copy_for_substr(str2, str);
3263 return str2;
3264}
3265
3266/* :nodoc: */
3267VALUE
3269{
3270 if (CHILLED_STRING_P(str)) {
3271 FL_UNSET_RAW(str, STR_CHILLED);
3272 }
3273
3274 if (OBJ_FROZEN(str)) return str;
3275 rb_str_resize(str, RSTRING_LEN(str));
3276 return rb_obj_freeze(str);
3277}
3278
3279/*
3280 * call-seq:
3281 * +string -> new_string or self
3282 *
3283 * Returns +self+ if +self+ is not frozen and can be mutated
3284 * without warning issuance.
3285 *
3286 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3287 *
3288 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3289 */
3290static VALUE
3291str_uplus(VALUE str)
3292{
3293 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3294 return rb_str_dup(str);
3295 }
3296 else {
3297 return str;
3298 }
3299}
3300
3301/*
3302 * call-seq:
3303 * -self -> frozen_string
3304 *
3305 * Returns a frozen string equal to +self+.
3306 *
3307 * The returned string is +self+ if and only if all of the following are true:
3308 *
3309 * - +self+ is already frozen.
3310 * - +self+ is an instance of \String (rather than of a subclass of \String)
3311 * - +self+ has no instance variables set on it.
3312 *
3313 * Otherwise, the returned string is a frozen copy of +self+.
3314 *
3315 * Returning +self+, when possible, saves duplicating +self+;
3316 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3317 *
3318 * It may also save duplicating other, already-existing, strings:
3319 *
3320 * s0 = 'foo'
3321 * s1 = 'foo'
3322 * s0.object_id == s1.object_id # => false
3323 * (-s0).object_id == (-s1).object_id # => true
3324 *
3325 * Note that method #-@ is convenient for defining a constant:
3326 *
3327 * FileName = -'config/database.yml'
3328 *
3329 * While its alias #dedup is better suited for chaining:
3330 *
3331 * 'foo'.dedup.gsub!('o')
3332 *
3333 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3334 */
3335static VALUE
3336str_uminus(VALUE str)
3337{
3338 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3339 str = rb_str_dup(str);
3340 }
3341 return rb_fstring(str);
3342}
3343
3344RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3345#define rb_str_dup_frozen rb_str_new_frozen
3346
3347VALUE
3349{
3350 rb_check_frozen(str);
3351 if (FL_TEST(str, STR_TMPLOCK)) {
3352 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3353 }
3354 FL_SET(str, STR_TMPLOCK);
3355 return str;
3356}
3357
3358VALUE
3360{
3361 rb_check_frozen(str);
3362 if (!FL_TEST(str, STR_TMPLOCK)) {
3363 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3364 }
3365 FL_UNSET(str, STR_TMPLOCK);
3366 return str;
3367}
3368
3369VALUE
3370rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3371{
3372 rb_str_locktmp(str);
3373 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3374}
3375
3376void
3378{
3379 RUBY_ASSERT(ruby_thread_has_gvl_p());
3380
3381 long capa;
3382 const int termlen = TERM_LEN(str);
3383
3384 str_modifiable(str);
3385 if (STR_SHARED_P(str)) {
3386 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3387 }
3388 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3389 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3390 }
3391
3392 int cr = ENC_CODERANGE(str);
3393 if (len == 0) {
3394 /* Empty string does not contain non-ASCII */
3396 }
3397 else if (cr == ENC_CODERANGE_UNKNOWN) {
3398 /* Leave unknown. */
3399 }
3400 else if (len > RSTRING_LEN(str)) {
3401 if (ENC_CODERANGE_CLEAN_P(cr)) {
3402 /* Update the coderange regarding the extended part. */
3403 const char *const prev_end = RSTRING_END(str);
3404 const char *const new_end = RSTRING_PTR(str) + len;
3405 rb_encoding *enc = rb_enc_get(str);
3406 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3407 ENC_CODERANGE_SET(str, cr);
3408 }
3409 else if (cr == ENC_CODERANGE_BROKEN) {
3410 /* May be valid now, by appended part. */
3412 }
3413 }
3414 else if (len < RSTRING_LEN(str)) {
3415 if (cr != ENC_CODERANGE_7BIT) {
3416 /* ASCII-only string is keeping after truncated. Valid
3417 * and broken may be invalid or valid, leave unknown. */
3419 }
3420 }
3421
3422 STR_SET_LEN(str, len);
3423 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3424}
3425
3426VALUE
3427rb_str_resize(VALUE str, long len)
3428{
3429 if (len < 0) {
3430 rb_raise(rb_eArgError, "negative string size (or size too big)");
3431 }
3432
3433 int independent = str_independent(str);
3434 long slen = RSTRING_LEN(str);
3435 const int termlen = TERM_LEN(str);
3436
3437 if (slen > len || (termlen != 1 && slen < len)) {
3439 }
3440
3441 {
3442 long capa;
3443 if (STR_EMBED_P(str)) {
3444 if (len == slen) return str;
3445 if (str_embed_capa(str) >= len + termlen) {
3446 STR_SET_LEN(str, len);
3447 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3448 return str;
3449 }
3450 str_make_independent_expand(str, slen, len - slen, termlen);
3451 }
3452 else if (str_embed_capa(str) >= len + termlen) {
3453 char *ptr = STR_HEAP_PTR(str);
3454 STR_SET_EMBED(str);
3455 if (slen > len) slen = len;
3456 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3457 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3458 STR_SET_LEN(str, len);
3459 if (independent) ruby_xfree(ptr);
3460 return str;
3461 }
3462 else if (!independent) {
3463 if (len == slen) return str;
3464 str_make_independent_expand(str, slen, len - slen, termlen);
3465 }
3466 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3467 (capa - len) > (len < 1024 ? len : 1024)) {
3468 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3469 (size_t)len + termlen, STR_HEAP_SIZE(str));
3470 RSTRING(str)->as.heap.aux.capa = len;
3471 }
3472 else if (len == slen) return str;
3473 STR_SET_LEN(str, len);
3474 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3475 }
3476 return str;
3477}
3478
3479static void
3480str_ensure_available_capa(VALUE str, long len)
3481{
3482 str_modify_keep_cr(str);
3483
3484 const int termlen = TERM_LEN(str);
3485 long olen = RSTRING_LEN(str);
3486
3487 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3488 rb_raise(rb_eArgError, "string sizes too big");
3489 }
3490
3491 long total = olen + len;
3492 long capa = str_capacity(str, termlen);
3493
3494 if (capa < total) {
3495 if (total >= LONG_MAX / 2) {
3496 capa = total;
3497 }
3498 while (total > capa) {
3499 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3500 }
3501 RESIZE_CAPA_TERM(str, capa, termlen);
3502 }
3503}
3504
3505static VALUE
3506str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3507{
3508 if (keep_cr) {
3509 str_modify_keep_cr(str);
3510 }
3511 else {
3512 rb_str_modify(str);
3513 }
3514 if (len == 0) return 0;
3515
3516 long total, olen, off = -1;
3517 char *sptr;
3518 const int termlen = TERM_LEN(str);
3519
3520 RSTRING_GETMEM(str, sptr, olen);
3521 if (ptr >= sptr && ptr <= sptr + olen) {
3522 off = ptr - sptr;
3523 }
3524
3525 long capa = str_capacity(str, termlen);
3526
3527 if (olen > LONG_MAX - len) {
3528 rb_raise(rb_eArgError, "string sizes too big");
3529 }
3530 total = olen + len;
3531 if (capa < total) {
3532 if (total >= LONG_MAX / 2) {
3533 capa = total;
3534 }
3535 while (total > capa) {
3536 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3537 }
3538 RESIZE_CAPA_TERM(str, capa, termlen);
3539 sptr = RSTRING_PTR(str);
3540 }
3541 if (off != -1) {
3542 ptr = sptr + off;
3543 }
3544 memcpy(sptr + olen, ptr, len);
3545 STR_SET_LEN(str, total);
3546 TERM_FILL(sptr + total, termlen); /* sentinel */
3547
3548 return str;
3549}
3550
3551#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3552#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3553
3554VALUE
3555rb_str_cat(VALUE str, const char *ptr, long len)
3556{
3557 if (len == 0) return str;
3558 if (len < 0) {
3559 rb_raise(rb_eArgError, "negative string size (or size too big)");
3560 }
3561 return str_buf_cat(str, ptr, len);
3562}
3563
3564VALUE
3565rb_str_cat_cstr(VALUE str, const char *ptr)
3566{
3567 must_not_null(ptr);
3568 return rb_str_buf_cat(str, ptr, strlen(ptr));
3569}
3570
3571static void
3572rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3573{
3574 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3575
3576 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3577 if (UNLIKELY(!str_independent(str))) {
3578 str_make_independent(str);
3579 }
3580
3581 long string_length = -1;
3582 const int null_terminator_length = 1;
3583 char *sptr;
3584 RSTRING_GETMEM(str, sptr, string_length);
3585
3586 // Ensure the resulting string wouldn't be too long.
3587 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3588 rb_raise(rb_eArgError, "string sizes too big");
3589 }
3590
3591 long string_capacity = str_capacity(str, null_terminator_length);
3592
3593 // Get the code range before any modifications since those might clear the code range.
3594 int cr = ENC_CODERANGE(str);
3595
3596 // Check if the string has spare string_capacity to write the new byte.
3597 if (LIKELY(string_capacity >= string_length + 1)) {
3598 // In fast path we can write the new byte and note the string's new length.
3599 sptr[string_length] = byte;
3600 STR_SET_LEN(str, string_length + 1);
3601 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3602 }
3603 else {
3604 // If there's not enough string_capacity, make a call into the general string concatenation function.
3605 str_buf_cat(str, (char *)&byte, 1);
3606 }
3607
3608 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3609 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3610 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3611 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3612 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3613 if (ISASCII(byte)) {
3615 }
3616 else {
3618
3619 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3620 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3621 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3622 }
3623 }
3624 }
3625}
3626
3627RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3628RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3629RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3630
3631static VALUE
3632rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3633 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3634{
3635 int str_encindex = ENCODING_GET(str);
3636 int res_encindex;
3637 int str_cr, res_cr;
3638 rb_encoding *str_enc, *ptr_enc;
3639
3640 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3641
3642 if (str_encindex == ptr_encindex) {
3643 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3644 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3645 }
3646 }
3647 else {
3648 str_enc = rb_enc_from_index(str_encindex);
3649 ptr_enc = rb_enc_from_index(ptr_encindex);
3650 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3651 if (len == 0)
3652 return str;
3653 if (RSTRING_LEN(str) == 0) {
3654 rb_str_buf_cat(str, ptr, len);
3655 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3656 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3657 return str;
3658 }
3659 goto incompatible;
3660 }
3661 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3662 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3663 }
3664 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3665 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3666 str_cr = rb_enc_str_coderange(str);
3667 }
3668 }
3669 }
3670 if (ptr_cr_ret)
3671 *ptr_cr_ret = ptr_cr;
3672
3673 if (str_encindex != ptr_encindex &&
3674 str_cr != ENC_CODERANGE_7BIT &&
3675 ptr_cr != ENC_CODERANGE_7BIT) {
3676 str_enc = rb_enc_from_index(str_encindex);
3677 ptr_enc = rb_enc_from_index(ptr_encindex);
3678 goto incompatible;
3679 }
3680
3681 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3682 res_encindex = str_encindex;
3683 res_cr = ENC_CODERANGE_UNKNOWN;
3684 }
3685 else if (str_cr == ENC_CODERANGE_7BIT) {
3686 if (ptr_cr == ENC_CODERANGE_7BIT) {
3687 res_encindex = str_encindex;
3688 res_cr = ENC_CODERANGE_7BIT;
3689 }
3690 else {
3691 res_encindex = ptr_encindex;
3692 res_cr = ptr_cr;
3693 }
3694 }
3695 else if (str_cr == ENC_CODERANGE_VALID) {
3696 res_encindex = str_encindex;
3697 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3698 res_cr = str_cr;
3699 else
3700 res_cr = ptr_cr;
3701 }
3702 else { /* str_cr == ENC_CODERANGE_BROKEN */
3703 res_encindex = str_encindex;
3704 res_cr = str_cr;
3705 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3706 }
3707
3708 if (len < 0) {
3709 rb_raise(rb_eArgError, "negative string size (or size too big)");
3710 }
3711 str_buf_cat(str, ptr, len);
3712 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3713 return str;
3714
3715 incompatible:
3716 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3717 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3719}
3720
3721VALUE
3722rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3723{
3724 return rb_enc_cr_str_buf_cat(str, ptr, len,
3725 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3726}
3727
3728VALUE
3730{
3731 /* ptr must reference NUL terminated ASCII string. */
3732 int encindex = ENCODING_GET(str);
3733 rb_encoding *enc = rb_enc_from_index(encindex);
3734 if (rb_enc_asciicompat(enc)) {
3735 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3736 encindex, ENC_CODERANGE_7BIT, 0);
3737 }
3738 else {
3739 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3740 while (*ptr) {
3741 unsigned int c = (unsigned char)*ptr;
3742 int len = rb_enc_codelen(c, enc);
3743 rb_enc_mbcput(c, buf, enc);
3744 rb_enc_cr_str_buf_cat(str, buf, len,
3745 encindex, ENC_CODERANGE_VALID, 0);
3746 ptr++;
3747 }
3748 return str;
3749 }
3750}
3751
3752VALUE
3754{
3755 int str2_cr = rb_enc_str_coderange(str2);
3756
3757 if (str_enc_fastpath(str)) {
3758 switch (str2_cr) {
3759 case ENC_CODERANGE_7BIT:
3760 // If RHS is 7bit we can do simple concatenation
3761 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3762 RB_GC_GUARD(str2);
3763 return str;
3765 // If RHS is valid, we can do simple concatenation if encodings are the same
3766 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3767 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3768 int str_cr = ENC_CODERANGE(str);
3769 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3770 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3771 }
3772 RB_GC_GUARD(str2);
3773 return str;
3774 }
3775 }
3776 }
3777
3778 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3779 ENCODING_GET(str2), str2_cr, &str2_cr);
3780
3781 ENC_CODERANGE_SET(str2, str2_cr);
3782
3783 return str;
3784}
3785
3786VALUE
3788{
3789 StringValue(str2);
3790 return rb_str_buf_append(str, str2);
3791}
3792
3793VALUE
3794rb_str_concat_literals(size_t num, const VALUE *strary)
3795{
3796 VALUE str;
3797 size_t i, s = 0;
3798 unsigned long len = 1;
3799
3800 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3801 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3802
3803 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3804 str = rb_str_buf_new(len);
3805 str_enc_copy_direct(str, strary[0]);
3806
3807 for (i = s; i < num; ++i) {
3808 const VALUE v = strary[i];
3809 int encidx = ENCODING_GET(v);
3810
3811 rb_str_buf_append(str, v);
3812 if (encidx != ENCINDEX_US_ASCII) {
3813 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3814 rb_enc_set_index(str, encidx);
3815 }
3816 }
3817 return str;
3818}
3819
3820/*
3821 * call-seq:
3822 * concat(*objects) -> string
3823 *
3824 * :include: doc/string/concat.rdoc
3825 */
3826static VALUE
3827rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3828{
3829 str_modifiable(str);
3830
3831 if (argc == 1) {
3832 return rb_str_concat(str, argv[0]);
3833 }
3834 else if (argc > 1) {
3835 int i;
3836 VALUE arg_str = rb_str_tmp_new(0);
3837 rb_enc_copy(arg_str, str);
3838 for (i = 0; i < argc; i++) {
3839 rb_str_concat(arg_str, argv[i]);
3840 }
3841 rb_str_buf_append(str, arg_str);
3842 }
3843
3844 return str;
3845}
3846
3847/*
3848 * call-seq:
3849 * append_as_bytes(*objects) -> self
3850 *
3851 * Concatenates each object in +objects+ into +self+; returns +self+;
3852 * performs no encoding validation or conversion:
3853 *
3854 * s = 'foo'
3855 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3856 * s.valid_encoding? # => false
3857 * s.append_as_bytes("\xAC 12")
3858 * s.valid_encoding? # => true
3859 *
3860 * When a given object is an integer,
3861 * the value is considered an 8-bit byte;
3862 * if the integer occupies more than one byte (i.e,. is greater than 255),
3863 * appends only the low-order byte (similar to String#setbyte):
3864 *
3865 * s = ""
3866 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3867 * s.bytesize # => 2
3868 *
3869 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3870 */
3871
3872VALUE
3873rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3874{
3875 long needed_capacity = 0;
3876 volatile VALUE t0;
3877 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3878
3879 for (int index = 0; index < argc; index++) {
3880 VALUE obj = argv[index];
3881 enum ruby_value_type type = types[index] = rb_type(obj);
3882 switch (type) {
3883 case T_FIXNUM:
3884 case T_BIGNUM:
3885 needed_capacity++;
3886 break;
3887 case T_STRING:
3888 needed_capacity += RSTRING_LEN(obj);
3889 break;
3890 default:
3891 rb_raise(
3893 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3894 rb_obj_class(obj)
3895 );
3896 break;
3897 }
3898 }
3899
3900 str_ensure_available_capa(str, needed_capacity);
3901 char *sptr = RSTRING_END(str);
3902
3903 for (int index = 0; index < argc; index++) {
3904 VALUE obj = argv[index];
3905 enum ruby_value_type type = types[index];
3906 switch (type) {
3907 case T_FIXNUM:
3908 case T_BIGNUM: {
3909 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3910 char byte = (char)(NUM2INT(obj) & 0xFF);
3911 *sptr = byte;
3912 sptr++;
3913 break;
3914 }
3915 case T_STRING: {
3916 const char *ptr;
3917 long len;
3918 RSTRING_GETMEM(obj, ptr, len);
3919 memcpy(sptr, ptr, len);
3920 sptr += len;
3921 break;
3922 }
3923 default:
3924 rb_bug("append_as_bytes arguments should have been validated");
3925 }
3926 }
3927
3928 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3929 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3930
3931 int cr = ENC_CODERANGE(str);
3932 switch (cr) {
3933 case ENC_CODERANGE_7BIT: {
3934 for (int index = 0; index < argc; index++) {
3935 VALUE obj = argv[index];
3936 enum ruby_value_type type = types[index];
3937 switch (type) {
3938 case T_FIXNUM:
3939 case T_BIGNUM: {
3940 if (!ISASCII(NUM2INT(obj))) {
3941 goto clear_cr;
3942 }
3943 break;
3944 }
3945 case T_STRING: {
3946 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3947 goto clear_cr;
3948 }
3949 break;
3950 }
3951 default:
3952 rb_bug("append_as_bytes arguments should have been validated");
3953 }
3954 }
3955 break;
3956 }
3958 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3959 goto keep_cr;
3960 }
3961 else {
3962 goto clear_cr;
3963 }
3964 break;
3965 default:
3966 goto clear_cr;
3967 break;
3968 }
3969
3970 RB_GC_GUARD(t0);
3971
3972 clear_cr:
3973 // If no fast path was hit, we clear the coderange.
3974 // append_as_bytes is predominently meant to be used in
3975 // buffering situation, hence it's likely the coderange
3976 // will never be scanned, so it's not worth spending time
3977 // precomputing the coderange except for simple and common
3978 // situations.
3980 keep_cr:
3981 return str;
3982}
3983
3984/*
3985 * call-seq:
3986 * self << object -> self
3987 *
3988 * Appends a string representation of +object+ to +self+;
3989 * returns +self+.
3990 *
3991 * If +object+ is a string, appends it to +self+:
3992 *
3993 * s = 'foo'
3994 * s << 'bar' # => "foobar"
3995 * s # => "foobar"
3996 *
3997 * If +object+ is an integer,
3998 * its value is considered a codepoint;
3999 * converts the value to a character before concatenating:
4000 *
4001 * s = 'foo'
4002 * s << 33 # => "foo!"
4003 *
4004 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4005 * and the encoding of +self+ is Encoding::US_ASCII,
4006 * changes the encoding to Encoding::ASCII_8BIT:
4007 *
4008 * s = 'foo'.encode(Encoding::US_ASCII)
4009 * s.encoding # => #<Encoding:US-ASCII>
4010 * s << 0xff # => "foo\xFF"
4011 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4012 *
4013 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4014 *
4015 * s = 'foo'
4016 * s.encoding # => <Encoding:UTF-8>
4017 * s << 0x00110000 # 1114112 out of char range (RangeError)
4018 * s = 'foo'.encode(Encoding::EUC_JP)
4019 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4020 *
4021 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4022 */
4023VALUE
4025{
4026 unsigned int code;
4027 rb_encoding *enc = STR_ENC_GET(str1);
4028 int encidx;
4029
4030 if (RB_INTEGER_TYPE_P(str2)) {
4031 if (rb_num_to_uint(str2, &code) == 0) {
4032 }
4033 else if (FIXNUM_P(str2)) {
4034 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4035 }
4036 else {
4037 rb_raise(rb_eRangeError, "bignum out of char range");
4038 }
4039 }
4040 else {
4041 return rb_str_append(str1, str2);
4042 }
4043
4044 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4045
4046 if (encidx >= 0) {
4047 rb_str_buf_cat_byte(str1, (unsigned char)code);
4048 }
4049 else {
4050 long pos = RSTRING_LEN(str1);
4051 int cr = ENC_CODERANGE(str1);
4052 int len;
4053 char *buf;
4054
4055 switch (len = rb_enc_codelen(code, enc)) {
4056 case ONIGERR_INVALID_CODE_POINT_VALUE:
4057 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4058 break;
4059 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4060 case 0:
4061 rb_raise(rb_eRangeError, "%u out of char range", code);
4062 break;
4063 }
4064 buf = ALLOCA_N(char, len + 1);
4065 rb_enc_mbcput(code, buf, enc);
4066 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4067 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4068 }
4069 rb_str_resize(str1, pos+len);
4070 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4071 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4073 }
4074 else if (cr == ENC_CODERANGE_BROKEN) {
4076 }
4077 ENC_CODERANGE_SET(str1, cr);
4078 }
4079 return str1;
4080}
4081
4082int
4083rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4084{
4085 int encidx = rb_enc_to_index(enc);
4086
4087 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4088 /* US-ASCII automatically extended to ASCII-8BIT */
4089 if (code > 0xFF) {
4090 rb_raise(rb_eRangeError, "%u out of char range", code);
4091 }
4092 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4093 return ENCINDEX_ASCII_8BIT;
4094 }
4095 return encidx;
4096 }
4097 else {
4098 return -1;
4099 }
4100}
4101
4102/*
4103 * call-seq:
4104 * prepend(*other_strings) -> new_string
4105 *
4106 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4107 *
4108 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4109 *
4110 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4111 *
4112 */
4113
4114static VALUE
4115rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4116{
4117 str_modifiable(str);
4118
4119 if (argc == 1) {
4120 rb_str_update(str, 0L, 0L, argv[0]);
4121 }
4122 else if (argc > 1) {
4123 int i;
4124 VALUE arg_str = rb_str_tmp_new(0);
4125 rb_enc_copy(arg_str, str);
4126 for (i = 0; i < argc; i++) {
4127 rb_str_append(arg_str, argv[i]);
4128 }
4129 rb_str_update(str, 0L, 0L, arg_str);
4130 }
4131
4132 return str;
4133}
4134
4135st_index_t
4137{
4138 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4139 st_index_t precomputed_hash;
4140 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4141
4142 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4143 return precomputed_hash;
4144 }
4145
4146 return str_do_hash(str);
4147}
4148
4149int
4151{
4152 long len1, len2;
4153 const char *ptr1, *ptr2;
4154 RSTRING_GETMEM(str1, ptr1, len1);
4155 RSTRING_GETMEM(str2, ptr2, len2);
4156 return (len1 != len2 ||
4157 !rb_str_comparable(str1, str2) ||
4158 memcmp(ptr1, ptr2, len1) != 0);
4159}
4160
4161/*
4162 * call-seq:
4163 * hash -> integer
4164 *
4165 * :include: doc/string/hash.rdoc
4166 *
4167 */
4168
4169static VALUE
4170rb_str_hash_m(VALUE str)
4171{
4172 st_index_t hval = rb_str_hash(str);
4173 return ST2FIX(hval);
4174}
4175
4176#define lesser(a,b) (((a)>(b))?(b):(a))
4177
4178int
4180{
4181 int idx1, idx2;
4182 int rc1, rc2;
4183
4184 if (RSTRING_LEN(str1) == 0) return TRUE;
4185 if (RSTRING_LEN(str2) == 0) return TRUE;
4186 idx1 = ENCODING_GET(str1);
4187 idx2 = ENCODING_GET(str2);
4188 if (idx1 == idx2) return TRUE;
4189 rc1 = rb_enc_str_coderange(str1);
4190 rc2 = rb_enc_str_coderange(str2);
4191 if (rc1 == ENC_CODERANGE_7BIT) {
4192 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4193 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4194 return TRUE;
4195 }
4196 if (rc2 == ENC_CODERANGE_7BIT) {
4197 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4198 return TRUE;
4199 }
4200 return FALSE;
4201}
4202
4203int
4205{
4206 long len1, len2;
4207 const char *ptr1, *ptr2;
4208 int retval;
4209
4210 if (str1 == str2) return 0;
4211 RSTRING_GETMEM(str1, ptr1, len1);
4212 RSTRING_GETMEM(str2, ptr2, len2);
4213 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4214 if (len1 == len2) {
4215 if (!rb_str_comparable(str1, str2)) {
4216 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4217 return 1;
4218 return -1;
4219 }
4220 return 0;
4221 }
4222 if (len1 > len2) return 1;
4223 return -1;
4224 }
4225 if (retval > 0) return 1;
4226 return -1;
4227}
4228
4229/*
4230 * call-seq:
4231 * self == object -> true or false
4232 *
4233 * Returns whether +object+ is equal to +self+.
4234 *
4235 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4236 *
4237 * s = 'foo'
4238 * s == 'foo' # => true
4239 * s == 'food' # => false
4240 * s == 'FOO' # => false
4241 *
4242 * Returns +false+ if the two strings' encodings are not compatible:
4243 *
4244 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4245 *
4246 * When +object+ is not a string:
4247 *
4248 * - If +object+ responds to method <tt>to_str</tt>,
4249 * <tt>object == self</tt> is called and its return value is returned.
4250 * - If +object+ does not respond to <tt>to_str</tt>,
4251 * +false+ is returned.
4252 *
4253 * Related: {Comparing}[rdoc-ref:String@Comparing].
4254 */
4255
4256VALUE
4258{
4259 if (str1 == str2) return Qtrue;
4260 if (!RB_TYPE_P(str2, T_STRING)) {
4261 if (!rb_respond_to(str2, idTo_str)) {
4262 return Qfalse;
4263 }
4264 return rb_equal(str2, str1);
4265 }
4266 return rb_str_eql_internal(str1, str2);
4267}
4268
4269/*
4270 * call-seq:
4271 * eql?(object) -> true or false
4272 *
4273 * :include: doc/string/eql_p.rdoc
4274 *
4275 */
4276
4277VALUE
4278rb_str_eql(VALUE str1, VALUE str2)
4279{
4280 if (str1 == str2) return Qtrue;
4281 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4282 return rb_str_eql_internal(str1, str2);
4283}
4284
4285/*
4286 * call-seq:
4287 * self <=> other_string -> -1, 0, 1, or nil
4288 *
4289 * Compares +self+ and +other_string+, returning:
4290 *
4291 * - -1 if +other_string+ is larger.
4292 * - 0 if the two are equal.
4293 * - 1 if +other_string+ is smaller.
4294 * - +nil+ if the two are incomparable.
4295 *
4296 * Examples:
4297 *
4298 * 'foo' <=> 'foo' # => 0
4299 * 'foo' <=> 'food' # => -1
4300 * 'food' <=> 'foo' # => 1
4301 * 'FOO' <=> 'foo' # => -1
4302 * 'foo' <=> 'FOO' # => 1
4303 * 'foo' <=> 1 # => nil
4304 *
4305 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4306 */
4307
4308static VALUE
4309rb_str_cmp_m(VALUE str1, VALUE str2)
4310{
4311 int result;
4312 VALUE s = rb_check_string_type(str2);
4313 if (NIL_P(s)) {
4314 return rb_invcmp(str1, str2);
4315 }
4316 result = rb_str_cmp(str1, s);
4317 return INT2FIX(result);
4318}
4319
4320static VALUE str_casecmp(VALUE str1, VALUE str2);
4321static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4322
4323/*
4324 * call-seq:
4325 * casecmp(other_string) -> -1, 0, 1, or nil
4326 *
4327 * Ignoring case, compares +self+ and +other_string+; returns:
4328 *
4329 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4330 * - 0 if the two are equal.
4331 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4332 * - +nil+ if the two are incomparable.
4333 *
4334 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4335 *
4336 * Examples:
4337 *
4338 * 'foo'.casecmp('goo') # => -1
4339 * 'goo'.casecmp('foo') # => 1
4340 * 'foo'.casecmp('food') # => -1
4341 * 'food'.casecmp('foo') # => 1
4342 * 'FOO'.casecmp('foo') # => 0
4343 * 'foo'.casecmp('FOO') # => 0
4344 * 'foo'.casecmp(1) # => nil
4345 *
4346 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4347 */
4348
4349static VALUE
4350rb_str_casecmp(VALUE str1, VALUE str2)
4351{
4352 VALUE s = rb_check_string_type(str2);
4353 if (NIL_P(s)) {
4354 return Qnil;
4355 }
4356 return str_casecmp(str1, s);
4357}
4358
4359static VALUE
4360str_casecmp(VALUE str1, VALUE str2)
4361{
4362 long len;
4363 rb_encoding *enc;
4364 const char *p1, *p1end, *p2, *p2end;
4365
4366 enc = rb_enc_compatible(str1, str2);
4367 if (!enc) {
4368 return Qnil;
4369 }
4370
4371 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4372 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4373 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4374 while (p1 < p1end && p2 < p2end) {
4375 if (*p1 != *p2) {
4376 unsigned int c1 = TOLOWER(*p1 & 0xff);
4377 unsigned int c2 = TOLOWER(*p2 & 0xff);
4378 if (c1 != c2)
4379 return INT2FIX(c1 < c2 ? -1 : 1);
4380 }
4381 p1++;
4382 p2++;
4383 }
4384 }
4385 else {
4386 while (p1 < p1end && p2 < p2end) {
4387 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4388 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4389
4390 if (0 <= c1 && 0 <= c2) {
4391 c1 = TOLOWER(c1);
4392 c2 = TOLOWER(c2);
4393 if (c1 != c2)
4394 return INT2FIX(c1 < c2 ? -1 : 1);
4395 }
4396 else {
4397 int r;
4398 l1 = rb_enc_mbclen(p1, p1end, enc);
4399 l2 = rb_enc_mbclen(p2, p2end, enc);
4400 len = l1 < l2 ? l1 : l2;
4401 r = memcmp(p1, p2, len);
4402 if (r != 0)
4403 return INT2FIX(r < 0 ? -1 : 1);
4404 if (l1 != l2)
4405 return INT2FIX(l1 < l2 ? -1 : 1);
4406 }
4407 p1 += l1;
4408 p2 += l2;
4409 }
4410 }
4411 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4412 if (p1 == p1end) return INT2FIX(-1);
4413 return INT2FIX(1);
4414}
4415
4416/*
4417 * call-seq:
4418 * casecmp?(other_string) -> true, false, or nil
4419 *
4420 * Returns +true+ if +self+ and +other_string+ are equal after
4421 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4422 *
4423 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4424 *
4425 * Examples:
4426 *
4427 * 'foo'.casecmp?('goo') # => false
4428 * 'goo'.casecmp?('foo') # => false
4429 * 'foo'.casecmp?('food') # => false
4430 * 'food'.casecmp?('foo') # => false
4431 * 'FOO'.casecmp?('foo') # => true
4432 * 'foo'.casecmp?('FOO') # => true
4433 * 'foo'.casecmp?(1) # => nil
4434 *
4435 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4436 */
4437
4438static VALUE
4439rb_str_casecmp_p(VALUE str1, VALUE str2)
4440{
4441 VALUE s = rb_check_string_type(str2);
4442 if (NIL_P(s)) {
4443 return Qnil;
4444 }
4445 return str_casecmp_p(str1, s);
4446}
4447
4448static VALUE
4449str_casecmp_p(VALUE str1, VALUE str2)
4450{
4451 rb_encoding *enc;
4452 VALUE folded_str1, folded_str2;
4453 VALUE fold_opt = sym_fold;
4454
4455 enc = rb_enc_compatible(str1, str2);
4456 if (!enc) {
4457 return Qnil;
4458 }
4459
4460 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4461 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4462
4463 return rb_str_eql(folded_str1, folded_str2);
4464}
4465
4466static long
4467strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4468 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4469{
4470 const char *search_start = str_ptr;
4471 long pos, search_len = str_len - offset;
4472
4473 for (;;) {
4474 const char *t;
4475 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4476 if (pos < 0) return pos;
4477 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4478 if (t == search_start + pos) break;
4479 search_len -= t - search_start;
4480 if (search_len <= 0) return -1;
4481 offset += t - search_start;
4482 search_start = t;
4483 }
4484 return pos + offset;
4485}
4486
4487/* found index in byte */
4488#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4489#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4490
4491static long
4492rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4493{
4494 const char *str_ptr, *str_ptr_end, *sub_ptr;
4495 long str_len, sub_len;
4496 rb_encoding *enc;
4497
4498 enc = rb_enc_check(str, sub);
4499 if (is_broken_string(sub)) return -1;
4500
4501 str_ptr = RSTRING_PTR(str);
4502 str_ptr_end = RSTRING_END(str);
4503 str_len = RSTRING_LEN(str);
4504 sub_ptr = RSTRING_PTR(sub);
4505 sub_len = RSTRING_LEN(sub);
4506
4507 if (str_len < sub_len) return -1;
4508
4509 if (offset != 0) {
4510 long str_len_char, sub_len_char;
4511 int single_byte = single_byte_optimizable(str);
4512 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4513 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4514 if (offset < 0) {
4515 offset += str_len_char;
4516 if (offset < 0) return -1;
4517 }
4518 if (str_len_char - offset < sub_len_char) return -1;
4519 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4520 str_ptr += offset;
4521 }
4522 if (sub_len == 0) return offset;
4523
4524 /* need proceed one character at a time */
4525 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4526}
4527
4528
4529/*
4530 * call-seq:
4531 * index(pattern, offset = 0) -> integer or nil
4532 *
4533 * :include: doc/string/index.rdoc
4534 *
4535 */
4536
4537static VALUE
4538rb_str_index_m(int argc, VALUE *argv, VALUE str)
4539{
4540 VALUE sub;
4541 VALUE initpos;
4542 rb_encoding *enc = STR_ENC_GET(str);
4543 long pos;
4544
4545 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4546 long slen = str_strlen(str, enc); /* str's enc */
4547 pos = NUM2LONG(initpos);
4548 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4549 if (RB_TYPE_P(sub, T_REGEXP)) {
4551 }
4552 return Qnil;
4553 }
4554 }
4555 else {
4556 pos = 0;
4557 }
4558
4559 if (RB_TYPE_P(sub, T_REGEXP)) {
4560 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4561 enc, single_byte_optimizable(str));
4562
4563 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4564 VALUE match = rb_backref_get();
4565 struct re_registers *regs = RMATCH_REGS(match);
4566 pos = rb_str_sublen(str, BEG(0));
4567 return LONG2NUM(pos);
4568 }
4569 }
4570 else {
4571 StringValue(sub);
4572 pos = rb_str_index(str, sub, pos);
4573 if (pos >= 0) {
4574 pos = rb_str_sublen(str, pos);
4575 return LONG2NUM(pos);
4576 }
4577 }
4578 return Qnil;
4579}
4580
4581/* Ensure that the given pos is a valid character boundary.
4582 * Note that in this function, "character" means a code point
4583 * (Unicode scalar value), not a grapheme cluster.
4584 */
4585static void
4586str_ensure_byte_pos(VALUE str, long pos)
4587{
4588 if (!single_byte_optimizable(str)) {
4589 const char *s = RSTRING_PTR(str);
4590 const char *e = RSTRING_END(str);
4591 const char *p = s + pos;
4592 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4593 rb_raise(rb_eIndexError,
4594 "offset %ld does not land on character boundary", pos);
4595 }
4596 }
4597}
4598
4599/*
4600 * call-seq:
4601 * byteindex(object, offset = 0) -> integer or nil
4602 *
4603 * Returns the 0-based integer index of a substring of +self+
4604 * specified by +object+ (a string or Regexp) and +offset+,
4605 * or +nil+ if there is no such substring;
4606 * the returned index is the count of _bytes_ (not characters).
4607 *
4608 * When +object+ is a string,
4609 * returns the index of the first found substring equal to +object+:
4610 *
4611 * s = 'foo' # => "foo"
4612 * s.size # => 3 # Three 1-byte characters.
4613 * s.bytesize # => 3 # Three bytes.
4614 * s.byteindex('f') # => 0
4615 * s.byteindex('o') # => 1
4616 * s.byteindex('oo') # => 1
4617 * s.byteindex('ooo') # => nil
4618 *
4619 * When +object+ is a Regexp,
4620 * returns the index of the first found substring matching +object+;
4621 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4622 *
4623 * s = 'foo'
4624 * s.byteindex(/f/) # => 0
4625 * $~ # => #<MatchData "f">
4626 * s.byteindex(/o/) # => 1
4627 * s.byteindex(/oo/) # => 1
4628 * s.byteindex(/ooo/) # => nil
4629 * $~ # => nil
4630 *
4631 * \Integer argument +offset+, if given, specifies the 0-based index
4632 * of the byte where searching is to begin.
4633 *
4634 * When +offset+ is non-negative,
4635 * searching begins at byte position +offset+:
4636 *
4637 * s = 'foo'
4638 * s.byteindex('o', 1) # => 1
4639 * s.byteindex('o', 2) # => 2
4640 * s.byteindex('o', 3) # => nil
4641 *
4642 * When +offset+ is negative, counts backward from the end of +self+:
4643 *
4644 * s = 'foo'
4645 * s.byteindex('o', -1) # => 2
4646 * s.byteindex('o', -2) # => 1
4647 * s.byteindex('o', -3) # => 1
4648 * s.byteindex('o', -4) # => nil
4649 *
4650 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4651 *
4652 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4653 * s.size # => 2 # Two 3-byte characters.
4654 * s.bytesize # => 6 # Six bytes.
4655 * s.byteindex("\uFFFF") # => 0
4656 * s.byteindex("\uFFFF", 1) # Raises IndexError
4657 * s.byteindex("\uFFFF", 2) # Raises IndexError
4658 * s.byteindex("\uFFFF", 3) # => 3
4659 * s.byteindex("\uFFFF", 4) # Raises IndexError
4660 * s.byteindex("\uFFFF", 5) # Raises IndexError
4661 * s.byteindex("\uFFFF", 6) # => nil
4662 *
4663 * Related: see {Querying}[rdoc-ref:String@Querying].
4664 */
4665
4666static VALUE
4667rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4668{
4669 VALUE sub;
4670 VALUE initpos;
4671 long pos;
4672
4673 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4674 long slen = RSTRING_LEN(str);
4675 pos = NUM2LONG(initpos);
4676 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4677 if (RB_TYPE_P(sub, T_REGEXP)) {
4679 }
4680 return Qnil;
4681 }
4682 }
4683 else {
4684 pos = 0;
4685 }
4686
4687 str_ensure_byte_pos(str, pos);
4688
4689 if (RB_TYPE_P(sub, T_REGEXP)) {
4690 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4691 VALUE match = rb_backref_get();
4692 struct re_registers *regs = RMATCH_REGS(match);
4693 pos = BEG(0);
4694 return LONG2NUM(pos);
4695 }
4696 }
4697 else {
4698 StringValue(sub);
4699 pos = rb_str_byteindex(str, sub, pos);
4700 if (pos >= 0) return LONG2NUM(pos);
4701 }
4702 return Qnil;
4703}
4704
4705#ifndef HAVE_MEMRCHR
4706static void*
4707memrchr(const char *search_str, int chr, long search_len)
4708{
4709 const char *ptr = search_str + search_len;
4710 while (ptr > search_str) {
4711 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4712 }
4713
4714 return ((void *)0);
4715}
4716#endif
4717
4718static long
4719str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4720{
4721 char *hit, *adjusted;
4722 int c;
4723 long slen, searchlen;
4724 char *sbeg, *e, *t;
4725
4726 sbeg = RSTRING_PTR(str);
4727 slen = RSTRING_LEN(sub);
4728 if (slen == 0) return s - sbeg;
4729 e = RSTRING_END(str);
4730 t = RSTRING_PTR(sub);
4731 c = *t & 0xff;
4732 searchlen = s - sbeg + 1;
4733
4734 if (memcmp(s, t, slen) == 0) {
4735 return s - sbeg;
4736 }
4737
4738 do {
4739 hit = memrchr(sbeg, c, searchlen);
4740 if (!hit) break;
4741 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4742 if (hit != adjusted) {
4743 searchlen = adjusted - sbeg;
4744 continue;
4745 }
4746 if (memcmp(hit, t, slen) == 0)
4747 return hit - sbeg;
4748 searchlen = adjusted - sbeg;
4749 } while (searchlen > 0);
4750
4751 return -1;
4752}
4753
4754/* found index in byte */
4755static long
4756rb_str_rindex(VALUE str, VALUE sub, long pos)
4757{
4758 long len, slen;
4759 char *sbeg, *s;
4760 rb_encoding *enc;
4761 int singlebyte;
4762
4763 enc = rb_enc_check(str, sub);
4764 if (is_broken_string(sub)) return -1;
4765 singlebyte = single_byte_optimizable(str);
4766 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4767 slen = str_strlen(sub, enc); /* rb_enc_check */
4768
4769 /* substring longer than string */
4770 if (len < slen) return -1;
4771 if (len - pos < slen) pos = len - slen;
4772 if (len == 0) return pos;
4773
4774 sbeg = RSTRING_PTR(str);
4775
4776 if (pos == 0) {
4777 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4778 return 0;
4779 else
4780 return -1;
4781 }
4782
4783 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4784 return str_rindex(str, sub, s, enc);
4785}
4786
4787/*
4788 * call-seq:
4789 * rindex(pattern, offset = self.length) -> integer or nil
4790 *
4791 * :include:doc/string/rindex.rdoc
4792 *
4793 */
4794
4795static VALUE
4796rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4797{
4798 VALUE sub;
4799 VALUE initpos;
4800 rb_encoding *enc = STR_ENC_GET(str);
4801 long pos, len = str_strlen(str, enc); /* str's enc */
4802
4803 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4804 pos = NUM2LONG(initpos);
4805 if (pos < 0 && (pos += len) < 0) {
4806 if (RB_TYPE_P(sub, T_REGEXP)) {
4808 }
4809 return Qnil;
4810 }
4811 if (pos > len) pos = len;
4812 }
4813 else {
4814 pos = len;
4815 }
4816
4817 if (RB_TYPE_P(sub, T_REGEXP)) {
4818 /* enc = rb_enc_check(str, sub); */
4819 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4820 enc, single_byte_optimizable(str));
4821
4822 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4823 VALUE match = rb_backref_get();
4824 struct re_registers *regs = RMATCH_REGS(match);
4825 pos = rb_str_sublen(str, BEG(0));
4826 return LONG2NUM(pos);
4827 }
4828 }
4829 else {
4830 StringValue(sub);
4831 pos = rb_str_rindex(str, sub, pos);
4832 if (pos >= 0) {
4833 pos = rb_str_sublen(str, pos);
4834 return LONG2NUM(pos);
4835 }
4836 }
4837 return Qnil;
4838}
4839
4840static long
4841rb_str_byterindex(VALUE str, VALUE sub, long pos)
4842{
4843 long len, slen;
4844 char *sbeg, *s;
4845 rb_encoding *enc;
4846
4847 enc = rb_enc_check(str, sub);
4848 if (is_broken_string(sub)) return -1;
4849 len = RSTRING_LEN(str);
4850 slen = RSTRING_LEN(sub);
4851
4852 /* substring longer than string */
4853 if (len < slen) return -1;
4854 if (len - pos < slen) pos = len - slen;
4855 if (len == 0) return pos;
4856
4857 sbeg = RSTRING_PTR(str);
4858
4859 if (pos == 0) {
4860 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4861 return 0;
4862 else
4863 return -1;
4864 }
4865
4866 s = sbeg + pos;
4867 return str_rindex(str, sub, s, enc);
4868}
4869
4870/*
4871 * call-seq:
4872 * byterindex(object, offset = self.bytesize) -> integer or nil
4873 *
4874 * Returns the 0-based integer index of a substring of +self+
4875 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4876 * or +nil+ if there is no such substring;
4877 * the returned index is the count of _bytes_ (not characters).
4878 *
4879 * When +object+ is a string,
4880 * returns the index of the _last_ found substring equal to +object+:
4881 *
4882 * s = 'foo' # => "foo"
4883 * s.size # => 3 # Three 1-byte characters.
4884 * s.bytesize # => 3 # Three bytes.
4885 * s.byterindex('f') # => 0
4886 s.byterindex('o') # => 2
4887 s.byterindex('oo') # => 1
4888 s.byterindex('ooo') # => nil
4889 *
4890 * When +object+ is a Regexp,
4891 * returns the index of the last found substring matching +object+;
4892 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4893 *
4894 * s = 'foo'
4895 * s.byterindex(/f/) # => 0
4896 * $~ # => #<MatchData "f">
4897 * s.byterindex(/o/) # => 2
4898 * s.byterindex(/oo/) # => 1
4899 * s.byterindex(/ooo/) # => nil
4900 * $~ # => nil
4901 *
4902 * The last match means starting at the possible last position,
4903 * not the last of the longest matches:
4904 *
4905 * s = 'foo'
4906 * s.byterindex(/o+/) # => 2
4907 * $~ #=> #<MatchData "o">
4908 *
4909 * To get the last longest match, use a negative lookbehind:
4910 *
4911 * s = 'foo'
4912 * s.byterindex(/(?<!o)o+/) # => 1
4913 * $~ # => #<MatchData "oo">
4914 *
4915 * Or use method #byteindex with negative lookahead:
4916 *
4917 * s = 'foo'
4918 * s.byteindex(/o+(?!.*o)/) # => 1
4919 * $~ #=> #<MatchData "oo">
4920 *
4921 * \Integer argument +offset+, if given, specifies the 0-based index
4922 * of the byte where searching is to end.
4923 *
4924 * When +offset+ is non-negative,
4925 * searching ends at byte position +offset+:
4926 *
4927 * s = 'foo'
4928 * s.byterindex('o', 0) # => nil
4929 * s.byterindex('o', 1) # => 1
4930 * s.byterindex('o', 2) # => 2
4931 * s.byterindex('o', 3) # => 2
4932 *
4933 * When +offset+ is negative, counts backward from the end of +self+:
4934 *
4935 * s = 'foo'
4936 * s.byterindex('o', -1) # => 2
4937 * s.byterindex('o', -2) # => 1
4938 * s.byterindex('o', -3) # => nil
4939 *
4940 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4941 *
4942 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4943 * s.size # => 2 # Two 3-byte characters.
4944 * s.bytesize # => 6 # Six bytes.
4945 * s.byterindex("\uFFFF") # => 3
4946 * s.byterindex("\uFFFF", 1) # Raises IndexError
4947 * s.byterindex("\uFFFF", 2) # Raises IndexError
4948 * s.byterindex("\uFFFF", 3) # => 3
4949 * s.byterindex("\uFFFF", 4) # Raises IndexError
4950 * s.byterindex("\uFFFF", 5) # Raises IndexError
4951 * s.byterindex("\uFFFF", 6) # => nil
4952 *
4953 * Related: see {Querying}[rdoc-ref:String@Querying].
4954 */
4955
4956static VALUE
4957rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4958{
4959 VALUE sub;
4960 VALUE initpos;
4961 long pos, len = RSTRING_LEN(str);
4962
4963 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4964 pos = NUM2LONG(initpos);
4965 if (pos < 0 && (pos += len) < 0) {
4966 if (RB_TYPE_P(sub, T_REGEXP)) {
4968 }
4969 return Qnil;
4970 }
4971 if (pos > len) pos = len;
4972 }
4973 else {
4974 pos = len;
4975 }
4976
4977 str_ensure_byte_pos(str, pos);
4978
4979 if (RB_TYPE_P(sub, T_REGEXP)) {
4980 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4981 VALUE match = rb_backref_get();
4982 struct re_registers *regs = RMATCH_REGS(match);
4983 pos = BEG(0);
4984 return LONG2NUM(pos);
4985 }
4986 }
4987 else {
4988 StringValue(sub);
4989 pos = rb_str_byterindex(str, sub, pos);
4990 if (pos >= 0) return LONG2NUM(pos);
4991 }
4992 return Qnil;
4993}
4994
4995/*
4996 * call-seq:
4997 * self =~ object -> integer or nil
4998 *
4999 * When +object+ is a Regexp, returns the index of the first substring in +self+
5000 * matched by +object+,
5001 * or +nil+ if no match is found;
5002 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5003 *
5004 * 'foo' =~ /f/ # => 0
5005 * $~ # => #<MatchData "f">
5006 * 'foo' =~ /o/ # => 1
5007 * $~ # => #<MatchData "o">
5008 * 'foo' =~ /x/ # => nil
5009 * $~ # => nil
5010 *
5011 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5012 * (see Regexp#=~):
5013 *
5014 * number = nil
5015 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5016 * number # => nil # Not assigned.
5017 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5018 * number # => "9" # Assigned.
5019 *
5020 * If +object+ is not a Regexp, returns the value
5021 * returned by <tt>object =~ self</tt>.
5022 *
5023 * Related: see {Querying}[rdoc-ref:String@Querying].
5024 */
5025
5026static VALUE
5027rb_str_match(VALUE x, VALUE y)
5028{
5029 switch (OBJ_BUILTIN_TYPE(y)) {
5030 case T_STRING:
5031 rb_raise(rb_eTypeError, "type mismatch: String given");
5032
5033 case T_REGEXP:
5034 return rb_reg_match(y, x);
5035
5036 default:
5037 return rb_funcall(y, idEqTilde, 1, x);
5038 }
5039}
5040
5041
5042static VALUE get_pat(VALUE);
5043
5044
5045/*
5046 * call-seq:
5047 * match(pattern, offset = 0) -> matchdata or nil
5048 * match(pattern, offset = 0) {|matchdata| ... } -> object
5049 *
5050 * Creates a MatchData object based on +self+ and the given arguments;
5051 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5052 *
5053 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5054 *
5055 * regexp = Regexp.new(pattern)
5056 *
5057 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5058 * (see Regexp#match):
5059 *
5060 * matchdata = regexp.match(self[offset..])
5061 *
5062 * With no block given, returns the computed +matchdata+ or +nil+:
5063 *
5064 * 'foo'.match('f') # => #<MatchData "f">
5065 * 'foo'.match('o') # => #<MatchData "o">
5066 * 'foo'.match('x') # => nil
5067 * 'foo'.match('f', 1) # => nil
5068 * 'foo'.match('o', 1) # => #<MatchData "o">
5069 *
5070 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5071 * returns the block's return value:
5072 *
5073 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5074 *
5075 * With a block given and +nil+ +matchdata+, does not call the block:
5076 *
5077 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5078 *
5079 * Related: see {Querying}[rdoc-ref:String@Querying].
5080 */
5081
5082static VALUE
5083rb_str_match_m(int argc, VALUE *argv, VALUE str)
5084{
5085 VALUE re, result;
5086 if (argc < 1)
5087 rb_check_arity(argc, 1, 2);
5088 re = argv[0];
5089 argv[0] = str;
5090 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5091 if (!NIL_P(result) && rb_block_given_p()) {
5092 return rb_yield(result);
5093 }
5094 return result;
5095}
5096
5097/*
5098 * call-seq:
5099 * match?(pattern, offset = 0) -> true or false
5100 *
5101 * Returns whether a match is found for +self+ and the given arguments;
5102 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5103 *
5104 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5105 *
5106 * regexp = Regexp.new(pattern)
5107 *
5108 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5109 * +false+ otherwise:
5110 *
5111 * 'foo'.match?(/o/) # => true
5112 * 'foo'.match?('o') # => true
5113 * 'foo'.match?(/x/) # => false
5114 * 'foo'.match?('f', 1) # => false
5115 * 'foo'.match?('o', 1) # => true
5116 *
5117 * Related: see {Querying}[rdoc-ref:String@Querying].
5118 */
5119
5120static VALUE
5121rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5122{
5123 VALUE re;
5124 rb_check_arity(argc, 1, 2);
5125 re = get_pat(argv[0]);
5126 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5127}
5128
5129enum neighbor_char {
5130 NEIGHBOR_NOT_CHAR,
5131 NEIGHBOR_FOUND,
5132 NEIGHBOR_WRAPPED
5133};
5134
5135static enum neighbor_char
5136enc_succ_char(char *p, long len, rb_encoding *enc)
5137{
5138 long i;
5139 int l;
5140
5141 if (rb_enc_mbminlen(enc) > 1) {
5142 /* wchar, trivial case */
5143 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5144 if (!MBCLEN_CHARFOUND_P(r)) {
5145 return NEIGHBOR_NOT_CHAR;
5146 }
5147 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5148 l = rb_enc_code_to_mbclen(c, enc);
5149 if (!l) return NEIGHBOR_NOT_CHAR;
5150 if (l != len) return NEIGHBOR_WRAPPED;
5151 rb_enc_mbcput(c, p, enc);
5152 r = rb_enc_precise_mbclen(p, p + len, enc);
5153 if (!MBCLEN_CHARFOUND_P(r)) {
5154 return NEIGHBOR_NOT_CHAR;
5155 }
5156 return NEIGHBOR_FOUND;
5157 }
5158 while (1) {
5159 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5160 p[i] = '\0';
5161 if (i < 0)
5162 return NEIGHBOR_WRAPPED;
5163 ++((unsigned char*)p)[i];
5164 l = rb_enc_precise_mbclen(p, p+len, enc);
5165 if (MBCLEN_CHARFOUND_P(l)) {
5166 l = MBCLEN_CHARFOUND_LEN(l);
5167 if (l == len) {
5168 return NEIGHBOR_FOUND;
5169 }
5170 else {
5171 memset(p+l, 0xff, len-l);
5172 }
5173 }
5174 if (MBCLEN_INVALID_P(l) && i < len-1) {
5175 long len2;
5176 int l2;
5177 for (len2 = len-1; 0 < len2; len2--) {
5178 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5179 if (!MBCLEN_INVALID_P(l2))
5180 break;
5181 }
5182 memset(p+len2+1, 0xff, len-(len2+1));
5183 }
5184 }
5185}
5186
5187static enum neighbor_char
5188enc_pred_char(char *p, long len, rb_encoding *enc)
5189{
5190 long i;
5191 int l;
5192 if (rb_enc_mbminlen(enc) > 1) {
5193 /* wchar, trivial case */
5194 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5195 if (!MBCLEN_CHARFOUND_P(r)) {
5196 return NEIGHBOR_NOT_CHAR;
5197 }
5198 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5199 if (!c) return NEIGHBOR_NOT_CHAR;
5200 --c;
5201 l = rb_enc_code_to_mbclen(c, enc);
5202 if (!l) return NEIGHBOR_NOT_CHAR;
5203 if (l != len) return NEIGHBOR_WRAPPED;
5204 rb_enc_mbcput(c, p, enc);
5205 r = rb_enc_precise_mbclen(p, p + len, enc);
5206 if (!MBCLEN_CHARFOUND_P(r)) {
5207 return NEIGHBOR_NOT_CHAR;
5208 }
5209 return NEIGHBOR_FOUND;
5210 }
5211 while (1) {
5212 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5213 p[i] = '\xff';
5214 if (i < 0)
5215 return NEIGHBOR_WRAPPED;
5216 --((unsigned char*)p)[i];
5217 l = rb_enc_precise_mbclen(p, p+len, enc);
5218 if (MBCLEN_CHARFOUND_P(l)) {
5219 l = MBCLEN_CHARFOUND_LEN(l);
5220 if (l == len) {
5221 return NEIGHBOR_FOUND;
5222 }
5223 else {
5224 memset(p+l, 0, len-l);
5225 }
5226 }
5227 if (MBCLEN_INVALID_P(l) && i < len-1) {
5228 long len2;
5229 int l2;
5230 for (len2 = len-1; 0 < len2; len2--) {
5231 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5232 if (!MBCLEN_INVALID_P(l2))
5233 break;
5234 }
5235 memset(p+len2+1, 0, len-(len2+1));
5236 }
5237 }
5238}
5239
5240/*
5241 overwrite +p+ by succeeding letter in +enc+ and returns
5242 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5243 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5244 assuming each ranges are successive, and mbclen
5245 never change in each ranges.
5246 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5247 character.
5248 */
5249static enum neighbor_char
5250enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5251{
5252 enum neighbor_char ret;
5253 unsigned int c;
5254 int ctype;
5255 int range;
5256 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5257
5258 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5259 int try;
5260 const int max_gaps = 1;
5261
5262 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5263 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5264 ctype = ONIGENC_CTYPE_DIGIT;
5265 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5266 ctype = ONIGENC_CTYPE_ALPHA;
5267 else
5268 return NEIGHBOR_NOT_CHAR;
5269
5270 MEMCPY(save, p, char, len);
5271 for (try = 0; try <= max_gaps; ++try) {
5272 ret = enc_succ_char(p, len, enc);
5273 if (ret == NEIGHBOR_FOUND) {
5274 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5275 if (rb_enc_isctype(c, ctype, enc))
5276 return NEIGHBOR_FOUND;
5277 }
5278 }
5279 MEMCPY(p, save, char, len);
5280 range = 1;
5281 while (1) {
5282 MEMCPY(save, p, char, len);
5283 ret = enc_pred_char(p, len, enc);
5284 if (ret == NEIGHBOR_FOUND) {
5285 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5286 if (!rb_enc_isctype(c, ctype, enc)) {
5287 MEMCPY(p, save, char, len);
5288 break;
5289 }
5290 }
5291 else {
5292 MEMCPY(p, save, char, len);
5293 break;
5294 }
5295 range++;
5296 }
5297 if (range == 1) {
5298 return NEIGHBOR_NOT_CHAR;
5299 }
5300
5301 if (ctype != ONIGENC_CTYPE_DIGIT) {
5302 MEMCPY(carry, p, char, len);
5303 return NEIGHBOR_WRAPPED;
5304 }
5305
5306 MEMCPY(carry, p, char, len);
5307 enc_succ_char(carry, len, enc);
5308 return NEIGHBOR_WRAPPED;
5309}
5310
5311
5312static VALUE str_succ(VALUE str);
5313
5314/*
5315 * call-seq:
5316 * succ -> new_str
5317 *
5318 * :include: doc/string/succ.rdoc
5319 *
5320 */
5321
5322VALUE
5324{
5325 VALUE str;
5326 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5327 rb_enc_cr_str_copy_for_substr(str, orig);
5328 return str_succ(str);
5329}
5330
5331static VALUE
5332str_succ(VALUE str)
5333{
5334 rb_encoding *enc;
5335 char *sbeg, *s, *e, *last_alnum = 0;
5336 int found_alnum = 0;
5337 long l, slen;
5338 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5339 long carry_pos = 0, carry_len = 1;
5340 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5341
5342 slen = RSTRING_LEN(str);
5343 if (slen == 0) return str;
5344
5345 enc = STR_ENC_GET(str);
5346 sbeg = RSTRING_PTR(str);
5347 s = e = sbeg + slen;
5348
5349 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5350 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5351 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5352 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5353 break;
5354 }
5355 }
5356 l = rb_enc_precise_mbclen(s, e, enc);
5357 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5358 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5359 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5360 switch (neighbor) {
5361 case NEIGHBOR_NOT_CHAR:
5362 continue;
5363 case NEIGHBOR_FOUND:
5364 return str;
5365 case NEIGHBOR_WRAPPED:
5366 last_alnum = s;
5367 break;
5368 }
5369 found_alnum = 1;
5370 carry_pos = s - sbeg;
5371 carry_len = l;
5372 }
5373 if (!found_alnum) { /* str contains no alnum */
5374 s = e;
5375 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5376 enum neighbor_char neighbor;
5377 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5378 l = rb_enc_precise_mbclen(s, e, enc);
5379 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5380 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5381 MEMCPY(tmp, s, char, l);
5382 neighbor = enc_succ_char(tmp, l, enc);
5383 switch (neighbor) {
5384 case NEIGHBOR_FOUND:
5385 MEMCPY(s, tmp, char, l);
5386 return str;
5387 break;
5388 case NEIGHBOR_WRAPPED:
5389 MEMCPY(s, tmp, char, l);
5390 break;
5391 case NEIGHBOR_NOT_CHAR:
5392 break;
5393 }
5394 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5395 /* wrapped to \0...\0. search next valid char. */
5396 enc_succ_char(s, l, enc);
5397 }
5398 if (!rb_enc_asciicompat(enc)) {
5399 MEMCPY(carry, s, char, l);
5400 carry_len = l;
5401 }
5402 carry_pos = s - sbeg;
5403 }
5405 }
5406 RESIZE_CAPA(str, slen + carry_len);
5407 sbeg = RSTRING_PTR(str);
5408 s = sbeg + carry_pos;
5409 memmove(s + carry_len, s, slen - carry_pos);
5410 memmove(s, carry, carry_len);
5411 slen += carry_len;
5412 STR_SET_LEN(str, slen);
5413 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5415 return str;
5416}
5417
5418
5419/*
5420 * call-seq:
5421 * succ! -> self
5422 *
5423 * Like String#succ, but modifies +self+ in place; returns +self+.
5424 *
5425 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5426 */
5427
5428static VALUE
5429rb_str_succ_bang(VALUE str)
5430{
5431 rb_str_modify(str);
5432 str_succ(str);
5433 return str;
5434}
5435
5436static int
5437all_digits_p(const char *s, long len)
5438{
5439 while (len-- > 0) {
5440 if (!ISDIGIT(*s)) return 0;
5441 s++;
5442 }
5443 return 1;
5444}
5445
5446static int
5447str_upto_i(VALUE str, VALUE arg)
5448{
5449 rb_yield(str);
5450 return 0;
5451}
5452
5453/*
5454 * call-seq:
5455 * upto(other_string, exclusive = false) {|string| ... } -> self
5456 * upto(other_string, exclusive = false) -> new_enumerator
5457 *
5458 * With a block given, calls the block with each +String+ value
5459 * returned by successive calls to String#succ;
5460 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5461 * the sequence terminates when value +other_string+ is reached;
5462 * returns +self+:
5463 *
5464 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5465 * Output:
5466 *
5467 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5468 *
5469 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5470 *
5471 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5472 *
5473 * Output:
5474 *
5475 * a8 a9 b0 b1 b2 b3 b4 b5
5476 *
5477 * If +other_string+ would not be reached, does not call the block:
5478 *
5479 * '25'.upto('5') {|s| fail s }
5480 * 'aa'.upto('a') {|s| fail s }
5481 *
5482 * With no block given, returns a new Enumerator:
5483 *
5484 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5485 *
5486 */
5487
5488static VALUE
5489rb_str_upto(int argc, VALUE *argv, VALUE beg)
5490{
5491 VALUE end, exclusive;
5492
5493 rb_scan_args(argc, argv, "11", &end, &exclusive);
5494 RETURN_ENUMERATOR(beg, argc, argv);
5495 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5496}
5497
5498VALUE
5499rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5500{
5501 VALUE current, after_end;
5502 ID succ;
5503 int n, ascii;
5504 rb_encoding *enc;
5505
5506 CONST_ID(succ, "succ");
5507 StringValue(end);
5508 enc = rb_enc_check(beg, end);
5509 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5510 /* single character */
5511 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5512 char c = RSTRING_PTR(beg)[0];
5513 char e = RSTRING_PTR(end)[0];
5514
5515 if (c > e || (excl && c == e)) return beg;
5516 for (;;) {
5517 VALUE str = rb_enc_str_new(&c, 1, enc);
5519 if ((*each)(str, arg)) break;
5520 if (!excl && c == e) break;
5521 c++;
5522 if (excl && c == e) break;
5523 }
5524 return beg;
5525 }
5526 /* both edges are all digits */
5527 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5528 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5529 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5530 VALUE b, e;
5531 int width;
5532
5533 width = RSTRING_LENINT(beg);
5534 b = rb_str_to_inum(beg, 10, FALSE);
5535 e = rb_str_to_inum(end, 10, FALSE);
5536 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5537 long bi = FIX2LONG(b);
5538 long ei = FIX2LONG(e);
5539 rb_encoding *usascii = rb_usascii_encoding();
5540
5541 while (bi <= ei) {
5542 if (excl && bi == ei) break;
5543 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5544 bi++;
5545 }
5546 }
5547 else {
5548 ID op = excl ? '<' : idLE;
5549 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5550
5551 args[0] = INT2FIX(width);
5552 while (rb_funcall(b, op, 1, e)) {
5553 args[1] = b;
5554 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5555 b = rb_funcallv(b, succ, 0, 0);
5556 }
5557 }
5558 return beg;
5559 }
5560 /* normal case */
5561 n = rb_str_cmp(beg, end);
5562 if (n > 0 || (excl && n == 0)) return beg;
5563
5564 after_end = rb_funcallv(end, succ, 0, 0);
5565 current = str_duplicate(rb_cString, beg);
5566 while (!rb_str_equal(current, after_end)) {
5567 VALUE next = Qnil;
5568 if (excl || !rb_str_equal(current, end))
5569 next = rb_funcallv(current, succ, 0, 0);
5570 if ((*each)(current, arg)) break;
5571 if (NIL_P(next)) break;
5572 current = next;
5573 StringValue(current);
5574 if (excl && rb_str_equal(current, end)) break;
5575 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5576 break;
5577 }
5578
5579 return beg;
5580}
5581
5582VALUE
5583rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5584{
5585 VALUE current;
5586 ID succ;
5587
5588 CONST_ID(succ, "succ");
5589 /* both edges are all digits */
5590 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5591 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5592 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5593 int width = RSTRING_LENINT(beg);
5594 b = rb_str_to_inum(beg, 10, FALSE);
5595 if (FIXNUM_P(b)) {
5596 long bi = FIX2LONG(b);
5597 rb_encoding *usascii = rb_usascii_encoding();
5598
5599 while (FIXABLE(bi)) {
5600 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5601 bi++;
5602 }
5603 b = LONG2NUM(bi);
5604 }
5605 args[0] = INT2FIX(width);
5606 while (1) {
5607 args[1] = b;
5608 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5609 b = rb_funcallv(b, succ, 0, 0);
5610 }
5611 }
5612 /* normal case */
5613 current = str_duplicate(rb_cString, beg);
5614 while (1) {
5615 VALUE next = rb_funcallv(current, succ, 0, 0);
5616 if ((*each)(current, arg)) break;
5617 current = next;
5618 StringValue(current);
5619 if (RSTRING_LEN(current) == 0)
5620 break;
5621 }
5622
5623 return beg;
5624}
5625
5626static int
5627include_range_i(VALUE str, VALUE arg)
5628{
5629 VALUE *argp = (VALUE *)arg;
5630 if (!rb_equal(str, *argp)) return 0;
5631 *argp = Qnil;
5632 return 1;
5633}
5634
5635VALUE
5636rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5637{
5638 beg = rb_str_new_frozen(beg);
5639 StringValue(end);
5640 end = rb_str_new_frozen(end);
5641 if (NIL_P(val)) return Qfalse;
5642 val = rb_check_string_type(val);
5643 if (NIL_P(val)) return Qfalse;
5644 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5645 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5646 rb_enc_asciicompat(STR_ENC_GET(val))) {
5647 const char *bp = RSTRING_PTR(beg);
5648 const char *ep = RSTRING_PTR(end);
5649 const char *vp = RSTRING_PTR(val);
5650 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5651 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5652 return Qfalse;
5653 else {
5654 char b = *bp;
5655 char e = *ep;
5656 char v = *vp;
5657
5658 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5659 if (b <= v && v < e) return Qtrue;
5660 return RBOOL(!RTEST(exclusive) && v == e);
5661 }
5662 }
5663 }
5664#if 0
5665 /* both edges are all digits */
5666 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5667 all_digits_p(bp, RSTRING_LEN(beg)) &&
5668 all_digits_p(ep, RSTRING_LEN(end))) {
5669 /* TODO */
5670 }
5671#endif
5672 }
5673 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5674
5675 return RBOOL(NIL_P(val));
5676}
5677
5678static VALUE
5679rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5680{
5681 if (rb_reg_search(re, str, 0, 0) >= 0) {
5682 VALUE match = rb_backref_get();
5683 int nth = rb_reg_backref_number(match, backref);
5684 return rb_reg_nth_match(nth, match);
5685 }
5686 return Qnil;
5687}
5688
5689static VALUE
5690rb_str_aref(VALUE str, VALUE indx)
5691{
5692 long idx;
5693
5694 if (FIXNUM_P(indx)) {
5695 idx = FIX2LONG(indx);
5696 }
5697 else if (RB_TYPE_P(indx, T_REGEXP)) {
5698 return rb_str_subpat(str, indx, INT2FIX(0));
5699 }
5700 else if (RB_TYPE_P(indx, T_STRING)) {
5701 if (rb_str_index(str, indx, 0) != -1)
5702 return str_duplicate(rb_cString, indx);
5703 return Qnil;
5704 }
5705 else {
5706 /* check if indx is Range */
5707 long beg, len = str_strlen(str, NULL);
5708 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5709 case Qfalse:
5710 break;
5711 case Qnil:
5712 return Qnil;
5713 default:
5714 return rb_str_substr(str, beg, len);
5715 }
5716 idx = NUM2LONG(indx);
5717 }
5718
5719 return str_substr(str, idx, 1, FALSE);
5720}
5721
5722
5723/*
5724 * call-seq:
5725 * self[index] -> new_string or nil
5726 * self[start, length] -> new_string or nil
5727 * self[range] -> new_string or nil
5728 * self[regexp, capture = 0] -> new_string or nil
5729 * self[substring] -> new_string or nil
5730 *
5731 * :include: doc/string/aref.rdoc
5732 *
5733 */
5734
5735static VALUE
5736rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5737{
5738 if (argc == 2) {
5739 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5740 return rb_str_subpat(str, argv[0], argv[1]);
5741 }
5742 else {
5743 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5744 }
5745 }
5746 rb_check_arity(argc, 1, 2);
5747 return rb_str_aref(str, argv[0]);
5748}
5749
5750VALUE
5752{
5753 char *ptr = RSTRING_PTR(str);
5754 long olen = RSTRING_LEN(str), nlen;
5755
5756 str_modifiable(str);
5757 if (len > olen) len = olen;
5758 nlen = olen - len;
5759 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5760 char *oldptr = ptr;
5761 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5762 STR_SET_EMBED(str);
5763 ptr = RSTRING(str)->as.embed.ary;
5764 memmove(ptr, oldptr + len, nlen);
5765 if (fl == STR_NOEMBED) xfree(oldptr);
5766 }
5767 else {
5768 if (!STR_SHARED_P(str)) {
5769 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5770 rb_enc_cr_str_exact_copy(shared, str);
5771 OBJ_FREEZE(shared);
5772 }
5773 ptr = RSTRING(str)->as.heap.ptr += len;
5774 }
5775 STR_SET_LEN(str, nlen);
5776
5777 if (!SHARABLE_MIDDLE_SUBSTRING) {
5778 TERM_FILL(ptr + nlen, TERM_LEN(str));
5779 }
5781 return str;
5782}
5783
5784static void
5785rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5786{
5787 char *sptr;
5788 long slen;
5789 int cr;
5790
5791 if (beg == 0 && vlen == 0) {
5792 rb_str_drop_bytes(str, len);
5793 return;
5794 }
5795
5796 str_modify_keep_cr(str);
5797 RSTRING_GETMEM(str, sptr, slen);
5798 if (len < vlen) {
5799 /* expand string */
5800 RESIZE_CAPA(str, slen + vlen - len);
5801 sptr = RSTRING_PTR(str);
5802 }
5803
5805 cr = rb_enc_str_coderange(val);
5806 else
5808
5809 if (vlen != len) {
5810 memmove(sptr + beg + vlen,
5811 sptr + beg + len,
5812 slen - (beg + len));
5813 }
5814 if (vlen < beg && len < 0) {
5815 MEMZERO(sptr + slen, char, -len);
5816 }
5817 if (vlen > 0) {
5818 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5819 }
5820 slen += vlen - len;
5821 STR_SET_LEN(str, slen);
5822 TERM_FILL(&sptr[slen], TERM_LEN(str));
5823 ENC_CODERANGE_SET(str, cr);
5824}
5825
5826static inline void
5827rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5828{
5829 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5830}
5831
5832void
5833rb_str_update(VALUE str, long beg, long len, VALUE val)
5834{
5835 long slen;
5836 char *p, *e;
5837 rb_encoding *enc;
5838 int singlebyte = single_byte_optimizable(str);
5839 int cr;
5840
5841 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5842
5843 StringValue(val);
5844 enc = rb_enc_check(str, val);
5845 slen = str_strlen(str, enc); /* rb_enc_check */
5846
5847 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5848 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5849 }
5850 if (beg < 0) {
5851 beg += slen;
5852 }
5853 RUBY_ASSERT(beg >= 0);
5854 RUBY_ASSERT(beg <= slen);
5855
5856 if (len > slen - beg) {
5857 len = slen - beg;
5858 }
5859 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5860 if (!p) p = RSTRING_END(str);
5861 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5862 if (!e) e = RSTRING_END(str);
5863 /* error check */
5864 beg = p - RSTRING_PTR(str); /* physical position */
5865 len = e - p; /* physical length */
5866 rb_str_update_0(str, beg, len, val);
5867 rb_enc_associate(str, enc);
5869 if (cr != ENC_CODERANGE_BROKEN)
5870 ENC_CODERANGE_SET(str, cr);
5871}
5872
5873static void
5874rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5875{
5876 int nth;
5877 VALUE match;
5878 long start, end, len;
5879 rb_encoding *enc;
5880 struct re_registers *regs;
5881
5882 if (rb_reg_search(re, str, 0, 0) < 0) {
5883 rb_raise(rb_eIndexError, "regexp not matched");
5884 }
5885 match = rb_backref_get();
5886 nth = rb_reg_backref_number(match, backref);
5887 regs = RMATCH_REGS(match);
5888 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5889 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5890 }
5891 if (nth < 0) {
5892 nth += regs->num_regs;
5893 }
5894
5895 start = BEG(nth);
5896 if (start == -1) {
5897 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5898 }
5899 end = END(nth);
5900 len = end - start;
5901 StringValue(val);
5902 enc = rb_enc_check_str(str, val);
5903 rb_str_update_0(str, start, len, val);
5904 rb_enc_associate(str, enc);
5905}
5906
5907static VALUE
5908rb_str_aset(VALUE str, VALUE indx, VALUE val)
5909{
5910 long idx, beg;
5911
5912 switch (TYPE(indx)) {
5913 case T_REGEXP:
5914 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5915 return val;
5916
5917 case T_STRING:
5918 beg = rb_str_index(str, indx, 0);
5919 if (beg < 0) {
5920 rb_raise(rb_eIndexError, "string not matched");
5921 }
5922 beg = rb_str_sublen(str, beg);
5923 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5924 return val;
5925
5926 default:
5927 /* check if indx is Range */
5928 {
5929 long beg, len;
5930 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5931 rb_str_update(str, beg, len, val);
5932 return val;
5933 }
5934 }
5935 /* FALLTHROUGH */
5936
5937 case T_FIXNUM:
5938 idx = NUM2LONG(indx);
5939 rb_str_update(str, idx, 1, val);
5940 return val;
5941 }
5942}
5943
5944/*
5945 * call-seq:
5946 * self[index] = other_string -> new_string
5947 * self[start, length] = other_string -> new_string
5948 * self[range] = other_string -> new_string
5949 * self[regexp, capture = 0] = other_string -> new_string
5950 * self[substring] = other_string -> new_string
5951 *
5952 * :include: doc/string/aset.rdoc
5953 *
5954 */
5955
5956static VALUE
5957rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5958{
5959 if (argc == 3) {
5960 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5961 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5962 }
5963 else {
5964 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5965 }
5966 return argv[2];
5967 }
5968 rb_check_arity(argc, 2, 3);
5969 return rb_str_aset(str, argv[0], argv[1]);
5970}
5971
5972/*
5973 * call-seq:
5974 * insert(offset, other_string) -> self
5975 *
5976 * :include: doc/string/insert.rdoc
5977 *
5978 */
5979
5980static VALUE
5981rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5982{
5983 long pos = NUM2LONG(idx);
5984
5985 if (pos == -1) {
5986 return rb_str_append(str, str2);
5987 }
5988 else if (pos < 0) {
5989 pos++;
5990 }
5991 rb_str_update(str, pos, 0, str2);
5992 return str;
5993}
5994
5995
5996/*
5997 * call-seq:
5998 * slice!(index) -> new_string or nil
5999 * slice!(start, length) -> new_string or nil
6000 * slice!(range) -> new_string or nil
6001 * slice!(regexp, capture = 0) -> new_string or nil
6002 * slice!(substring) -> new_string or nil
6003 *
6004 * Like String#[] (and its alias String#slice), except that:
6005 *
6006 * - Performs substitutions in +self+ (not in a copy of +self+).
6007 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
6008 *
6009 * A few examples:
6010 *
6011 * s = 'hello'
6012 * s.slice!('e') # => "e"
6013 * s # => "hllo"
6014 * s.slice!('e') # => nil
6015 * s # => "hllo"
6016 *
6017 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6018 */
6019
6020static VALUE
6021rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6022{
6023 VALUE result = Qnil;
6024 VALUE indx;
6025 long beg, len = 1;
6026 char *p;
6027
6028 rb_check_arity(argc, 1, 2);
6029 str_modify_keep_cr(str);
6030 indx = argv[0];
6031 if (RB_TYPE_P(indx, T_REGEXP)) {
6032 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6033 VALUE match = rb_backref_get();
6034 struct re_registers *regs = RMATCH_REGS(match);
6035 int nth = 0;
6036 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6037 if ((nth += regs->num_regs) <= 0) return Qnil;
6038 }
6039 else if (nth >= regs->num_regs) return Qnil;
6040 beg = BEG(nth);
6041 len = END(nth) - beg;
6042 goto subseq;
6043 }
6044 else if (argc == 2) {
6045 beg = NUM2LONG(indx);
6046 len = NUM2LONG(argv[1]);
6047 goto num_index;
6048 }
6049 else if (FIXNUM_P(indx)) {
6050 beg = FIX2LONG(indx);
6051 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6052 if (!len) return Qnil;
6053 beg = p - RSTRING_PTR(str);
6054 goto subseq;
6055 }
6056 else if (RB_TYPE_P(indx, T_STRING)) {
6057 beg = rb_str_index(str, indx, 0);
6058 if (beg == -1) return Qnil;
6059 len = RSTRING_LEN(indx);
6060 result = str_duplicate(rb_cString, indx);
6061 goto squash;
6062 }
6063 else {
6064 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6065 case Qnil:
6066 return Qnil;
6067 case Qfalse:
6068 beg = NUM2LONG(indx);
6069 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6070 if (!len) return Qnil;
6071 beg = p - RSTRING_PTR(str);
6072 goto subseq;
6073 default:
6074 goto num_index;
6075 }
6076 }
6077
6078 num_index:
6079 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6080 beg = p - RSTRING_PTR(str);
6081
6082 subseq:
6083 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6084 rb_enc_cr_str_copy_for_substr(result, str);
6085
6086 squash:
6087 if (len > 0) {
6088 if (beg == 0) {
6089 rb_str_drop_bytes(str, len);
6090 }
6091 else {
6092 char *sptr = RSTRING_PTR(str);
6093 long slen = RSTRING_LEN(str);
6094 if (beg + len > slen) /* pathological check */
6095 len = slen - beg;
6096 memmove(sptr + beg,
6097 sptr + beg + len,
6098 slen - (beg + len));
6099 slen -= len;
6100 STR_SET_LEN(str, slen);
6101 TERM_FILL(&sptr[slen], TERM_LEN(str));
6102 }
6103 }
6104 return result;
6105}
6106
6107static VALUE
6108get_pat(VALUE pat)
6109{
6110 VALUE val;
6111
6112 switch (OBJ_BUILTIN_TYPE(pat)) {
6113 case T_REGEXP:
6114 return pat;
6115
6116 case T_STRING:
6117 break;
6118
6119 default:
6120 val = rb_check_string_type(pat);
6121 if (NIL_P(val)) {
6122 Check_Type(pat, T_REGEXP);
6123 }
6124 pat = val;
6125 }
6126
6127 return rb_reg_regcomp(pat);
6128}
6129
6130static VALUE
6131get_pat_quoted(VALUE pat, int check)
6132{
6133 VALUE val;
6134
6135 switch (OBJ_BUILTIN_TYPE(pat)) {
6136 case T_REGEXP:
6137 return pat;
6138
6139 case T_STRING:
6140 break;
6141
6142 default:
6143 val = rb_check_string_type(pat);
6144 if (NIL_P(val)) {
6145 Check_Type(pat, T_REGEXP);
6146 }
6147 pat = val;
6148 }
6149 if (check && is_broken_string(pat)) {
6150 rb_exc_raise(rb_reg_check_preprocess(pat));
6151 }
6152 return pat;
6153}
6154
6155static long
6156rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6157{
6158 if (BUILTIN_TYPE(pat) == T_STRING) {
6159 pos = rb_str_byteindex(str, pat, pos);
6160 if (set_backref_str) {
6161 if (pos >= 0) {
6162 str = rb_str_new_frozen_String(str);
6163 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6164 if (match) {
6165 *match = match_data;
6166 }
6167 }
6168 else {
6170 }
6171 }
6172 return pos;
6173 }
6174 else {
6175 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6176 }
6177}
6178
6179static long
6180rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6181{
6182 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6183}
6184
6185
6186/*
6187 * call-seq:
6188 * sub!(pattern, replacement) -> self or nil
6189 * sub!(pattern) {|match| ... } -> self or nil
6190 *
6191 * Like String#sub, except that:
6192 *
6193 * - Changes are made to +self+, not to copy of +self+.
6194 * - Returns +self+ if any changes are made, +nil+ otherwise.
6195 *
6196 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6197 */
6198
6199static VALUE
6200rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6201{
6202 VALUE pat, repl, hash = Qnil;
6203 int iter = 0;
6204 long plen;
6205 int min_arity = rb_block_given_p() ? 1 : 2;
6206 long beg;
6207
6208 rb_check_arity(argc, min_arity, 2);
6209 if (argc == 1) {
6210 iter = 1;
6211 }
6212 else {
6213 repl = argv[1];
6214 hash = rb_check_hash_type(argv[1]);
6215 if (NIL_P(hash)) {
6216 StringValue(repl);
6217 }
6218 }
6219
6220 pat = get_pat_quoted(argv[0], 1);
6221
6222 str_modifiable(str);
6223 beg = rb_pat_search(pat, str, 0, 1);
6224 if (beg >= 0) {
6225 rb_encoding *enc;
6226 int cr = ENC_CODERANGE(str);
6227 long beg0, end0;
6228 VALUE match, match0 = Qnil;
6229 struct re_registers *regs;
6230 char *p, *rp;
6231 long len, rlen;
6232
6233 match = rb_backref_get();
6234 regs = RMATCH_REGS(match);
6235 if (RB_TYPE_P(pat, T_STRING)) {
6236 beg0 = beg;
6237 end0 = beg0 + RSTRING_LEN(pat);
6238 match0 = pat;
6239 }
6240 else {
6241 beg0 = BEG(0);
6242 end0 = END(0);
6243 if (iter) match0 = rb_reg_nth_match(0, match);
6244 }
6245
6246 if (iter || !NIL_P(hash)) {
6247 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6248
6249 if (iter) {
6250 repl = rb_obj_as_string(rb_yield(match0));
6251 }
6252 else {
6253 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6254 repl = rb_obj_as_string(repl);
6255 }
6256 str_mod_check(str, p, len);
6257 rb_check_frozen(str);
6258 }
6259 else {
6260 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6261 }
6262
6263 enc = rb_enc_compatible(str, repl);
6264 if (!enc) {
6265 rb_encoding *str_enc = STR_ENC_GET(str);
6266 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6267 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6268 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6269 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6270 rb_enc_inspect_name(str_enc),
6271 rb_enc_inspect_name(STR_ENC_GET(repl)));
6272 }
6273 enc = STR_ENC_GET(repl);
6274 }
6275 rb_str_modify(str);
6276 rb_enc_associate(str, enc);
6278 int cr2 = ENC_CODERANGE(repl);
6279 if (cr2 == ENC_CODERANGE_BROKEN ||
6280 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6282 else
6283 cr = cr2;
6284 }
6285 plen = end0 - beg0;
6286 rlen = RSTRING_LEN(repl);
6287 len = RSTRING_LEN(str);
6288 if (rlen > plen) {
6289 RESIZE_CAPA(str, len + rlen - plen);
6290 }
6291 p = RSTRING_PTR(str);
6292 if (rlen != plen) {
6293 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6294 }
6295 rp = RSTRING_PTR(repl);
6296 memmove(p + beg0, rp, rlen);
6297 len += rlen - plen;
6298 STR_SET_LEN(str, len);
6299 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6300 ENC_CODERANGE_SET(str, cr);
6301
6302 RB_GC_GUARD(match);
6303
6304 return str;
6305 }
6306 return Qnil;
6307}
6308
6309
6310/*
6311 * call-seq:
6312 * sub(pattern, replacement) -> new_string
6313 * sub(pattern) {|match| ... } -> new_string
6314 *
6315 * :include: doc/string/sub.rdoc
6316 */
6317
6318static VALUE
6319rb_str_sub(int argc, VALUE *argv, VALUE str)
6320{
6321 str = str_duplicate(rb_cString, str);
6322 rb_str_sub_bang(argc, argv, str);
6323 return str;
6324}
6325
6326static VALUE
6327str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6328{
6329 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6330 long beg, beg0, end0;
6331 long offset, blen, slen, len, last;
6332 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6333 char *sp, *cp;
6334 int need_backref_str = -1;
6335 rb_encoding *str_enc;
6336
6337 switch (argc) {
6338 case 1:
6339 RETURN_ENUMERATOR(str, argc, argv);
6340 mode = ITER;
6341 break;
6342 case 2:
6343 repl = argv[1];
6344 hash = rb_check_hash_type(argv[1]);
6345 if (NIL_P(hash)) {
6346 StringValue(repl);
6347 }
6348 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6349 mode = FAST_MAP;
6350 }
6351 else {
6352 mode = MAP;
6353 }
6354 break;
6355 default:
6356 rb_error_arity(argc, 1, 2);
6357 }
6358
6359 pat = get_pat_quoted(argv[0], 1);
6360 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6361
6362 if (beg < 0) {
6363 if (bang) return Qnil; /* no match, no substitution */
6364 return str_duplicate(rb_cString, str);
6365 }
6366
6367 offset = 0;
6368 blen = RSTRING_LEN(str) + 30; /* len + margin */
6369 dest = rb_str_buf_new(blen);
6370 sp = RSTRING_PTR(str);
6371 slen = RSTRING_LEN(str);
6372 cp = sp;
6373 str_enc = STR_ENC_GET(str);
6374 rb_enc_associate(dest, str_enc);
6375 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6376
6377 do {
6378 struct re_registers *regs = RMATCH_REGS(match);
6379 if (RB_TYPE_P(pat, T_STRING)) {
6380 beg0 = beg;
6381 end0 = beg0 + RSTRING_LEN(pat);
6382 match0 = pat;
6383 }
6384 else {
6385 beg0 = BEG(0);
6386 end0 = END(0);
6387 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6388 }
6389
6390 if (mode != STR) {
6391 if (mode == ITER) {
6392 val = rb_obj_as_string(rb_yield(match0));
6393 }
6394 else {
6395 struct RString fake_str = {RBASIC_INIT};
6396 VALUE key;
6397 if (mode == FAST_MAP) {
6398 // It is safe to use a fake_str here because we established that it won't escape,
6399 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6400 // default proc.
6401 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6402 }
6403 else {
6404 key = rb_str_subseq(str, beg0, end0 - beg0);
6405 }
6406 val = rb_hash_aref(hash, key);
6407 val = rb_obj_as_string(val);
6408 }
6409 str_mod_check(str, sp, slen);
6410 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6411 rb_raise(rb_eRuntimeError, "block should not cheat");
6412 }
6413 }
6414 else if (need_backref_str) {
6415 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6416 if (need_backref_str < 0) {
6417 need_backref_str = val != repl;
6418 }
6419 }
6420 else {
6421 val = repl;
6422 }
6423
6424 len = beg0 - offset; /* copy pre-match substr */
6425 if (len) {
6426 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6427 }
6428
6429 rb_str_buf_append(dest, val);
6430
6431 last = offset;
6432 offset = end0;
6433 if (beg0 == end0) {
6434 /*
6435 * Always consume at least one character of the input string
6436 * in order to prevent infinite loops.
6437 */
6438 if (RSTRING_LEN(str) <= end0) break;
6439 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6440 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6441 offset = end0 + len;
6442 }
6443 cp = RSTRING_PTR(str) + offset;
6444 if (offset > RSTRING_LEN(str)) break;
6445
6446 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6447 if (mode != FAST_MAP && mode != STR) {
6448 match = Qnil;
6449 }
6450 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6451
6452 RB_GC_GUARD(match);
6453 } while (beg >= 0);
6454
6455 if (RSTRING_LEN(str) > offset) {
6456 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6457 }
6458 rb_pat_search0(pat, str, last, 1, &match);
6459 if (bang) {
6460 str_shared_replace(str, dest);
6461 }
6462 else {
6463 str = dest;
6464 }
6465
6466 return str;
6467}
6468
6469
6470/*
6471 * call-seq:
6472 * gsub!(pattern, replacement) -> self or nil
6473 * gsub!(pattern) {|match| ... } -> self or nil
6474 * gsub!(pattern) -> an_enumerator
6475 *
6476 * Like String#gsub, except that:
6477 *
6478 * - Performs substitutions in +self+ (not in a copy of +self+).
6479 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6480 *
6481 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6482 */
6483
6484static VALUE
6485rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6486{
6487 str_modify_keep_cr(str);
6488 return str_gsub(argc, argv, str, 1);
6489}
6490
6491
6492/*
6493 * call-seq:
6494 * gsub(pattern, replacement) -> new_string
6495 * gsub(pattern) {|match| ... } -> new_string
6496 * gsub(pattern) -> enumerator
6497 *
6498 * Returns a copy of +self+ with zero or more substrings replaced.
6499 *
6500 * Argument +pattern+ may be a string or a Regexp;
6501 * argument +replacement+ may be a string or a Hash.
6502 * Varying types for the argument values makes this method very versatile.
6503 *
6504 * Below are some simple examples;
6505 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6506 *
6507 * With arguments +pattern+ and string +replacement+ given,
6508 * replaces each matching substring with the given +replacement+ string:
6509 *
6510 * s = 'abracadabra'
6511 * s.gsub('ab', 'AB') # => "ABracadABra"
6512 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6513 *
6514 * With arguments +pattern+ and hash +replacement+ given,
6515 * replaces each matching substring with a value from the given +replacement+ hash,
6516 * or removes it:
6517 *
6518 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6519 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6520 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6521 *
6522 * With argument +pattern+ and a block given,
6523 * calls the block with each matching substring;
6524 * replaces that substring with the block's return value:
6525 *
6526 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6527 * # => "ABrACADABrA"
6528 *
6529 * With argument +pattern+ and no block given,
6530 * returns a new Enumerator.
6531 *
6532 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6533 */
6534
6535static VALUE
6536rb_str_gsub(int argc, VALUE *argv, VALUE str)
6537{
6538 return str_gsub(argc, argv, str, 0);
6539}
6540
6541
6542/*
6543 * call-seq:
6544 * replace(other_string) -> self
6545 *
6546 * Replaces the contents of +self+ with the contents of +other_string+;
6547 * returns +self+:
6548 *
6549 * s = 'foo' # => "foo"
6550 * s.replace('bar') # => "bar"
6551 *
6552 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6553 */
6554
6555VALUE
6557{
6558 str_modifiable(str);
6559 if (str == str2) return str;
6560
6561 StringValue(str2);
6562 str_discard(str);
6563 return str_replace(str, str2);
6564}
6565
6566/*
6567 * call-seq:
6568 * clear -> self
6569 *
6570 * Removes the contents of +self+:
6571 *
6572 * s = 'foo'
6573 * s.clear # => ""
6574 * s # => ""
6575 *
6576 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6577 */
6578
6579static VALUE
6580rb_str_clear(VALUE str)
6581{
6582 str_discard(str);
6583 STR_SET_EMBED(str);
6584 STR_SET_LEN(str, 0);
6585 RSTRING_PTR(str)[0] = 0;
6586 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6588 else
6590 return str;
6591}
6592
6593/*
6594 * call-seq:
6595 * chr -> string
6596 *
6597 * :include: doc/string/chr.rdoc
6598 *
6599 */
6600
6601static VALUE
6602rb_str_chr(VALUE str)
6603{
6604 return rb_str_substr(str, 0, 1);
6605}
6606
6607/*
6608 * call-seq:
6609 * getbyte(index) -> integer or nil
6610 *
6611 * :include: doc/string/getbyte.rdoc
6612 *
6613 */
6614VALUE
6615rb_str_getbyte(VALUE str, VALUE index)
6616{
6617 long pos = NUM2LONG(index);
6618
6619 if (pos < 0)
6620 pos += RSTRING_LEN(str);
6621 if (pos < 0 || RSTRING_LEN(str) <= pos)
6622 return Qnil;
6623
6624 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6625}
6626
6627/*
6628 * call-seq:
6629 * setbyte(index, integer) -> integer
6630 *
6631 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6632 * returns +integer+:
6633 *
6634 * s = 'xyzzy'
6635 * s.setbyte(2, 129) # => 129
6636 * s # => "xy\x81zy"
6637 *
6638 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6639 */
6640VALUE
6641rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6642{
6643 long pos = NUM2LONG(index);
6644 long len = RSTRING_LEN(str);
6645 char *ptr, *head, *left = 0;
6646 rb_encoding *enc;
6647 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6648
6649 if (pos < -len || len <= pos)
6650 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6651 if (pos < 0)
6652 pos += len;
6653
6654 VALUE v = rb_to_int(value);
6655 VALUE w = rb_int_and(v, INT2FIX(0xff));
6656 char byte = (char)(NUM2INT(w) & 0xFF);
6657
6658 if (!str_independent(str))
6659 str_make_independent(str);
6660 enc = STR_ENC_GET(str);
6661 head = RSTRING_PTR(str);
6662 ptr = &head[pos];
6663 if (!STR_EMBED_P(str)) {
6664 cr = ENC_CODERANGE(str);
6665 switch (cr) {
6666 case ENC_CODERANGE_7BIT:
6667 left = ptr;
6668 *ptr = byte;
6669 if (ISASCII(byte)) goto end;
6670 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6671 if (!MBCLEN_CHARFOUND_P(nlen))
6673 else
6675 goto end;
6677 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6678 width = rb_enc_precise_mbclen(left, head+len, enc);
6679 *ptr = byte;
6680 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6681 if (!MBCLEN_CHARFOUND_P(nlen))
6683 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6685 goto end;
6686 }
6687 }
6689 *ptr = byte;
6690
6691 end:
6692 return value;
6693}
6694
6695static VALUE
6696str_byte_substr(VALUE str, long beg, long len, int empty)
6697{
6698 long n = RSTRING_LEN(str);
6699
6700 if (beg > n || len < 0) return Qnil;
6701 if (beg < 0) {
6702 beg += n;
6703 if (beg < 0) return Qnil;
6704 }
6705 if (len > n - beg)
6706 len = n - beg;
6707 if (len <= 0) {
6708 if (!empty) return Qnil;
6709 len = 0;
6710 }
6711
6712 VALUE str2 = str_subseq(str, beg, len);
6713
6714 str_enc_copy_direct(str2, str);
6715
6716 if (RSTRING_LEN(str2) == 0) {
6717 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6719 else
6721 }
6722 else {
6723 switch (ENC_CODERANGE(str)) {
6724 case ENC_CODERANGE_7BIT:
6726 break;
6727 default:
6729 break;
6730 }
6731 }
6732
6733 return str2;
6734}
6735
6736VALUE
6737rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6738{
6739 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6740}
6741
6742static VALUE
6743str_byte_aref(VALUE str, VALUE indx)
6744{
6745 long idx;
6746 if (FIXNUM_P(indx)) {
6747 idx = FIX2LONG(indx);
6748 }
6749 else {
6750 /* check if indx is Range */
6751 long beg, len = RSTRING_LEN(str);
6752
6753 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6754 case Qfalse:
6755 break;
6756 case Qnil:
6757 return Qnil;
6758 default:
6759 return str_byte_substr(str, beg, len, TRUE);
6760 }
6761
6762 idx = NUM2LONG(indx);
6763 }
6764 return str_byte_substr(str, idx, 1, FALSE);
6765}
6766
6767/*
6768 * call-seq:
6769 * byteslice(offset, length = 1) -> string or nil
6770 * byteslice(range) -> string or nil
6771 *
6772 * :include: doc/string/byteslice.rdoc
6773 */
6774
6775static VALUE
6776rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6777{
6778 if (argc == 2) {
6779 long beg = NUM2LONG(argv[0]);
6780 long len = NUM2LONG(argv[1]);
6781 return str_byte_substr(str, beg, len, TRUE);
6782 }
6783 rb_check_arity(argc, 1, 2);
6784 return str_byte_aref(str, argv[0]);
6785}
6786
6787static void
6788str_check_beg_len(VALUE str, long *beg, long *len)
6789{
6790 long end, slen = RSTRING_LEN(str);
6791
6792 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6793 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6794 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6795 }
6796 if (*beg < 0) {
6797 *beg += slen;
6798 }
6799 RUBY_ASSERT(*beg >= 0);
6800 RUBY_ASSERT(*beg <= slen);
6801
6802 if (*len > slen - *beg) {
6803 *len = slen - *beg;
6804 }
6805 end = *beg + *len;
6806 str_ensure_byte_pos(str, *beg);
6807 str_ensure_byte_pos(str, end);
6808}
6809
6810/*
6811 * call-seq:
6812 * bytesplice(offset, length, str) -> self
6813 * bytesplice(offset, length, str, str_offset, str_length) -> self
6814 * bytesplice(range, str) -> self
6815 * bytesplice(range, str, str_range) -> self
6816 *
6817 * :include: doc/string/bytesplice.rdoc
6818 */
6819
6820static VALUE
6821rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6822{
6823 long beg, len, vbeg, vlen;
6824 VALUE val;
6825 int cr;
6826
6827 rb_check_arity(argc, 2, 5);
6828 if (!(argc == 2 || argc == 3 || argc == 5)) {
6829 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6830 }
6831 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6832 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6833 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6834 rb_builtin_class_name(argv[0]));
6835 }
6836 val = argv[1];
6837 StringValue(val);
6838 if (argc == 2) {
6839 /* bytesplice(range, str) */
6840 vbeg = 0;
6841 vlen = RSTRING_LEN(val);
6842 }
6843 else {
6844 /* bytesplice(range, str, str_range) */
6845 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6846 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6847 rb_builtin_class_name(argv[2]));
6848 }
6849 }
6850 }
6851 else {
6852 beg = NUM2LONG(argv[0]);
6853 len = NUM2LONG(argv[1]);
6854 val = argv[2];
6855 StringValue(val);
6856 if (argc == 3) {
6857 /* bytesplice(index, length, str) */
6858 vbeg = 0;
6859 vlen = RSTRING_LEN(val);
6860 }
6861 else {
6862 /* bytesplice(index, length, str, str_index, str_length) */
6863 vbeg = NUM2LONG(argv[3]);
6864 vlen = NUM2LONG(argv[4]);
6865 }
6866 }
6867 str_check_beg_len(str, &beg, &len);
6868 str_check_beg_len(val, &vbeg, &vlen);
6869 str_modify_keep_cr(str);
6870
6871 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6872 rb_enc_associate(str, rb_enc_check(str, val));
6873 }
6874
6875 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6877 if (cr != ENC_CODERANGE_BROKEN)
6878 ENC_CODERANGE_SET(str, cr);
6879 return str;
6880}
6881
6882/*
6883 * call-seq:
6884 * reverse -> new_string
6885 *
6886 * Returns a new string with the characters from +self+ in reverse order.
6887 *
6888 * 'drawer'.reverse # => "reward"
6889 * 'reviled'.reverse # => "deliver"
6890 * 'stressed'.reverse # => "desserts"
6891 * 'semordnilaps'.reverse # => "spalindromes"
6892 *
6893 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6894 */
6895
6896static VALUE
6897rb_str_reverse(VALUE str)
6898{
6899 rb_encoding *enc;
6900 VALUE rev;
6901 char *s, *e, *p;
6902 int cr;
6903
6904 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6905 enc = STR_ENC_GET(str);
6906 rev = rb_str_new(0, RSTRING_LEN(str));
6907 s = RSTRING_PTR(str); e = RSTRING_END(str);
6908 p = RSTRING_END(rev);
6909 cr = ENC_CODERANGE(str);
6910
6911 if (RSTRING_LEN(str) > 1) {
6912 if (single_byte_optimizable(str)) {
6913 while (s < e) {
6914 *--p = *s++;
6915 }
6916 }
6917 else if (cr == ENC_CODERANGE_VALID) {
6918 while (s < e) {
6919 int clen = rb_enc_fast_mbclen(s, e, enc);
6920
6921 p -= clen;
6922 memcpy(p, s, clen);
6923 s += clen;
6924 }
6925 }
6926 else {
6927 cr = rb_enc_asciicompat(enc) ?
6929 while (s < e) {
6930 int clen = rb_enc_mbclen(s, e, enc);
6931
6932 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6933 p -= clen;
6934 memcpy(p, s, clen);
6935 s += clen;
6936 }
6937 }
6938 }
6939 STR_SET_LEN(rev, RSTRING_LEN(str));
6940 str_enc_copy_direct(rev, str);
6941 ENC_CODERANGE_SET(rev, cr);
6942
6943 return rev;
6944}
6945
6946
6947/*
6948 * call-seq:
6949 * reverse! -> self
6950 *
6951 * Returns +self+ with its characters reversed:
6952 *
6953 * 'drawer'.reverse! # => "reward"
6954 * 'reviled'.reverse! # => "deliver"
6955 * 'stressed'.reverse! # => "desserts"
6956 * 'semordnilaps'.reverse! # => "spalindromes"
6957 *
6958 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6959 */
6960
6961static VALUE
6962rb_str_reverse_bang(VALUE str)
6963{
6964 if (RSTRING_LEN(str) > 1) {
6965 if (single_byte_optimizable(str)) {
6966 char *s, *e, c;
6967
6968 str_modify_keep_cr(str);
6969 s = RSTRING_PTR(str);
6970 e = RSTRING_END(str) - 1;
6971 while (s < e) {
6972 c = *s;
6973 *s++ = *e;
6974 *e-- = c;
6975 }
6976 }
6977 else {
6978 str_shared_replace(str, rb_str_reverse(str));
6979 }
6980 }
6981 else {
6982 str_modify_keep_cr(str);
6983 }
6984 return str;
6985}
6986
6987
6988/*
6989 * call-seq:
6990 * include?(other_string) -> true or false
6991 *
6992 * Returns whether +self+ contains +other_string+:
6993 *
6994 * s = 'bar'
6995 * s.include?('ba') # => true
6996 * s.include?('ar') # => true
6997 * s.include?('bar') # => true
6998 * s.include?('a') # => true
6999 * s.include?('') # => true
7000 * s.include?('foo') # => false
7001 *
7002 * Related: see {Querying}[rdoc-ref:String@Querying].
7003 */
7004
7005VALUE
7006rb_str_include(VALUE str, VALUE arg)
7007{
7008 long i;
7009
7010 StringValue(arg);
7011 i = rb_str_index(str, arg, 0);
7012
7013 return RBOOL(i != -1);
7014}
7015
7016
7017/*
7018 * call-seq:
7019 * to_i(base = 10) -> integer
7020 *
7021 * Returns the result of interpreting leading characters in +self+
7022 * as an integer in the given +base+ (which must be in (0, 2..36)):
7023 *
7024 * '123456'.to_i # => 123456
7025 * '123def'.to_i(16) # => 1195503
7026 *
7027 * With +base+ zero, string +object+ may contain leading characters
7028 * to specify the actual base:
7029 *
7030 * '123def'.to_i(0) # => 123
7031 * '0123def'.to_i(0) # => 83
7032 * '0b123def'.to_i(0) # => 1
7033 * '0o123def'.to_i(0) # => 83
7034 * '0d123def'.to_i(0) # => 123
7035 * '0x123def'.to_i(0) # => 1195503
7036 *
7037 * Characters past a leading valid number (in the given +base+) are ignored:
7038 *
7039 * '12.345'.to_i # => 12
7040 * '12345'.to_i(2) # => 1
7041 *
7042 * Returns zero if there is no leading valid number:
7043 *
7044 * 'abcdef'.to_i # => 0
7045 * '2'.to_i(2) # => 0
7046 *
7047 */
7048
7049static VALUE
7050rb_str_to_i(int argc, VALUE *argv, VALUE str)
7051{
7052 int base = 10;
7053
7054 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7055 rb_raise(rb_eArgError, "invalid radix %d", base);
7056 }
7057 return rb_str_to_inum(str, base, FALSE);
7058}
7059
7060
7061/*
7062 * call-seq:
7063 * to_f -> float
7064 *
7065 * Returns the result of interpreting leading characters in +self+ as a Float:
7066 *
7067 * '3.14159'.to_f # => 3.14159
7068 * '1.234e-2'.to_f # => 0.01234
7069 *
7070 * Characters past a leading valid number (in the given +base+) are ignored:
7071 *
7072 * '3.14 (pi to two places)'.to_f # => 3.14
7073 *
7074 * Returns zero if there is no leading valid number:
7075 *
7076 * 'abcdef'.to_f # => 0.0
7077 *
7078 */
7079
7080static VALUE
7081rb_str_to_f(VALUE str)
7082{
7083 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7084}
7085
7086
7087/*
7088 * call-seq:
7089 * to_s -> self or string
7090 *
7091 * Returns +self+ if +self+ is a +String+,
7092 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7093 */
7094
7095static VALUE
7096rb_str_to_s(VALUE str)
7097{
7098 if (rb_obj_class(str) != rb_cString) {
7099 return str_duplicate(rb_cString, str);
7100 }
7101 return str;
7102}
7103
7104#if 0
7105static void
7106str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7107{
7108 char s[RUBY_MAX_CHAR_LEN];
7109 int n = rb_enc_codelen(c, enc);
7110
7111 rb_enc_mbcput(c, s, enc);
7112 rb_enc_str_buf_cat(str, s, n, enc);
7113}
7114#endif
7115
7116#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7117
7118int
7119rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7120{
7121 char buf[CHAR_ESC_LEN + 1];
7122 int l;
7123
7124#if SIZEOF_INT > 4
7125 c &= 0xffffffff;
7126#endif
7127 if (unicode_p) {
7128 if (c < 0x7F && ISPRINT(c)) {
7129 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7130 }
7131 else if (c < 0x10000) {
7132 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7133 }
7134 else {
7135 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7136 }
7137 }
7138 else {
7139 if (c < 0x100) {
7140 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7141 }
7142 else {
7143 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7144 }
7145 }
7146 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7147 rb_str_buf_cat(result, buf, l);
7148 return l;
7149}
7150
7151const char *
7152ruby_escaped_char(int c)
7153{
7154 switch (c) {
7155 case '\0': return "\\0";
7156 case '\n': return "\\n";
7157 case '\r': return "\\r";
7158 case '\t': return "\\t";
7159 case '\f': return "\\f";
7160 case '\013': return "\\v";
7161 case '\010': return "\\b";
7162 case '\007': return "\\a";
7163 case '\033': return "\\e";
7164 case '\x7f': return "\\c?";
7165 }
7166 return NULL;
7167}
7168
7169VALUE
7170rb_str_escape(VALUE str)
7171{
7172 int encidx = ENCODING_GET(str);
7173 rb_encoding *enc = rb_enc_from_index(encidx);
7174 const char *p = RSTRING_PTR(str);
7175 const char *pend = RSTRING_END(str);
7176 const char *prev = p;
7177 char buf[CHAR_ESC_LEN + 1];
7178 VALUE result = rb_str_buf_new(0);
7179 int unicode_p = rb_enc_unicode_p(enc);
7180 int asciicompat = rb_enc_asciicompat(enc);
7181
7182 while (p < pend) {
7183 unsigned int c;
7184 const char *cc;
7185 int n = rb_enc_precise_mbclen(p, pend, enc);
7186 if (!MBCLEN_CHARFOUND_P(n)) {
7187 if (p > prev) str_buf_cat(result, prev, p - prev);
7188 n = rb_enc_mbminlen(enc);
7189 if (pend < p + n)
7190 n = (int)(pend - p);
7191 while (n--) {
7192 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7193 str_buf_cat(result, buf, strlen(buf));
7194 prev = ++p;
7195 }
7196 continue;
7197 }
7198 n = MBCLEN_CHARFOUND_LEN(n);
7199 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7200 p += n;
7201 cc = ruby_escaped_char(c);
7202 if (cc) {
7203 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7204 str_buf_cat(result, cc, strlen(cc));
7205 prev = p;
7206 }
7207 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7208 }
7209 else {
7210 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7211 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7212 prev = p;
7213 }
7214 }
7215 if (p > prev) str_buf_cat(result, prev, p - prev);
7216 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7217
7218 return result;
7219}
7220
7221/*
7222 * call-seq:
7223 * inspect -> string
7224 *
7225 * :include: doc/string/inspect.rdoc
7226 *
7227 */
7228
7229VALUE
7231{
7232 int encidx = ENCODING_GET(str);
7233 rb_encoding *enc = rb_enc_from_index(encidx);
7234 const char *p, *pend, *prev;
7235 char buf[CHAR_ESC_LEN + 1];
7236 VALUE result = rb_str_buf_new(0);
7237 rb_encoding *resenc = rb_default_internal_encoding();
7238 int unicode_p = rb_enc_unicode_p(enc);
7239 int asciicompat = rb_enc_asciicompat(enc);
7240
7241 if (resenc == NULL) resenc = rb_default_external_encoding();
7242 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7243 rb_enc_associate(result, resenc);
7244 str_buf_cat2(result, "\"");
7245
7246 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7247 prev = p;
7248 while (p < pend) {
7249 unsigned int c, cc;
7250 int n;
7251
7252 n = rb_enc_precise_mbclen(p, pend, enc);
7253 if (!MBCLEN_CHARFOUND_P(n)) {
7254 if (p > prev) str_buf_cat(result, prev, p - prev);
7255 n = rb_enc_mbminlen(enc);
7256 if (pend < p + n)
7257 n = (int)(pend - p);
7258 while (n--) {
7259 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7260 str_buf_cat(result, buf, strlen(buf));
7261 prev = ++p;
7262 }
7263 continue;
7264 }
7265 n = MBCLEN_CHARFOUND_LEN(n);
7266 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7267 p += n;
7268 if ((asciicompat || unicode_p) &&
7269 (c == '"'|| c == '\\' ||
7270 (c == '#' &&
7271 p < pend &&
7272 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7273 (cc = rb_enc_codepoint(p,pend,enc),
7274 (cc == '$' || cc == '@' || cc == '{'))))) {
7275 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7276 str_buf_cat2(result, "\\");
7277 if (asciicompat || enc == resenc) {
7278 prev = p - n;
7279 continue;
7280 }
7281 }
7282 switch (c) {
7283 case '\n': cc = 'n'; break;
7284 case '\r': cc = 'r'; break;
7285 case '\t': cc = 't'; break;
7286 case '\f': cc = 'f'; break;
7287 case '\013': cc = 'v'; break;
7288 case '\010': cc = 'b'; break;
7289 case '\007': cc = 'a'; break;
7290 case 033: cc = 'e'; break;
7291 default: cc = 0; break;
7292 }
7293 if (cc) {
7294 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7295 buf[0] = '\\';
7296 buf[1] = (char)cc;
7297 str_buf_cat(result, buf, 2);
7298 prev = p;
7299 continue;
7300 }
7301 /* The special casing of 0x85 (NEXT_LINE) here is because
7302 * Oniguruma historically treats it as printable, but it
7303 * doesn't match the print POSIX bracket class or character
7304 * property in regexps.
7305 *
7306 * See Ruby Bug #16842 for details:
7307 * https://bugs.ruby-lang.org/issues/16842
7308 */
7309 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7310 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7311 continue;
7312 }
7313 else {
7314 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7315 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7316 prev = p;
7317 continue;
7318 }
7319 }
7320 if (p > prev) str_buf_cat(result, prev, p - prev);
7321 str_buf_cat2(result, "\"");
7322
7323 return result;
7324}
7325
7326#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7327
7328/*
7329 * call-seq:
7330 * dump -> new_string
7331 *
7332 * :include: doc/string/dump.rdoc
7333 *
7334 */
7335
7336VALUE
7338{
7339 int encidx = rb_enc_get_index(str);
7340 rb_encoding *enc = rb_enc_from_index(encidx);
7341 long len;
7342 const char *p, *pend;
7343 char *q, *qend;
7344 VALUE result;
7345 int u8 = (encidx == rb_utf8_encindex());
7346 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7347
7348 len = 2; /* "" */
7349 if (!rb_enc_asciicompat(enc)) {
7350 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7351 len += strlen(enc->name);
7352 }
7353
7354 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7355 while (p < pend) {
7356 int clen;
7357 unsigned char c = *p++;
7358
7359 switch (c) {
7360 case '"': case '\\':
7361 case '\n': case '\r':
7362 case '\t': case '\f':
7363 case '\013': case '\010': case '\007': case '\033':
7364 clen = 2;
7365 break;
7366
7367 case '#':
7368 clen = IS_EVSTR(p, pend) ? 2 : 1;
7369 break;
7370
7371 default:
7372 if (ISPRINT(c)) {
7373 clen = 1;
7374 }
7375 else {
7376 if (u8 && c > 0x7F) { /* \u notation */
7377 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7378 if (MBCLEN_CHARFOUND_P(n)) {
7379 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7380 if (cc <= 0xFFFF)
7381 clen = 6; /* \uXXXX */
7382 else if (cc <= 0xFFFFF)
7383 clen = 9; /* \u{XXXXX} */
7384 else
7385 clen = 10; /* \u{XXXXXX} */
7386 p += MBCLEN_CHARFOUND_LEN(n)-1;
7387 break;
7388 }
7389 }
7390 clen = 4; /* \xNN */
7391 }
7392 break;
7393 }
7394
7395 if (clen > LONG_MAX - len) {
7396 rb_raise(rb_eRuntimeError, "string size too big");
7397 }
7398 len += clen;
7399 }
7400
7401 result = rb_str_new(0, len);
7402 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7403 q = RSTRING_PTR(result); qend = q + len + 1;
7404
7405 *q++ = '"';
7406 while (p < pend) {
7407 unsigned char c = *p++;
7408
7409 if (c == '"' || c == '\\') {
7410 *q++ = '\\';
7411 *q++ = c;
7412 }
7413 else if (c == '#') {
7414 if (IS_EVSTR(p, pend)) *q++ = '\\';
7415 *q++ = '#';
7416 }
7417 else if (c == '\n') {
7418 *q++ = '\\';
7419 *q++ = 'n';
7420 }
7421 else if (c == '\r') {
7422 *q++ = '\\';
7423 *q++ = 'r';
7424 }
7425 else if (c == '\t') {
7426 *q++ = '\\';
7427 *q++ = 't';
7428 }
7429 else if (c == '\f') {
7430 *q++ = '\\';
7431 *q++ = 'f';
7432 }
7433 else if (c == '\013') {
7434 *q++ = '\\';
7435 *q++ = 'v';
7436 }
7437 else if (c == '\010') {
7438 *q++ = '\\';
7439 *q++ = 'b';
7440 }
7441 else if (c == '\007') {
7442 *q++ = '\\';
7443 *q++ = 'a';
7444 }
7445 else if (c == '\033') {
7446 *q++ = '\\';
7447 *q++ = 'e';
7448 }
7449 else if (ISPRINT(c)) {
7450 *q++ = c;
7451 }
7452 else {
7453 *q++ = '\\';
7454 if (u8) {
7455 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7456 if (MBCLEN_CHARFOUND_P(n)) {
7457 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7458 p += n;
7459 if (cc <= 0xFFFF)
7460 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7461 else
7462 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7463 q += strlen(q);
7464 continue;
7465 }
7466 }
7467 snprintf(q, qend-q, "x%02X", c);
7468 q += 3;
7469 }
7470 }
7471 *q++ = '"';
7472 *q = '\0';
7473 if (!rb_enc_asciicompat(enc)) {
7474 snprintf(q, qend-q, nonascii_suffix, enc->name);
7475 encidx = rb_ascii8bit_encindex();
7476 }
7477 /* result from dump is ASCII */
7478 rb_enc_associate_index(result, encidx);
7480 return result;
7481}
7482
7483static int
7484unescape_ascii(unsigned int c)
7485{
7486 switch (c) {
7487 case 'n':
7488 return '\n';
7489 case 'r':
7490 return '\r';
7491 case 't':
7492 return '\t';
7493 case 'f':
7494 return '\f';
7495 case 'v':
7496 return '\13';
7497 case 'b':
7498 return '\010';
7499 case 'a':
7500 return '\007';
7501 case 'e':
7502 return 033;
7503 }
7505}
7506
7507static void
7508undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7509{
7510 const char *s = *ss;
7511 unsigned int c;
7512 int codelen;
7513 size_t hexlen;
7514 unsigned char buf[6];
7515 static rb_encoding *enc_utf8 = NULL;
7516
7517 switch (*s) {
7518 case '\\':
7519 case '"':
7520 case '#':
7521 rb_str_cat(undumped, s, 1); /* cat itself */
7522 s++;
7523 break;
7524 case 'n':
7525 case 'r':
7526 case 't':
7527 case 'f':
7528 case 'v':
7529 case 'b':
7530 case 'a':
7531 case 'e':
7532 *buf = unescape_ascii(*s);
7533 rb_str_cat(undumped, (char *)buf, 1);
7534 s++;
7535 break;
7536 case 'u':
7537 if (*binary) {
7538 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7539 }
7540 *utf8 = true;
7541 if (++s >= s_end) {
7542 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7543 }
7544 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7545 if (*penc != enc_utf8) {
7546 *penc = enc_utf8;
7547 rb_enc_associate(undumped, enc_utf8);
7548 }
7549 if (*s == '{') { /* handle \u{...} form */
7550 s++;
7551 for (;;) {
7552 if (s >= s_end) {
7553 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7554 }
7555 if (*s == '}') {
7556 s++;
7557 break;
7558 }
7559 if (ISSPACE(*s)) {
7560 s++;
7561 continue;
7562 }
7563 c = scan_hex(s, s_end-s, &hexlen);
7564 if (hexlen == 0 || hexlen > 6) {
7565 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7566 }
7567 if (c > 0x10ffff) {
7568 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7569 }
7570 if (0xd800 <= c && c <= 0xdfff) {
7571 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7572 }
7573 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7574 rb_str_cat(undumped, (char *)buf, codelen);
7575 s += hexlen;
7576 }
7577 }
7578 else { /* handle \uXXXX form */
7579 c = scan_hex(s, 4, &hexlen);
7580 if (hexlen != 4) {
7581 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7582 }
7583 if (0xd800 <= c && c <= 0xdfff) {
7584 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7585 }
7586 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7587 rb_str_cat(undumped, (char *)buf, codelen);
7588 s += hexlen;
7589 }
7590 break;
7591 case 'x':
7592 if (*utf8) {
7593 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7594 }
7595 *binary = true;
7596 if (++s >= s_end) {
7597 rb_raise(rb_eRuntimeError, "invalid hex escape");
7598 }
7599 *buf = scan_hex(s, 2, &hexlen);
7600 if (hexlen != 2) {
7601 rb_raise(rb_eRuntimeError, "invalid hex escape");
7602 }
7603 rb_str_cat(undumped, (char *)buf, 1);
7604 s += hexlen;
7605 break;
7606 default:
7607 rb_str_cat(undumped, s-1, 2);
7608 s++;
7609 }
7610
7611 *ss = s;
7612}
7613
7614static VALUE rb_str_is_ascii_only_p(VALUE str);
7615
7616/*
7617 * call-seq:
7618 * undump -> string
7619 *
7620 * Returns an unescaped version of +self+:
7621 *
7622 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7623 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7624 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7625 * s_undumped == s_orig # => true
7626 *
7627 * Related: String#dump (inverse of String#undump).
7628 *
7629 */
7630
7631static VALUE
7632str_undump(VALUE str)
7633{
7634 const char *s = RSTRING_PTR(str);
7635 const char *s_end = RSTRING_END(str);
7636 rb_encoding *enc = rb_enc_get(str);
7637 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7638 bool utf8 = false;
7639 bool binary = false;
7640 int w;
7641
7643 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7644 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7645 }
7646 if (!str_null_check(str, &w)) {
7647 rb_raise(rb_eRuntimeError, "string contains null byte");
7648 }
7649 if (RSTRING_LEN(str) < 2) goto invalid_format;
7650 if (*s != '"') goto invalid_format;
7651
7652 /* strip '"' at the start */
7653 s++;
7654
7655 for (;;) {
7656 if (s >= s_end) {
7657 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7658 }
7659
7660 if (*s == '"') {
7661 /* epilogue */
7662 s++;
7663 if (s == s_end) {
7664 /* ascii compatible dumped string */
7665 break;
7666 }
7667 else {
7668 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7669 static const char dup_suffix[] = ".dup";
7670 const char *encname;
7671 int encidx;
7672 ptrdiff_t size;
7673
7674 /* check separately for strings dumped by older versions */
7675 size = sizeof(dup_suffix) - 1;
7676 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7677
7678 size = sizeof(force_encoding_suffix) - 1;
7679 if (s_end - s <= size) goto invalid_format;
7680 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7681 s += size;
7682
7683 if (utf8) {
7684 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7685 }
7686
7687 encname = s;
7688 s = memchr(s, '"', s_end-s);
7689 size = s - encname;
7690 if (!s) goto invalid_format;
7691 if (s_end - s != 2) goto invalid_format;
7692 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7693
7694 encidx = rb_enc_find_index2(encname, (long)size);
7695 if (encidx < 0) {
7696 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7697 }
7698 rb_enc_associate_index(undumped, encidx);
7699 }
7700 break;
7701 }
7702
7703 if (*s == '\\') {
7704 s++;
7705 if (s >= s_end) {
7706 rb_raise(rb_eRuntimeError, "invalid escape");
7707 }
7708 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7709 }
7710 else {
7711 rb_str_cat(undumped, s++, 1);
7712 }
7713 }
7714
7715 RB_GC_GUARD(str);
7716
7717 return undumped;
7718invalid_format:
7719 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7720}
7721
7722static void
7723rb_str_check_dummy_enc(rb_encoding *enc)
7724{
7725 if (rb_enc_dummy_p(enc)) {
7726 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7727 rb_enc_name(enc));
7728 }
7729}
7730
7731static rb_encoding *
7732str_true_enc(VALUE str)
7733{
7734 rb_encoding *enc = STR_ENC_GET(str);
7735 rb_str_check_dummy_enc(enc);
7736 return enc;
7737}
7738
7739static OnigCaseFoldType
7740check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7741{
7742 if (argc==0)
7743 return flags;
7744 if (argc>2)
7745 rb_raise(rb_eArgError, "too many options");
7746 if (argv[0]==sym_turkic) {
7747 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7748 if (argc==2) {
7749 if (argv[1]==sym_lithuanian)
7750 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7751 else
7752 rb_raise(rb_eArgError, "invalid second option");
7753 }
7754 }
7755 else if (argv[0]==sym_lithuanian) {
7756 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7757 if (argc==2) {
7758 if (argv[1]==sym_turkic)
7759 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7760 else
7761 rb_raise(rb_eArgError, "invalid second option");
7762 }
7763 }
7764 else if (argc>1)
7765 rb_raise(rb_eArgError, "too many options");
7766 else if (argv[0]==sym_ascii)
7767 flags |= ONIGENC_CASE_ASCII_ONLY;
7768 else if (argv[0]==sym_fold) {
7769 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7770 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7771 else
7772 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7773 }
7774 else
7775 rb_raise(rb_eArgError, "invalid option");
7776 return flags;
7777}
7778
7779static inline bool
7780case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7781{
7782 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7783 return true;
7784 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7785}
7786
7787/* 16 should be long enough to absorb any kind of single character length increase */
7788#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7789#ifndef CASEMAP_DEBUG
7790# define CASEMAP_DEBUG 0
7791#endif
7792
7793struct mapping_buffer;
7794typedef struct mapping_buffer {
7795 size_t capa;
7796 size_t used;
7797 struct mapping_buffer *next;
7798 OnigUChar space[FLEX_ARY_LEN];
7800
7801static void
7802mapping_buffer_free(void *p)
7803{
7804 mapping_buffer *previous_buffer;
7805 mapping_buffer *current_buffer = p;
7806 while (current_buffer) {
7807 previous_buffer = current_buffer;
7808 current_buffer = current_buffer->next;
7809 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7810 }
7811}
7812
7813static const rb_data_type_t mapping_buffer_type = {
7814 "mapping_buffer",
7815 {0, mapping_buffer_free,},
7816 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7817};
7818
7819static VALUE
7820rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7821{
7822 VALUE target;
7823
7824 const OnigUChar *source_current, *source_end;
7825 int target_length = 0;
7826 VALUE buffer_anchor;
7827 mapping_buffer *current_buffer = 0;
7828 mapping_buffer **pre_buffer;
7829 size_t buffer_count = 0;
7830 int buffer_length_or_invalid;
7831
7832 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7833
7834 source_current = (OnigUChar*)RSTRING_PTR(source);
7835 source_end = (OnigUChar*)RSTRING_END(source);
7836
7837 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7838 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7839 while (source_current < source_end) {
7840 /* increase multiplier using buffer count to converge quickly */
7841 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7842 if (CASEMAP_DEBUG) {
7843 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7844 }
7845 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7846 *pre_buffer = current_buffer;
7847 pre_buffer = &current_buffer->next;
7848 current_buffer->next = NULL;
7849 current_buffer->capa = capa;
7850 buffer_length_or_invalid = enc->case_map(flags,
7851 &source_current, source_end,
7852 current_buffer->space,
7853 current_buffer->space+current_buffer->capa,
7854 enc);
7855 if (buffer_length_or_invalid < 0) {
7856 current_buffer = DATA_PTR(buffer_anchor);
7857 DATA_PTR(buffer_anchor) = 0;
7858 mapping_buffer_free(current_buffer);
7859 rb_raise(rb_eArgError, "input string invalid");
7860 }
7861 target_length += current_buffer->used = buffer_length_or_invalid;
7862 }
7863 if (CASEMAP_DEBUG) {
7864 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7865 }
7866
7867 if (buffer_count==1) {
7868 target = rb_str_new((const char*)current_buffer->space, target_length);
7869 }
7870 else {
7871 char *target_current;
7872
7873 target = rb_str_new(0, target_length);
7874 target_current = RSTRING_PTR(target);
7875 current_buffer = DATA_PTR(buffer_anchor);
7876 while (current_buffer) {
7877 memcpy(target_current, current_buffer->space, current_buffer->used);
7878 target_current += current_buffer->used;
7879 current_buffer = current_buffer->next;
7880 }
7881 }
7882 current_buffer = DATA_PTR(buffer_anchor);
7883 DATA_PTR(buffer_anchor) = 0;
7884 mapping_buffer_free(current_buffer);
7885
7886 RB_GC_GUARD(buffer_anchor);
7887
7888 /* TODO: check about string terminator character */
7889 str_enc_copy_direct(target, source);
7890 /*ENC_CODERANGE_SET(mapped, cr);*/
7891
7892 return target;
7893}
7894
7895static VALUE
7896rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7897{
7898 const OnigUChar *source_current, *source_end;
7899 OnigUChar *target_current, *target_end;
7900 long old_length = RSTRING_LEN(source);
7901 int length_or_invalid;
7902
7903 if (old_length == 0) return Qnil;
7904
7905 source_current = (OnigUChar*)RSTRING_PTR(source);
7906 source_end = (OnigUChar*)RSTRING_END(source);
7907 if (source == target) {
7908 target_current = (OnigUChar*)source_current;
7909 target_end = (OnigUChar*)source_end;
7910 }
7911 else {
7912 target_current = (OnigUChar*)RSTRING_PTR(target);
7913 target_end = (OnigUChar*)RSTRING_END(target);
7914 }
7915
7916 length_or_invalid = onigenc_ascii_only_case_map(flags,
7917 &source_current, source_end,
7918 target_current, target_end, enc);
7919 if (length_or_invalid < 0)
7920 rb_raise(rb_eArgError, "input string invalid");
7921 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7922 fprintf(stderr, "problem with rb_str_ascii_casemap"
7923 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7924 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7925 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7926 }
7927
7928 str_enc_copy(target, source);
7929
7930 return target;
7931}
7932
7933static bool
7934upcase_single(VALUE str)
7935{
7936 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7937 bool modified = false;
7938
7939 while (s < send) {
7940 unsigned int c = *(unsigned char*)s;
7941
7942 if ('a' <= c && c <= 'z') {
7943 *s = 'A' + (c - 'a');
7944 modified = true;
7945 }
7946 s++;
7947 }
7948 return modified;
7949}
7950
7951/*
7952 * call-seq:
7953 * upcase!(mapping) -> self or nil
7954 *
7955 * Upcases the characters in +self+;
7956 * returns +self+ if any changes were made, +nil+ otherwise:
7957 *
7958 * s = 'Hello World!' # => "Hello World!"
7959 * s.upcase! # => "HELLO WORLD!"
7960 * s # => "HELLO WORLD!"
7961 * s.upcase! # => nil
7962 *
7963 * The casing may be affected by the given +mapping+;
7964 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7965 *
7966 * Related: String#upcase, String#downcase, String#downcase!.
7967 *
7968 */
7969
7970static VALUE
7971rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7972{
7973 rb_encoding *enc;
7974 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7975
7976 flags = check_case_options(argc, argv, flags);
7977 str_modify_keep_cr(str);
7978 enc = str_true_enc(str);
7979 if (case_option_single_p(flags, enc, str)) {
7980 if (upcase_single(str))
7981 flags |= ONIGENC_CASE_MODIFIED;
7982 }
7983 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7984 rb_str_ascii_casemap(str, str, &flags, enc);
7985 else
7986 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7987
7988 if (ONIGENC_CASE_MODIFIED&flags) return str;
7989 return Qnil;
7990}
7991
7992
7993/*
7994 * call-seq:
7995 * upcase(mapping) -> string
7996 *
7997 * Returns a string containing the upcased characters in +self+:
7998 *
7999 * s = 'Hello World!' # => "Hello World!"
8000 * s.upcase # => "HELLO WORLD!"
8001 *
8002 * The casing may be affected by the given +mapping+;
8003 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8004 *
8005 * Related: String#upcase!, String#downcase, String#downcase!.
8006 *
8007 */
8008
8009static VALUE
8010rb_str_upcase(int argc, VALUE *argv, VALUE str)
8011{
8012 rb_encoding *enc;
8013 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8014 VALUE ret;
8015
8016 flags = check_case_options(argc, argv, flags);
8017 enc = str_true_enc(str);
8018 if (case_option_single_p(flags, enc, str)) {
8019 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8020 str_enc_copy_direct(ret, str);
8021 upcase_single(ret);
8022 }
8023 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8024 ret = rb_str_new(0, RSTRING_LEN(str));
8025 rb_str_ascii_casemap(str, ret, &flags, enc);
8026 }
8027 else {
8028 ret = rb_str_casemap(str, &flags, enc);
8029 }
8030
8031 return ret;
8032}
8033
8034static bool
8035downcase_single(VALUE str)
8036{
8037 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8038 bool modified = false;
8039
8040 while (s < send) {
8041 unsigned int c = *(unsigned char*)s;
8042
8043 if ('A' <= c && c <= 'Z') {
8044 *s = 'a' + (c - 'A');
8045 modified = true;
8046 }
8047 s++;
8048 }
8049
8050 return modified;
8051}
8052
8053/*
8054 * call-seq:
8055 * downcase!(mapping) -> self or nil
8056 *
8057 * Like String#downcase, except that:
8058 *
8059 * - Changes character casings in +self+ (not in a copy of +self+).
8060 * - Returns +self+ if any changes are made, +nil+ otherwise.
8061 *
8062 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8063 */
8064
8065static VALUE
8066rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8067{
8068 rb_encoding *enc;
8069 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8070
8071 flags = check_case_options(argc, argv, flags);
8072 str_modify_keep_cr(str);
8073 enc = str_true_enc(str);
8074 if (case_option_single_p(flags, enc, str)) {
8075 if (downcase_single(str))
8076 flags |= ONIGENC_CASE_MODIFIED;
8077 }
8078 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8079 rb_str_ascii_casemap(str, str, &flags, enc);
8080 else
8081 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8082
8083 if (ONIGENC_CASE_MODIFIED&flags) return str;
8084 return Qnil;
8085}
8086
8087
8088/*
8089 * call-seq:
8090 * downcase(mapping) -> string
8091 *
8092 * :include: doc/string/downcase.rdoc
8093 *
8094 */
8095
8096static VALUE
8097rb_str_downcase(int argc, VALUE *argv, VALUE str)
8098{
8099 rb_encoding *enc;
8100 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8101 VALUE ret;
8102
8103 flags = check_case_options(argc, argv, flags);
8104 enc = str_true_enc(str);
8105 if (case_option_single_p(flags, enc, str)) {
8106 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8107 str_enc_copy_direct(ret, str);
8108 downcase_single(ret);
8109 }
8110 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8111 ret = rb_str_new(0, RSTRING_LEN(str));
8112 rb_str_ascii_casemap(str, ret, &flags, enc);
8113 }
8114 else {
8115 ret = rb_str_casemap(str, &flags, enc);
8116 }
8117
8118 return ret;
8119}
8120
8121
8122/*
8123 * call-seq:
8124 * capitalize!(mapping = :ascii) -> self or nil
8125 *
8126 * Like String#capitalize, except that:
8127 *
8128 * - Changes character casings in +self+ (not in a copy of +self+).
8129 * - Returns +self+ if any changes are made, +nil+ otherwise.
8130 *
8131 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8132 */
8133
8134static VALUE
8135rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8136{
8137 rb_encoding *enc;
8138 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8139
8140 flags = check_case_options(argc, argv, flags);
8141 str_modify_keep_cr(str);
8142 enc = str_true_enc(str);
8143 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8144 if (flags&ONIGENC_CASE_ASCII_ONLY)
8145 rb_str_ascii_casemap(str, str, &flags, enc);
8146 else
8147 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8148
8149 if (ONIGENC_CASE_MODIFIED&flags) return str;
8150 return Qnil;
8151}
8152
8153
8154/*
8155 * call-seq:
8156 * capitalize(mapping = :ascii) -> string
8157 *
8158 * Returns a string containing the characters in +self+,
8159 * each with possibly changed case:
8160 *
8161 * - The first character is upcased.
8162 * - All other characters are downcased.
8163 *
8164 * Examples:
8165 *
8166 * 'hello world'.capitalize # => "Hello world"
8167 * 'HELLO WORLD'.capitalize # => "Hello world"
8168 *
8169 * Some characters do not have upcase and downcase, and so are not changed;
8170 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc]:
8171 *
8172 * '1, 2, 3, ...'.capitalize # => "1, 2, 3, ..."
8173 *
8174 * The casing is affected by the given +mapping+,
8175 * which may be +:ascii+, +:fold+, or +:turkic+;
8176 * see {Case Mappings}[rdoc-ref:case_mapping.rdoc@Case+Mappings].
8177 *
8178 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8179 */
8180
8181static VALUE
8182rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8183{
8184 rb_encoding *enc;
8185 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8186 VALUE ret;
8187
8188 flags = check_case_options(argc, argv, flags);
8189 enc = str_true_enc(str);
8190 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8191 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8192 ret = rb_str_new(0, RSTRING_LEN(str));
8193 rb_str_ascii_casemap(str, ret, &flags, enc);
8194 }
8195 else {
8196 ret = rb_str_casemap(str, &flags, enc);
8197 }
8198 return ret;
8199}
8200
8201
8202/*
8203 * call-seq:
8204 * swapcase!(mapping) -> self or nil
8205 *
8206 * Like String#swapcase, except that:
8207 *
8208 * - Changes are made to +self+, not to copy of +self+.
8209 * - Returns +self+ if any changes are made, +nil+ otherwise.
8210 *
8211 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8212 */
8213
8214static VALUE
8215rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8216{
8217 rb_encoding *enc;
8218 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8219
8220 flags = check_case_options(argc, argv, flags);
8221 str_modify_keep_cr(str);
8222 enc = str_true_enc(str);
8223 if (flags&ONIGENC_CASE_ASCII_ONLY)
8224 rb_str_ascii_casemap(str, str, &flags, enc);
8225 else
8226 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8227
8228 if (ONIGENC_CASE_MODIFIED&flags) return str;
8229 return Qnil;
8230}
8231
8232
8233/*
8234 * call-seq:
8235 * swapcase(mapping) -> new_string
8236 *
8237 * :include: doc/string/swapcase.rdoc
8238 *
8239 */
8240
8241static VALUE
8242rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8243{
8244 rb_encoding *enc;
8245 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8246 VALUE ret;
8247
8248 flags = check_case_options(argc, argv, flags);
8249 enc = str_true_enc(str);
8250 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8251 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8252 ret = rb_str_new(0, RSTRING_LEN(str));
8253 rb_str_ascii_casemap(str, ret, &flags, enc);
8254 }
8255 else {
8256 ret = rb_str_casemap(str, &flags, enc);
8257 }
8258 return ret;
8259}
8260
8261typedef unsigned char *USTR;
8262
8263struct tr {
8264 int gen;
8265 unsigned int now, max;
8266 char *p, *pend;
8267};
8268
8269static unsigned int
8270trnext(struct tr *t, rb_encoding *enc)
8271{
8272 int n;
8273
8274 for (;;) {
8275 nextpart:
8276 if (!t->gen) {
8277 if (t->p == t->pend) return -1;
8278 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8279 t->p += n;
8280 }
8281 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8282 t->p += n;
8283 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8284 t->p += n;
8285 if (t->p < t->pend) {
8286 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8287 t->p += n;
8288 if (t->now > c) {
8289 if (t->now < 0x80 && c < 0x80) {
8290 rb_raise(rb_eArgError,
8291 "invalid range \"%c-%c\" in string transliteration",
8292 t->now, c);
8293 }
8294 else {
8295 rb_raise(rb_eArgError, "invalid range in string transliteration");
8296 }
8297 continue; /* not reached */
8298 }
8299 else if (t->now < c) {
8300 t->gen = 1;
8301 t->max = c;
8302 }
8303 }
8304 }
8305 return t->now;
8306 }
8307 else {
8308 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8309 if (t->now == t->max) {
8310 t->gen = 0;
8311 goto nextpart;
8312 }
8313 }
8314 if (t->now < t->max) {
8315 return t->now;
8316 }
8317 else {
8318 t->gen = 0;
8319 return t->max;
8320 }
8321 }
8322 }
8323}
8324
8325static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8326
8327static VALUE
8328tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8329{
8330 const unsigned int errc = -1;
8331 unsigned int trans[256];
8332 rb_encoding *enc, *e1, *e2;
8333 struct tr trsrc, trrepl;
8334 int cflag = 0;
8335 unsigned int c, c0, last = 0;
8336 int modify = 0, i, l;
8337 unsigned char *s, *send;
8338 VALUE hash = 0;
8339 int singlebyte = single_byte_optimizable(str);
8340 int termlen;
8341 int cr;
8342
8343#define CHECK_IF_ASCII(c) \
8344 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8345 (cr = ENC_CODERANGE_VALID) : 0)
8346
8347 StringValue(src);
8348 StringValue(repl);
8349 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8350 if (RSTRING_LEN(repl) == 0) {
8351 return rb_str_delete_bang(1, &src, str);
8352 }
8353
8354 cr = ENC_CODERANGE(str);
8355 e1 = rb_enc_check(str, src);
8356 e2 = rb_enc_check(str, repl);
8357 if (e1 == e2) {
8358 enc = e1;
8359 }
8360 else {
8361 enc = rb_enc_check(src, repl);
8362 }
8363 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8364 if (RSTRING_LEN(src) > 1 &&
8365 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8366 trsrc.p + l < trsrc.pend) {
8367 cflag = 1;
8368 trsrc.p += l;
8369 }
8370 trrepl.p = RSTRING_PTR(repl);
8371 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8372 trsrc.gen = trrepl.gen = 0;
8373 trsrc.now = trrepl.now = 0;
8374 trsrc.max = trrepl.max = 0;
8375
8376 if (cflag) {
8377 for (i=0; i<256; i++) {
8378 trans[i] = 1;
8379 }
8380 while ((c = trnext(&trsrc, enc)) != errc) {
8381 if (c < 256) {
8382 trans[c] = errc;
8383 }
8384 else {
8385 if (!hash) hash = rb_hash_new();
8386 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8387 }
8388 }
8389 while ((c = trnext(&trrepl, enc)) != errc)
8390 /* retrieve last replacer */;
8391 last = trrepl.now;
8392 for (i=0; i<256; i++) {
8393 if (trans[i] != errc) {
8394 trans[i] = last;
8395 }
8396 }
8397 }
8398 else {
8399 unsigned int r;
8400
8401 for (i=0; i<256; i++) {
8402 trans[i] = errc;
8403 }
8404 while ((c = trnext(&trsrc, enc)) != errc) {
8405 r = trnext(&trrepl, enc);
8406 if (r == errc) r = trrepl.now;
8407 if (c < 256) {
8408 trans[c] = r;
8409 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8410 }
8411 else {
8412 if (!hash) hash = rb_hash_new();
8413 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8414 }
8415 }
8416 }
8417
8418 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8419 cr = ENC_CODERANGE_7BIT;
8420 str_modify_keep_cr(str);
8421 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8422 termlen = rb_enc_mbminlen(enc);
8423 if (sflag) {
8424 int clen, tlen;
8425 long offset, max = RSTRING_LEN(str);
8426 unsigned int save = -1;
8427 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8428
8429 while (s < send) {
8430 int may_modify = 0;
8431
8432 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8433 if (!MBCLEN_CHARFOUND_P(r)) {
8434 xfree(buf);
8435 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8436 }
8437 clen = MBCLEN_CHARFOUND_LEN(r);
8438 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8439
8440 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8441
8442 s += clen;
8443 if (c < 256) {
8444 c = trans[c];
8445 }
8446 else if (hash) {
8447 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8448 if (NIL_P(tmp)) {
8449 if (cflag) c = last;
8450 else c = errc;
8451 }
8452 else if (cflag) c = errc;
8453 else c = NUM2INT(tmp);
8454 }
8455 else {
8456 c = errc;
8457 }
8458 if (c != (unsigned int)-1) {
8459 if (save == c) {
8460 CHECK_IF_ASCII(c);
8461 continue;
8462 }
8463 save = c;
8464 tlen = rb_enc_codelen(c, enc);
8465 modify = 1;
8466 }
8467 else {
8468 save = -1;
8469 c = c0;
8470 if (enc != e1) may_modify = 1;
8471 }
8472 if ((offset = t - buf) + tlen > max) {
8473 size_t MAYBE_UNUSED(old) = max + termlen;
8474 max = offset + tlen + (send - s);
8475 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8476 t = buf + offset;
8477 }
8478 rb_enc_mbcput(c, t, enc);
8479 if (may_modify && memcmp(s, t, tlen) != 0) {
8480 modify = 1;
8481 }
8482 CHECK_IF_ASCII(c);
8483 t += tlen;
8484 }
8485 if (!STR_EMBED_P(str)) {
8486 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8487 }
8488 TERM_FILL((char *)t, termlen);
8489 RSTRING(str)->as.heap.ptr = (char *)buf;
8490 STR_SET_LEN(str, t - buf);
8491 STR_SET_NOEMBED(str);
8492 RSTRING(str)->as.heap.aux.capa = max;
8493 }
8494 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8495 while (s < send) {
8496 c = (unsigned char)*s;
8497 if (trans[c] != errc) {
8498 if (!cflag) {
8499 c = trans[c];
8500 *s = c;
8501 modify = 1;
8502 }
8503 else {
8504 *s = last;
8505 modify = 1;
8506 }
8507 }
8508 CHECK_IF_ASCII(c);
8509 s++;
8510 }
8511 }
8512 else {
8513 int clen, tlen;
8514 long offset, max = (long)((send - s) * 1.2);
8515 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8516
8517 while (s < send) {
8518 int may_modify = 0;
8519
8520 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8521 if (!MBCLEN_CHARFOUND_P(r)) {
8522 xfree(buf);
8523 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8524 }
8525 clen = MBCLEN_CHARFOUND_LEN(r);
8526 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8527
8528 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8529
8530 if (c < 256) {
8531 c = trans[c];
8532 }
8533 else if (hash) {
8534 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8535 if (NIL_P(tmp)) {
8536 if (cflag) c = last;
8537 else c = errc;
8538 }
8539 else if (cflag) c = errc;
8540 else c = NUM2INT(tmp);
8541 }
8542 else {
8543 c = cflag ? last : errc;
8544 }
8545 if (c != errc) {
8546 tlen = rb_enc_codelen(c, enc);
8547 modify = 1;
8548 }
8549 else {
8550 c = c0;
8551 if (enc != e1) may_modify = 1;
8552 }
8553 if ((offset = t - buf) + tlen > max) {
8554 size_t MAYBE_UNUSED(old) = max + termlen;
8555 max = offset + tlen + (long)((send - s) * 1.2);
8556 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8557 t = buf + offset;
8558 }
8559 if (s != t) {
8560 rb_enc_mbcput(c, t, enc);
8561 if (may_modify && memcmp(s, t, tlen) != 0) {
8562 modify = 1;
8563 }
8564 }
8565 CHECK_IF_ASCII(c);
8566 s += clen;
8567 t += tlen;
8568 }
8569 if (!STR_EMBED_P(str)) {
8570 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8571 }
8572 TERM_FILL((char *)t, termlen);
8573 RSTRING(str)->as.heap.ptr = (char *)buf;
8574 STR_SET_LEN(str, t - buf);
8575 STR_SET_NOEMBED(str);
8576 RSTRING(str)->as.heap.aux.capa = max;
8577 }
8578
8579 if (modify) {
8580 if (cr != ENC_CODERANGE_BROKEN)
8581 ENC_CODERANGE_SET(str, cr);
8582 rb_enc_associate(str, enc);
8583 return str;
8584 }
8585 return Qnil;
8586}
8587
8588
8589/*
8590 * call-seq:
8591 * tr!(selector, replacements) -> self or nil
8592 *
8593 * Like String#tr, but modifies +self+ in place.
8594 * Returns +self+ if any changes were made, +nil+ otherwise.
8595 *
8596 */
8597
8598static VALUE
8599rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8600{
8601 return tr_trans(str, src, repl, 0);
8602}
8603
8604
8605/*
8606 * call-seq:
8607 * tr(selector, replacements) -> new_string
8608 *
8609 * Returns a copy of +self+ with each character specified by string +selector+
8610 * translated to the corresponding character in string +replacements+.
8611 * The correspondence is _positional_:
8612 *
8613 * - Each occurrence of the first character specified by +selector+
8614 * is translated to the first character in +replacements+.
8615 * - Each occurrence of the second character specified by +selector+
8616 * is translated to the second character in +replacements+.
8617 * - And so on.
8618 *
8619 * Example:
8620 *
8621 * 'hello'.tr('el', 'ip') #=> "hippo"
8622 *
8623 * If +replacements+ is shorter than +selector+,
8624 * it is implicitly padded with its own last character:
8625 *
8626 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8627 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8628 *
8629 * Arguments +selector+ and +replacements+ must be valid character selectors
8630 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8631 * and may use any of its valid forms, including negation, ranges, and escaping:
8632 *
8633 * # Negation.
8634 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8635 * # Ranges.
8636 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8637 * # Escapes.
8638 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8639 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8640 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8641 *
8642 */
8643
8644static VALUE
8645rb_str_tr(VALUE str, VALUE src, VALUE repl)
8646{
8647 str = str_duplicate(rb_cString, str);
8648 tr_trans(str, src, repl, 0);
8649 return str;
8650}
8651
8652#define TR_TABLE_MAX (UCHAR_MAX+1)
8653#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8654static void
8655tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8656 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8657{
8658 const unsigned int errc = -1;
8659 char buf[TR_TABLE_MAX];
8660 struct tr tr;
8661 unsigned int c;
8662 VALUE table = 0, ptable = 0;
8663 int i, l, cflag = 0;
8664
8665 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8666 tr.gen = tr.now = tr.max = 0;
8667
8668 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8669 cflag = 1;
8670 tr.p += l;
8671 }
8672 if (first) {
8673 for (i=0; i<TR_TABLE_MAX; i++) {
8674 stable[i] = 1;
8675 }
8676 stable[TR_TABLE_MAX] = cflag;
8677 }
8678 else if (stable[TR_TABLE_MAX] && !cflag) {
8679 stable[TR_TABLE_MAX] = 0;
8680 }
8681 for (i=0; i<TR_TABLE_MAX; i++) {
8682 buf[i] = cflag;
8683 }
8684
8685 while ((c = trnext(&tr, enc)) != errc) {
8686 if (c < TR_TABLE_MAX) {
8687 buf[(unsigned char)c] = !cflag;
8688 }
8689 else {
8690 VALUE key = UINT2NUM(c);
8691
8692 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8693 if (cflag) {
8694 ptable = *ctablep;
8695 table = ptable ? ptable : rb_hash_new();
8696 *ctablep = table;
8697 }
8698 else {
8699 table = rb_hash_new();
8700 ptable = *tablep;
8701 *tablep = table;
8702 }
8703 }
8704 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8705 rb_hash_aset(table, key, Qtrue);
8706 }
8707 }
8708 }
8709 for (i=0; i<TR_TABLE_MAX; i++) {
8710 stable[i] = stable[i] && buf[i];
8711 }
8712 if (!table && !cflag) {
8713 *tablep = 0;
8714 }
8715}
8716
8717
8718static int
8719tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8720{
8721 if (c < TR_TABLE_MAX) {
8722 return table[c] != 0;
8723 }
8724 else {
8725 VALUE v = UINT2NUM(c);
8726
8727 if (del) {
8728 if (!NIL_P(rb_hash_lookup(del, v)) &&
8729 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8730 return TRUE;
8731 }
8732 }
8733 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8734 return FALSE;
8735 }
8736 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8737 }
8738}
8739
8740/*
8741 * call-seq:
8742 * delete!(*selectors) -> self or nil
8743 *
8744 * Like String#delete, but modifies +self+ in place;
8745 * returns +self+ if any characters were deleted, +nil+ otherwise.
8746 *
8747 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8748 */
8749
8750static VALUE
8751rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8752{
8753 char squeez[TR_TABLE_SIZE];
8754 rb_encoding *enc = 0;
8755 char *s, *send, *t;
8756 VALUE del = 0, nodel = 0;
8757 int modify = 0;
8758 int i, ascompat, cr;
8759
8760 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8762 for (i=0; i<argc; i++) {
8763 VALUE s = argv[i];
8764
8765 StringValue(s);
8766 enc = rb_enc_check(str, s);
8767 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8768 }
8769
8770 str_modify_keep_cr(str);
8771 ascompat = rb_enc_asciicompat(enc);
8772 s = t = RSTRING_PTR(str);
8773 send = RSTRING_END(str);
8774 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8775 while (s < send) {
8776 unsigned int c;
8777 int clen;
8778
8779 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8780 if (squeez[c]) {
8781 modify = 1;
8782 }
8783 else {
8784 if (t != s) *t = c;
8785 t++;
8786 }
8787 s++;
8788 }
8789 else {
8790 c = rb_enc_codepoint_len(s, send, &clen, enc);
8791
8792 if (tr_find(c, squeez, del, nodel)) {
8793 modify = 1;
8794 }
8795 else {
8796 if (t != s) rb_enc_mbcput(c, t, enc);
8797 t += clen;
8799 }
8800 s += clen;
8801 }
8802 }
8803 TERM_FILL(t, TERM_LEN(str));
8804 STR_SET_LEN(str, t - RSTRING_PTR(str));
8805 ENC_CODERANGE_SET(str, cr);
8806
8807 if (modify) return str;
8808 return Qnil;
8809}
8810
8811
8812/*
8813 * call-seq:
8814 * delete(*selectors) -> new_string
8815 *
8816 * :include: doc/string/delete.rdoc
8817 *
8818 */
8819
8820static VALUE
8821rb_str_delete(int argc, VALUE *argv, VALUE str)
8822{
8823 str = str_duplicate(rb_cString, str);
8824 rb_str_delete_bang(argc, argv, str);
8825 return str;
8826}
8827
8828
8829/*
8830 * call-seq:
8831 * squeeze!(*selectors) -> self or nil
8832 *
8833 * Like String#squeeze, except that:
8834 *
8835 * - Characters are squeezed in +self+ (not in a copy of +self+).
8836 * - Returns +self+ if any changes are made, +nil+ otherwise.
8837 *
8838 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8839 */
8840
8841static VALUE
8842rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8843{
8844 char squeez[TR_TABLE_SIZE];
8845 rb_encoding *enc = 0;
8846 VALUE del = 0, nodel = 0;
8847 unsigned char *s, *send, *t;
8848 int i, modify = 0;
8849 int ascompat, singlebyte = single_byte_optimizable(str);
8850 unsigned int save;
8851
8852 if (argc == 0) {
8853 enc = STR_ENC_GET(str);
8854 }
8855 else {
8856 for (i=0; i<argc; i++) {
8857 VALUE s = argv[i];
8858
8859 StringValue(s);
8860 enc = rb_enc_check(str, s);
8861 if (singlebyte && !single_byte_optimizable(s))
8862 singlebyte = 0;
8863 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8864 }
8865 }
8866
8867 str_modify_keep_cr(str);
8868 s = t = (unsigned char *)RSTRING_PTR(str);
8869 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8870 send = (unsigned char *)RSTRING_END(str);
8871 save = -1;
8872 ascompat = rb_enc_asciicompat(enc);
8873
8874 if (singlebyte) {
8875 while (s < send) {
8876 unsigned int c = *s++;
8877 if (c != save || (argc > 0 && !squeez[c])) {
8878 *t++ = save = c;
8879 }
8880 }
8881 }
8882 else {
8883 while (s < send) {
8884 unsigned int c;
8885 int clen;
8886
8887 if (ascompat && (c = *s) < 0x80) {
8888 if (c != save || (argc > 0 && !squeez[c])) {
8889 *t++ = save = c;
8890 }
8891 s++;
8892 }
8893 else {
8894 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8895
8896 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8897 if (t != s) rb_enc_mbcput(c, t, enc);
8898 save = c;
8899 t += clen;
8900 }
8901 s += clen;
8902 }
8903 }
8904 }
8905
8906 TERM_FILL((char *)t, TERM_LEN(str));
8907 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8908 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8909 modify = 1;
8910 }
8911
8912 if (modify) return str;
8913 return Qnil;
8914}
8915
8916
8917/*
8918 * call-seq:
8919 * squeeze(*selectors) -> new_string
8920 *
8921 * :include: doc/string/squeeze.rdoc
8922 *
8923 */
8924
8925static VALUE
8926rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8927{
8928 str = str_duplicate(rb_cString, str);
8929 rb_str_squeeze_bang(argc, argv, str);
8930 return str;
8931}
8932
8933
8934/*
8935 * call-seq:
8936 * tr_s!(selector, replacements) -> self or nil
8937 *
8938 * Like String#tr_s, but modifies +self+ in place.
8939 * Returns +self+ if any changes were made, +nil+ otherwise.
8940 *
8941 * Related: String#squeeze!.
8942 */
8943
8944static VALUE
8945rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8946{
8947 return tr_trans(str, src, repl, 1);
8948}
8949
8950
8951/*
8952 * call-seq:
8953 * tr_s(selector, replacements) -> string
8954 *
8955 * Like String#tr, but also squeezes the modified portions of the translated string;
8956 * returns a new string (translated and squeezed).
8957 *
8958 * 'hello'.tr_s('l', 'r') #=> "hero"
8959 * 'hello'.tr_s('el', '-') #=> "h-o"
8960 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8961 *
8962 * Related: String#squeeze.
8963 *
8964 */
8965
8966static VALUE
8967rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8968{
8969 str = str_duplicate(rb_cString, str);
8970 tr_trans(str, src, repl, 1);
8971 return str;
8972}
8973
8974
8975/*
8976 * call-seq:
8977 * count(*selectors) -> integer
8978 *
8979 * :include: doc/string/count.rdoc
8980 */
8981
8982static VALUE
8983rb_str_count(int argc, VALUE *argv, VALUE str)
8984{
8985 char table[TR_TABLE_SIZE];
8986 rb_encoding *enc = 0;
8987 VALUE del = 0, nodel = 0, tstr;
8988 char *s, *send;
8989 int i;
8990 int ascompat;
8991 size_t n = 0;
8992
8994
8995 tstr = argv[0];
8996 StringValue(tstr);
8997 enc = rb_enc_check(str, tstr);
8998 if (argc == 1) {
8999 const char *ptstr;
9000 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9001 (ptstr = RSTRING_PTR(tstr),
9002 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9003 !is_broken_string(str)) {
9004 int clen;
9005 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9006
9007 s = RSTRING_PTR(str);
9008 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9009 send = RSTRING_END(str);
9010 while (s < send) {
9011 if (*(unsigned char*)s++ == c) n++;
9012 }
9013 return SIZET2NUM(n);
9014 }
9015 }
9016
9017 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9018 for (i=1; i<argc; i++) {
9019 tstr = argv[i];
9020 StringValue(tstr);
9021 enc = rb_enc_check(str, tstr);
9022 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9023 }
9024
9025 s = RSTRING_PTR(str);
9026 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9027 send = RSTRING_END(str);
9028 ascompat = rb_enc_asciicompat(enc);
9029 while (s < send) {
9030 unsigned int c;
9031
9032 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9033 if (table[c]) {
9034 n++;
9035 }
9036 s++;
9037 }
9038 else {
9039 int clen;
9040 c = rb_enc_codepoint_len(s, send, &clen, enc);
9041 if (tr_find(c, table, del, nodel)) {
9042 n++;
9043 }
9044 s += clen;
9045 }
9046 }
9047
9048 return SIZET2NUM(n);
9049}
9050
9051static VALUE
9052rb_fs_check(VALUE val)
9053{
9054 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9055 val = rb_check_string_type(val);
9056 if (NIL_P(val)) return 0;
9057 }
9058 return val;
9059}
9060
9061static const char isspacetable[256] = {
9062 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9063 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9064 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9065 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9066 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9067 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9068 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9069 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9070 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9071 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9078};
9079
9080#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9081
9082static long
9083split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9084{
9085 if (empty_count >= 0 && len == 0) {
9086 return empty_count + 1;
9087 }
9088 if (empty_count > 0) {
9089 /* make different substrings */
9090 if (result) {
9091 do {
9092 rb_ary_push(result, str_new_empty_String(str));
9093 } while (--empty_count > 0);
9094 }
9095 else {
9096 do {
9097 rb_yield(str_new_empty_String(str));
9098 } while (--empty_count > 0);
9099 }
9100 }
9101 str = rb_str_subseq(str, beg, len);
9102 if (result) {
9103 rb_ary_push(result, str);
9104 }
9105 else {
9106 rb_yield(str);
9107 }
9108 return empty_count;
9109}
9110
9111typedef enum {
9112 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9113} split_type_t;
9114
9115static split_type_t
9116literal_split_pattern(VALUE spat, split_type_t default_type)
9117{
9118 rb_encoding *enc = STR_ENC_GET(spat);
9119 const char *ptr;
9120 long len;
9121 RSTRING_GETMEM(spat, ptr, len);
9122 if (len == 0) {
9123 /* Special case - split into chars */
9124 return SPLIT_TYPE_CHARS;
9125 }
9126 else if (rb_enc_asciicompat(enc)) {
9127 if (len == 1 && ptr[0] == ' ') {
9128 return SPLIT_TYPE_AWK;
9129 }
9130 }
9131 else {
9132 int l;
9133 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9134 return SPLIT_TYPE_AWK;
9135 }
9136 }
9137 return default_type;
9138}
9139
9140/*
9141 * call-seq:
9142 * split(field_sep = $;, limit = 0) -> array_of_substrings
9143 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9144 *
9145 * :include: doc/string/split.rdoc
9146 *
9147 */
9148
9149static VALUE
9150rb_str_split_m(int argc, VALUE *argv, VALUE str)
9151{
9152 rb_encoding *enc;
9153 VALUE spat;
9154 VALUE limit;
9155 split_type_t split_type;
9156 long beg, end, i = 0, empty_count = -1;
9157 int lim = 0;
9158 VALUE result, tmp;
9159
9160 result = rb_block_given_p() ? Qfalse : Qnil;
9161 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9162 lim = NUM2INT(limit);
9163 if (lim <= 0) limit = Qnil;
9164 else if (lim == 1) {
9165 if (RSTRING_LEN(str) == 0)
9166 return result ? rb_ary_new2(0) : str;
9167 tmp = str_duplicate(rb_cString, str);
9168 if (!result) {
9169 rb_yield(tmp);
9170 return str;
9171 }
9172 return rb_ary_new3(1, tmp);
9173 }
9174 i = 1;
9175 }
9176 if (NIL_P(limit) && !lim) empty_count = 0;
9177
9178 enc = STR_ENC_GET(str);
9179 split_type = SPLIT_TYPE_REGEXP;
9180 if (!NIL_P(spat)) {
9181 spat = get_pat_quoted(spat, 0);
9182 }
9183 else if (NIL_P(spat = rb_fs)) {
9184 split_type = SPLIT_TYPE_AWK;
9185 }
9186 else if (!(spat = rb_fs_check(spat))) {
9187 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9188 }
9189 else {
9190 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9191 }
9192 if (split_type != SPLIT_TYPE_AWK) {
9193 switch (BUILTIN_TYPE(spat)) {
9194 case T_REGEXP:
9195 rb_reg_options(spat); /* check if uninitialized */
9196 tmp = RREGEXP_SRC(spat);
9197 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9198 if (split_type == SPLIT_TYPE_AWK) {
9199 spat = tmp;
9200 split_type = SPLIT_TYPE_STRING;
9201 }
9202 break;
9203
9204 case T_STRING:
9205 mustnot_broken(spat);
9206 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9207 break;
9208
9209 default:
9211 }
9212 }
9213
9214#define SPLIT_STR(beg, len) ( \
9215 empty_count = split_string(result, str, beg, len, empty_count), \
9216 str_mod_check(str, str_start, str_len))
9217
9218 beg = 0;
9219 char *ptr = RSTRING_PTR(str);
9220 char *const str_start = ptr;
9221 const long str_len = RSTRING_LEN(str);
9222 char *const eptr = str_start + str_len;
9223 if (split_type == SPLIT_TYPE_AWK) {
9224 char *bptr = ptr;
9225 int skip = 1;
9226 unsigned int c;
9227
9228 if (result) result = rb_ary_new();
9229 end = beg;
9230 if (is_ascii_string(str)) {
9231 while (ptr < eptr) {
9232 c = (unsigned char)*ptr++;
9233 if (skip) {
9234 if (ascii_isspace(c)) {
9235 beg = ptr - bptr;
9236 }
9237 else {
9238 end = ptr - bptr;
9239 skip = 0;
9240 if (!NIL_P(limit) && lim <= i) break;
9241 }
9242 }
9243 else if (ascii_isspace(c)) {
9244 SPLIT_STR(beg, end-beg);
9245 skip = 1;
9246 beg = ptr - bptr;
9247 if (!NIL_P(limit)) ++i;
9248 }
9249 else {
9250 end = ptr - bptr;
9251 }
9252 }
9253 }
9254 else {
9255 while (ptr < eptr) {
9256 int n;
9257
9258 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9259 ptr += n;
9260 if (skip) {
9261 if (rb_isspace(c)) {
9262 beg = ptr - bptr;
9263 }
9264 else {
9265 end = ptr - bptr;
9266 skip = 0;
9267 if (!NIL_P(limit) && lim <= i) break;
9268 }
9269 }
9270 else if (rb_isspace(c)) {
9271 SPLIT_STR(beg, end-beg);
9272 skip = 1;
9273 beg = ptr - bptr;
9274 if (!NIL_P(limit)) ++i;
9275 }
9276 else {
9277 end = ptr - bptr;
9278 }
9279 }
9280 }
9281 }
9282 else if (split_type == SPLIT_TYPE_STRING) {
9283 char *substr_start = ptr;
9284 char *sptr = RSTRING_PTR(spat);
9285 long slen = RSTRING_LEN(spat);
9286
9287 if (result) result = rb_ary_new();
9288 mustnot_broken(str);
9289 enc = rb_enc_check(str, spat);
9290 while (ptr < eptr &&
9291 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9292 /* Check we are at the start of a char */
9293 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9294 if (t != ptr + end) {
9295 ptr = t;
9296 continue;
9297 }
9298 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9299 str_mod_check(spat, sptr, slen);
9300 ptr += end + slen;
9301 substr_start = ptr;
9302 if (!NIL_P(limit) && lim <= ++i) break;
9303 }
9304 beg = ptr - str_start;
9305 }
9306 else if (split_type == SPLIT_TYPE_CHARS) {
9307 int n;
9308
9309 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9310 mustnot_broken(str);
9311 enc = rb_enc_get(str);
9312 while (ptr < eptr &&
9313 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9314 SPLIT_STR(ptr - str_start, n);
9315 ptr += n;
9316 if (!NIL_P(limit) && lim <= ++i) break;
9317 }
9318 beg = ptr - str_start;
9319 }
9320 else {
9321 if (result) result = rb_ary_new();
9322 long len = RSTRING_LEN(str);
9323 long start = beg;
9324 long idx;
9325 int last_null = 0;
9326 struct re_registers *regs;
9327 VALUE match = 0;
9328
9329 for (; rb_reg_search(spat, str, start, 0) >= 0;
9330 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9331 match = rb_backref_get();
9332 if (!result) rb_match_busy(match);
9333 regs = RMATCH_REGS(match);
9334 end = BEG(0);
9335 if (start == end && BEG(0) == END(0)) {
9336 if (!ptr) {
9337 SPLIT_STR(0, 0);
9338 break;
9339 }
9340 else if (last_null == 1) {
9341 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9342 beg = start;
9343 }
9344 else {
9345 if (start == len)
9346 start++;
9347 else
9348 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9349 last_null = 1;
9350 continue;
9351 }
9352 }
9353 else {
9354 SPLIT_STR(beg, end-beg);
9355 beg = start = END(0);
9356 }
9357 last_null = 0;
9358
9359 for (idx=1; idx < regs->num_regs; idx++) {
9360 if (BEG(idx) == -1) continue;
9361 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9362 }
9363 if (!NIL_P(limit) && lim <= ++i) break;
9364 }
9365 if (match) rb_match_unbusy(match);
9366 }
9367 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9368 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9369 }
9370
9371 return result ? result : str;
9372}
9373
9374VALUE
9375rb_str_split(VALUE str, const char *sep0)
9376{
9377 VALUE sep;
9378
9379 StringValue(str);
9380 sep = rb_str_new_cstr(sep0);
9381 return rb_str_split_m(1, &sep, str);
9382}
9383
9384#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9385
9386static inline int
9387enumerator_element(VALUE ary, VALUE e)
9388{
9389 if (ary) {
9390 rb_ary_push(ary, e);
9391 return 0;
9392 }
9393 else {
9394 rb_yield(e);
9395 return 1;
9396 }
9397}
9398
9399#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9400
9401static const char *
9402chomp_newline(const char *p, const char *e, rb_encoding *enc)
9403{
9404 const char *prev = rb_enc_prev_char(p, e, e, enc);
9405 if (rb_enc_is_newline(prev, e, enc)) {
9406 e = prev;
9407 prev = rb_enc_prev_char(p, e, e, enc);
9408 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9409 e = prev;
9410 }
9411 return e;
9412}
9413
9414static VALUE
9415get_rs(void)
9416{
9417 VALUE rs = rb_rs;
9418 if (!NIL_P(rs) &&
9419 (!RB_TYPE_P(rs, T_STRING) ||
9420 RSTRING_LEN(rs) != 1 ||
9421 RSTRING_PTR(rs)[0] != '\n')) {
9422 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9423 }
9424 return rs;
9425}
9426
9427#define rb_rs get_rs()
9428
9429static VALUE
9430rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9431{
9432 rb_encoding *enc;
9433 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9434 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9435 long pos, len, rslen;
9436 int rsnewline = 0;
9437
9438 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9439 rs = rb_rs;
9440 if (!NIL_P(opts)) {
9441 static ID keywords[1];
9442 if (!keywords[0]) {
9443 keywords[0] = rb_intern_const("chomp");
9444 }
9445 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9446 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9447 }
9448
9449 if (NIL_P(rs)) {
9450 if (!ENUM_ELEM(ary, str)) {
9451 return ary;
9452 }
9453 else {
9454 return orig;
9455 }
9456 }
9457
9458 if (!RSTRING_LEN(str)) goto end;
9459 str = rb_str_new_frozen(str);
9460 ptr = subptr = RSTRING_PTR(str);
9461 pend = RSTRING_END(str);
9462 len = RSTRING_LEN(str);
9463 StringValue(rs);
9464 rslen = RSTRING_LEN(rs);
9465
9466 if (rs == rb_default_rs)
9467 enc = rb_enc_get(str);
9468 else
9469 enc = rb_enc_check(str, rs);
9470
9471 if (rslen == 0) {
9472 /* paragraph mode */
9473 int n;
9474 const char *eol = NULL;
9475 subend = subptr;
9476 while (subend < pend) {
9477 long chomp_rslen = 0;
9478 do {
9479 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9480 n = 0;
9481 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9482 if (rb_enc_is_newline(subend + n, pend, enc)) {
9483 if (eol == subend) break;
9484 subend += rslen;
9485 if (subptr) {
9486 eol = subend;
9487 chomp_rslen = -rslen;
9488 }
9489 }
9490 else {
9491 if (!subptr) subptr = subend;
9492 subend += rslen;
9493 }
9494 rslen = 0;
9495 } while (subend < pend);
9496 if (!subptr) break;
9497 if (rslen == 0) chomp_rslen = 0;
9498 line = rb_str_subseq(str, subptr - ptr,
9499 subend - subptr + (chomp ? chomp_rslen : rslen));
9500 if (ENUM_ELEM(ary, line)) {
9501 str_mod_check(str, ptr, len);
9502 }
9503 subptr = eol = NULL;
9504 }
9505 goto end;
9506 }
9507 else {
9508 rsptr = RSTRING_PTR(rs);
9509 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9510 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9511 rsnewline = 1;
9512 }
9513 }
9514
9515 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9516 rs = rb_str_new(rsptr, rslen);
9517 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9518 rsptr = RSTRING_PTR(rs);
9519 rslen = RSTRING_LEN(rs);
9520 }
9521
9522 while (subptr < pend) {
9523 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9524 if (pos < 0) break;
9525 hit = subptr + pos;
9526 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9527 if (hit != adjusted) {
9528 subptr = adjusted;
9529 continue;
9530 }
9531 subend = hit += rslen;
9532 if (chomp) {
9533 if (rsnewline) {
9534 subend = chomp_newline(subptr, subend, enc);
9535 }
9536 else {
9537 subend -= rslen;
9538 }
9539 }
9540 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9541 if (ENUM_ELEM(ary, line)) {
9542 str_mod_check(str, ptr, len);
9543 }
9544 subptr = hit;
9545 }
9546
9547 if (subptr != pend) {
9548 if (chomp) {
9549 if (rsnewline) {
9550 pend = chomp_newline(subptr, pend, enc);
9551 }
9552 else if (pend - subptr >= rslen &&
9553 memcmp(pend - rslen, rsptr, rslen) == 0) {
9554 pend -= rslen;
9555 }
9556 }
9557 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9558 ENUM_ELEM(ary, line);
9559 RB_GC_GUARD(str);
9560 }
9561
9562 end:
9563 if (ary)
9564 return ary;
9565 else
9566 return orig;
9567}
9568
9569/*
9570 * call-seq:
9571 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9572 * each_line(record_separator = $/, chomp: false) -> enumerator
9573 *
9574 * :include: doc/string/each_line.rdoc
9575 *
9576 */
9577
9578static VALUE
9579rb_str_each_line(int argc, VALUE *argv, VALUE str)
9580{
9581 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9582 return rb_str_enumerate_lines(argc, argv, str, 0);
9583}
9584
9585/*
9586 * call-seq:
9587 * lines(record_separator = $/, chomp: false) -> array_of_strings
9588 *
9589 * Returns substrings ("lines") of +self+
9590 * according to the given arguments:
9591 *
9592 * s = <<~EOT
9593 * This is the first line.
9594 * This is line two.
9595 *
9596 * This is line four.
9597 * This is line five.
9598 * EOT
9599 *
9600 * With the default argument values:
9601 *
9602 * $/ # => "\n"
9603 * s.lines
9604 * # =>
9605 * ["This is the first line.\n",
9606 * "This is line two.\n",
9607 * "\n",
9608 * "This is line four.\n",
9609 * "This is line five.\n"]
9610 *
9611 * With a different +record_separator+:
9612 *
9613 * record_separator = ' is '
9614 * s.lines(record_separator)
9615 * # =>
9616 * ["This is ",
9617 * "the first line.\nThis is ",
9618 * "line two.\n\nThis is ",
9619 * "line four.\nThis is ",
9620 * "line five.\n"]
9621 *
9622 * With keyword argument +chomp+ as +true+,
9623 * removes the trailing newline from each line:
9624 *
9625 * s.lines(chomp: true)
9626 * # =>
9627 * ["This is the first line.",
9628 * "This is line two.",
9629 * "",
9630 * "This is line four.",
9631 * "This is line five."]
9632 *
9633 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9634 */
9635
9636static VALUE
9637rb_str_lines(int argc, VALUE *argv, VALUE str)
9638{
9639 VALUE ary = WANTARRAY("lines", 0);
9640 return rb_str_enumerate_lines(argc, argv, str, ary);
9641}
9642
9643static VALUE
9644rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9645{
9646 return LONG2FIX(RSTRING_LEN(str));
9647}
9648
9649static VALUE
9650rb_str_enumerate_bytes(VALUE str, VALUE ary)
9651{
9652 long i;
9653
9654 for (i=0; i<RSTRING_LEN(str); i++) {
9655 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9656 }
9657 if (ary)
9658 return ary;
9659 else
9660 return str;
9661}
9662
9663/*
9664 * call-seq:
9665 * each_byte {|byte| ... } -> self
9666 * each_byte -> enumerator
9667 *
9668 * :include: doc/string/each_byte.rdoc
9669 *
9670 */
9671
9672static VALUE
9673rb_str_each_byte(VALUE str)
9674{
9675 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9676 return rb_str_enumerate_bytes(str, 0);
9677}
9678
9679/*
9680 * call-seq:
9681 * bytes -> array_of_bytes
9682 *
9683 * :include: doc/string/bytes.rdoc
9684 *
9685 */
9686
9687static VALUE
9688rb_str_bytes(VALUE str)
9689{
9690 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9691 return rb_str_enumerate_bytes(str, ary);
9692}
9693
9694static VALUE
9695rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9696{
9697 return rb_str_length(str);
9698}
9699
9700static VALUE
9701rb_str_enumerate_chars(VALUE str, VALUE ary)
9702{
9703 VALUE orig = str;
9704 long i, len, n;
9705 const char *ptr;
9706 rb_encoding *enc;
9707
9708 str = rb_str_new_frozen(str);
9709 ptr = RSTRING_PTR(str);
9710 len = RSTRING_LEN(str);
9711 enc = rb_enc_get(str);
9712
9714 for (i = 0; i < len; i += n) {
9715 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9716 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9717 }
9718 }
9719 else {
9720 for (i = 0; i < len; i += n) {
9721 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9722 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9723 }
9724 }
9725 RB_GC_GUARD(str);
9726 if (ary)
9727 return ary;
9728 else
9729 return orig;
9730}
9731
9732/*
9733 * call-seq:
9734 * each_char {|char| ... } -> self
9735 * each_char -> enumerator
9736 *
9737 * :include: doc/string/each_char.rdoc
9738 *
9739 */
9740
9741static VALUE
9742rb_str_each_char(VALUE str)
9743{
9744 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9745 return rb_str_enumerate_chars(str, 0);
9746}
9747
9748/*
9749 * call-seq:
9750 * chars -> array_of_characters
9751 *
9752 * :include: doc/string/chars.rdoc
9753 *
9754 */
9755
9756static VALUE
9757rb_str_chars(VALUE str)
9758{
9759 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9760 return rb_str_enumerate_chars(str, ary);
9761}
9762
9763static VALUE
9764rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9765{
9766 VALUE orig = str;
9767 int n;
9768 unsigned int c;
9769 const char *ptr, *end;
9770 rb_encoding *enc;
9771
9772 if (single_byte_optimizable(str))
9773 return rb_str_enumerate_bytes(str, ary);
9774
9775 str = rb_str_new_frozen(str);
9776 ptr = RSTRING_PTR(str);
9777 end = RSTRING_END(str);
9778 enc = STR_ENC_GET(str);
9779
9780 while (ptr < end) {
9781 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9782 ENUM_ELEM(ary, UINT2NUM(c));
9783 ptr += n;
9784 }
9785 RB_GC_GUARD(str);
9786 if (ary)
9787 return ary;
9788 else
9789 return orig;
9790}
9791
9792/*
9793 * call-seq:
9794 * each_codepoint {|codepoint| ... } -> self
9795 * each_codepoint -> enumerator
9796 *
9797 * :include: doc/string/each_codepoint.rdoc
9798 *
9799 */
9800
9801static VALUE
9802rb_str_each_codepoint(VALUE str)
9803{
9804 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9805 return rb_str_enumerate_codepoints(str, 0);
9806}
9807
9808/*
9809 * call-seq:
9810 * codepoints -> array_of_integers
9811 *
9812 * :include: doc/string/codepoints.rdoc
9813 *
9814 */
9815
9816static VALUE
9817rb_str_codepoints(VALUE str)
9818{
9819 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9820 return rb_str_enumerate_codepoints(str, ary);
9821}
9822
9823static regex_t *
9824get_reg_grapheme_cluster(rb_encoding *enc)
9825{
9826 int encidx = rb_enc_to_index(enc);
9827
9828 const OnigUChar source_ascii[] = "\\X";
9829 const OnigUChar *source = source_ascii;
9830 size_t source_len = sizeof(source_ascii) - 1;
9831
9832 switch (encidx) {
9833#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9834#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9835#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9836#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9837#define CASE_UTF(e) \
9838 case ENCINDEX_UTF_##e: { \
9839 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9840 source = source_UTF_##e; \
9841 source_len = sizeof(source_UTF_##e); \
9842 break; \
9843 }
9844 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9845#undef CASE_UTF
9846#undef CHARS_16BE
9847#undef CHARS_16LE
9848#undef CHARS_32BE
9849#undef CHARS_32LE
9850 }
9851
9852 regex_t *reg_grapheme_cluster;
9853 OnigErrorInfo einfo;
9854 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9855 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9856 if (r) {
9857 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9858 onig_error_code_to_str(message, r, &einfo);
9859 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9860 }
9861
9862 return reg_grapheme_cluster;
9863}
9864
9865static regex_t *
9866get_cached_reg_grapheme_cluster(rb_encoding *enc)
9867{
9868 int encidx = rb_enc_to_index(enc);
9869 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9870
9871 if (encidx == rb_utf8_encindex()) {
9872 if (!reg_grapheme_cluster_utf8) {
9873 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9874 }
9875
9876 return reg_grapheme_cluster_utf8;
9877 }
9878
9879 return NULL;
9880}
9881
9882static VALUE
9883rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9884{
9885 size_t grapheme_cluster_count = 0;
9886 rb_encoding *enc = get_encoding(str);
9887 const char *ptr, *end;
9888
9889 if (!rb_enc_unicode_p(enc)) {
9890 return rb_str_length(str);
9891 }
9892
9893 bool cached_reg_grapheme_cluster = true;
9894 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9895 if (!reg_grapheme_cluster) {
9896 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9897 cached_reg_grapheme_cluster = false;
9898 }
9899
9900 ptr = RSTRING_PTR(str);
9901 end = RSTRING_END(str);
9902
9903 while (ptr < end) {
9904 OnigPosition len = onig_match(reg_grapheme_cluster,
9905 (const OnigUChar *)ptr, (const OnigUChar *)end,
9906 (const OnigUChar *)ptr, NULL, 0);
9907 if (len <= 0) break;
9908 grapheme_cluster_count++;
9909 ptr += len;
9910 }
9911
9912 if (!cached_reg_grapheme_cluster) {
9913 onig_free(reg_grapheme_cluster);
9914 }
9915
9916 return SIZET2NUM(grapheme_cluster_count);
9917}
9918
9919static VALUE
9920rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9921{
9922 VALUE orig = str;
9923 rb_encoding *enc = get_encoding(str);
9924 const char *ptr0, *ptr, *end;
9925
9926 if (!rb_enc_unicode_p(enc)) {
9927 return rb_str_enumerate_chars(str, ary);
9928 }
9929
9930 if (!ary) str = rb_str_new_frozen(str);
9931
9932 bool cached_reg_grapheme_cluster = true;
9933 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9934 if (!reg_grapheme_cluster) {
9935 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9936 cached_reg_grapheme_cluster = false;
9937 }
9938
9939 ptr0 = ptr = RSTRING_PTR(str);
9940 end = RSTRING_END(str);
9941
9942 while (ptr < end) {
9943 OnigPosition len = onig_match(reg_grapheme_cluster,
9944 (const OnigUChar *)ptr, (const OnigUChar *)end,
9945 (const OnigUChar *)ptr, NULL, 0);
9946 if (len <= 0) break;
9947 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9948 ptr += len;
9949 }
9950
9951 if (!cached_reg_grapheme_cluster) {
9952 onig_free(reg_grapheme_cluster);
9953 }
9954
9955 RB_GC_GUARD(str);
9956 if (ary)
9957 return ary;
9958 else
9959 return orig;
9960}
9961
9962/*
9963 * call-seq:
9964 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9965 * each_grapheme_cluster -> enumerator
9966 *
9967 * :include: doc/string/each_grapheme_cluster.rdoc
9968 *
9969 */
9970
9971static VALUE
9972rb_str_each_grapheme_cluster(VALUE str)
9973{
9974 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9975 return rb_str_enumerate_grapheme_clusters(str, 0);
9976}
9977
9978/*
9979 * call-seq:
9980 * grapheme_clusters -> array_of_grapheme_clusters
9981 *
9982 * :include: doc/string/grapheme_clusters.rdoc
9983 *
9984 */
9985
9986static VALUE
9987rb_str_grapheme_clusters(VALUE str)
9988{
9989 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9990 return rb_str_enumerate_grapheme_clusters(str, ary);
9991}
9992
9993static long
9994chopped_length(VALUE str)
9995{
9996 rb_encoding *enc = STR_ENC_GET(str);
9997 const char *p, *p2, *beg, *end;
9998
9999 beg = RSTRING_PTR(str);
10000 end = beg + RSTRING_LEN(str);
10001 if (beg >= end) return 0;
10002 p = rb_enc_prev_char(beg, end, end, enc);
10003 if (!p) return 0;
10004 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10005 p2 = rb_enc_prev_char(beg, p, end, enc);
10006 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10007 }
10008 return p - beg;
10009}
10010
10011/*
10012 * call-seq:
10013 * chop! -> self or nil
10014 *
10015 * Like String#chop, except that:
10016 *
10017 * - Removes trailing characters from +self+ (not from a copy of +self+).
10018 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10019 *
10020 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10021 */
10022
10023static VALUE
10024rb_str_chop_bang(VALUE str)
10025{
10026 str_modify_keep_cr(str);
10027 if (RSTRING_LEN(str) > 0) {
10028 long len;
10029 len = chopped_length(str);
10030 STR_SET_LEN(str, len);
10031 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10032 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10034 }
10035 return str;
10036 }
10037 return Qnil;
10038}
10039
10040
10041/*
10042 * call-seq:
10043 * chop -> new_string
10044 *
10045 * :include: doc/string/chop.rdoc
10046 *
10047 */
10048
10049static VALUE
10050rb_str_chop(VALUE str)
10051{
10052 return rb_str_subseq(str, 0, chopped_length(str));
10053}
10054
10055static long
10056smart_chomp(VALUE str, const char *e, const char *p)
10057{
10058 rb_encoding *enc = rb_enc_get(str);
10059 if (rb_enc_mbminlen(enc) > 1) {
10060 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10061 if (rb_enc_is_newline(pp, e, enc)) {
10062 e = pp;
10063 }
10064 pp = e - rb_enc_mbminlen(enc);
10065 if (pp >= p) {
10066 pp = rb_enc_left_char_head(p, pp, e, enc);
10067 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10068 e = pp;
10069 }
10070 }
10071 }
10072 else {
10073 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10074 case '\n':
10075 if (--e > p && *(e-1) == '\r') {
10076 --e;
10077 }
10078 break;
10079 case '\r':
10080 --e;
10081 break;
10082 }
10083 }
10084 return e - p;
10085}
10086
10087static long
10088chompped_length(VALUE str, VALUE rs)
10089{
10090 rb_encoding *enc;
10091 int newline;
10092 char *pp, *e, *rsptr;
10093 long rslen;
10094 char *const p = RSTRING_PTR(str);
10095 long len = RSTRING_LEN(str);
10096
10097 if (len == 0) return 0;
10098 e = p + len;
10099 if (rs == rb_default_rs) {
10100 return smart_chomp(str, e, p);
10101 }
10102
10103 enc = rb_enc_get(str);
10104 RSTRING_GETMEM(rs, rsptr, rslen);
10105 if (rslen == 0) {
10106 if (rb_enc_mbminlen(enc) > 1) {
10107 while (e > p) {
10108 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10109 if (!rb_enc_is_newline(pp, e, enc)) break;
10110 e = pp;
10111 pp -= rb_enc_mbminlen(enc);
10112 if (pp >= p) {
10113 pp = rb_enc_left_char_head(p, pp, e, enc);
10114 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10115 e = pp;
10116 }
10117 }
10118 }
10119 }
10120 else {
10121 while (e > p && *(e-1) == '\n') {
10122 --e;
10123 if (e > p && *(e-1) == '\r')
10124 --e;
10125 }
10126 }
10127 return e - p;
10128 }
10129 if (rslen > len) return len;
10130
10131 enc = rb_enc_get(rs);
10132 newline = rsptr[rslen-1];
10133 if (rslen == rb_enc_mbminlen(enc)) {
10134 if (rslen == 1) {
10135 if (newline == '\n')
10136 return smart_chomp(str, e, p);
10137 }
10138 else {
10139 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10140 return smart_chomp(str, e, p);
10141 }
10142 }
10143
10144 enc = rb_enc_check(str, rs);
10145 if (is_broken_string(rs)) {
10146 return len;
10147 }
10148 pp = e - rslen;
10149 if (p[len-1] == newline &&
10150 (rslen <= 1 ||
10151 memcmp(rsptr, pp, rslen) == 0)) {
10152 if (at_char_boundary(p, pp, e, enc))
10153 return len - rslen;
10154 RB_GC_GUARD(rs);
10155 }
10156 return len;
10157}
10158
10164static VALUE
10165chomp_rs(int argc, const VALUE *argv)
10166{
10167 rb_check_arity(argc, 0, 1);
10168 if (argc > 0) {
10169 VALUE rs = argv[0];
10170 if (!NIL_P(rs)) StringValue(rs);
10171 return rs;
10172 }
10173 else {
10174 return rb_rs;
10175 }
10176}
10177
10178VALUE
10179rb_str_chomp_string(VALUE str, VALUE rs)
10180{
10181 long olen = RSTRING_LEN(str);
10182 long len = chompped_length(str, rs);
10183 if (len >= olen) return Qnil;
10184 str_modify_keep_cr(str);
10185 STR_SET_LEN(str, len);
10186 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10187 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10189 }
10190 return str;
10191}
10192
10193/*
10194 * call-seq:
10195 * chomp!(line_sep = $/) -> self or nil
10196 *
10197 * Like String#chomp, except that:
10198 *
10199 * - Removes trailing characters from +self+ (not from a copy of +self+).
10200 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10201 *
10202 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10203 */
10204
10205static VALUE
10206rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10207{
10208 VALUE rs;
10209 str_modifiable(str);
10210 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10211 rs = chomp_rs(argc, argv);
10212 if (NIL_P(rs)) return Qnil;
10213 return rb_str_chomp_string(str, rs);
10214}
10215
10216
10217/*
10218 * call-seq:
10219 * chomp(line_sep = $/) -> new_string
10220 *
10221 * :include: doc/string/chomp.rdoc
10222 *
10223 */
10224
10225static VALUE
10226rb_str_chomp(int argc, VALUE *argv, VALUE str)
10227{
10228 VALUE rs = chomp_rs(argc, argv);
10229 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10230 return rb_str_subseq(str, 0, chompped_length(str, rs));
10231}
10232
10233static long
10234lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10235{
10236 const char *const start = s;
10237
10238 if (!s || s >= e) return 0;
10239
10240 /* remove spaces at head */
10241 if (single_byte_optimizable(str)) {
10242 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10243 }
10244 else {
10245 while (s < e) {
10246 int n;
10247 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10248
10249 if (cc && !rb_isspace(cc)) break;
10250 s += n;
10251 }
10252 }
10253 return s - start;
10254}
10255
10256/*
10257 * call-seq:
10258 * lstrip! -> self or nil
10259 *
10260 * Like String#lstrip, except that:
10261 *
10262 * - Performs stripping in +self+ (not in a copy of +self+).
10263 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10264 *
10265 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10266 */
10267
10268static VALUE
10269rb_str_lstrip_bang(VALUE str)
10270{
10271 rb_encoding *enc;
10272 char *start, *s;
10273 long olen, loffset;
10274
10275 str_modify_keep_cr(str);
10276 enc = STR_ENC_GET(str);
10277 RSTRING_GETMEM(str, start, olen);
10278 loffset = lstrip_offset(str, start, start+olen, enc);
10279 if (loffset > 0) {
10280 long len = olen-loffset;
10281 s = start + loffset;
10282 memmove(start, s, len);
10283 STR_SET_LEN(str, len);
10284 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10285 return str;
10286 }
10287 return Qnil;
10288}
10289
10290
10291/*
10292 * call-seq:
10293 * lstrip -> new_string
10294 *
10295 * Returns a copy of +self+ with leading whitespace removed;
10296 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10297 *
10298 * whitespace = "\x00\t\n\v\f\r "
10299 * s = whitespace + 'abc' + whitespace
10300 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10301 * s.lstrip
10302 * # => "abc\u0000\t\n\v\f\r "
10303 *
10304 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10305 */
10306
10307static VALUE
10308rb_str_lstrip(VALUE str)
10309{
10310 char *start;
10311 long len, loffset;
10312 RSTRING_GETMEM(str, start, len);
10313 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10314 if (loffset <= 0) return str_duplicate(rb_cString, str);
10315 return rb_str_subseq(str, loffset, len - loffset);
10316}
10317
10318static long
10319rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10320{
10321 const char *t;
10322
10323 rb_str_check_dummy_enc(enc);
10325 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10326 }
10327 if (!s || s >= e) return 0;
10328 t = e;
10329
10330 /* remove trailing spaces or '\0's */
10331 if (single_byte_optimizable(str)) {
10332 unsigned char c;
10333 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10334 }
10335 else {
10336 char *tp;
10337
10338 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10339 unsigned int c = rb_enc_codepoint(tp, e, enc);
10340 if (c && !rb_isspace(c)) break;
10341 t = tp;
10342 }
10343 }
10344 return e - t;
10345}
10346
10347/*
10348 * call-seq:
10349 * rstrip! -> self or nil
10350 *
10351 * Like String#rstrip, except that:
10352 *
10353 * - Performs stripping in +self+ (not in a copy of +self+).
10354 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10355 *
10356 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10357 */
10358
10359static VALUE
10360rb_str_rstrip_bang(VALUE str)
10361{
10362 rb_encoding *enc;
10363 char *start;
10364 long olen, roffset;
10365
10366 str_modify_keep_cr(str);
10367 enc = STR_ENC_GET(str);
10368 RSTRING_GETMEM(str, start, olen);
10369 roffset = rstrip_offset(str, start, start+olen, enc);
10370 if (roffset > 0) {
10371 long len = olen - roffset;
10372
10373 STR_SET_LEN(str, len);
10374 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10375 return str;
10376 }
10377 return Qnil;
10378}
10379
10380
10381/*
10382 * call-seq:
10383 * rstrip -> new_string
10384 *
10385 * Returns a copy of +self+ with trailing whitespace removed;
10386 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10387 *
10388 * whitespace = "\x00\t\n\v\f\r "
10389 * s = whitespace + 'abc' + whitespace
10390 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10391 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10392 *
10393 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10394 */
10395
10396static VALUE
10397rb_str_rstrip(VALUE str)
10398{
10399 rb_encoding *enc;
10400 char *start;
10401 long olen, roffset;
10402
10403 enc = STR_ENC_GET(str);
10404 RSTRING_GETMEM(str, start, olen);
10405 roffset = rstrip_offset(str, start, start+olen, enc);
10406
10407 if (roffset <= 0) return str_duplicate(rb_cString, str);
10408 return rb_str_subseq(str, 0, olen-roffset);
10409}
10410
10411
10412/*
10413 * call-seq:
10414 * strip! -> self or nil
10415 *
10416 * Like String#strip, except that:
10417 *
10418 * - Any modifications are made to +self+.
10419 * - Returns +self+ if any modification are made, +nil+ otherwise.
10420 *
10421 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10422 */
10423
10424static VALUE
10425rb_str_strip_bang(VALUE str)
10426{
10427 char *start;
10428 long olen, loffset, roffset;
10429 rb_encoding *enc;
10430
10431 str_modify_keep_cr(str);
10432 enc = STR_ENC_GET(str);
10433 RSTRING_GETMEM(str, start, olen);
10434 loffset = lstrip_offset(str, start, start+olen, enc);
10435 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10436
10437 if (loffset > 0 || roffset > 0) {
10438 long len = olen-roffset;
10439 if (loffset > 0) {
10440 len -= loffset;
10441 memmove(start, start + loffset, len);
10442 }
10443 STR_SET_LEN(str, len);
10444 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10445 return str;
10446 }
10447 return Qnil;
10448}
10449
10450
10451/*
10452 * call-seq:
10453 * strip -> new_string
10454 *
10455 * Returns a copy of +self+ with leading and trailing whitespace removed;
10456 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10457 *
10458 * whitespace = "\x00\t\n\v\f\r "
10459 * s = whitespace + 'abc' + whitespace
10460 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10461 * s.strip # => "abc"
10462 *
10463 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10464 */
10465
10466static VALUE
10467rb_str_strip(VALUE str)
10468{
10469 char *start;
10470 long olen, loffset, roffset;
10471 rb_encoding *enc = STR_ENC_GET(str);
10472
10473 RSTRING_GETMEM(str, start, olen);
10474 loffset = lstrip_offset(str, start, start+olen, enc);
10475 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10476
10477 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10478 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10479}
10480
10481static VALUE
10482scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10483{
10484 VALUE result = Qnil;
10485 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10486 if (pos >= 0) {
10487 VALUE match;
10488 struct re_registers *regs;
10489 if (BUILTIN_TYPE(pat) == T_STRING) {
10490 regs = NULL;
10491 end = pos + RSTRING_LEN(pat);
10492 }
10493 else {
10494 match = rb_backref_get();
10495 regs = RMATCH_REGS(match);
10496 pos = BEG(0);
10497 end = END(0);
10498 }
10499
10500 if (pos == end) {
10501 rb_encoding *enc = STR_ENC_GET(str);
10502 /*
10503 * Always consume at least one character of the input string
10504 */
10505 if (RSTRING_LEN(str) > end)
10506 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10507 RSTRING_END(str), enc);
10508 else
10509 *start = end + 1;
10510 }
10511 else {
10512 *start = end;
10513 }
10514
10515 if (!regs || regs->num_regs == 1) {
10516 result = rb_str_subseq(str, pos, end - pos);
10517 return result;
10518 }
10519 else {
10520 result = rb_ary_new2(regs->num_regs);
10521 for (int i = 1; i < regs->num_regs; i++) {
10522 VALUE s = Qnil;
10523 if (BEG(i) >= 0) {
10524 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10525 }
10526
10527 rb_ary_push(result, s);
10528 }
10529 }
10530
10531 RB_GC_GUARD(match);
10532 }
10533
10534 return result;
10535}
10536
10537
10538/*
10539 * call-seq:
10540 * scan(pattern) -> array_of_results
10541 * scan(pattern) {|result| ... } -> self
10542 *
10543 * :include: doc/string/scan.rdoc
10544 *
10545 */
10546
10547static VALUE
10548rb_str_scan(VALUE str, VALUE pat)
10549{
10550 VALUE result;
10551 long start = 0;
10552 long last = -1, prev = 0;
10553 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10554
10555 pat = get_pat_quoted(pat, 1);
10556 mustnot_broken(str);
10557 if (!rb_block_given_p()) {
10558 VALUE ary = rb_ary_new();
10559
10560 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10561 last = prev;
10562 prev = start;
10563 rb_ary_push(ary, result);
10564 }
10565 if (last >= 0) rb_pat_search(pat, str, last, 1);
10566 else rb_backref_set(Qnil);
10567 return ary;
10568 }
10569
10570 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10571 last = prev;
10572 prev = start;
10573 rb_yield(result);
10574 str_mod_check(str, p, len);
10575 }
10576 if (last >= 0) rb_pat_search(pat, str, last, 1);
10577 return str;
10578}
10579
10580
10581/*
10582 * call-seq:
10583 * hex -> integer
10584 *
10585 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10586 * returns its value as an integer.
10587 *
10588 * The leading substring is interpreted as hexadecimal when it begins with:
10589 *
10590 * - One or more character representing hexadecimal digits
10591 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10592 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10593 *
10594 * 'f'.hex # => 15
10595 * '11'.hex # => 17
10596 * 'FFF'.hex # => 4095
10597 * 'fffg'.hex # => 4095
10598 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10599 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10600 * 'deadbeef'.hex # => 3735928559
10601 *
10602 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10603 *
10604 * '0xfff'.hex # => 4095
10605 * '0xfffg'.hex # => 4095
10606 *
10607 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10608 *
10609 * '-fff'.hex # => -4095
10610 * '-0xFFF'.hex # => -4095
10611 *
10612 * For any substring not described above, returns zero:
10613 *
10614 * 'xxx'.hex # => 0
10615 * ''.hex # => 0
10616 *
10617 * Note that, unlike #oct, this method interprets only hexadecimal,
10618 * and not binary, octal, or decimal notations:
10619 *
10620 * '0b111'.hex # => 45329
10621 * '0o777'.hex # => 0
10622 * '0d999'.hex # => 55705
10623 *
10624 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10625 */
10626
10627static VALUE
10628rb_str_hex(VALUE str)
10629{
10630 return rb_str_to_inum(str, 16, FALSE);
10631}
10632
10633
10634/*
10635 * call-seq:
10636 * oct -> integer
10637 *
10638 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10639 * returns their value as an integer.
10640 *
10641 * In brief:
10642 *
10643 * # Interpreted as octal.
10644 * '777'.oct # => 511
10645 * '777x'.oct # => 511
10646 * '0777'.oct # => 511
10647 * '0o777'.oct # => 511
10648 * '-777'.oct # => -511
10649 * # Not interpreted as octal.
10650 * '0b111'.oct # => 7 # Interpreted as binary.
10651 * '0d999'.oct # => 999 # Interpreted as decimal.
10652 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10653 *
10654 * The leading substring is interpreted as octal when it begins with:
10655 *
10656 * - One or more character representing octal digits
10657 * (each in the range <tt>'0'..'7'</tt>);
10658 * the string to be interpreted ends at the first character that does not represent an octal digit:
10659 *
10660 * '7'.oct @ => 7
10661 * '11'.oct # => 9
10662 * '777'.oct # => 511
10663 * '0777'.oct # => 511
10664 * '7778'.oct # => 511
10665 * '777x'.oct # => 511
10666 *
10667 * - <tt>'0o'</tt>, followed by one or more octal digits:
10668 *
10669 * '0o777'.oct # => 511
10670 * '0o7778'.oct # => 511
10671 *
10672 * The leading substring is _not_ interpreted as octal when it begins with:
10673 *
10674 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10675 * (each in the range <tt>'0'..'1'</tt>);
10676 * the string to be interpreted ends at the first character that does not represent a binary digit.
10677 * the string is interpreted as binary digits (base 2):
10678 *
10679 * '0b111'.oct # => 7
10680 * '0b1112'.oct # => 7
10681 *
10682 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10683 * (each in the range <tt>'0'..'9'</tt>);
10684 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10685 * the string is interpreted as decimal digits (base 10):
10686 *
10687 * '0d999'.oct # => 999
10688 * '0d999x'.oct # => 999
10689 *
10690 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10691 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10692 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10693 * the string is interpreted as hexadecimal digits (base 16):
10694 *
10695 * '0xfff'.oct # => 4095
10696 * '0xfffg'.oct # => 4095
10697 *
10698 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10699 *
10700 * '-777'.oct # => -511
10701 * '-0777'.oct # => -511
10702 * '-0b111'.oct # => -7
10703 * '-0xfff'.oct # => -4095
10704 *
10705 * For any substring not described above, returns zero:
10706 *
10707 * 'foo'.oct # => 0
10708 * ''.oct # => 0
10709 *
10710 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10711 */
10712
10713static VALUE
10714rb_str_oct(VALUE str)
10715{
10716 return rb_str_to_inum(str, -8, FALSE);
10717}
10718
10719#ifndef HAVE_CRYPT_R
10720# include "ruby/thread_native.h"
10721# include "ruby/atomic.h"
10722
10723static struct {
10724 rb_nativethread_lock_t lock;
10725} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10726#endif
10727
10728/*
10729 * call-seq:
10730 * crypt(salt_str) -> new_string
10731 *
10732 * Returns the string generated by calling <code>crypt(3)</code>
10733 * standard library function with <code>str</code> and
10734 * <code>salt_str</code>, in this order, as its arguments. Please do
10735 * not use this method any longer. It is legacy; provided only for
10736 * backward compatibility with ruby scripts in earlier days. It is
10737 * bad to use in contemporary programs for several reasons:
10738 *
10739 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10740 * run. The generated string lacks data portability.
10741 *
10742 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10743 * (i.e. silently ends up in unexpected results).
10744 *
10745 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10746 * thread safe.
10747 *
10748 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10749 * very very weak. According to its manpage, Linux's traditional
10750 * <code>crypt(3)</code> output has only 2**56 variations; too
10751 * easy to brute force today. And this is the default behaviour.
10752 *
10753 * * In order to make things robust some OSes implement so-called
10754 * "modular" usage. To go through, you have to do a complex
10755 * build-up of the <code>salt_str</code> parameter, by hand.
10756 * Failure in generation of a proper salt string tends not to
10757 * yield any errors; typos in parameters are normally not
10758 * detectable.
10759 *
10760 * * For instance, in the following example, the second invocation
10761 * of String#crypt is wrong; it has a typo in "round=" (lacks
10762 * "s"). However the call does not fail and something unexpected
10763 * is generated.
10764 *
10765 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10766 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10767 *
10768 * * Even in the "modular" mode, some hash functions are considered
10769 * archaic and no longer recommended at all; for instance module
10770 * <code>$1$</code> is officially abandoned by its author: see
10771 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10772 * instance module <code>$3$</code> is considered completely
10773 * broken: see the manpage of FreeBSD.
10774 *
10775 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10776 * written above, <code>crypt(3)</code> on Mac OS never fails.
10777 * This means even if you build up a proper salt string it
10778 * generates a traditional DES hash anyways, and there is no way
10779 * for you to be aware of.
10780 *
10781 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10782 *
10783 * If for some reason you cannot migrate to other secure contemporary
10784 * password hashing algorithms, install the string-crypt gem and
10785 * <code>require 'string/crypt'</code> to continue using it.
10786 */
10787
10788static VALUE
10789rb_str_crypt(VALUE str, VALUE salt)
10790{
10791#ifdef HAVE_CRYPT_R
10792 VALUE databuf;
10793 struct crypt_data *data;
10794# define CRYPT_END() ALLOCV_END(databuf)
10795#else
10796 char *tmp_buf;
10797 extern char *crypt(const char *, const char *);
10798# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10799#endif
10800 VALUE result;
10801 const char *s, *saltp;
10802 char *res;
10803#ifdef BROKEN_CRYPT
10804 char salt_8bit_clean[3];
10805#endif
10806
10807 StringValue(salt);
10808 mustnot_wchar(str);
10809 mustnot_wchar(salt);
10810 s = StringValueCStr(str);
10811 saltp = RSTRING_PTR(salt);
10812 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10813 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10814 }
10815
10816#ifdef BROKEN_CRYPT
10817 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10818 salt_8bit_clean[0] = saltp[0] & 0x7f;
10819 salt_8bit_clean[1] = saltp[1] & 0x7f;
10820 salt_8bit_clean[2] = '\0';
10821 saltp = salt_8bit_clean;
10822 }
10823#endif
10824#ifdef HAVE_CRYPT_R
10825 data = ALLOCV(databuf, sizeof(struct crypt_data));
10826# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10827 data->initialized = 0;
10828# endif
10829 res = crypt_r(s, saltp, data);
10830#else
10831 rb_nativethread_lock_lock(&crypt_mutex.lock);
10832 res = crypt(s, saltp);
10833#endif
10834 if (!res) {
10835 int err = errno;
10836 CRYPT_END();
10837 rb_syserr_fail(err, "crypt");
10838 }
10839#ifdef HAVE_CRYPT_R
10840 result = rb_str_new_cstr(res);
10841 CRYPT_END();
10842#else
10843 // We need to copy this buffer because it's static and we need to unlock the mutex
10844 // before allocating a new object (the string to be returned). If we allocate while
10845 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10846 // if other ractors are waiting on this lock.
10847 size_t res_size = strlen(res)+1;
10848 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10849 memcpy(tmp_buf, res, res_size);
10850 res = tmp_buf;
10851 CRYPT_END();
10852 result = rb_str_new_cstr(res);
10853#endif
10854 return result;
10855}
10856
10857
10858/*
10859 * call-seq:
10860 * ord -> integer
10861 *
10862 * :include: doc/string/ord.rdoc
10863 *
10864 */
10865
10866static VALUE
10867rb_str_ord(VALUE s)
10868{
10869 unsigned int c;
10870
10871 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10872 return UINT2NUM(c);
10873}
10874/*
10875 * call-seq:
10876 * sum(n = 16) -> integer
10877 *
10878 * :include: doc/string/sum.rdoc
10879 *
10880 */
10881
10882static VALUE
10883rb_str_sum(int argc, VALUE *argv, VALUE str)
10884{
10885 int bits = 16;
10886 char *ptr, *p, *pend;
10887 long len;
10888 VALUE sum = INT2FIX(0);
10889 unsigned long sum0 = 0;
10890
10891 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10892 bits = 0;
10893 }
10894 ptr = p = RSTRING_PTR(str);
10895 len = RSTRING_LEN(str);
10896 pend = p + len;
10897
10898 while (p < pend) {
10899 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10900 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10901 str_mod_check(str, ptr, len);
10902 sum0 = 0;
10903 }
10904 sum0 += (unsigned char)*p;
10905 p++;
10906 }
10907
10908 if (bits == 0) {
10909 if (sum0) {
10910 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10911 }
10912 }
10913 else {
10914 if (sum == INT2FIX(0)) {
10915 if (bits < (int)sizeof(long)*CHAR_BIT) {
10916 sum0 &= (((unsigned long)1)<<bits)-1;
10917 }
10918 sum = LONG2FIX(sum0);
10919 }
10920 else {
10921 VALUE mod;
10922
10923 if (sum0) {
10924 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10925 }
10926
10927 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10928 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10929 sum = rb_funcall(sum, '&', 1, mod);
10930 }
10931 }
10932 return sum;
10933}
10934
10935static VALUE
10936rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10937{
10938 rb_encoding *enc;
10939 VALUE w;
10940 long width, len, flen = 1, fclen = 1;
10941 VALUE res;
10942 char *p;
10943 const char *f = " ";
10944 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10945 VALUE pad;
10946 int singlebyte = 1, cr;
10947 int termlen;
10948
10949 rb_scan_args(argc, argv, "11", &w, &pad);
10950 enc = STR_ENC_GET(str);
10951 termlen = rb_enc_mbminlen(enc);
10952 width = NUM2LONG(w);
10953 if (argc == 2) {
10954 StringValue(pad);
10955 enc = rb_enc_check(str, pad);
10956 f = RSTRING_PTR(pad);
10957 flen = RSTRING_LEN(pad);
10958 fclen = str_strlen(pad, enc); /* rb_enc_check */
10959 singlebyte = single_byte_optimizable(pad);
10960 if (flen == 0 || fclen == 0) {
10961 rb_raise(rb_eArgError, "zero width padding");
10962 }
10963 }
10964 len = str_strlen(str, enc); /* rb_enc_check */
10965 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10966 n = width - len;
10967 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10968 rlen = n - llen;
10969 cr = ENC_CODERANGE(str);
10970 if (flen > 1) {
10971 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10972 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10973 }
10974 size = RSTRING_LEN(str);
10975 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10976 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10977 (len += llen2 + rlen2) >= LONG_MAX - size) {
10978 rb_raise(rb_eArgError, "argument too big");
10979 }
10980 len += size;
10981 res = str_enc_new(rb_cString, 0, len, enc);
10982 p = RSTRING_PTR(res);
10983 if (flen <= 1) {
10984 memset(p, *f, llen);
10985 p += llen;
10986 }
10987 else {
10988 while (llen >= fclen) {
10989 memcpy(p,f,flen);
10990 p += flen;
10991 llen -= fclen;
10992 }
10993 if (llen > 0) {
10994 memcpy(p, f, llen2);
10995 p += llen2;
10996 }
10997 }
10998 memcpy(p, RSTRING_PTR(str), size);
10999 p += size;
11000 if (flen <= 1) {
11001 memset(p, *f, rlen);
11002 p += rlen;
11003 }
11004 else {
11005 while (rlen >= fclen) {
11006 memcpy(p,f,flen);
11007 p += flen;
11008 rlen -= fclen;
11009 }
11010 if (rlen > 0) {
11011 memcpy(p, f, rlen2);
11012 p += rlen2;
11013 }
11014 }
11015 TERM_FILL(p, termlen);
11016 STR_SET_LEN(res, p-RSTRING_PTR(res));
11017
11018 if (argc == 2)
11019 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11020 if (cr != ENC_CODERANGE_BROKEN)
11021 ENC_CODERANGE_SET(res, cr);
11022
11023 RB_GC_GUARD(pad);
11024 return res;
11025}
11026
11027
11028/*
11029 * call-seq:
11030 * ljust(width, pad_string = ' ') -> new_string
11031 *
11032 * :include: doc/string/ljust.rdoc
11033 *
11034 */
11035
11036static VALUE
11037rb_str_ljust(int argc, VALUE *argv, VALUE str)
11038{
11039 return rb_str_justify(argc, argv, str, 'l');
11040}
11041
11042/*
11043 * call-seq:
11044 * rjust(width, pad_string = ' ') -> new_string
11045 *
11046 * :include: doc/string/rjust.rdoc
11047 *
11048 */
11049
11050static VALUE
11051rb_str_rjust(int argc, VALUE *argv, VALUE str)
11052{
11053 return rb_str_justify(argc, argv, str, 'r');
11054}
11055
11056
11057/*
11058 * call-seq:
11059 * center(size, pad_string = ' ') -> new_string
11060 *
11061 * :include: doc/string/center.rdoc
11062 *
11063 */
11064
11065static VALUE
11066rb_str_center(int argc, VALUE *argv, VALUE str)
11067{
11068 return rb_str_justify(argc, argv, str, 'c');
11069}
11070
11071/*
11072 * call-seq:
11073 * partition(pattern) -> [pre_match, first_match, post_match]
11074 *
11075 * :include: doc/string/partition.rdoc
11076 *
11077 */
11078
11079static VALUE
11080rb_str_partition(VALUE str, VALUE sep)
11081{
11082 long pos;
11083
11084 sep = get_pat_quoted(sep, 0);
11085 if (RB_TYPE_P(sep, T_REGEXP)) {
11086 if (rb_reg_search(sep, str, 0, 0) < 0) {
11087 goto failed;
11088 }
11089 VALUE match = rb_backref_get();
11090 struct re_registers *regs = RMATCH_REGS(match);
11091
11092 pos = BEG(0);
11093 sep = rb_str_subseq(str, pos, END(0) - pos);
11094 }
11095 else {
11096 pos = rb_str_index(str, sep, 0);
11097 if (pos < 0) goto failed;
11098 }
11099 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11100 sep,
11101 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11102 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11103
11104 failed:
11105 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11106}
11107
11108/*
11109 * call-seq:
11110 * rpartition(pattern) -> [pre_match, last_match, post_match]
11111 *
11112 * :include: doc/string/rpartition.rdoc
11113 *
11114 */
11115
11116static VALUE
11117rb_str_rpartition(VALUE str, VALUE sep)
11118{
11119 long pos = RSTRING_LEN(str);
11120
11121 sep = get_pat_quoted(sep, 0);
11122 if (RB_TYPE_P(sep, T_REGEXP)) {
11123 if (rb_reg_search(sep, str, pos, 1) < 0) {
11124 goto failed;
11125 }
11126 VALUE match = rb_backref_get();
11127 struct re_registers *regs = RMATCH_REGS(match);
11128
11129 pos = BEG(0);
11130 sep = rb_str_subseq(str, pos, END(0) - pos);
11131 }
11132 else {
11133 pos = rb_str_sublen(str, pos);
11134 pos = rb_str_rindex(str, sep, pos);
11135 if (pos < 0) {
11136 goto failed;
11137 }
11138 }
11139
11140 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11141 sep,
11142 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11143 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11144 failed:
11145 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11146}
11147
11148/*
11149 * call-seq:
11150 * start_with?(*patterns) -> true or false
11151 *
11152 * :include: doc/string/start_with_p.rdoc
11153 *
11154 */
11155
11156static VALUE
11157rb_str_start_with(int argc, VALUE *argv, VALUE str)
11158{
11159 int i;
11160
11161 for (i=0; i<argc; i++) {
11162 VALUE tmp = argv[i];
11163 if (RB_TYPE_P(tmp, T_REGEXP)) {
11164 if (rb_reg_start_with_p(tmp, str))
11165 return Qtrue;
11166 }
11167 else {
11168 const char *p, *s, *e;
11169 long slen, tlen;
11170 rb_encoding *enc;
11171
11172 StringValue(tmp);
11173 enc = rb_enc_check(str, tmp);
11174 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11175 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11176 p = RSTRING_PTR(str);
11177 e = p + slen;
11178 s = p + tlen;
11179 if (!at_char_right_boundary(p, s, e, enc))
11180 continue;
11181 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11182 return Qtrue;
11183 }
11184 }
11185 return Qfalse;
11186}
11187
11188/*
11189 * call-seq:
11190 * end_with?(*strings) -> true or false
11191 *
11192 * :include: doc/string/end_with_p.rdoc
11193 *
11194 */
11195
11196static VALUE
11197rb_str_end_with(int argc, VALUE *argv, VALUE str)
11198{
11199 int i;
11200
11201 for (i=0; i<argc; i++) {
11202 VALUE tmp = argv[i];
11203 const char *p, *s, *e;
11204 long slen, tlen;
11205 rb_encoding *enc;
11206
11207 StringValue(tmp);
11208 enc = rb_enc_check(str, tmp);
11209 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11210 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11211 p = RSTRING_PTR(str);
11212 e = p + slen;
11213 s = e - tlen;
11214 if (!at_char_boundary(p, s, e, enc))
11215 continue;
11216 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11217 return Qtrue;
11218 }
11219 return Qfalse;
11220}
11221
11231static long
11232deleted_prefix_length(VALUE str, VALUE prefix)
11233{
11234 const char *strptr, *prefixptr;
11235 long olen, prefixlen;
11236 rb_encoding *enc = rb_enc_get(str);
11237
11238 StringValue(prefix);
11239
11240 if (!is_broken_string(prefix) ||
11241 !rb_enc_asciicompat(enc) ||
11242 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11243 enc = rb_enc_check(str, prefix);
11244 }
11245
11246 /* return 0 if not start with prefix */
11247 prefixlen = RSTRING_LEN(prefix);
11248 if (prefixlen <= 0) return 0;
11249 olen = RSTRING_LEN(str);
11250 if (olen < prefixlen) return 0;
11251 strptr = RSTRING_PTR(str);
11252 prefixptr = RSTRING_PTR(prefix);
11253 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11254 if (is_broken_string(prefix)) {
11255 if (!is_broken_string(str)) {
11256 /* prefix in a valid string cannot be broken */
11257 return 0;
11258 }
11259 const char *strend = strptr + olen;
11260 const char *after_prefix = strptr + prefixlen;
11261 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11262 /* prefix does not end at char-boundary */
11263 return 0;
11264 }
11265 }
11266 /* prefix part in `str` also should be valid. */
11267
11268 return prefixlen;
11269}
11270
11271/*
11272 * call-seq:
11273 * delete_prefix!(prefix) -> self or nil
11274 *
11275 * Like String#delete_prefix, except that +self+ is modified in place;
11276 * returns +self+ if the prefix is removed, +nil+ otherwise.
11277 *
11278 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11279 */
11280
11281static VALUE
11282rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11283{
11284 long prefixlen;
11285 str_modify_keep_cr(str);
11286
11287 prefixlen = deleted_prefix_length(str, prefix);
11288 if (prefixlen <= 0) return Qnil;
11289
11290 return rb_str_drop_bytes(str, prefixlen);
11291}
11292
11293/*
11294 * call-seq:
11295 * delete_prefix(prefix) -> new_string
11296 *
11297 * :include: doc/string/delete_prefix.rdoc
11298 *
11299 */
11300
11301static VALUE
11302rb_str_delete_prefix(VALUE str, VALUE prefix)
11303{
11304 long prefixlen;
11305
11306 prefixlen = deleted_prefix_length(str, prefix);
11307 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11308
11309 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11310}
11311
11321static long
11322deleted_suffix_length(VALUE str, VALUE suffix)
11323{
11324 const char *strptr, *suffixptr;
11325 long olen, suffixlen;
11326 rb_encoding *enc;
11327
11328 StringValue(suffix);
11329 if (is_broken_string(suffix)) return 0;
11330 enc = rb_enc_check(str, suffix);
11331
11332 /* return 0 if not start with suffix */
11333 suffixlen = RSTRING_LEN(suffix);
11334 if (suffixlen <= 0) return 0;
11335 olen = RSTRING_LEN(str);
11336 if (olen < suffixlen) return 0;
11337 strptr = RSTRING_PTR(str);
11338 suffixptr = RSTRING_PTR(suffix);
11339 const char *strend = strptr + olen;
11340 const char *before_suffix = strend - suffixlen;
11341 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11342 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11343
11344 return suffixlen;
11345}
11346
11347/*
11348 * call-seq:
11349 * delete_suffix!(suffix) -> self or nil
11350 *
11351 * Like String#delete_suffix, except that +self+ is modified in place;
11352 * returns +self+ if the suffix is removed, +nil+ otherwise.
11353 *
11354 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11355 */
11356
11357static VALUE
11358rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11359{
11360 long olen, suffixlen, len;
11361 str_modifiable(str);
11362
11363 suffixlen = deleted_suffix_length(str, suffix);
11364 if (suffixlen <= 0) return Qnil;
11365
11366 olen = RSTRING_LEN(str);
11367 str_modify_keep_cr(str);
11368 len = olen - suffixlen;
11369 STR_SET_LEN(str, len);
11370 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11371 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11373 }
11374 return str;
11375}
11376
11377/*
11378 * call-seq:
11379 * delete_suffix(suffix) -> new_string
11380 *
11381 * :include: doc/string/delete_suffix.rdoc
11382 *
11383 */
11384
11385static VALUE
11386rb_str_delete_suffix(VALUE str, VALUE suffix)
11387{
11388 long suffixlen;
11389
11390 suffixlen = deleted_suffix_length(str, suffix);
11391 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11392
11393 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11394}
11395
11396void
11397rb_str_setter(VALUE val, ID id, VALUE *var)
11398{
11399 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11400 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11401 }
11402 *var = val;
11403}
11404
11405static void
11406rb_fs_setter(VALUE val, ID id, VALUE *var)
11407{
11408 val = rb_fs_check(val);
11409 if (!val) {
11410 rb_raise(rb_eTypeError,
11411 "value of %"PRIsVALUE" must be String or Regexp",
11412 rb_id2str(id));
11413 }
11414 if (!NIL_P(val)) {
11415 rb_warn_deprecated("'$;'", NULL);
11416 }
11417 *var = val;
11418}
11419
11420
11421/*
11422 * call-seq:
11423 * force_encoding(encoding) -> self
11424 *
11425 * :include: doc/string/force_encoding.rdoc
11426 *
11427 */
11428
11429static VALUE
11430rb_str_force_encoding(VALUE str, VALUE enc)
11431{
11432 str_modifiable(str);
11433
11434 rb_encoding *encoding = rb_to_encoding(enc);
11435 int idx = rb_enc_to_index(encoding);
11436
11437 // If the encoding is unchanged, we do nothing.
11438 if (ENCODING_GET(str) == idx) {
11439 return str;
11440 }
11441
11442 rb_enc_associate_index(str, idx);
11443
11444 // If the coderange was 7bit and the new encoding is ASCII-compatible
11445 // we can keep the coderange.
11446 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11447 return str;
11448 }
11449
11451 return str;
11452}
11453
11454/*
11455 * call-seq:
11456 * b -> new_string
11457 *
11458 * :include: doc/string/b.rdoc
11459 *
11460 */
11461
11462static VALUE
11463rb_str_b(VALUE str)
11464{
11465 VALUE str2;
11466 if (STR_EMBED_P(str)) {
11467 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11468 }
11469 else {
11470 str2 = str_alloc_heap(rb_cString);
11471 }
11472 str_replace_shared_without_enc(str2, str);
11473
11474 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11475 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11476 // If we know the receiver's code range then we know the result's code range.
11477 int cr = ENC_CODERANGE(str);
11478 switch (cr) {
11479 case ENC_CODERANGE_7BIT:
11481 break;
11485 break;
11486 default:
11487 ENC_CODERANGE_CLEAR(str2);
11488 break;
11489 }
11490 }
11491
11492 return str2;
11493}
11494
11495/*
11496 * call-seq:
11497 * valid_encoding? -> true or false
11498 *
11499 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11500 *
11501 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11502 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11503 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11504 */
11505
11506static VALUE
11507rb_str_valid_encoding_p(VALUE str)
11508{
11509 int cr = rb_enc_str_coderange(str);
11510
11511 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11512}
11513
11514/*
11515 * call-seq:
11516 * ascii_only? -> true or false
11517 *
11518 * Returns whether +self+ contains only ASCII characters:
11519 *
11520 * 'abc'.ascii_only? # => true
11521 * "abc\u{6666}".ascii_only? # => false
11522 *
11523 * Related: see {Querying}[rdoc-ref:String@Querying].
11524 */
11525
11526static VALUE
11527rb_str_is_ascii_only_p(VALUE str)
11528{
11529 int cr = rb_enc_str_coderange(str);
11530
11531 return RBOOL(cr == ENC_CODERANGE_7BIT);
11532}
11533
11534VALUE
11536{
11537 static const char ellipsis[] = "...";
11538 const long ellipsislen = sizeof(ellipsis) - 1;
11539 rb_encoding *const enc = rb_enc_get(str);
11540 const long blen = RSTRING_LEN(str);
11541 const char *const p = RSTRING_PTR(str), *e = p + blen;
11542 VALUE estr, ret = 0;
11543
11544 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11545 if (len * rb_enc_mbminlen(enc) >= blen ||
11546 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11547 ret = str;
11548 }
11549 else if (len <= ellipsislen ||
11550 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11551 if (rb_enc_asciicompat(enc)) {
11552 ret = rb_str_new(ellipsis, len);
11553 rb_enc_associate(ret, enc);
11554 }
11555 else {
11556 estr = rb_usascii_str_new(ellipsis, len);
11557 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11558 }
11559 }
11560 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11561 rb_str_cat(ret, ellipsis, ellipsislen);
11562 }
11563 else {
11564 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11565 rb_enc_from_encoding(enc), 0, Qnil);
11566 rb_str_append(ret, estr);
11567 }
11568 return ret;
11569}
11570
11571static VALUE
11572str_compat_and_valid(VALUE str, rb_encoding *enc)
11573{
11574 int cr;
11575 str = StringValue(str);
11576 cr = rb_enc_str_coderange(str);
11577 if (cr == ENC_CODERANGE_BROKEN) {
11578 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11579 }
11580 else {
11581 rb_encoding *e = STR_ENC_GET(str);
11582 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11583 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11584 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11585 }
11586 }
11587 return str;
11588}
11589
11590static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11591
11592VALUE
11594{
11595 rb_encoding *enc = STR_ENC_GET(str);
11596 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11597}
11598
11599VALUE
11600rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11601{
11602 int cr = ENC_CODERANGE_UNKNOWN;
11603 if (enc == STR_ENC_GET(str)) {
11604 /* cached coderange makes sense only when enc equals the
11605 * actual encoding of str */
11606 cr = ENC_CODERANGE(str);
11607 }
11608 return enc_str_scrub(enc, str, repl, cr);
11609}
11610
11611static VALUE
11612enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11613{
11614 int encidx;
11615 VALUE buf = Qnil;
11616 const char *rep, *p, *e, *p1, *sp;
11617 long replen = -1;
11618 long slen;
11619
11620 if (rb_block_given_p()) {
11621 if (!NIL_P(repl))
11622 rb_raise(rb_eArgError, "both of block and replacement given");
11623 replen = 0;
11624 }
11625
11626 if (ENC_CODERANGE_CLEAN_P(cr))
11627 return Qnil;
11628
11629 if (!NIL_P(repl)) {
11630 repl = str_compat_and_valid(repl, enc);
11631 }
11632
11633 if (rb_enc_dummy_p(enc)) {
11634 return Qnil;
11635 }
11636 encidx = rb_enc_to_index(enc);
11637
11638#define DEFAULT_REPLACE_CHAR(str) do { \
11639 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11640 rep = replace; replen = (int)sizeof(replace); \
11641 } while (0)
11642
11643 slen = RSTRING_LEN(str);
11644 p = RSTRING_PTR(str);
11645 e = RSTRING_END(str);
11646 p1 = p;
11647 sp = p;
11648
11649 if (rb_enc_asciicompat(enc)) {
11650 int rep7bit_p;
11651 if (!replen) {
11652 rep = NULL;
11653 rep7bit_p = FALSE;
11654 }
11655 else if (!NIL_P(repl)) {
11656 rep = RSTRING_PTR(repl);
11657 replen = RSTRING_LEN(repl);
11658 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11659 }
11660 else if (encidx == rb_utf8_encindex()) {
11661 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11662 rep7bit_p = FALSE;
11663 }
11664 else {
11665 DEFAULT_REPLACE_CHAR("?");
11666 rep7bit_p = TRUE;
11667 }
11668 cr = ENC_CODERANGE_7BIT;
11669
11670 p = search_nonascii(p, e);
11671 if (!p) {
11672 p = e;
11673 }
11674 while (p < e) {
11675 int ret = rb_enc_precise_mbclen(p, e, enc);
11676 if (MBCLEN_NEEDMORE_P(ret)) {
11677 break;
11678 }
11679 else if (MBCLEN_CHARFOUND_P(ret)) {
11681 p += MBCLEN_CHARFOUND_LEN(ret);
11682 }
11683 else if (MBCLEN_INVALID_P(ret)) {
11684 /*
11685 * p1~p: valid ascii/multibyte chars
11686 * p ~e: invalid bytes + unknown bytes
11687 */
11688 long clen = rb_enc_mbmaxlen(enc);
11689 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11690 if (p > p1) {
11691 rb_str_buf_cat(buf, p1, p - p1);
11692 }
11693
11694 if (e - p < clen) clen = e - p;
11695 if (clen <= 2) {
11696 clen = 1;
11697 }
11698 else {
11699 const char *q = p;
11700 clen--;
11701 for (; clen > 1; clen--) {
11702 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11703 if (MBCLEN_NEEDMORE_P(ret)) break;
11704 if (MBCLEN_INVALID_P(ret)) continue;
11706 }
11707 }
11708 if (rep) {
11709 rb_str_buf_cat(buf, rep, replen);
11710 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11711 }
11712 else {
11713 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11714 str_mod_check(str, sp, slen);
11715 repl = str_compat_and_valid(repl, enc);
11716 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11719 }
11720 p += clen;
11721 p1 = p;
11722 p = search_nonascii(p, e);
11723 if (!p) {
11724 p = e;
11725 break;
11726 }
11727 }
11728 else {
11730 }
11731 }
11732 if (NIL_P(buf)) {
11733 if (p == e) {
11734 ENC_CODERANGE_SET(str, cr);
11735 return Qnil;
11736 }
11737 buf = rb_str_buf_new(RSTRING_LEN(str));
11738 }
11739 if (p1 < p) {
11740 rb_str_buf_cat(buf, p1, p - p1);
11741 }
11742 if (p < e) {
11743 if (rep) {
11744 rb_str_buf_cat(buf, rep, replen);
11745 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11746 }
11747 else {
11748 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11749 str_mod_check(str, sp, slen);
11750 repl = str_compat_and_valid(repl, enc);
11751 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11754 }
11755 }
11756 }
11757 else {
11758 /* ASCII incompatible */
11759 long mbminlen = rb_enc_mbminlen(enc);
11760 if (!replen) {
11761 rep = NULL;
11762 }
11763 else if (!NIL_P(repl)) {
11764 rep = RSTRING_PTR(repl);
11765 replen = RSTRING_LEN(repl);
11766 }
11767 else if (encidx == ENCINDEX_UTF_16BE) {
11768 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11769 }
11770 else if (encidx == ENCINDEX_UTF_16LE) {
11771 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11772 }
11773 else if (encidx == ENCINDEX_UTF_32BE) {
11774 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11775 }
11776 else if (encidx == ENCINDEX_UTF_32LE) {
11777 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11778 }
11779 else {
11780 DEFAULT_REPLACE_CHAR("?");
11781 }
11782
11783 while (p < e) {
11784 int ret = rb_enc_precise_mbclen(p, e, enc);
11785 if (MBCLEN_NEEDMORE_P(ret)) {
11786 break;
11787 }
11788 else if (MBCLEN_CHARFOUND_P(ret)) {
11789 p += MBCLEN_CHARFOUND_LEN(ret);
11790 }
11791 else if (MBCLEN_INVALID_P(ret)) {
11792 const char *q = p;
11793 long clen = rb_enc_mbmaxlen(enc);
11794 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11795 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11796
11797 if (e - p < clen) clen = e - p;
11798 if (clen <= mbminlen * 2) {
11799 clen = mbminlen;
11800 }
11801 else {
11802 clen -= mbminlen;
11803 for (; clen > mbminlen; clen-=mbminlen) {
11804 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11805 if (MBCLEN_NEEDMORE_P(ret)) break;
11806 if (MBCLEN_INVALID_P(ret)) continue;
11808 }
11809 }
11810 if (rep) {
11811 rb_str_buf_cat(buf, rep, replen);
11812 }
11813 else {
11814 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11815 str_mod_check(str, sp, slen);
11816 repl = str_compat_and_valid(repl, enc);
11817 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11818 }
11819 p += clen;
11820 p1 = p;
11821 }
11822 else {
11824 }
11825 }
11826 if (NIL_P(buf)) {
11827 if (p == e) {
11829 return Qnil;
11830 }
11831 buf = rb_str_buf_new(RSTRING_LEN(str));
11832 }
11833 if (p1 < p) {
11834 rb_str_buf_cat(buf, p1, p - p1);
11835 }
11836 if (p < e) {
11837 if (rep) {
11838 rb_str_buf_cat(buf, rep, replen);
11839 }
11840 else {
11841 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11842 str_mod_check(str, sp, slen);
11843 repl = str_compat_and_valid(repl, enc);
11844 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11845 }
11846 }
11848 }
11849 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11850 return buf;
11851}
11852
11853/*
11854 * call-seq:
11855 * scrub(replacement_string = default_replacement_string) -> new_string
11856 * scrub{|sequence| ... } -> new_string
11857 *
11858 * :include: doc/string/scrub.rdoc
11859 *
11860 */
11861static VALUE
11862str_scrub(int argc, VALUE *argv, VALUE str)
11863{
11864 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11865 VALUE new = rb_str_scrub(str, repl);
11866 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11867}
11868
11869/*
11870 * call-seq:
11871 * scrub!(replacement_string = default_replacement_string) -> self
11872 * scrub!{|sequence| ... } -> self
11873 *
11874 * Like String#scrub, except that:
11875 *
11876 * - Any replacements are made in +self+.
11877 * - Returns +self+.
11878 *
11879 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11880 *
11881 */
11882static VALUE
11883str_scrub_bang(int argc, VALUE *argv, VALUE str)
11884{
11885 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11886 VALUE new = rb_str_scrub(str, repl);
11887 if (!NIL_P(new)) rb_str_replace(str, new);
11888 return str;
11889}
11890
11891static ID id_normalize;
11892static ID id_normalized_p;
11893static VALUE mUnicodeNormalize;
11894
11895static VALUE
11896unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11897{
11898 static int UnicodeNormalizeRequired = 0;
11899 VALUE argv2[2];
11900
11901 if (!UnicodeNormalizeRequired) {
11902 rb_require("unicode_normalize/normalize.rb");
11903 UnicodeNormalizeRequired = 1;
11904 }
11905 argv2[0] = str;
11906 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11907 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11908}
11909
11910/*
11911 * call-seq:
11912 * unicode_normalize(form = :nfc) -> string
11913 *
11914 * Returns a copy of +self+ with
11915 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11916 *
11917 * Argument +form+ must be one of the following symbols
11918 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11919 *
11920 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11921 * - +:nfd+: Canonical decomposition.
11922 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11923 * - +:nfkd+: Compatibility decomposition.
11924 *
11925 * The encoding of +self+ must be one of:
11926 *
11927 * - Encoding::UTF_8
11928 * - Encoding::UTF_16BE
11929 * - Encoding::UTF_16LE
11930 * - Encoding::UTF_32BE
11931 * - Encoding::UTF_32LE
11932 * - Encoding::GB18030
11933 * - Encoding::UCS_2BE
11934 * - Encoding::UCS_4BE
11935 *
11936 * Examples:
11937 *
11938 * "a\u0300".unicode_normalize # => "a"
11939 * "\u00E0".unicode_normalize(:nfd) # => "a "
11940 *
11941 * Related: String#unicode_normalize!, String#unicode_normalized?.
11942 */
11943static VALUE
11944rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11945{
11946 return unicode_normalize_common(argc, argv, str, id_normalize);
11947}
11948
11949/*
11950 * call-seq:
11951 * unicode_normalize!(form = :nfc) -> self
11952 *
11953 * Like String#unicode_normalize, except that the normalization
11954 * is performed on +self+.
11955 *
11956 * Related String#unicode_normalized?.
11957 *
11958 */
11959static VALUE
11960rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11961{
11962 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11963}
11964
11965/* call-seq:
11966 * unicode_normalized?(form = :nfc) -> true or false
11967 *
11968 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11969 * +false+ otherwise.
11970 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11971 *
11972 * Examples:
11973 *
11974 * "a\u0300".unicode_normalized? # => false
11975 * "a\u0300".unicode_normalized?(:nfd) # => true
11976 * "\u00E0".unicode_normalized? # => true
11977 * "\u00E0".unicode_normalized?(:nfd) # => false
11978 *
11979 *
11980 * Raises an exception if +self+ is not in a Unicode encoding:
11981 *
11982 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
11983 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11984 *
11985 * Related: String#unicode_normalize, String#unicode_normalize!.
11986 *
11987 */
11988static VALUE
11989rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11990{
11991 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11992}
11993
11994/**********************************************************************
11995 * Document-class: Symbol
11996 *
11997 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11998 *
11999 * You can create a +Symbol+ object explicitly with:
12000 *
12001 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12002 *
12003 * The same +Symbol+ object will be
12004 * created for a given name or string for the duration of a program's
12005 * execution, regardless of the context or meaning of that name. Thus
12006 * if <code>Fred</code> is a constant in one context, a method in
12007 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12008 * will be the same object in all three contexts.
12009 *
12010 * module One
12011 * class Fred
12012 * end
12013 * $f1 = :Fred
12014 * end
12015 * module Two
12016 * Fred = 1
12017 * $f2 = :Fred
12018 * end
12019 * def Fred()
12020 * end
12021 * $f3 = :Fred
12022 * $f1.object_id #=> 2514190
12023 * $f2.object_id #=> 2514190
12024 * $f3.object_id #=> 2514190
12025 *
12026 * Constant, method, and variable names are returned as symbols:
12027 *
12028 * module One
12029 * Two = 2
12030 * def three; 3 end
12031 * @four = 4
12032 * @@five = 5
12033 * $six = 6
12034 * end
12035 * seven = 7
12036 *
12037 * One.constants
12038 * # => [:Two]
12039 * One.instance_methods(true)
12040 * # => [:three]
12041 * One.instance_variables
12042 * # => [:@four]
12043 * One.class_variables
12044 * # => [:@@five]
12045 * global_variables.grep(/six/)
12046 * # => [:$six]
12047 * local_variables
12048 * # => [:seven]
12049 *
12050 * A +Symbol+ object differs from a String object in that
12051 * a +Symbol+ object represents an identifier, while a String object
12052 * represents text or data.
12053 *
12054 * == What's Here
12055 *
12056 * First, what's elsewhere. Class +Symbol+:
12057 *
12058 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12059 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12060 *
12061 * Here, class +Symbol+ provides methods that are useful for:
12062 *
12063 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12064 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12065 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12066 *
12067 * === Methods for Querying
12068 *
12069 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12070 * - #=~: Returns the index of the first substring in symbol that matches a
12071 * given Regexp or other object; returns +nil+ if no match is found.
12072 * - #[], #slice : Returns a substring of symbol
12073 * determined by a given index, start/length, or range, or string.
12074 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12075 * - #encoding: Returns the Encoding object that represents the encoding
12076 * of symbol.
12077 * - #end_with?: Returns +true+ if symbol ends with
12078 * any of the given strings.
12079 * - #match: Returns a MatchData object if symbol
12080 * matches a given Regexp; +nil+ otherwise.
12081 * - #match?: Returns +true+ if symbol
12082 * matches a given Regexp; +false+ otherwise.
12083 * - #length, #size: Returns the number of characters in symbol.
12084 * - #start_with?: Returns +true+ if symbol starts with
12085 * any of the given strings.
12086 *
12087 * === Methods for Comparing
12088 *
12089 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12090 * or larger than symbol.
12091 * - #==, #===: Returns +true+ if a given symbol has the same content and
12092 * encoding.
12093 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12094 * symbol is smaller than, equal to, or larger than symbol.
12095 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12096 * after Unicode case folding; +false+ otherwise.
12097 *
12098 * === Methods for Converting
12099 *
12100 * - #capitalize: Returns symbol with the first character upcased
12101 * and all other characters downcased.
12102 * - #downcase: Returns symbol with all characters downcased.
12103 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12104 * - #name: Returns the frozen string corresponding to symbol.
12105 * - #succ, #next: Returns the symbol that is the successor to symbol.
12106 * - #swapcase: Returns symbol with all upcase characters downcased
12107 * and all downcase characters upcased.
12108 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12109 * - #to_s, #id2name: Returns the string corresponding to +self+.
12110 * - #to_sym, #intern: Returns +self+.
12111 * - #upcase: Returns symbol with all characters upcased.
12112 *
12113 */
12114
12115
12116/*
12117 * call-seq:
12118 * symbol == object -> true or false
12119 *
12120 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12121 */
12122
12123#define sym_equal rb_obj_equal
12124
12125static int
12126sym_printable(const char *s, const char *send, rb_encoding *enc)
12127{
12128 while (s < send) {
12129 int n;
12130 int c = rb_enc_precise_mbclen(s, send, enc);
12131
12132 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12133 n = MBCLEN_CHARFOUND_LEN(c);
12134 c = rb_enc_mbc_to_codepoint(s, send, enc);
12135 if (!rb_enc_isprint(c, enc)) return FALSE;
12136 s += n;
12137 }
12138 return TRUE;
12139}
12140
12141int
12142rb_str_symname_p(VALUE sym)
12143{
12144 rb_encoding *enc;
12145 const char *ptr;
12146 long len;
12147 rb_encoding *resenc = rb_default_internal_encoding();
12148
12149 if (resenc == NULL) resenc = rb_default_external_encoding();
12150 enc = STR_ENC_GET(sym);
12151 ptr = RSTRING_PTR(sym);
12152 len = RSTRING_LEN(sym);
12153 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12154 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12155 return FALSE;
12156 }
12157 return TRUE;
12158}
12159
12160VALUE
12161rb_str_quote_unprintable(VALUE str)
12162{
12163 rb_encoding *enc;
12164 const char *ptr;
12165 long len;
12166 rb_encoding *resenc;
12167
12168 Check_Type(str, T_STRING);
12169 resenc = rb_default_internal_encoding();
12170 if (resenc == NULL) resenc = rb_default_external_encoding();
12171 enc = STR_ENC_GET(str);
12172 ptr = RSTRING_PTR(str);
12173 len = RSTRING_LEN(str);
12174 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12175 !sym_printable(ptr, ptr + len, enc)) {
12176 return rb_str_escape(str);
12177 }
12178 return str;
12179}
12180
12181VALUE
12182rb_id_quote_unprintable(ID id)
12183{
12184 VALUE str = rb_id2str(id);
12185 if (!rb_str_symname_p(str)) {
12186 return rb_str_escape(str);
12187 }
12188 return str;
12189}
12190
12191/*
12192 * call-seq:
12193 * inspect -> string
12194 *
12195 * Returns a string representation of +self+ (including the leading colon):
12196 *
12197 * :foo.inspect # => ":foo"
12198 *
12199 * Related: Symbol#to_s, Symbol#name.
12200 *
12201 */
12202
12203static VALUE
12204sym_inspect(VALUE sym)
12205{
12206 VALUE str = rb_sym2str(sym);
12207 const char *ptr;
12208 long len;
12209 char *dest;
12210
12211 if (!rb_str_symname_p(str)) {
12212 str = rb_str_inspect(str);
12213 len = RSTRING_LEN(str);
12214 rb_str_resize(str, len + 1);
12215 dest = RSTRING_PTR(str);
12216 memmove(dest + 1, dest, len);
12217 }
12218 else {
12219 rb_encoding *enc = STR_ENC_GET(str);
12220 VALUE orig_str = str;
12221
12222 len = RSTRING_LEN(orig_str);
12223 str = rb_enc_str_new(0, len + 1, enc);
12224
12225 // Get data pointer after allocation
12226 ptr = RSTRING_PTR(orig_str);
12227 dest = RSTRING_PTR(str);
12228 memcpy(dest + 1, ptr, len);
12229
12230 RB_GC_GUARD(orig_str);
12231 }
12232 dest[0] = ':';
12233
12235
12236 return str;
12237}
12238
12239VALUE
12241{
12242 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12243 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12244 return str;
12245}
12246
12247VALUE
12248rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12249{
12250 VALUE obj;
12251
12252 if (argc < 1) {
12253 rb_raise(rb_eArgError, "no receiver given");
12254 }
12255 obj = argv[0];
12256 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12257}
12258
12259/*
12260 * call-seq:
12261 * succ
12262 *
12263 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12264 *
12265 * :foo.succ # => :fop
12266 *
12267 * Related: String#succ.
12268 */
12269
12270static VALUE
12271sym_succ(VALUE sym)
12272{
12273 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12274}
12275
12276/*
12277 * call-seq:
12278 * symbol <=> object -> -1, 0, +1, or nil
12279 *
12280 * If +object+ is a symbol,
12281 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12282 *
12283 * :bar <=> :foo # => -1
12284 * :foo <=> :foo # => 0
12285 * :foo <=> :bar # => 1
12286 *
12287 * Otherwise, returns +nil+:
12288 *
12289 * :foo <=> 'bar' # => nil
12290 *
12291 * Related: String#<=>.
12292 */
12293
12294static VALUE
12295sym_cmp(VALUE sym, VALUE other)
12296{
12297 if (!SYMBOL_P(other)) {
12298 return Qnil;
12299 }
12300 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12301}
12302
12303/*
12304 * call-seq:
12305 * casecmp(object) -> -1, 0, 1, or nil
12306 *
12307 * :include: doc/symbol/casecmp.rdoc
12308 *
12309 */
12310
12311static VALUE
12312sym_casecmp(VALUE sym, VALUE other)
12313{
12314 if (!SYMBOL_P(other)) {
12315 return Qnil;
12316 }
12317 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12318}
12319
12320/*
12321 * call-seq:
12322 * casecmp?(object) -> true, false, or nil
12323 *
12324 * :include: doc/symbol/casecmp_p.rdoc
12325 *
12326 */
12327
12328static VALUE
12329sym_casecmp_p(VALUE sym, VALUE other)
12330{
12331 if (!SYMBOL_P(other)) {
12332 return Qnil;
12333 }
12334 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12335}
12336
12337/*
12338 * call-seq:
12339 * symbol =~ object -> integer or nil
12340 *
12341 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12342 * including possible updates to global variables;
12343 * see String#=~.
12344 *
12345 */
12346
12347static VALUE
12348sym_match(VALUE sym, VALUE other)
12349{
12350 return rb_str_match(rb_sym2str(sym), other);
12351}
12352
12353/*
12354 * call-seq:
12355 * match(pattern, offset = 0) -> matchdata or nil
12356 * match(pattern, offset = 0) {|matchdata| } -> object
12357 *
12358 * Equivalent to <tt>self.to_s.match</tt>,
12359 * including possible updates to global variables;
12360 * see String#match.
12361 *
12362 */
12363
12364static VALUE
12365sym_match_m(int argc, VALUE *argv, VALUE sym)
12366{
12367 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12368}
12369
12370/*
12371 * call-seq:
12372 * match?(pattern, offset) -> true or false
12373 *
12374 * Equivalent to <tt>sym.to_s.match?</tt>;
12375 * see String#match.
12376 *
12377 */
12378
12379static VALUE
12380sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12381{
12382 return rb_str_match_m_p(argc, argv, sym);
12383}
12384
12385/*
12386 * call-seq:
12387 * symbol[index] -> string or nil
12388 * symbol[start, length] -> string or nil
12389 * symbol[range] -> string or nil
12390 * symbol[regexp, capture = 0] -> string or nil
12391 * symbol[substring] -> string or nil
12392 *
12393 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12394 *
12395 */
12396
12397static VALUE
12398sym_aref(int argc, VALUE *argv, VALUE sym)
12399{
12400 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12401}
12402
12403/*
12404 * call-seq:
12405 * length -> integer
12406 *
12407 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12408 */
12409
12410static VALUE
12411sym_length(VALUE sym)
12412{
12413 return rb_str_length(rb_sym2str(sym));
12414}
12415
12416/*
12417 * call-seq:
12418 * empty? -> true or false
12419 *
12420 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12421 *
12422 */
12423
12424static VALUE
12425sym_empty(VALUE sym)
12426{
12427 return rb_str_empty(rb_sym2str(sym));
12428}
12429
12430/*
12431 * call-seq:
12432 * upcase(mapping) -> symbol
12433 *
12434 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12435 *
12436 * See String#upcase.
12437 *
12438 */
12439
12440static VALUE
12441sym_upcase(int argc, VALUE *argv, VALUE sym)
12442{
12443 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12444}
12445
12446/*
12447 * call-seq:
12448 * downcase(mapping) -> symbol
12449 *
12450 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12451 *
12452 * See String#downcase.
12453 *
12454 * Related: Symbol#upcase.
12455 *
12456 */
12457
12458static VALUE
12459sym_downcase(int argc, VALUE *argv, VALUE sym)
12460{
12461 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12462}
12463
12464/*
12465 * call-seq:
12466 * capitalize(mapping) -> symbol
12467 *
12468 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12469 *
12470 * See String#capitalize.
12471 *
12472 */
12473
12474static VALUE
12475sym_capitalize(int argc, VALUE *argv, VALUE sym)
12476{
12477 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12478}
12479
12480/*
12481 * call-seq:
12482 * swapcase(mapping) -> symbol
12483 *
12484 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12485 *
12486 * See String#swapcase.
12487 *
12488 */
12489
12490static VALUE
12491sym_swapcase(int argc, VALUE *argv, VALUE sym)
12492{
12493 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12494}
12495
12496/*
12497 * call-seq:
12498 * start_with?(*string_or_regexp) -> true or false
12499 *
12500 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12501 *
12502 */
12503
12504static VALUE
12505sym_start_with(int argc, VALUE *argv, VALUE sym)
12506{
12507 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12508}
12509
12510/*
12511 * call-seq:
12512 * end_with?(*strings) -> true or false
12513 *
12514 *
12515 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12516 *
12517 */
12518
12519static VALUE
12520sym_end_with(int argc, VALUE *argv, VALUE sym)
12521{
12522 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12523}
12524
12525/*
12526 * call-seq:
12527 * encoding -> encoding
12528 *
12529 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12530 *
12531 */
12532
12533static VALUE
12534sym_encoding(VALUE sym)
12535{
12536 return rb_obj_encoding(rb_sym2str(sym));
12537}
12538
12539static VALUE
12540string_for_symbol(VALUE name)
12541{
12542 if (!RB_TYPE_P(name, T_STRING)) {
12543 VALUE tmp = rb_check_string_type(name);
12544 if (NIL_P(tmp)) {
12545 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12546 name);
12547 }
12548 name = tmp;
12549 }
12550 return name;
12551}
12552
12553ID
12555{
12556 if (SYMBOL_P(name)) {
12557 return SYM2ID(name);
12558 }
12559 name = string_for_symbol(name);
12560 return rb_intern_str(name);
12561}
12562
12563VALUE
12565{
12566 if (SYMBOL_P(name)) {
12567 return name;
12568 }
12569 name = string_for_symbol(name);
12570 return rb_str_intern(name);
12571}
12572
12573/*
12574 * call-seq:
12575 * Symbol.all_symbols -> array_of_symbols
12576 *
12577 * Returns an array of all symbols currently in Ruby's symbol table:
12578 *
12579 * Symbol.all_symbols.size # => 9334
12580 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12581 *
12582 */
12583
12584static VALUE
12585sym_all_symbols(VALUE _)
12586{
12587 return rb_sym_all_symbols();
12588}
12589
12590VALUE
12591rb_str_to_interned_str(VALUE str)
12592{
12593 return rb_fstring(str);
12594}
12595
12596VALUE
12597rb_interned_str(const char *ptr, long len)
12598{
12599 struct RString fake_str = {RBASIC_INIT};
12600 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12601}
12602
12603VALUE
12605{
12606 return rb_interned_str(ptr, strlen(ptr));
12607}
12608
12609VALUE
12610rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12611{
12612 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12613 rb_enc_autoload(enc);
12614 }
12615
12616 struct RString fake_str = {RBASIC_INIT};
12617 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12618}
12619
12620VALUE
12621rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12622{
12623 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12624 rb_enc_autoload(enc);
12625 }
12626
12627 struct RString fake_str = {RBASIC_INIT};
12628 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12629 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12630 return str;
12631}
12632
12633VALUE
12635{
12636 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12637}
12638
12639#if USE_YJIT
12640void
12641rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12642{
12643 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12644 ssize_t code = RB_NUM2SSIZE(codepoint);
12645
12646 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12647 rb_str_buf_cat_byte(str, (char) code);
12648 return;
12649 }
12650 }
12651
12652 rb_str_concat(str, codepoint);
12653}
12654#endif
12655
12656static int
12657fstring_set_class_i(VALUE *str, void *data)
12658{
12659 RBASIC_SET_CLASS(*str, rb_cString);
12660
12661 return ST_CONTINUE;
12662}
12663
12664void
12665Init_String(void)
12666{
12667 rb_cString = rb_define_class("String", rb_cObject);
12668
12669 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12670
12672 rb_define_alloc_func(rb_cString, empty_str_alloc);
12673 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12674 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12675 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12677 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12678 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12681 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12682 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12683 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12684 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12687 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12688 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12689 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12690 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12693 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12694 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12695 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12696 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12697 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12699 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12701 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12702 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12703 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12704 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12705 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12706 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12707 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12708 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12709 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12710 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12711 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12712 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12713 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12714 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12716 rb_define_method(rb_cString, "+@", str_uplus, 0);
12717 rb_define_method(rb_cString, "-@", str_uminus, 0);
12718 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12719 rb_define_alias(rb_cString, "dedup", "-@");
12720
12721 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12722 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12723 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12724 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12727 rb_define_method(rb_cString, "undump", str_undump, 0);
12728
12729 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12730 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12731 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12732 sym_fold = ID2SYM(rb_intern_const("fold"));
12733
12734 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12735 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12736 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12737 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12738
12739 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12740 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12741 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12742 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12743
12744 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12745 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12746 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12747 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12748 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12749 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12750 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12751 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12752 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12753 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12754 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12755 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12757 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12758 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12759 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12760 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12761 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12762
12763 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12764 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12765 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12766
12767 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12768
12769 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12770 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12771 rb_define_method(rb_cString, "center", rb_str_center, -1);
12772
12773 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12774 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12775 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12776 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12777 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12778 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12779 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12780 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12781 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12782
12783 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12784 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12785 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12786 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12787 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12788 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12789 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12790 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12791 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12792
12793 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12794 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12795 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12796 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12797 rb_define_method(rb_cString, "count", rb_str_count, -1);
12798
12799 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12800 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12801 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12802 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12803
12804 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12805 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12806 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12807 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12808 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12809
12810 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12811
12812 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12813 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12814
12815 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12816 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12817
12818 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12819 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12820 rb_define_method(rb_cString, "b", rb_str_b, 0);
12821 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12822 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12823
12824 /* define UnicodeNormalize module here so that we don't have to look it up */
12825 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12826 id_normalize = rb_intern_const("normalize");
12827 id_normalized_p = rb_intern_const("normalized?");
12828
12829 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12830 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12831 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12832
12833 rb_fs = Qnil;
12834 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12835 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12836 rb_gc_register_address(&rb_fs);
12837
12838 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12842 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12843
12844 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12845 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12846 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12847 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12848 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12849 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12850
12851 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12852 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12853 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12854 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12855
12856 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12857 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12858 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12859 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12860 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12861 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12862 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12863
12864 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12865 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12866 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12867 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12868
12869 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12870 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12871
12872 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12873}
12874
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1820
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1603
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1721
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2972
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2792
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3262
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1037
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3051
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:206
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:683
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3908
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:676
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2164
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2182
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1341
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3578
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:265
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:583
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:177
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1329
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3262
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1332
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:937
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1197
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3018
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1216
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12610
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2324
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3722
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1145
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1437
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1338
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:956
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12634
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:821
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2705
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2968
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:706
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1982
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1988
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1927
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4223
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3720
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1742
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1502
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2477
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3787
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1413
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12240
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2550
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1389
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1736
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3046
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5323
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4150
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3143
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11535
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1778
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1179
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:991
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1508
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1986
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4136
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3555
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2413
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2004
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6556
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3151
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12604
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1419
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3753
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3093
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4257
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3377
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7230
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2780
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12597
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4204
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4024
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4179
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3729
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3268
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5833
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11593
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1692
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2940
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3240
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3359
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1191
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2734
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7337
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1401
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1708
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2427
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5751
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9375
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1185
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1840
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1986
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2065
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3361
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1624
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12564
ID rb_to_id(VALUE str)
Definition string.c:12554
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3499
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4467
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1431
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2917
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2799
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1425
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2812
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1769
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:455
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1477
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:202
Definition string.c:8263
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113