Ruby 4.1.0dev (2026-01-12 revision 61c372a1b7fe045adc9b67196503f29b79bff376)
string.c (61c372a1b7fe045adc9b67196503f29b79bff376)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 rb_gc_register_pinning_obj(str); \
209 FL_SET((shared_str), STR_SHARED_ROOT); \
210 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
211 FL_SET_RAW((shared_str), STR_BORROWED); \
212 } \
213} while (0)
214
215#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
216#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
217/* TODO: include the terminator size in capa. */
218
219#define STR_ENC_GET(str) get_encoding(str)
220
221#if !defined SHARABLE_MIDDLE_SUBSTRING
222# define SHARABLE_MIDDLE_SUBSTRING 0
223#endif
224#if !SHARABLE_MIDDLE_SUBSTRING
225#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
226#else
227#define SHARABLE_SUBSTRING_P(beg, len, end) 1
228#endif
229
230
231static inline long
232str_embed_capa(VALUE str)
233{
234 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
235}
236
237bool
238rb_str_reembeddable_p(VALUE str)
239{
240 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
241}
242
243static inline size_t
244rb_str_embed_size(long capa, long termlen)
245{
246 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
247 if (size < sizeof(struct RString)) size = sizeof(struct RString);
248 return size;
249}
250
251size_t
252rb_str_size_as_embedded(VALUE str)
253{
254 size_t real_size;
255 if (STR_EMBED_P(str)) {
256 size_t capa = RSTRING(str)->len;
257 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
258
259 real_size = rb_str_embed_size(capa, TERM_LEN(str));
260 }
261 /* if the string is not currently embedded, but it can be embedded, how
262 * much space would it require */
263 else if (rb_str_reembeddable_p(str)) {
264 size_t capa = RSTRING(str)->as.heap.aux.capa;
265 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
266
267 real_size = rb_str_embed_size(capa, TERM_LEN(str));
268 }
269 else {
270 real_size = sizeof(struct RString);
271 }
272
273 return real_size;
274}
275
276static inline bool
277STR_EMBEDDABLE_P(long len, long termlen)
278{
279 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
280}
281
282static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
283static VALUE str_new_frozen(VALUE klass, VALUE orig);
284static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
285static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
286static VALUE str_new(VALUE klass, const char *ptr, long len);
287static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
288static inline void str_modifiable(VALUE str);
289static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
290static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
291
292static inline void
293str_make_independent(VALUE str)
294{
295 long len = RSTRING_LEN(str);
296 int termlen = TERM_LEN(str);
297 str_make_independent_expand((str), len, 0L, termlen);
298}
299
300static inline int str_dependent_p(VALUE str);
301
302void
303rb_str_make_independent(VALUE str)
304{
305 if (str_dependent_p(str)) {
306 str_make_independent(str);
307 }
308}
309
310void
311rb_str_make_embedded(VALUE str)
312{
313 RUBY_ASSERT(rb_str_reembeddable_p(str));
314 RUBY_ASSERT(!STR_EMBED_P(str));
315
316 char *buf = RSTRING(str)->as.heap.ptr;
317 long len = RSTRING(str)->len;
318
319 STR_SET_EMBED(str);
320 STR_SET_LEN(str, len);
321
322 if (len > 0) {
323 memcpy(RSTRING_PTR(str), buf, len);
324 ruby_xfree(buf);
325 }
326
327 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
328}
329
330void
331rb_debug_rstring_null_ptr(const char *func)
332{
333 fprintf(stderr, "%s is returning NULL!! "
334 "SIGSEGV is highly expected to follow immediately.\n"
335 "If you could reproduce, attach your debugger here, "
336 "and look at the passed string.\n",
337 func);
338}
339
340/* symbols for [up|down|swap]case/capitalize options */
341static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
342
343static rb_encoding *
344get_encoding(VALUE str)
345{
346 return rb_enc_from_index(ENCODING_GET(str));
347}
348
349static void
350mustnot_broken(VALUE str)
351{
352 if (is_broken_string(str)) {
353 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
354 }
355}
356
357static void
358mustnot_wchar(VALUE str)
359{
360 rb_encoding *enc = STR_ENC_GET(str);
361 if (rb_enc_mbminlen(enc) > 1) {
362 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
363 }
364}
365
366static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
367
368#if SIZEOF_LONG == SIZEOF_VOIDP
369#define PRECOMPUTED_FAKESTR_HASH 1
370#else
371#endif
372
373static inline bool
374BARE_STRING_P(VALUE str)
375{
376 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
377}
378
379static inline st_index_t
380str_do_hash(VALUE str)
381{
382 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
383 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
384 if (e && !is_ascii_string(str)) {
385 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
386 }
387 return h;
388}
389
390static VALUE
391str_store_precomputed_hash(VALUE str, st_index_t hash)
392{
393 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
394 RUBY_ASSERT(STR_EMBED_P(str));
395
396#if RUBY_DEBUG
397 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
398 size_t free_bytes = str_embed_capa(str) - used_bytes;
399 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
400#endif
401
402 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
403
404 FL_SET(str, STR_PRECOMPUTED_HASH);
405
406 return str;
407}
408
409VALUE
410rb_fstring(VALUE str)
411{
412 VALUE fstr;
413 int bare;
414
415 Check_Type(str, T_STRING);
416
417 if (FL_TEST(str, RSTRING_FSTR))
418 return str;
419
420 bare = BARE_STRING_P(str);
421 if (!bare) {
422 if (STR_EMBED_P(str)) {
423 OBJ_FREEZE(str);
424 return str;
425 }
426
427 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
429 return str;
430 }
431 }
432
433 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
434 rb_str_resize(str, RSTRING_LEN(str));
435
436 fstr = register_fstring(str, false, false);
437
438 if (!bare) {
439 str_replace_shared_without_enc(str, fstr);
440 OBJ_FREEZE(str);
441 return str;
442 }
443 return fstr;
444}
445
446static VALUE fstring_table_obj;
447
448static VALUE
449fstring_concurrent_set_hash(VALUE str)
450{
451#ifdef PRECOMPUTED_FAKESTR_HASH
452 st_index_t h;
453 if (FL_TEST_RAW(str, STR_FAKESTR)) {
454 // register_fstring precomputes the hash and stores it in capa for fake strings
455 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
456 }
457 else {
458 h = rb_str_hash(str);
459 }
460 // rb_str_hash doesn't include the encoding for ascii only strings, so
461 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
462 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
463#else
464 return (VALUE)rb_str_hash(str);
465#endif
466}
467
468static bool
469fstring_concurrent_set_cmp(VALUE a, VALUE b)
470{
471 long alen, blen;
472 const char *aptr, *bptr;
473
476
477 RSTRING_GETMEM(a, aptr, alen);
478 RSTRING_GETMEM(b, bptr, blen);
479 return (alen == blen &&
480 ENCODING_GET(a) == ENCODING_GET(b) &&
481 memcmp(aptr, bptr, alen) == 0);
482}
483
485 bool copy;
486 bool force_precompute_hash;
487};
488
489static VALUE
490fstring_concurrent_set_create(VALUE str, void *data)
491{
492 struct fstr_create_arg *arg = data;
493
494 // Unless the string is empty or binary, its coderange has been precomputed.
495 int coderange = ENC_CODERANGE(str);
496
497 if (FL_TEST_RAW(str, STR_FAKESTR)) {
498 if (arg->copy) {
499 VALUE new_str;
500 long len = RSTRING_LEN(str);
501 long capa = len + sizeof(st_index_t);
502 int term_len = TERM_LEN(str);
503
504 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
505 new_str = str_alloc_embed(rb_cString, capa + term_len);
506 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
507 STR_SET_LEN(new_str, RSTRING_LEN(str));
508 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
509 rb_enc_copy(new_str, str);
510 str_store_precomputed_hash(new_str, str_do_hash(str));
511 }
512 else {
513 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
514 rb_enc_copy(new_str, str);
515#ifdef PRECOMPUTED_FAKESTR_HASH
516 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
517 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
518 }
519#endif
520 }
521 str = new_str;
522 }
523 else {
524 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
525 RSTRING(str)->len,
526 ENCODING_GET(str));
527 }
528 OBJ_FREEZE(str);
529 }
530 else {
531 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
532 str = str_new_frozen(rb_cString, str);
533 }
534 if (STR_SHARED_P(str)) { /* str should not be shared */
535 /* shared substring */
536 str_make_independent(str);
538 }
539 if (!BARE_STRING_P(str)) {
540 str = str_new_frozen(rb_cString, str);
541 }
542 }
543
544 ENC_CODERANGE_SET(str, coderange);
545 RBASIC(str)->flags |= RSTRING_FSTR;
546 if (!RB_OBJ_SHAREABLE_P(str)) {
547 RB_OBJ_SET_SHAREABLE(str);
548 }
549 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
552 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
553 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
555 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
556
557 return str;
558}
559
560static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
561 .hash = fstring_concurrent_set_hash,
562 .cmp = fstring_concurrent_set_cmp,
563 .create = fstring_concurrent_set_create,
564 .free = NULL,
565};
566
567void
568Init_fstring_table(void)
569{
570 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
571 rb_gc_register_address(&fstring_table_obj);
572}
573
574static VALUE
575register_fstring(VALUE str, bool copy, bool force_precompute_hash)
576{
577 struct fstr_create_arg args = {
578 .copy = copy,
579 .force_precompute_hash = force_precompute_hash
580 };
581
582#if SIZEOF_VOIDP == SIZEOF_LONG
583 if (FL_TEST_RAW(str, STR_FAKESTR)) {
584 // if the string hasn't been interned, we'll need the hash twice, so we
585 // compute it once and store it in capa
586 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
587 }
588#endif
589
590 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
591
592 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
594 RUBY_ASSERT(OBJ_FROZEN(result));
596 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
597 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
599
600 return result;
601}
602
603bool
604rb_obj_is_fstring_table(VALUE obj)
605{
606 ASSERT_vm_locking();
607
608 return obj == fstring_table_obj;
609}
610
611void
612rb_gc_free_fstring(VALUE obj)
613{
614 ASSERT_vm_locking_with_barrier();
615
616 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
618 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
619
620 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
621
622 RB_DEBUG_COUNTER_INC(obj_str_fstr);
623
624 FL_UNSET(obj, RSTRING_FSTR);
625}
626
627void
628rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
629{
630 if (fstring_table_obj) {
631 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
632 }
633}
634
635static VALUE
636setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
637{
638 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
639 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
640
641 if (!name) {
643 name = "";
644 }
645
646 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
647
648 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
649 fake_str->len = len;
650 fake_str->as.heap.ptr = (char *)name;
651 fake_str->as.heap.aux.capa = len;
652 return (VALUE)fake_str;
653}
654
655/*
656 * set up a fake string which refers a static string literal.
657 */
658VALUE
659rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
660{
661 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
662}
663
664/*
665 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
666 * shared string which refers a static string literal. `ptr` must
667 * point a constant string.
668 */
669VALUE
670rb_fstring_new(const char *ptr, long len)
671{
672 struct RString fake_str = {RBASIC_INIT};
673 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
674}
675
676VALUE
677rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
678{
679 struct RString fake_str = {RBASIC_INIT};
680 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
681}
682
683VALUE
684rb_fstring_cstr(const char *ptr)
685{
686 return rb_fstring_new(ptr, strlen(ptr));
687}
688
689static inline bool
690single_byte_optimizable(VALUE str)
691{
692 int encindex = ENCODING_GET(str);
693 switch (encindex) {
694 case ENCINDEX_ASCII_8BIT:
695 case ENCINDEX_US_ASCII:
696 return true;
697 case ENCINDEX_UTF_8:
698 // For UTF-8 it's worth scanning the string coderange when unknown.
700 }
701 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
702 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
703 return true;
704 }
705
706 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
707 return true;
708 }
709
710 /* Conservative. Possibly single byte.
711 * "\xa1" in Shift_JIS for example. */
712 return false;
713}
714
716
717static inline const char *
718search_nonascii(const char *p, const char *e)
719{
720 const char *s, *t;
721
722#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
723# if SIZEOF_UINTPTR_T == 8
724# define NONASCII_MASK UINT64_C(0x8080808080808080)
725# elif SIZEOF_UINTPTR_T == 4
726# define NONASCII_MASK UINT32_C(0x80808080)
727# else
728# error "don't know what to do."
729# endif
730#else
731# if SIZEOF_UINTPTR_T == 8
732# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
733# elif SIZEOF_UINTPTR_T == 4
734# define NONASCII_MASK 0x80808080UL /* or...? */
735# else
736# error "don't know what to do."
737# endif
738#endif
739
740 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
741#if !UNALIGNED_WORD_ACCESS
742 if ((uintptr_t)p % SIZEOF_VOIDP) {
743 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
744 p += l;
745 switch (l) {
746 default: UNREACHABLE;
747#if SIZEOF_VOIDP > 4
748 case 7: if (p[-7]&0x80) return p-7;
749 case 6: if (p[-6]&0x80) return p-6;
750 case 5: if (p[-5]&0x80) return p-5;
751 case 4: if (p[-4]&0x80) return p-4;
752#endif
753 case 3: if (p[-3]&0x80) return p-3;
754 case 2: if (p[-2]&0x80) return p-2;
755 case 1: if (p[-1]&0x80) return p-1;
756 case 0: break;
757 }
758 }
759#endif
760#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
761#define aligned_ptr(value) \
762 __builtin_assume_aligned((value), sizeof(uintptr_t))
763#else
764#define aligned_ptr(value) (value)
765#endif
766 s = aligned_ptr(p);
767 t = (e - (SIZEOF_VOIDP-1));
768#undef aligned_ptr
769 for (;s < t; s += sizeof(uintptr_t)) {
770 uintptr_t word;
771 memcpy(&word, s, sizeof(word));
772 if (word & NONASCII_MASK) {
773#ifdef WORDS_BIGENDIAN
774 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
775#else
776 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
777#endif
778 }
779 }
780 p = (const char *)s;
781 }
782
783 switch (e - p) {
784 default: UNREACHABLE;
785#if SIZEOF_VOIDP > 4
786 case 7: if (e[-7]&0x80) return e-7;
787 case 6: if (e[-6]&0x80) return e-6;
788 case 5: if (e[-5]&0x80) return e-5;
789 case 4: if (e[-4]&0x80) return e-4;
790#endif
791 case 3: if (e[-3]&0x80) return e-3;
792 case 2: if (e[-2]&0x80) return e-2;
793 case 1: if (e[-1]&0x80) return e-1;
794 case 0: return NULL;
795 }
796}
797
798static int
799coderange_scan(const char *p, long len, rb_encoding *enc)
800{
801 const char *e = p + len;
802
803 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
804 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
805 p = search_nonascii(p, e);
807 }
808
809 if (rb_enc_asciicompat(enc)) {
810 p = search_nonascii(p, e);
811 if (!p) return ENC_CODERANGE_7BIT;
812 for (;;) {
813 int ret = rb_enc_precise_mbclen(p, e, enc);
815 p += MBCLEN_CHARFOUND_LEN(ret);
816 if (p == e) break;
817 p = search_nonascii(p, e);
818 if (!p) break;
819 }
820 }
821 else {
822 while (p < e) {
823 int ret = rb_enc_precise_mbclen(p, e, enc);
825 p += MBCLEN_CHARFOUND_LEN(ret);
826 }
827 }
828 return ENC_CODERANGE_VALID;
829}
830
831long
832rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
833{
834 const char *p = s;
835
836 if (*cr == ENC_CODERANGE_BROKEN)
837 return e - s;
838
839 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
840 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
841 if (*cr == ENC_CODERANGE_VALID) return e - s;
842 p = search_nonascii(p, e);
844 return e - s;
845 }
846 else if (rb_enc_asciicompat(enc)) {
847 p = search_nonascii(p, e);
848 if (!p) {
849 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
850 return e - s;
851 }
852 for (;;) {
853 int ret = rb_enc_precise_mbclen(p, e, enc);
854 if (!MBCLEN_CHARFOUND_P(ret)) {
856 return p - s;
857 }
858 p += MBCLEN_CHARFOUND_LEN(ret);
859 if (p == e) break;
860 p = search_nonascii(p, e);
861 if (!p) break;
862 }
863 }
864 else {
865 while (p < e) {
866 int ret = rb_enc_precise_mbclen(p, e, enc);
867 if (!MBCLEN_CHARFOUND_P(ret)) {
869 return p - s;
870 }
871 p += MBCLEN_CHARFOUND_LEN(ret);
872 }
873 }
875 return e - s;
876}
877
878static inline void
879str_enc_copy(VALUE str1, VALUE str2)
880{
881 rb_enc_set_index(str1, ENCODING_GET(str2));
882}
883
884/* Like str_enc_copy, but does not check frozen status of str1.
885 * You should use this only if you're certain that str1 is not frozen. */
886static inline void
887str_enc_copy_direct(VALUE str1, VALUE str2)
888{
889 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
890 if (inlined_encoding == ENCODING_INLINE_MAX) {
891 rb_enc_set_index(str1, rb_enc_get_index(str2));
892 }
893 else {
894 ENCODING_SET_INLINED(str1, inlined_encoding);
895 }
896}
897
898static void
899rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
900{
901 /* this function is designed for copying encoding and coderange
902 * from src to new string "dest" which is made from the part of src.
903 */
904 str_enc_copy(dest, src);
905 if (RSTRING_LEN(dest) == 0) {
906 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
908 else
910 return;
911 }
912 switch (ENC_CODERANGE(src)) {
915 break;
917 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
918 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
920 else
922 break;
923 default:
924 break;
925 }
926}
927
928static void
929rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
930{
931 str_enc_copy(dest, src);
933}
934
935static int
936enc_coderange_scan(VALUE str, rb_encoding *enc)
937{
938 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
939}
940
941int
942rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
943{
944 return enc_coderange_scan(str, enc);
945}
946
947int
949{
950 int cr = ENC_CODERANGE(str);
951
952 if (cr == ENC_CODERANGE_UNKNOWN) {
953 cr = enc_coderange_scan(str, get_encoding(str));
954 ENC_CODERANGE_SET(str, cr);
955 }
956 return cr;
957}
958
959static inline bool
960rb_enc_str_asciicompat(VALUE str)
961{
962 int encindex = ENCODING_GET_INLINED(str);
963 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
964}
965
966int
968{
969 switch(ENC_CODERANGE(str)) {
971 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
973 return true;
974 default:
975 return false;
976 }
977}
978
979static inline void
980str_mod_check(VALUE s, const char *p, long len)
981{
982 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
983 rb_raise(rb_eRuntimeError, "string modified");
984 }
985}
986
987static size_t
988str_capacity(VALUE str, const int termlen)
989{
990 if (STR_EMBED_P(str)) {
991 return str_embed_capa(str) - termlen;
992 }
993 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
994 return RSTRING(str)->len;
995 }
996 else {
997 return RSTRING(str)->as.heap.aux.capa;
998 }
999}
1000
1001size_t
1003{
1004 return str_capacity(str, TERM_LEN(str));
1005}
1006
1007static inline void
1008must_not_null(const char *ptr)
1009{
1010 if (!ptr) {
1011 rb_raise(rb_eArgError, "NULL pointer given");
1012 }
1013}
1014
1015static inline VALUE
1016str_alloc_embed(VALUE klass, size_t capa)
1017{
1018 size_t size = rb_str_embed_size(capa, 0);
1019 RUBY_ASSERT(size > 0);
1020 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1021
1022 NEWOBJ_OF(str, struct RString, klass,
1024
1025 str->len = 0;
1026 str->as.embed.ary[0] = 0;
1027
1028 return (VALUE)str;
1029}
1030
1031static inline VALUE
1032str_alloc_heap(VALUE klass)
1033{
1034 NEWOBJ_OF(str, struct RString, klass,
1035 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1036
1037 str->len = 0;
1038 str->as.heap.aux.capa = 0;
1039 str->as.heap.ptr = NULL;
1040
1041 return (VALUE)str;
1042}
1043
1044static inline VALUE
1045empty_str_alloc(VALUE klass)
1046{
1047 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1048 VALUE str = str_alloc_embed(klass, 0);
1049 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1051 return str;
1052}
1053
1054static VALUE
1055str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1056{
1057 VALUE str;
1058
1059 if (len < 0) {
1060 rb_raise(rb_eArgError, "negative string size (or size too big)");
1061 }
1062
1063 if (enc == NULL) {
1064 enc = rb_ascii8bit_encoding();
1065 }
1066
1067 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1068
1069 int termlen = rb_enc_mbminlen(enc);
1070
1071 if (STR_EMBEDDABLE_P(len, termlen)) {
1072 str = str_alloc_embed(klass, len + termlen);
1073 if (len == 0) {
1074 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1075 }
1076 }
1077 else {
1078 str = str_alloc_heap(klass);
1079 RSTRING(str)->as.heap.aux.capa = len;
1080 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1081 * integer overflow. If we can STATIC_ASSERT that, the following
1082 * mul_add_mul can be reverted to a simple ALLOC_N. */
1083 RSTRING(str)->as.heap.ptr =
1084 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1085 }
1086
1087 rb_enc_raw_set(str, enc);
1088
1089 if (ptr) {
1090 memcpy(RSTRING_PTR(str), ptr, len);
1091 }
1092 else {
1093 memset(RSTRING_PTR(str), 0, len);
1094 }
1095
1096 STR_SET_LEN(str, len);
1097 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1098 return str;
1099}
1100
1101static VALUE
1102str_new(VALUE klass, const char *ptr, long len)
1103{
1104 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1105}
1106
1107VALUE
1108rb_str_new(const char *ptr, long len)
1109{
1110 return str_new(rb_cString, ptr, len);
1111}
1112
1113VALUE
1114rb_usascii_str_new(const char *ptr, long len)
1115{
1116 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1117}
1118
1119VALUE
1120rb_utf8_str_new(const char *ptr, long len)
1121{
1122 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1123}
1124
1125VALUE
1126rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1127{
1128 return str_enc_new(rb_cString, ptr, len, enc);
1129}
1130
1131VALUE
1133{
1134 must_not_null(ptr);
1135 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1136 * memory regions, and that cannot be detected by the MSAN. Just
1137 * trust the programmer that the argument passed here is a sane C
1138 * string. */
1139 __msan_unpoison_string(ptr);
1140 return rb_str_new(ptr, strlen(ptr));
1141}
1142
1143VALUE
1145{
1146 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1147}
1148
1149VALUE
1151{
1152 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1153}
1154
1155VALUE
1157{
1158 must_not_null(ptr);
1159 if (rb_enc_mbminlen(enc) != 1) {
1160 rb_raise(rb_eArgError, "wchar encoding given");
1161 }
1162 return rb_enc_str_new(ptr, strlen(ptr), enc);
1163}
1164
1165static VALUE
1166str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1167{
1168 VALUE str;
1169
1170 if (len < 0) {
1171 rb_raise(rb_eArgError, "negative string size (or size too big)");
1172 }
1173
1174 if (!ptr) {
1175 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1176 }
1177 else {
1178 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1179 str = str_alloc_heap(klass);
1180 RSTRING(str)->len = len;
1181 RSTRING(str)->as.heap.ptr = (char *)ptr;
1182 RSTRING(str)->as.heap.aux.capa = len;
1183 RBASIC(str)->flags |= STR_NOFREE;
1184 rb_enc_associate_index(str, encindex);
1185 }
1186 return str;
1187}
1188
1189VALUE
1190rb_str_new_static(const char *ptr, long len)
1191{
1192 return str_new_static(rb_cString, ptr, len, 0);
1193}
1194
1195VALUE
1197{
1198 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1199}
1200
1201VALUE
1203{
1204 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1205}
1206
1207VALUE
1209{
1210 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1211}
1212
1213static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1214 rb_encoding *from, rb_encoding *to,
1215 int ecflags, VALUE ecopts);
1216
1217static inline bool
1218is_enc_ascii_string(VALUE str, rb_encoding *enc)
1219{
1220 int encidx = rb_enc_to_index(enc);
1221 if (rb_enc_get_index(str) == encidx)
1222 return is_ascii_string(str);
1223 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1224}
1225
1226VALUE
1227rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1228{
1229 long len;
1230 const char *ptr;
1231 VALUE newstr;
1232
1233 if (!to) return str;
1234 if (!from) from = rb_enc_get(str);
1235 if (from == to) return str;
1236 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1237 rb_is_ascii8bit_enc(to)) {
1238 if (STR_ENC_GET(str) != to) {
1239 str = rb_str_dup(str);
1240 rb_enc_associate(str, to);
1241 }
1242 return str;
1243 }
1244
1245 RSTRING_GETMEM(str, ptr, len);
1246 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1247 from, to, ecflags, ecopts);
1248 if (NIL_P(newstr)) {
1249 /* some error, return original */
1250 return str;
1251 }
1252 return newstr;
1253}
1254
1255VALUE
1256rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1257 rb_encoding *from, int ecflags, VALUE ecopts)
1258{
1259 long olen;
1260
1261 olen = RSTRING_LEN(newstr);
1262 if (ofs < -olen || olen < ofs)
1263 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1264 if (ofs < 0) ofs += olen;
1265 if (!from) {
1266 STR_SET_LEN(newstr, ofs);
1267 return rb_str_cat(newstr, ptr, len);
1268 }
1269
1270 rb_str_modify(newstr);
1271 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1272 rb_enc_get(newstr),
1273 ecflags, ecopts);
1274}
1275
1276VALUE
1277rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1278{
1279 STR_SET_LEN(str, 0);
1280 rb_enc_associate(str, enc);
1281 rb_str_cat(str, ptr, len);
1282 return str;
1283}
1284
1285static VALUE
1286str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1287 rb_encoding *from, rb_encoding *to,
1288 int ecflags, VALUE ecopts)
1289{
1290 rb_econv_t *ec;
1292 long olen;
1293 VALUE econv_wrapper;
1294 const unsigned char *start, *sp;
1295 unsigned char *dest, *dp;
1296 size_t converted_output = (size_t)ofs;
1297
1298 olen = rb_str_capacity(newstr);
1299
1300 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1301 RBASIC_CLEAR_CLASS(econv_wrapper);
1302 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1303 if (!ec) return Qnil;
1304 DATA_PTR(econv_wrapper) = ec;
1305
1306 sp = (unsigned char*)ptr;
1307 start = sp;
1308 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1309 (dp = dest + converted_output),
1310 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1312 /* destination buffer short */
1313 size_t converted_input = sp - start;
1314 size_t rest = len - converted_input;
1315 converted_output = dp - dest;
1316 rb_str_set_len(newstr, converted_output);
1317 if (converted_input && converted_output &&
1318 rest < (LONG_MAX / converted_output)) {
1319 rest = (rest * converted_output) / converted_input;
1320 }
1321 else {
1322 rest = olen;
1323 }
1324 olen += rest < 2 ? 2 : rest;
1325 rb_str_resize(newstr, olen);
1326 }
1327 DATA_PTR(econv_wrapper) = 0;
1328 RB_GC_GUARD(econv_wrapper);
1329 rb_econv_close(ec);
1330 switch (ret) {
1331 case econv_finished:
1332 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1333 rb_str_set_len(newstr, len);
1334 rb_enc_associate(newstr, to);
1335 return newstr;
1336
1337 default:
1338 return Qnil;
1339 }
1340}
1341
1342VALUE
1344{
1345 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1346}
1347
1348VALUE
1350{
1351 rb_encoding *ienc;
1352 VALUE str;
1353 const int eidx = rb_enc_to_index(eenc);
1354
1355 if (!ptr) {
1356 return rb_enc_str_new(ptr, len, eenc);
1357 }
1358
1359 /* ASCII-8BIT case, no conversion */
1360 if ((eidx == rb_ascii8bit_encindex()) ||
1361 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1362 return rb_str_new(ptr, len);
1363 }
1364 /* no default_internal or same encoding, no conversion */
1365 ienc = rb_default_internal_encoding();
1366 if (!ienc || eenc == ienc) {
1367 return rb_enc_str_new(ptr, len, eenc);
1368 }
1369 /* ASCII compatible, and ASCII only string, no conversion in
1370 * default_internal */
1371 if ((eidx == rb_ascii8bit_encindex()) ||
1372 (eidx == rb_usascii_encindex()) ||
1373 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1374 return rb_enc_str_new(ptr, len, ienc);
1375 }
1376 /* convert from the given encoding to default_internal */
1377 str = rb_enc_str_new(NULL, 0, ienc);
1378 /* when the conversion failed for some reason, just ignore the
1379 * default_internal and result in the given encoding as-is. */
1380 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1381 rb_str_initialize(str, ptr, len, eenc);
1382 }
1383 return str;
1384}
1385
1386VALUE
1387rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1388{
1389 int eidx = rb_enc_to_index(eenc);
1390 if (eidx == rb_usascii_encindex() &&
1391 !is_ascii_string(str)) {
1392 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1393 return str;
1394 }
1395 rb_enc_associate_index(str, eidx);
1396 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1397}
1398
1399VALUE
1400rb_external_str_new(const char *ptr, long len)
1401{
1402 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1403}
1404
1405VALUE
1407{
1408 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1409}
1410
1411VALUE
1412rb_locale_str_new(const char *ptr, long len)
1413{
1414 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1415}
1416
1417VALUE
1419{
1420 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1421}
1422
1423VALUE
1425{
1426 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1427}
1428
1429VALUE
1431{
1432 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1433}
1434
1435VALUE
1437{
1438 return rb_str_export_to_enc(str, rb_default_external_encoding());
1439}
1440
1441VALUE
1443{
1444 return rb_str_export_to_enc(str, rb_locale_encoding());
1445}
1446
1447VALUE
1449{
1450 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1451}
1452
1453static VALUE
1454str_replace_shared_without_enc(VALUE str2, VALUE str)
1455{
1456 const int termlen = TERM_LEN(str);
1457 char *ptr;
1458 long len;
1459
1460 RSTRING_GETMEM(str, ptr, len);
1461 if (str_embed_capa(str2) >= len + termlen) {
1462 char *ptr2 = RSTRING(str2)->as.embed.ary;
1463 STR_SET_EMBED(str2);
1464 memcpy(ptr2, RSTRING_PTR(str), len);
1465 TERM_FILL(ptr2+len, termlen);
1466 }
1467 else {
1468 VALUE root;
1469 if (STR_SHARED_P(str)) {
1470 root = RSTRING(str)->as.heap.aux.shared;
1471 RSTRING_GETMEM(str, ptr, len);
1472 }
1473 else {
1474 root = rb_str_new_frozen(str);
1475 RSTRING_GETMEM(root, ptr, len);
1476 }
1477 RUBY_ASSERT(OBJ_FROZEN(root));
1478
1479 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1480 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1481 rb_fatal("about to free a possible shared root");
1482 }
1483 char *ptr2 = STR_HEAP_PTR(str2);
1484 if (ptr2 != ptr) {
1485 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1486 }
1487 }
1488 FL_SET(str2, STR_NOEMBED);
1489 RSTRING(str2)->as.heap.ptr = ptr;
1490 STR_SET_SHARED(str2, root);
1491 }
1492
1493 STR_SET_LEN(str2, len);
1494
1495 return str2;
1496}
1497
1498static VALUE
1499str_replace_shared(VALUE str2, VALUE str)
1500{
1501 str_replace_shared_without_enc(str2, str);
1502 rb_enc_cr_str_exact_copy(str2, str);
1503 return str2;
1504}
1505
1506static VALUE
1507str_new_shared(VALUE klass, VALUE str)
1508{
1509 return str_replace_shared(str_alloc_heap(klass), str);
1510}
1511
1512VALUE
1514{
1515 return str_new_shared(rb_obj_class(str), str);
1516}
1517
1518VALUE
1520{
1521 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1522 return str_new_frozen(rb_obj_class(orig), orig);
1523}
1524
1525static VALUE
1526rb_str_new_frozen_String(VALUE orig)
1527{
1528 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1529 return str_new_frozen(rb_cString, orig);
1530}
1531
1532
1533VALUE
1534rb_str_frozen_bare_string(VALUE orig)
1535{
1536 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1537 return str_new_frozen(rb_cString, orig);
1538}
1539
1540VALUE
1541rb_str_tmp_frozen_acquire(VALUE orig)
1542{
1543 if (OBJ_FROZEN_RAW(orig)) return orig;
1544 return str_new_frozen_buffer(0, orig, FALSE);
1545}
1546
1547VALUE
1548rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1549{
1550 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1551 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1552
1553 VALUE str = str_alloc_heap(0);
1554 OBJ_FREEZE(str);
1555 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1556 FL_SET(str, STR_SHARED_ROOT);
1557
1558 size_t capa = str_capacity(orig, TERM_LEN(orig));
1559
1560 /* If the string is embedded then we want to create a copy that is heap
1561 * allocated. If the string is shared then the shared root must be
1562 * embedded, so we want to create a copy. If the string is a shared root
1563 * then it must be embedded, so we want to create a copy. */
1564 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1565 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1566 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1567 }
1568 else {
1569 /* orig must be heap allocated and not shared, so we can safely transfer
1570 * the pointer to str. */
1571 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1572 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1573 RBASIC(orig)->flags &= ~STR_NOFREE;
1574 STR_SET_SHARED(orig, str);
1575 if (RB_OBJ_SHAREABLE_P(orig)) {
1576 RB_OBJ_SET_SHAREABLE(str);
1577 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1578 }
1579 }
1580
1581 RSTRING(str)->len = RSTRING(orig)->len;
1582 RSTRING(str)->as.heap.aux.capa = capa;
1583
1584 return str;
1585}
1586
1587void
1588rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1589{
1590 if (RBASIC_CLASS(tmp) != 0)
1591 return;
1592
1593 if (STR_EMBED_P(tmp)) {
1595 }
1596 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1597 !OBJ_FROZEN_RAW(orig)) {
1598 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1599
1600 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1601 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1602 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1603
1604 /* Unshare orig since the root (tmp) only has this one child. */
1605 FL_UNSET_RAW(orig, STR_SHARED);
1606 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1607 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1609
1610 /* Make tmp embedded and empty so it is safe for sweeping. */
1611 STR_SET_EMBED(tmp);
1612 STR_SET_LEN(tmp, 0);
1613 }
1614 }
1615}
1616
1617static VALUE
1618str_new_frozen(VALUE klass, VALUE orig)
1619{
1620 return str_new_frozen_buffer(klass, orig, TRUE);
1621}
1622
1623static VALUE
1624heap_str_make_shared(VALUE klass, VALUE orig)
1625{
1626 RUBY_ASSERT(!STR_EMBED_P(orig));
1627 RUBY_ASSERT(!STR_SHARED_P(orig));
1629
1630 VALUE str = str_alloc_heap(klass);
1631 STR_SET_LEN(str, RSTRING_LEN(orig));
1632 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1633 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1634 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1635 RBASIC(orig)->flags &= ~STR_NOFREE;
1636 STR_SET_SHARED(orig, str);
1637 if (klass == 0)
1638 FL_UNSET_RAW(str, STR_BORROWED);
1639 return str;
1640}
1641
1642static VALUE
1643str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1644{
1645 VALUE str;
1646
1647 long len = RSTRING_LEN(orig);
1648 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1649 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1650
1651 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1652 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1653 RUBY_ASSERT(STR_EMBED_P(str));
1654 }
1655 else {
1656 if (FL_TEST_RAW(orig, STR_SHARED)) {
1657 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1658 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1659 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1660 RUBY_ASSERT(ofs >= 0);
1661 RUBY_ASSERT(rest >= 0);
1662 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1664
1665 if ((ofs > 0) || (rest > 0) ||
1666 (klass != RBASIC(shared)->klass) ||
1667 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1668 str = str_new_shared(klass, shared);
1669 RUBY_ASSERT(!STR_EMBED_P(str));
1670 RSTRING(str)->as.heap.ptr += ofs;
1671 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1672 }
1673 else {
1674 if (RBASIC_CLASS(shared) == 0)
1675 FL_SET_RAW(shared, STR_BORROWED);
1676 return shared;
1677 }
1678 }
1679 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1680 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1681 STR_SET_EMBED(str);
1682 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1683 STR_SET_LEN(str, RSTRING_LEN(orig));
1684 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1685 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1686 }
1687 else {
1688 if (RB_OBJ_SHAREABLE_P(orig)) {
1689 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1690 }
1691 else {
1692 str = heap_str_make_shared(klass, orig);
1693 }
1694 }
1695 }
1696
1697 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1698 OBJ_FREEZE(str);
1699 return str;
1700}
1701
1702VALUE
1703rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1704{
1705 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1706}
1707
1708static VALUE
1709str_new_empty_String(VALUE str)
1710{
1711 VALUE v = rb_str_new(0, 0);
1712 rb_enc_copy(v, str);
1713 return v;
1714}
1715
1716#define STR_BUF_MIN_SIZE 63
1717
1718VALUE
1720{
1721 if (STR_EMBEDDABLE_P(capa, 1)) {
1722 return str_alloc_embed(rb_cString, capa + 1);
1723 }
1724
1725 VALUE str = str_alloc_heap(rb_cString);
1726
1727 RSTRING(str)->as.heap.aux.capa = capa;
1728 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1729 RSTRING(str)->as.heap.ptr[0] = '\0';
1730
1731 return str;
1732}
1733
1734VALUE
1736{
1737 VALUE str;
1738 long len = strlen(ptr);
1739
1740 str = rb_str_buf_new(len);
1741 rb_str_buf_cat(str, ptr, len);
1742
1743 return str;
1744}
1745
1746VALUE
1748{
1749 return str_new(0, 0, len);
1750}
1751
1752void
1754{
1755 if (STR_EMBED_P(str)) {
1756 RB_DEBUG_COUNTER_INC(obj_str_embed);
1757 }
1758 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1759 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1760 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1761 }
1762 else {
1763 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1764 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1765 }
1766}
1767
1768size_t
1769rb_str_memsize(VALUE str)
1770{
1771 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1772 return STR_HEAP_SIZE(str);
1773 }
1774 else {
1775 return 0;
1776 }
1777}
1778
1779VALUE
1781{
1782 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1783}
1784
1785static inline void str_discard(VALUE str);
1786static void str_shared_replace(VALUE str, VALUE str2);
1787
1788void
1790{
1791 if (str != str2) str_shared_replace(str, str2);
1792}
1793
1794static void
1795str_shared_replace(VALUE str, VALUE str2)
1796{
1797 rb_encoding *enc;
1798 int cr;
1799 int termlen;
1800
1801 RUBY_ASSERT(str2 != str);
1802 enc = STR_ENC_GET(str2);
1803 cr = ENC_CODERANGE(str2);
1804 str_discard(str);
1805 termlen = rb_enc_mbminlen(enc);
1806
1807 STR_SET_LEN(str, RSTRING_LEN(str2));
1808
1809 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1810 STR_SET_EMBED(str);
1811 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1812 rb_enc_associate(str, enc);
1813 ENC_CODERANGE_SET(str, cr);
1814 }
1815 else {
1816 if (STR_EMBED_P(str2)) {
1817 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1818 long len = RSTRING_LEN(str2);
1819 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1820
1821 char *new_ptr = ALLOC_N(char, len + termlen);
1822 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1823 RSTRING(str2)->as.heap.ptr = new_ptr;
1824 STR_SET_LEN(str2, len);
1825 RSTRING(str2)->as.heap.aux.capa = len;
1826 STR_SET_NOEMBED(str2);
1827 }
1828
1829 STR_SET_NOEMBED(str);
1830 FL_UNSET(str, STR_SHARED);
1831 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1832
1833 if (FL_TEST(str2, STR_SHARED)) {
1834 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1835 STR_SET_SHARED(str, shared);
1836 }
1837 else {
1838 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1839 }
1840
1841 /* abandon str2 */
1842 STR_SET_EMBED(str2);
1843 RSTRING_PTR(str2)[0] = 0;
1844 STR_SET_LEN(str2, 0);
1845 rb_enc_associate(str, enc);
1846 ENC_CODERANGE_SET(str, cr);
1847 }
1848}
1849
1850VALUE
1852{
1853 VALUE str;
1854
1855 if (RB_TYPE_P(obj, T_STRING)) {
1856 return obj;
1857 }
1858 str = rb_funcall(obj, idTo_s, 0);
1859 return rb_obj_as_string_result(str, obj);
1860}
1861
1862VALUE
1863rb_obj_as_string_result(VALUE str, VALUE obj)
1864{
1865 if (!RB_TYPE_P(str, T_STRING))
1866 return rb_any_to_s(obj);
1867 return str;
1868}
1869
1870static VALUE
1871str_replace(VALUE str, VALUE str2)
1872{
1873 long len;
1874
1875 len = RSTRING_LEN(str2);
1876 if (STR_SHARED_P(str2)) {
1877 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1879 STR_SET_NOEMBED(str);
1880 STR_SET_LEN(str, len);
1881 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1882 STR_SET_SHARED(str, shared);
1883 rb_enc_cr_str_exact_copy(str, str2);
1884 }
1885 else {
1886 str_replace_shared(str, str2);
1887 }
1888
1889 return str;
1890}
1891
1892static inline VALUE
1893ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1894{
1895 size_t size = rb_str_embed_size(capa, 0);
1896 RUBY_ASSERT(size > 0);
1897 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1898
1899 NEWOBJ_OF(str, struct RString, klass,
1901
1902 str->len = 0;
1903
1904 return (VALUE)str;
1905}
1906
1907static inline VALUE
1908ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1909{
1910 NEWOBJ_OF(str, struct RString, klass,
1911 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1912
1913 str->as.heap.aux.capa = 0;
1914 str->as.heap.ptr = NULL;
1915
1916 return (VALUE)str;
1917}
1918
1919static inline VALUE
1920str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1921{
1922 int encidx = 0;
1923 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1924 encidx = rb_enc_get_index(str);
1925 flags &= ~ENCODING_MASK;
1926 }
1927 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1928 if (encidx) rb_enc_associate_index(dup, encidx);
1929 return dup;
1930}
1931
1932static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1933
1934static inline VALUE
1935str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1936{
1937 VALUE flags = FL_TEST_RAW(str, flag_mask);
1938 long len = RSTRING_LEN(str);
1939
1940 RUBY_ASSERT(STR_EMBED_P(dup));
1941 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1942 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1943 STR_SET_LEN(dup, RSTRING_LEN(str));
1944 return str_duplicate_setup_encoding(str, dup, flags);
1945}
1946
1947static inline VALUE
1948str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1949{
1950 VALUE flags = FL_TEST_RAW(str, flag_mask);
1951 VALUE root = str;
1952 if (FL_TEST_RAW(str, STR_SHARED)) {
1953 root = RSTRING(str)->as.heap.aux.shared;
1954 }
1955 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1956 root = str = str_new_frozen(klass, str);
1957 flags = FL_TEST_RAW(str, flag_mask);
1958 }
1959 RUBY_ASSERT(!STR_SHARED_P(root));
1961
1962 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1963 FL_SET_RAW(dup, RSTRING_NOEMBED);
1964 STR_SET_SHARED(dup, root);
1965 flags |= RSTRING_NOEMBED | STR_SHARED;
1966
1967 STR_SET_LEN(dup, RSTRING_LEN(str));
1968 return str_duplicate_setup_encoding(str, dup, flags);
1969}
1970
1971static inline VALUE
1972str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1973{
1974 if (STR_EMBED_P(str)) {
1975 return str_duplicate_setup_embed(klass, str, dup);
1976 }
1977 else {
1978 return str_duplicate_setup_heap(klass, str, dup);
1979 }
1980}
1981
1982static inline VALUE
1983str_duplicate(VALUE klass, VALUE str)
1984{
1985 VALUE dup;
1986 if (STR_EMBED_P(str)) {
1987 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1988 }
1989 else {
1990 dup = str_alloc_heap(klass);
1991 }
1992
1993 return str_duplicate_setup(klass, str, dup);
1994}
1995
1996VALUE
1998{
1999 return str_duplicate(rb_obj_class(str), str);
2000}
2001
2002/* :nodoc: */
2003VALUE
2004rb_str_dup_m(VALUE str)
2005{
2006 if (LIKELY(BARE_STRING_P(str))) {
2007 return str_duplicate(rb_cString, str);
2008 }
2009 else {
2010 return rb_obj_dup(str);
2011 }
2012}
2013
2014VALUE
2016{
2017 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2018 return str_duplicate(rb_cString, str);
2019}
2020
2021VALUE
2022rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2023{
2024 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2025 VALUE new_str, klass = rb_cString;
2026
2027 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2028 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2029 str_duplicate_setup_embed(klass, str, new_str);
2030 }
2031 else {
2032 new_str = ec_str_alloc_heap(ec, klass);
2033 str_duplicate_setup_heap(klass, str, new_str);
2034 }
2035 if (chilled) {
2036 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2037 }
2038 return new_str;
2039}
2040
2041VALUE
2042rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2043{
2044 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2045 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2046 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2047 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2048 return rb_str_freeze(str);
2049}
2050
2051/*
2052 * The documentation block below uses an include (instead of inline text)
2053 * because the included text has non-ASCII characters (which are not allowed in a C file).
2054 */
2055
2056/*
2057 *
2058 * call-seq:
2059 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2060 *
2061 * :include: doc/string/new.rdoc
2062 *
2063 */
2064
2065static VALUE
2066rb_str_init(int argc, VALUE *argv, VALUE str)
2067{
2068 static ID keyword_ids[2];
2069 VALUE orig, opt, venc, vcapa;
2070 VALUE kwargs[2];
2071 rb_encoding *enc = 0;
2072 int n;
2073
2074 if (!keyword_ids[0]) {
2075 keyword_ids[0] = rb_id_encoding();
2076 CONST_ID(keyword_ids[1], "capacity");
2077 }
2078
2079 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2080 if (!NIL_P(opt)) {
2081 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2082 venc = kwargs[0];
2083 vcapa = kwargs[1];
2084 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2085 enc = rb_to_encoding(venc);
2086 }
2087 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2088 long capa = NUM2LONG(vcapa);
2089 long len = 0;
2090 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2091
2092 if (capa < STR_BUF_MIN_SIZE) {
2093 capa = STR_BUF_MIN_SIZE;
2094 }
2095 if (n == 1) {
2096 StringValue(orig);
2097 len = RSTRING_LEN(orig);
2098 if (capa < len) {
2099 capa = len;
2100 }
2101 if (orig == str) n = 0;
2102 }
2103 str_modifiable(str);
2104 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2105 /* make noembed always */
2106 const size_t size = (size_t)capa + termlen;
2107 const char *const old_ptr = RSTRING_PTR(str);
2108 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2109 char *new_ptr = ALLOC_N(char, size);
2110 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2111 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2112 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2113 RSTRING(str)->as.heap.ptr = new_ptr;
2114 }
2115 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2116 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2117 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2118 }
2119 STR_SET_LEN(str, len);
2120 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2121 if (n == 1) {
2122 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2123 rb_enc_cr_str_exact_copy(str, orig);
2124 }
2125 FL_SET(str, STR_NOEMBED);
2126 RSTRING(str)->as.heap.aux.capa = capa;
2127 }
2128 else if (n == 1) {
2129 rb_str_replace(str, orig);
2130 }
2131 if (enc) {
2132 rb_enc_associate(str, enc);
2134 }
2135 }
2136 else if (n == 1) {
2137 rb_str_replace(str, orig);
2138 }
2139 return str;
2140}
2141
2142/* :nodoc: */
2143static VALUE
2144rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2145{
2146 if (klass != rb_cString) {
2147 return rb_class_new_instance_pass_kw(argc, argv, klass);
2148 }
2149
2150 static ID keyword_ids[2];
2151 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2152 VALUE kwargs[2];
2153 rb_encoding *enc = NULL;
2154
2155 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2156 if (NIL_P(opt)) {
2157 return rb_class_new_instance_pass_kw(argc, argv, klass);
2158 }
2159
2160 keyword_ids[0] = rb_id_encoding();
2161 CONST_ID(keyword_ids[1], "capacity");
2162 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2163 encoding = kwargs[0];
2164 capacity = kwargs[1];
2165
2166 if (n == 1) {
2167 orig = StringValue(orig);
2168 }
2169 else {
2170 orig = Qnil;
2171 }
2172
2173 if (UNDEF_P(encoding)) {
2174 if (!NIL_P(orig)) {
2175 encoding = rb_obj_encoding(orig);
2176 }
2177 }
2178
2179 if (!UNDEF_P(encoding)) {
2180 enc = rb_to_encoding(encoding);
2181 }
2182
2183 // If capacity is nil, we're basically just duping `orig`.
2184 if (UNDEF_P(capacity)) {
2185 if (NIL_P(orig)) {
2186 VALUE empty_str = str_new(klass, "", 0);
2187 if (enc) {
2188 rb_enc_associate(empty_str, enc);
2189 }
2190 return empty_str;
2191 }
2192 VALUE copy = str_duplicate(klass, orig);
2193 rb_enc_associate(copy, enc);
2194 ENC_CODERANGE_CLEAR(copy);
2195 return copy;
2196 }
2197
2198 long capa = 0;
2199 capa = NUM2LONG(capacity);
2200 if (capa < 0) {
2201 capa = 0;
2202 }
2203
2204 if (!NIL_P(orig)) {
2205 long orig_capa = rb_str_capacity(orig);
2206 if (orig_capa > capa) {
2207 capa = orig_capa;
2208 }
2209 }
2210
2211 VALUE str = str_enc_new(klass, NULL, capa, enc);
2212 STR_SET_LEN(str, 0);
2213 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2214
2215 if (!NIL_P(orig)) {
2216 rb_str_buf_append(str, orig);
2217 }
2218
2219 return str;
2220}
2221
2222#ifdef NONASCII_MASK
2223#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2224
2225/*
2226 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2227 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2228 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2229 *
2230 * if (!(byte & 0x80))
2231 * byte |= 0x40; // turn on bit6
2232 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2233 *
2234 * This function calculates whether a byte is leading or not for all bytes
2235 * in the argument word by concurrently using the above logic, and then
2236 * adds up the number of leading bytes in the word.
2237 */
2238static inline uintptr_t
2239count_utf8_lead_bytes_with_word(const uintptr_t *s)
2240{
2241 uintptr_t d = *s;
2242
2243 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2244 d = (d>>6) | (~d>>7);
2245 d &= NONASCII_MASK >> 7;
2246
2247 /* Gather all bytes. */
2248#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2249 /* use only if it can use POPCNT */
2250 return rb_popcount_intptr(d);
2251#else
2252 d += (d>>8);
2253 d += (d>>16);
2254# if SIZEOF_VOIDP == 8
2255 d += (d>>32);
2256# endif
2257 return (d&0xF);
2258#endif
2259}
2260#endif
2261
2262static inline long
2263enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2264{
2265 long c;
2266 const char *q;
2267
2268 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2269 long diff = (long)(e - p);
2270 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2271 }
2272#ifdef NONASCII_MASK
2273 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2274 uintptr_t len = 0;
2275 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2276 const uintptr_t *s, *t;
2277 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2278 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2279 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2280 while (p < (const char *)s) {
2281 if (is_utf8_lead_byte(*p)) len++;
2282 p++;
2283 }
2284 while (s < t) {
2285 len += count_utf8_lead_bytes_with_word(s);
2286 s++;
2287 }
2288 p = (const char *)s;
2289 }
2290 while (p < e) {
2291 if (is_utf8_lead_byte(*p)) len++;
2292 p++;
2293 }
2294 return (long)len;
2295 }
2296#endif
2297 else if (rb_enc_asciicompat(enc)) {
2298 c = 0;
2299 if (ENC_CODERANGE_CLEAN_P(cr)) {
2300 while (p < e) {
2301 if (ISASCII(*p)) {
2302 q = search_nonascii(p, e);
2303 if (!q)
2304 return c + (e - p);
2305 c += q - p;
2306 p = q;
2307 }
2308 p += rb_enc_fast_mbclen(p, e, enc);
2309 c++;
2310 }
2311 }
2312 else {
2313 while (p < e) {
2314 if (ISASCII(*p)) {
2315 q = search_nonascii(p, e);
2316 if (!q)
2317 return c + (e - p);
2318 c += q - p;
2319 p = q;
2320 }
2321 p += rb_enc_mbclen(p, e, enc);
2322 c++;
2323 }
2324 }
2325 return c;
2326 }
2327
2328 for (c=0; p<e; c++) {
2329 p += rb_enc_mbclen(p, e, enc);
2330 }
2331 return c;
2332}
2333
2334long
2335rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2336{
2337 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2338}
2339
2340/* To get strlen with cr
2341 * Note that given cr is not used.
2342 */
2343long
2344rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2345{
2346 long c;
2347 const char *q;
2348 int ret;
2349
2350 *cr = 0;
2351 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2352 long diff = (long)(e - p);
2353 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2354 }
2355 else if (rb_enc_asciicompat(enc)) {
2356 c = 0;
2357 while (p < e) {
2358 if (ISASCII(*p)) {
2359 q = search_nonascii(p, e);
2360 if (!q) {
2361 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2362 return c + (e - p);
2363 }
2364 c += q - p;
2365 p = q;
2366 }
2367 ret = rb_enc_precise_mbclen(p, e, enc);
2368 if (MBCLEN_CHARFOUND_P(ret)) {
2369 *cr |= ENC_CODERANGE_VALID;
2370 p += MBCLEN_CHARFOUND_LEN(ret);
2371 }
2372 else {
2374 p++;
2375 }
2376 c++;
2377 }
2378 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2379 return c;
2380 }
2381
2382 for (c=0; p<e; c++) {
2383 ret = rb_enc_precise_mbclen(p, e, enc);
2384 if (MBCLEN_CHARFOUND_P(ret)) {
2385 *cr |= ENC_CODERANGE_VALID;
2386 p += MBCLEN_CHARFOUND_LEN(ret);
2387 }
2388 else {
2390 if (p + rb_enc_mbminlen(enc) <= e)
2391 p += rb_enc_mbminlen(enc);
2392 else
2393 p = e;
2394 }
2395 }
2396 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2397 return c;
2398}
2399
2400/* enc must be str's enc or rb_enc_check(str, str2) */
2401static long
2402str_strlen(VALUE str, rb_encoding *enc)
2403{
2404 const char *p, *e;
2405 int cr;
2406
2407 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2408 if (!enc) enc = STR_ENC_GET(str);
2409 p = RSTRING_PTR(str);
2410 e = RSTRING_END(str);
2411 cr = ENC_CODERANGE(str);
2412
2413 if (cr == ENC_CODERANGE_UNKNOWN) {
2414 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2415 if (cr) ENC_CODERANGE_SET(str, cr);
2416 return n;
2417 }
2418 else {
2419 return enc_strlen(p, e, enc, cr);
2420 }
2421}
2422
2423long
2425{
2426 return str_strlen(str, NULL);
2427}
2428
2429/*
2430 * call-seq:
2431 * length -> integer
2432 *
2433 * :include: doc/string/length.rdoc
2434 *
2435 */
2436
2437VALUE
2439{
2440 return LONG2NUM(str_strlen(str, NULL));
2441}
2442
2443/*
2444 * call-seq:
2445 * bytesize -> integer
2446 *
2447 * :include: doc/string/bytesize.rdoc
2448 *
2449 */
2450
2451VALUE
2452rb_str_bytesize(VALUE str)
2453{
2454 return LONG2NUM(RSTRING_LEN(str));
2455}
2456
2457/*
2458 * call-seq:
2459 * empty? -> true or false
2460 *
2461 * Returns whether the length of +self+ is zero:
2462 *
2463 * 'hello'.empty? # => false
2464 * ' '.empty? # => false
2465 * ''.empty? # => true
2466 *
2467 * Related: see {Querying}[rdoc-ref:String@Querying].
2468 */
2469
2470static VALUE
2471rb_str_empty(VALUE str)
2472{
2473 return RBOOL(RSTRING_LEN(str) == 0);
2474}
2475
2476/*
2477 * call-seq:
2478 * self + other_string -> new_string
2479 *
2480 * Returns a new string containing +other_string+ concatenated to +self+:
2481 *
2482 * 'Hello from ' + self.to_s # => "Hello from main"
2483 *
2484 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2485 */
2486
2487VALUE
2489{
2490 VALUE str3;
2491 rb_encoding *enc;
2492 char *ptr1, *ptr2, *ptr3;
2493 long len1, len2;
2494 int termlen;
2495
2496 StringValue(str2);
2497 enc = rb_enc_check_str(str1, str2);
2498 RSTRING_GETMEM(str1, ptr1, len1);
2499 RSTRING_GETMEM(str2, ptr2, len2);
2500 termlen = rb_enc_mbminlen(enc);
2501 if (len1 > LONG_MAX - len2) {
2502 rb_raise(rb_eArgError, "string size too big");
2503 }
2504 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2505 ptr3 = RSTRING_PTR(str3);
2506 memcpy(ptr3, ptr1, len1);
2507 memcpy(ptr3+len1, ptr2, len2);
2508 TERM_FILL(&ptr3[len1+len2], termlen);
2509
2510 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2512 RB_GC_GUARD(str1);
2513 RB_GC_GUARD(str2);
2514 return str3;
2515}
2516
2517/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2518VALUE
2519rb_str_opt_plus(VALUE str1, VALUE str2)
2520{
2523 long len1, len2;
2524 MAYBE_UNUSED(char) *ptr1, *ptr2;
2525 RSTRING_GETMEM(str1, ptr1, len1);
2526 RSTRING_GETMEM(str2, ptr2, len2);
2527 int enc1 = rb_enc_get_index(str1);
2528 int enc2 = rb_enc_get_index(str2);
2529
2530 if (enc1 < 0) {
2531 return Qundef;
2532 }
2533 else if (enc2 < 0) {
2534 return Qundef;
2535 }
2536 else if (enc1 != enc2) {
2537 return Qundef;
2538 }
2539 else if (len1 > LONG_MAX - len2) {
2540 return Qundef;
2541 }
2542 else {
2543 return rb_str_plus(str1, str2);
2544 }
2545
2546}
2547
2548/*
2549 * call-seq:
2550 * self * n -> new_string
2551 *
2552 * Returns a new string containing +n+ copies of +self+:
2553 *
2554 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2555 * 'No!' * 0 # => ""
2556 *
2557 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2558 */
2559
2560VALUE
2562{
2563 VALUE str2;
2564 long n, len;
2565 char *ptr2;
2566 int termlen;
2567
2568 if (times == INT2FIX(1)) {
2569 return str_duplicate(rb_cString, str);
2570 }
2571 if (times == INT2FIX(0)) {
2572 str2 = str_alloc_embed(rb_cString, 0);
2573 rb_enc_copy(str2, str);
2574 return str2;
2575 }
2576 len = NUM2LONG(times);
2577 if (len < 0) {
2578 rb_raise(rb_eArgError, "negative argument");
2579 }
2580 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2581 if (STR_EMBEDDABLE_P(len, 1)) {
2582 str2 = str_alloc_embed(rb_cString, len + 1);
2583 memset(RSTRING_PTR(str2), 0, len + 1);
2584 }
2585 else {
2586 str2 = str_alloc_heap(rb_cString);
2587 RSTRING(str2)->as.heap.aux.capa = len;
2588 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2589 }
2590 STR_SET_LEN(str2, len);
2591 rb_enc_copy(str2, str);
2592 return str2;
2593 }
2594 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2595 rb_raise(rb_eArgError, "argument too big");
2596 }
2597
2598 len *= RSTRING_LEN(str);
2599 termlen = TERM_LEN(str);
2600 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2601 ptr2 = RSTRING_PTR(str2);
2602 if (len) {
2603 n = RSTRING_LEN(str);
2604 memcpy(ptr2, RSTRING_PTR(str), n);
2605 while (n <= len/2) {
2606 memcpy(ptr2 + n, ptr2, n);
2607 n *= 2;
2608 }
2609 memcpy(ptr2 + n, ptr2, len-n);
2610 }
2611 STR_SET_LEN(str2, len);
2612 TERM_FILL(&ptr2[len], termlen);
2613 rb_enc_cr_str_copy_for_substr(str2, str);
2614
2615 return str2;
2616}
2617
2618/*
2619 * call-seq:
2620 * self % object -> new_string
2621 *
2622 * Returns the result of formatting +object+ into the format specifications
2623 * contained in +self+
2624 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2625 *
2626 * '%05d' % 123 # => "00123"
2627 *
2628 * If +self+ contains multiple format specifications,
2629 * +object+ must be an array or hash containing the objects to be formatted:
2630 *
2631 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2632 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2633 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2634 *
2635 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2636 */
2637
2638static VALUE
2639rb_str_format_m(VALUE str, VALUE arg)
2640{
2641 VALUE tmp = rb_check_array_type(arg);
2642
2643 if (!NIL_P(tmp)) {
2644 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2645 }
2646 return rb_str_format(1, &arg, str);
2647}
2648
2649static inline void
2650rb_check_lockedtmp(VALUE str)
2651{
2652 if (FL_TEST(str, STR_TMPLOCK)) {
2653 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2654 }
2655}
2656
2657// If none of these flags are set, we know we have an modifiable string.
2658// If any is set, we need to do more detailed checks.
2659#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2660static inline void
2661str_modifiable(VALUE str)
2662{
2663 RUBY_ASSERT(ruby_thread_has_gvl_p());
2664
2665 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2666 if (CHILLED_STRING_P(str)) {
2667 CHILLED_STRING_MUTATED(str);
2668 }
2669 rb_check_lockedtmp(str);
2670 rb_check_frozen(str);
2671 }
2672}
2673
2674static inline int
2675str_dependent_p(VALUE str)
2676{
2677 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2678 return FALSE;
2679 }
2680 else {
2681 return TRUE;
2682 }
2683}
2684
2685// If none of these flags are set, we know we have an independent string.
2686// If any is set, we need to do more detailed checks.
2687#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2688static inline int
2689str_independent(VALUE str)
2690{
2691 RUBY_ASSERT(ruby_thread_has_gvl_p());
2692
2693 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2694 str_modifiable(str);
2695 return !str_dependent_p(str);
2696 }
2697 return TRUE;
2698}
2699
2700static void
2701str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2702{
2703 RUBY_ASSERT(ruby_thread_has_gvl_p());
2704
2705 char *ptr;
2706 char *oldptr;
2707 long capa = len + expand;
2708
2709 if (len > capa) len = capa;
2710
2711 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2712 ptr = RSTRING(str)->as.heap.ptr;
2713 STR_SET_EMBED(str);
2714 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2715 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2716 STR_SET_LEN(str, len);
2717 return;
2718 }
2719
2720 ptr = ALLOC_N(char, (size_t)capa + termlen);
2721 oldptr = RSTRING_PTR(str);
2722 if (oldptr) {
2723 memcpy(ptr, oldptr, len);
2724 }
2725 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2726 xfree(oldptr);
2727 }
2728 STR_SET_NOEMBED(str);
2729 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2730 TERM_FILL(ptr + len, termlen);
2731 RSTRING(str)->as.heap.ptr = ptr;
2732 STR_SET_LEN(str, len);
2733 RSTRING(str)->as.heap.aux.capa = capa;
2734}
2735
2736void
2737rb_str_modify(VALUE str)
2738{
2739 if (!str_independent(str))
2740 str_make_independent(str);
2742}
2743
2744void
2746{
2747 RUBY_ASSERT(ruby_thread_has_gvl_p());
2748
2749 int termlen = TERM_LEN(str);
2750 long len = RSTRING_LEN(str);
2751
2752 if (expand < 0) {
2753 rb_raise(rb_eArgError, "negative expanding string size");
2754 }
2755 if (expand >= LONG_MAX - len) {
2756 rb_raise(rb_eArgError, "string size too big");
2757 }
2758
2759 if (!str_independent(str)) {
2760 str_make_independent_expand(str, len, expand, termlen);
2761 }
2762 else if (expand > 0) {
2763 RESIZE_CAPA_TERM(str, len + expand, termlen);
2764 }
2766}
2767
2768/* As rb_str_modify(), but don't clear coderange */
2769static void
2770str_modify_keep_cr(VALUE str)
2771{
2772 if (!str_independent(str))
2773 str_make_independent(str);
2775 /* Force re-scan later */
2777}
2778
2779static inline void
2780str_discard(VALUE str)
2781{
2782 str_modifiable(str);
2783 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2784 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2785 RSTRING(str)->as.heap.ptr = 0;
2786 STR_SET_LEN(str, 0);
2787 }
2788}
2789
2790void
2792{
2793 int encindex = rb_enc_get_index(str);
2794
2795 if (RB_UNLIKELY(encindex == -1)) {
2796 rb_raise(rb_eTypeError, "not encoding capable object");
2797 }
2798
2799 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2800 return;
2801 }
2802
2803 rb_encoding *enc = rb_enc_from_index(encindex);
2804 if (!rb_enc_asciicompat(enc)) {
2805 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2806 }
2807}
2808
2809VALUE
2811{
2812 RUBY_ASSERT(ruby_thread_has_gvl_p());
2813
2814 VALUE s = *ptr;
2815 if (!RB_TYPE_P(s, T_STRING)) {
2816 s = rb_str_to_str(s);
2817 *ptr = s;
2818 }
2819 return s;
2820}
2821
2822char *
2824{
2825 VALUE str = rb_string_value(ptr);
2826 return RSTRING_PTR(str);
2827}
2828
2829static int
2830zero_filled(const char *s, int n)
2831{
2832 for (; n > 0; --n) {
2833 if (*s++) return 0;
2834 }
2835 return 1;
2836}
2837
2838static const char *
2839str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2840{
2841 const char *e = s + len;
2842
2843 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2844 if (zero_filled(s, minlen)) return s;
2845 }
2846 return 0;
2847}
2848
2849static char *
2850str_fill_term(VALUE str, char *s, long len, int termlen)
2851{
2852 /* This function assumes that (capa + termlen) bytes of memory
2853 * is allocated, like many other functions in this file.
2854 */
2855 if (str_dependent_p(str)) {
2856 if (!zero_filled(s + len, termlen))
2857 str_make_independent_expand(str, len, 0L, termlen);
2858 }
2859 else {
2860 TERM_FILL(s + len, termlen);
2861 return s;
2862 }
2863 return RSTRING_PTR(str);
2864}
2865
2866void
2867rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2868{
2869 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2870 long len = RSTRING_LEN(str);
2871
2872 RUBY_ASSERT(capa >= len);
2873 if (capa - len < termlen) {
2874 rb_check_lockedtmp(str);
2875 str_make_independent_expand(str, len, 0L, termlen);
2876 }
2877 else if (str_dependent_p(str)) {
2878 if (termlen > oldtermlen)
2879 str_make_independent_expand(str, len, 0L, termlen);
2880 }
2881 else {
2882 if (!STR_EMBED_P(str)) {
2883 /* modify capa instead of realloc */
2884 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2885 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2886 }
2887 if (termlen > oldtermlen) {
2888 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2889 }
2890 }
2891
2892 return;
2893}
2894
2895static char *
2896str_null_check(VALUE str, int *w)
2897{
2898 char *s = RSTRING_PTR(str);
2899 long len = RSTRING_LEN(str);
2900 rb_encoding *enc = rb_enc_get(str);
2901 const int minlen = rb_enc_mbminlen(enc);
2902
2903 if (minlen > 1) {
2904 *w = 1;
2905 if (str_null_char(s, len, minlen, enc)) {
2906 return NULL;
2907 }
2908 return str_fill_term(str, s, len, minlen);
2909 }
2910 *w = 0;
2911 if (!s || memchr(s, 0, len)) {
2912 return NULL;
2913 }
2914 if (s[len]) {
2915 s = str_fill_term(str, s, len, minlen);
2916 }
2917 return s;
2918}
2919
2920char *
2921rb_str_to_cstr(VALUE str)
2922{
2923 int w;
2924 return str_null_check(str, &w);
2925}
2926
2927char *
2929{
2930 VALUE str = rb_string_value(ptr);
2931 int w;
2932 char *s = str_null_check(str, &w);
2933 if (!s) {
2934 if (w) {
2935 rb_raise(rb_eArgError, "string contains null char");
2936 }
2937 rb_raise(rb_eArgError, "string contains null byte");
2938 }
2939 return s;
2940}
2941
2942char *
2943rb_str_fill_terminator(VALUE str, const int newminlen)
2944{
2945 char *s = RSTRING_PTR(str);
2946 long len = RSTRING_LEN(str);
2947 return str_fill_term(str, s, len, newminlen);
2948}
2949
2950VALUE
2952{
2953 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2954 return str;
2955}
2956
2957/*
2958 * call-seq:
2959 * String.try_convert(object) -> object, new_string, or nil
2960 *
2961 * Attempts to convert the given +object+ to a string.
2962 *
2963 * If +object+ is already a string, returns +object+, unmodified.
2964 *
2965 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2966 * calls <tt>object.to_str</tt> and returns the result.
2967 *
2968 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2969 *
2970 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2971 */
2972static VALUE
2973rb_str_s_try_convert(VALUE dummy, VALUE str)
2974{
2975 return rb_check_string_type(str);
2976}
2977
2978static char*
2979str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2980{
2981 long nth = *nthp;
2982 if (rb_enc_mbmaxlen(enc) == 1) {
2983 p += nth;
2984 }
2985 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2986 p += nth * rb_enc_mbmaxlen(enc);
2987 }
2988 else if (rb_enc_asciicompat(enc)) {
2989 const char *p2, *e2;
2990 int n;
2991
2992 while (p < e && 0 < nth) {
2993 e2 = p + nth;
2994 if (e < e2) {
2995 *nthp = nth;
2996 return (char *)e;
2997 }
2998 if (ISASCII(*p)) {
2999 p2 = search_nonascii(p, e2);
3000 if (!p2) {
3001 nth -= e2 - p;
3002 *nthp = nth;
3003 return (char *)e2;
3004 }
3005 nth -= p2 - p;
3006 p = p2;
3007 }
3008 n = rb_enc_mbclen(p, e, enc);
3009 p += n;
3010 nth--;
3011 }
3012 *nthp = nth;
3013 if (nth != 0) {
3014 return (char *)e;
3015 }
3016 return (char *)p;
3017 }
3018 else {
3019 while (p < e && nth--) {
3020 p += rb_enc_mbclen(p, e, enc);
3021 }
3022 }
3023 if (p > e) p = e;
3024 *nthp = nth;
3025 return (char*)p;
3026}
3027
3028char*
3029rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3030{
3031 return str_nth_len(p, e, &nth, enc);
3032}
3033
3034static char*
3035str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3036{
3037 if (singlebyte)
3038 p += nth;
3039 else {
3040 p = str_nth_len(p, e, &nth, enc);
3041 }
3042 if (!p) return 0;
3043 if (p > e) p = e;
3044 return (char *)p;
3045}
3046
3047/* char offset to byte offset */
3048static long
3049str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3050{
3051 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3052 if (!pp) return e - p;
3053 return pp - p;
3054}
3055
3056long
3057rb_str_offset(VALUE str, long pos)
3058{
3059 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3060 STR_ENC_GET(str), single_byte_optimizable(str));
3061}
3062
3063#ifdef NONASCII_MASK
3064static char *
3065str_utf8_nth(const char *p, const char *e, long *nthp)
3066{
3067 long nth = *nthp;
3068 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3069 const uintptr_t *s, *t;
3070 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3071 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3072 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3073 while (p < (const char *)s) {
3074 if (is_utf8_lead_byte(*p)) nth--;
3075 p++;
3076 }
3077 do {
3078 nth -= count_utf8_lead_bytes_with_word(s);
3079 s++;
3080 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3081 p = (char *)s;
3082 }
3083 while (p < e) {
3084 if (is_utf8_lead_byte(*p)) {
3085 if (nth == 0) break;
3086 nth--;
3087 }
3088 p++;
3089 }
3090 *nthp = nth;
3091 return (char *)p;
3092}
3093
3094static long
3095str_utf8_offset(const char *p, const char *e, long nth)
3096{
3097 const char *pp = str_utf8_nth(p, e, &nth);
3098 return pp - p;
3099}
3100#endif
3101
3102/* byte offset to char offset */
3103long
3104rb_str_sublen(VALUE str, long pos)
3105{
3106 if (single_byte_optimizable(str) || pos < 0)
3107 return pos;
3108 else {
3109 char *p = RSTRING_PTR(str);
3110 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3111 }
3112}
3113
3114static VALUE
3115str_subseq(VALUE str, long beg, long len)
3116{
3117 VALUE str2;
3118
3119 RUBY_ASSERT(beg >= 0);
3120 RUBY_ASSERT(len >= 0);
3121 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3122
3123 const int termlen = TERM_LEN(str);
3124 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3125 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3126 RB_GC_GUARD(str);
3127 return str2;
3128 }
3129
3130 str2 = str_alloc_heap(rb_cString);
3131 if (str_embed_capa(str2) >= len + termlen) {
3132 char *ptr2 = RSTRING(str2)->as.embed.ary;
3133 STR_SET_EMBED(str2);
3134 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3135 TERM_FILL(ptr2+len, termlen);
3136
3137 STR_SET_LEN(str2, len);
3138 RB_GC_GUARD(str);
3139 }
3140 else {
3141 str_replace_shared(str2, str);
3142 RUBY_ASSERT(!STR_EMBED_P(str2));
3143 ENC_CODERANGE_CLEAR(str2);
3144 RSTRING(str2)->as.heap.ptr += beg;
3145 if (RSTRING_LEN(str2) > len) {
3146 STR_SET_LEN(str2, len);
3147 }
3148 }
3149
3150 return str2;
3151}
3152
3153VALUE
3154rb_str_subseq(VALUE str, long beg, long len)
3155{
3156 VALUE str2 = str_subseq(str, beg, len);
3157 rb_enc_cr_str_copy_for_substr(str2, str);
3158 return str2;
3159}
3160
3161char *
3162rb_str_subpos(VALUE str, long beg, long *lenp)
3163{
3164 long len = *lenp;
3165 long slen = -1L;
3166 const long blen = RSTRING_LEN(str);
3167 rb_encoding *enc = STR_ENC_GET(str);
3168 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3169
3170 if (len < 0) return 0;
3171 if (beg < 0 && -beg < 0) return 0;
3172 if (!blen) {
3173 len = 0;
3174 }
3175 if (single_byte_optimizable(str)) {
3176 if (beg > blen) return 0;
3177 if (beg < 0) {
3178 beg += blen;
3179 if (beg < 0) return 0;
3180 }
3181 if (len > blen - beg)
3182 len = blen - beg;
3183 if (len < 0) return 0;
3184 p = s + beg;
3185 goto end;
3186 }
3187 if (beg < 0) {
3188 if (len > -beg) len = -beg;
3189 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3190 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3191 beg = -beg;
3192 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3193 p = e;
3194 if (!p) return 0;
3195 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3196 if (!p) return 0;
3197 len = e - p;
3198 goto end;
3199 }
3200 else {
3201 slen = str_strlen(str, enc);
3202 beg += slen;
3203 if (beg < 0) return 0;
3204 p = s + beg;
3205 if (len == 0) goto end;
3206 }
3207 }
3208 else if (beg > 0 && beg > blen) {
3209 return 0;
3210 }
3211 if (len == 0) {
3212 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3213 p = s + beg;
3214 }
3215#ifdef NONASCII_MASK
3216 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3217 enc == rb_utf8_encoding()) {
3218 p = str_utf8_nth(s, e, &beg);
3219 if (beg > 0) return 0;
3220 len = str_utf8_offset(p, e, len);
3221 }
3222#endif
3223 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3224 int char_sz = rb_enc_mbmaxlen(enc);
3225
3226 p = s + beg * char_sz;
3227 if (p > e) {
3228 return 0;
3229 }
3230 else if (len * char_sz > e - p)
3231 len = e - p;
3232 else
3233 len *= char_sz;
3234 }
3235 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3236 if (beg > 0) return 0;
3237 len = 0;
3238 }
3239 else {
3240 len = str_offset(p, e, len, enc, 0);
3241 }
3242 end:
3243 *lenp = len;
3244 RB_GC_GUARD(str);
3245 return p;
3246}
3247
3248static VALUE str_substr(VALUE str, long beg, long len, int empty);
3249
3250VALUE
3251rb_str_substr(VALUE str, long beg, long len)
3252{
3253 return str_substr(str, beg, len, TRUE);
3254}
3255
3256VALUE
3257rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3258{
3259 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3260}
3261
3262static VALUE
3263str_substr(VALUE str, long beg, long len, int empty)
3264{
3265 char *p = rb_str_subpos(str, beg, &len);
3266
3267 if (!p) return Qnil;
3268 if (!len && !empty) return Qnil;
3269
3270 beg = p - RSTRING_PTR(str);
3271
3272 VALUE str2 = str_subseq(str, beg, len);
3273 rb_enc_cr_str_copy_for_substr(str2, str);
3274 return str2;
3275}
3276
3277/* :nodoc: */
3278VALUE
3280{
3281 if (CHILLED_STRING_P(str)) {
3282 FL_UNSET_RAW(str, STR_CHILLED);
3283 }
3284
3285 if (OBJ_FROZEN(str)) return str;
3286 rb_str_resize(str, RSTRING_LEN(str));
3287 return rb_obj_freeze(str);
3288}
3289
3290/*
3291 * call-seq:
3292 * +string -> new_string or self
3293 *
3294 * Returns +self+ if +self+ is not frozen and can be mutated
3295 * without warning issuance.
3296 *
3297 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3298 *
3299 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3300 */
3301static VALUE
3302str_uplus(VALUE str)
3303{
3304 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3305 return rb_str_dup(str);
3306 }
3307 else {
3308 return str;
3309 }
3310}
3311
3312/*
3313 * call-seq:
3314 * -self -> frozen_string
3315 *
3316 * Returns a frozen string equal to +self+.
3317 *
3318 * The returned string is +self+ if and only if all of the following are true:
3319 *
3320 * - +self+ is already frozen.
3321 * - +self+ is an instance of \String (rather than of a subclass of \String)
3322 * - +self+ has no instance variables set on it.
3323 *
3324 * Otherwise, the returned string is a frozen copy of +self+.
3325 *
3326 * Returning +self+, when possible, saves duplicating +self+;
3327 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3328 *
3329 * It may also save duplicating other, already-existing, strings:
3330 *
3331 * s0 = 'foo'
3332 * s1 = 'foo'
3333 * s0.object_id == s1.object_id # => false
3334 * (-s0).object_id == (-s1).object_id # => true
3335 *
3336 * Note that method #-@ is convenient for defining a constant:
3337 *
3338 * FileName = -'config/database.yml'
3339 *
3340 * While its alias #dedup is better suited for chaining:
3341 *
3342 * 'foo'.dedup.gsub!('o')
3343 *
3344 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3345 */
3346static VALUE
3347str_uminus(VALUE str)
3348{
3349 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3350 str = rb_str_dup(str);
3351 }
3352 return rb_fstring(str);
3353}
3354
3355RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3356#define rb_str_dup_frozen rb_str_new_frozen
3357
3358VALUE
3360{
3361 rb_check_frozen(str);
3362 if (FL_TEST(str, STR_TMPLOCK)) {
3363 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3364 }
3365 FL_SET(str, STR_TMPLOCK);
3366 return str;
3367}
3368
3369VALUE
3371{
3372 rb_check_frozen(str);
3373 if (!FL_TEST(str, STR_TMPLOCK)) {
3374 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3375 }
3376 FL_UNSET(str, STR_TMPLOCK);
3377 return str;
3378}
3379
3380VALUE
3381rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3382{
3383 rb_str_locktmp(str);
3384 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3385}
3386
3387void
3389{
3390 RUBY_ASSERT(ruby_thread_has_gvl_p());
3391
3392 long capa;
3393 const int termlen = TERM_LEN(str);
3394
3395 str_modifiable(str);
3396 if (STR_SHARED_P(str)) {
3397 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3398 }
3399 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3400 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3401 }
3402
3403 int cr = ENC_CODERANGE(str);
3404 if (len == 0) {
3405 /* Empty string does not contain non-ASCII */
3407 }
3408 else if (cr == ENC_CODERANGE_UNKNOWN) {
3409 /* Leave unknown. */
3410 }
3411 else if (len > RSTRING_LEN(str)) {
3412 if (ENC_CODERANGE_CLEAN_P(cr)) {
3413 /* Update the coderange regarding the extended part. */
3414 const char *const prev_end = RSTRING_END(str);
3415 const char *const new_end = RSTRING_PTR(str) + len;
3416 rb_encoding *enc = rb_enc_get(str);
3417 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3418 ENC_CODERANGE_SET(str, cr);
3419 }
3420 else if (cr == ENC_CODERANGE_BROKEN) {
3421 /* May be valid now, by appended part. */
3423 }
3424 }
3425 else if (len < RSTRING_LEN(str)) {
3426 if (cr != ENC_CODERANGE_7BIT) {
3427 /* ASCII-only string is keeping after truncated. Valid
3428 * and broken may be invalid or valid, leave unknown. */
3430 }
3431 }
3432
3433 STR_SET_LEN(str, len);
3434 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3435}
3436
3437VALUE
3438rb_str_resize(VALUE str, long len)
3439{
3440 if (len < 0) {
3441 rb_raise(rb_eArgError, "negative string size (or size too big)");
3442 }
3443
3444 int independent = str_independent(str);
3445 long slen = RSTRING_LEN(str);
3446 const int termlen = TERM_LEN(str);
3447
3448 if (slen > len || (termlen != 1 && slen < len)) {
3450 }
3451
3452 {
3453 long capa;
3454 if (STR_EMBED_P(str)) {
3455 if (len == slen) return str;
3456 if (str_embed_capa(str) >= len + termlen) {
3457 STR_SET_LEN(str, len);
3458 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3459 return str;
3460 }
3461 str_make_independent_expand(str, slen, len - slen, termlen);
3462 }
3463 else if (str_embed_capa(str) >= len + termlen) {
3464 char *ptr = STR_HEAP_PTR(str);
3465 STR_SET_EMBED(str);
3466 if (slen > len) slen = len;
3467 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3468 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3469 STR_SET_LEN(str, len);
3470 if (independent) ruby_xfree(ptr);
3471 return str;
3472 }
3473 else if (!independent) {
3474 if (len == slen) return str;
3475 str_make_independent_expand(str, slen, len - slen, termlen);
3476 }
3477 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3478 (capa - len) > (len < 1024 ? len : 1024)) {
3479 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3480 (size_t)len + termlen, STR_HEAP_SIZE(str));
3481 RSTRING(str)->as.heap.aux.capa = len;
3482 }
3483 else if (len == slen) return str;
3484 STR_SET_LEN(str, len);
3485 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3486 }
3487 return str;
3488}
3489
3490static void
3491str_ensure_available_capa(VALUE str, long len)
3492{
3493 str_modify_keep_cr(str);
3494
3495 const int termlen = TERM_LEN(str);
3496 long olen = RSTRING_LEN(str);
3497
3498 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3499 rb_raise(rb_eArgError, "string sizes too big");
3500 }
3501
3502 long total = olen + len;
3503 long capa = str_capacity(str, termlen);
3504
3505 if (capa < total) {
3506 if (total >= LONG_MAX / 2) {
3507 capa = total;
3508 }
3509 while (total > capa) {
3510 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3511 }
3512 RESIZE_CAPA_TERM(str, capa, termlen);
3513 }
3514}
3515
3516static VALUE
3517str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3518{
3519 if (keep_cr) {
3520 str_modify_keep_cr(str);
3521 }
3522 else {
3523 rb_str_modify(str);
3524 }
3525 if (len == 0) return 0;
3526
3527 long total, olen, off = -1;
3528 char *sptr;
3529 const int termlen = TERM_LEN(str);
3530
3531 RSTRING_GETMEM(str, sptr, olen);
3532 if (ptr >= sptr && ptr <= sptr + olen) {
3533 off = ptr - sptr;
3534 }
3535
3536 long capa = str_capacity(str, termlen);
3537
3538 if (olen > LONG_MAX - len) {
3539 rb_raise(rb_eArgError, "string sizes too big");
3540 }
3541 total = olen + len;
3542 if (capa < total) {
3543 if (total >= LONG_MAX / 2) {
3544 capa = total;
3545 }
3546 while (total > capa) {
3547 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3548 }
3549 RESIZE_CAPA_TERM(str, capa, termlen);
3550 sptr = RSTRING_PTR(str);
3551 }
3552 if (off != -1) {
3553 ptr = sptr + off;
3554 }
3555 memcpy(sptr + olen, ptr, len);
3556 STR_SET_LEN(str, total);
3557 TERM_FILL(sptr + total, termlen); /* sentinel */
3558
3559 return str;
3560}
3561
3562#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3563#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3564
3565VALUE
3566rb_str_cat(VALUE str, const char *ptr, long len)
3567{
3568 if (len == 0) return str;
3569 if (len < 0) {
3570 rb_raise(rb_eArgError, "negative string size (or size too big)");
3571 }
3572 return str_buf_cat(str, ptr, len);
3573}
3574
3575VALUE
3576rb_str_cat_cstr(VALUE str, const char *ptr)
3577{
3578 must_not_null(ptr);
3579 return rb_str_buf_cat(str, ptr, strlen(ptr));
3580}
3581
3582static void
3583rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3584{
3585 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3586
3587 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3588 if (UNLIKELY(!str_independent(str))) {
3589 str_make_independent(str);
3590 }
3591
3592 long string_length = -1;
3593 const int null_terminator_length = 1;
3594 char *sptr;
3595 RSTRING_GETMEM(str, sptr, string_length);
3596
3597 // Ensure the resulting string wouldn't be too long.
3598 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3599 rb_raise(rb_eArgError, "string sizes too big");
3600 }
3601
3602 long string_capacity = str_capacity(str, null_terminator_length);
3603
3604 // Get the code range before any modifications since those might clear the code range.
3605 int cr = ENC_CODERANGE(str);
3606
3607 // Check if the string has spare string_capacity to write the new byte.
3608 if (LIKELY(string_capacity >= string_length + 1)) {
3609 // In fast path we can write the new byte and note the string's new length.
3610 sptr[string_length] = byte;
3611 STR_SET_LEN(str, string_length + 1);
3612 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3613 }
3614 else {
3615 // If there's not enough string_capacity, make a call into the general string concatenation function.
3616 str_buf_cat(str, (char *)&byte, 1);
3617 }
3618
3619 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3620 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3621 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3622 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3623 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3624 if (ISASCII(byte)) {
3626 }
3627 else {
3629
3630 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3631 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3632 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3633 }
3634 }
3635 }
3636}
3637
3638RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3639RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3640RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3641
3642static VALUE
3643rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3644 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3645{
3646 int str_encindex = ENCODING_GET(str);
3647 int res_encindex;
3648 int str_cr, res_cr;
3649 rb_encoding *str_enc, *ptr_enc;
3650
3651 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3652
3653 if (str_encindex == ptr_encindex) {
3654 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3655 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3656 }
3657 }
3658 else {
3659 str_enc = rb_enc_from_index(str_encindex);
3660 ptr_enc = rb_enc_from_index(ptr_encindex);
3661 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3662 if (len == 0)
3663 return str;
3664 if (RSTRING_LEN(str) == 0) {
3665 rb_str_buf_cat(str, ptr, len);
3666 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3667 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3668 return str;
3669 }
3670 goto incompatible;
3671 }
3672 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3673 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3674 }
3675 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3676 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3677 str_cr = rb_enc_str_coderange(str);
3678 }
3679 }
3680 }
3681 if (ptr_cr_ret)
3682 *ptr_cr_ret = ptr_cr;
3683
3684 if (str_encindex != ptr_encindex &&
3685 str_cr != ENC_CODERANGE_7BIT &&
3686 ptr_cr != ENC_CODERANGE_7BIT) {
3687 str_enc = rb_enc_from_index(str_encindex);
3688 ptr_enc = rb_enc_from_index(ptr_encindex);
3689 goto incompatible;
3690 }
3691
3692 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3693 res_encindex = str_encindex;
3694 res_cr = ENC_CODERANGE_UNKNOWN;
3695 }
3696 else if (str_cr == ENC_CODERANGE_7BIT) {
3697 if (ptr_cr == ENC_CODERANGE_7BIT) {
3698 res_encindex = str_encindex;
3699 res_cr = ENC_CODERANGE_7BIT;
3700 }
3701 else {
3702 res_encindex = ptr_encindex;
3703 res_cr = ptr_cr;
3704 }
3705 }
3706 else if (str_cr == ENC_CODERANGE_VALID) {
3707 res_encindex = str_encindex;
3708 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3709 res_cr = str_cr;
3710 else
3711 res_cr = ptr_cr;
3712 }
3713 else { /* str_cr == ENC_CODERANGE_BROKEN */
3714 res_encindex = str_encindex;
3715 res_cr = str_cr;
3716 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3717 }
3718
3719 if (len < 0) {
3720 rb_raise(rb_eArgError, "negative string size (or size too big)");
3721 }
3722 str_buf_cat(str, ptr, len);
3723 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3724 return str;
3725
3726 incompatible:
3727 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3728 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3730}
3731
3732VALUE
3733rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3734{
3735 return rb_enc_cr_str_buf_cat(str, ptr, len,
3736 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3737}
3738
3739VALUE
3741{
3742 /* ptr must reference NUL terminated ASCII string. */
3743 int encindex = ENCODING_GET(str);
3744 rb_encoding *enc = rb_enc_from_index(encindex);
3745 if (rb_enc_asciicompat(enc)) {
3746 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3747 encindex, ENC_CODERANGE_7BIT, 0);
3748 }
3749 else {
3750 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3751 while (*ptr) {
3752 unsigned int c = (unsigned char)*ptr;
3753 int len = rb_enc_codelen(c, enc);
3754 rb_enc_mbcput(c, buf, enc);
3755 rb_enc_cr_str_buf_cat(str, buf, len,
3756 encindex, ENC_CODERANGE_VALID, 0);
3757 ptr++;
3758 }
3759 return str;
3760 }
3761}
3762
3763VALUE
3765{
3766 int str2_cr = rb_enc_str_coderange(str2);
3767
3768 if (str_enc_fastpath(str)) {
3769 switch (str2_cr) {
3770 case ENC_CODERANGE_7BIT:
3771 // If RHS is 7bit we can do simple concatenation
3772 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3773 RB_GC_GUARD(str2);
3774 return str;
3776 // If RHS is valid, we can do simple concatenation if encodings are the same
3777 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3778 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3779 int str_cr = ENC_CODERANGE(str);
3780 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3781 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3782 }
3783 RB_GC_GUARD(str2);
3784 return str;
3785 }
3786 }
3787 }
3788
3789 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3790 ENCODING_GET(str2), str2_cr, &str2_cr);
3791
3792 ENC_CODERANGE_SET(str2, str2_cr);
3793
3794 return str;
3795}
3796
3797VALUE
3799{
3800 StringValue(str2);
3801 return rb_str_buf_append(str, str2);
3802}
3803
3804VALUE
3805rb_str_concat_literals(size_t num, const VALUE *strary)
3806{
3807 VALUE str;
3808 size_t i, s = 0;
3809 unsigned long len = 1;
3810
3811 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3812 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3813
3814 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3815 str = rb_str_buf_new(len);
3816 str_enc_copy_direct(str, strary[0]);
3817
3818 for (i = s; i < num; ++i) {
3819 const VALUE v = strary[i];
3820 int encidx = ENCODING_GET(v);
3821
3822 rb_str_buf_append(str, v);
3823 if (encidx != ENCINDEX_US_ASCII) {
3824 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3825 rb_enc_set_index(str, encidx);
3826 }
3827 }
3828 return str;
3829}
3830
3831/*
3832 * call-seq:
3833 * concat(*objects) -> string
3834 *
3835 * :include: doc/string/concat.rdoc
3836 */
3837static VALUE
3838rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3839{
3840 str_modifiable(str);
3841
3842 if (argc == 1) {
3843 return rb_str_concat(str, argv[0]);
3844 }
3845 else if (argc > 1) {
3846 int i;
3847 VALUE arg_str = rb_str_tmp_new(0);
3848 rb_enc_copy(arg_str, str);
3849 for (i = 0; i < argc; i++) {
3850 rb_str_concat(arg_str, argv[i]);
3851 }
3852 rb_str_buf_append(str, arg_str);
3853 }
3854
3855 return str;
3856}
3857
3858/*
3859 * call-seq:
3860 * append_as_bytes(*objects) -> self
3861 *
3862 * Concatenates each object in +objects+ into +self+; returns +self+;
3863 * performs no encoding validation or conversion:
3864 *
3865 * s = 'foo'
3866 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3867 * s.valid_encoding? # => false
3868 * s.append_as_bytes("\xAC 12")
3869 * s.valid_encoding? # => true
3870 *
3871 * When a given object is an integer,
3872 * the value is considered an 8-bit byte;
3873 * if the integer occupies more than one byte (i.e,. is greater than 255),
3874 * appends only the low-order byte (similar to String#setbyte):
3875 *
3876 * s = ""
3877 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3878 * s.bytesize # => 2
3879 *
3880 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3881 */
3882
3883VALUE
3884rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3885{
3886 long needed_capacity = 0;
3887 volatile VALUE t0;
3888 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3889
3890 for (int index = 0; index < argc; index++) {
3891 VALUE obj = argv[index];
3892 enum ruby_value_type type = types[index] = rb_type(obj);
3893 switch (type) {
3894 case T_FIXNUM:
3895 case T_BIGNUM:
3896 needed_capacity++;
3897 break;
3898 case T_STRING:
3899 needed_capacity += RSTRING_LEN(obj);
3900 break;
3901 default:
3902 rb_raise(
3904 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3905 rb_obj_class(obj)
3906 );
3907 break;
3908 }
3909 }
3910
3911 str_ensure_available_capa(str, needed_capacity);
3912 char *sptr = RSTRING_END(str);
3913
3914 for (int index = 0; index < argc; index++) {
3915 VALUE obj = argv[index];
3916 enum ruby_value_type type = types[index];
3917 switch (type) {
3918 case T_FIXNUM:
3919 case T_BIGNUM: {
3920 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3921 char byte = (char)(NUM2INT(obj) & 0xFF);
3922 *sptr = byte;
3923 sptr++;
3924 break;
3925 }
3926 case T_STRING: {
3927 const char *ptr;
3928 long len;
3929 RSTRING_GETMEM(obj, ptr, len);
3930 memcpy(sptr, ptr, len);
3931 sptr += len;
3932 break;
3933 }
3934 default:
3935 rb_bug("append_as_bytes arguments should have been validated");
3936 }
3937 }
3938
3939 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3940 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3941
3942 int cr = ENC_CODERANGE(str);
3943 switch (cr) {
3944 case ENC_CODERANGE_7BIT: {
3945 for (int index = 0; index < argc; index++) {
3946 VALUE obj = argv[index];
3947 enum ruby_value_type type = types[index];
3948 switch (type) {
3949 case T_FIXNUM:
3950 case T_BIGNUM: {
3951 if (!ISASCII(NUM2INT(obj))) {
3952 goto clear_cr;
3953 }
3954 break;
3955 }
3956 case T_STRING: {
3957 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3958 goto clear_cr;
3959 }
3960 break;
3961 }
3962 default:
3963 rb_bug("append_as_bytes arguments should have been validated");
3964 }
3965 }
3966 break;
3967 }
3969 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3970 goto keep_cr;
3971 }
3972 else {
3973 goto clear_cr;
3974 }
3975 break;
3976 default:
3977 goto clear_cr;
3978 break;
3979 }
3980
3981 RB_GC_GUARD(t0);
3982
3983 clear_cr:
3984 // If no fast path was hit, we clear the coderange.
3985 // append_as_bytes is predominantly meant to be used in
3986 // buffering situation, hence it's likely the coderange
3987 // will never be scanned, so it's not worth spending time
3988 // precomputing the coderange except for simple and common
3989 // situations.
3991 keep_cr:
3992 return str;
3993}
3994
3995/*
3996 * call-seq:
3997 * self << object -> self
3998 *
3999 * Appends a string representation of +object+ to +self+;
4000 * returns +self+.
4001 *
4002 * If +object+ is a string, appends it to +self+:
4003 *
4004 * s = 'foo'
4005 * s << 'bar' # => "foobar"
4006 * s # => "foobar"
4007 *
4008 * If +object+ is an integer,
4009 * its value is considered a codepoint;
4010 * converts the value to a character before concatenating:
4011 *
4012 * s = 'foo'
4013 * s << 33 # => "foo!"
4014 *
4015 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4016 * and the encoding of +self+ is Encoding::US_ASCII,
4017 * changes the encoding to Encoding::ASCII_8BIT:
4018 *
4019 * s = 'foo'.encode(Encoding::US_ASCII)
4020 * s.encoding # => #<Encoding:US-ASCII>
4021 * s << 0xff # => "foo\xFF"
4022 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4023 *
4024 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4025 *
4026 * s = 'foo'
4027 * s.encoding # => <Encoding:UTF-8>
4028 * s << 0x00110000 # 1114112 out of char range (RangeError)
4029 * s = 'foo'.encode(Encoding::EUC_JP)
4030 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4031 *
4032 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4033 */
4034VALUE
4036{
4037 unsigned int code;
4038 rb_encoding *enc = STR_ENC_GET(str1);
4039 int encidx;
4040
4041 if (RB_INTEGER_TYPE_P(str2)) {
4042 if (rb_num_to_uint(str2, &code) == 0) {
4043 }
4044 else if (FIXNUM_P(str2)) {
4045 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4046 }
4047 else {
4048 rb_raise(rb_eRangeError, "bignum out of char range");
4049 }
4050 }
4051 else {
4052 return rb_str_append(str1, str2);
4053 }
4054
4055 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4056
4057 if (encidx >= 0) {
4058 rb_str_buf_cat_byte(str1, (unsigned char)code);
4059 }
4060 else {
4061 long pos = RSTRING_LEN(str1);
4062 int cr = ENC_CODERANGE(str1);
4063 int len;
4064 char *buf;
4065
4066 switch (len = rb_enc_codelen(code, enc)) {
4067 case ONIGERR_INVALID_CODE_POINT_VALUE:
4068 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4069 break;
4070 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4071 case 0:
4072 rb_raise(rb_eRangeError, "%u out of char range", code);
4073 break;
4074 }
4075 buf = ALLOCA_N(char, len + 1);
4076 rb_enc_mbcput(code, buf, enc);
4077 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4078 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4079 }
4080 rb_str_resize(str1, pos+len);
4081 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4082 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4084 }
4085 else if (cr == ENC_CODERANGE_BROKEN) {
4087 }
4088 ENC_CODERANGE_SET(str1, cr);
4089 }
4090 return str1;
4091}
4092
4093int
4094rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4095{
4096 int encidx = rb_enc_to_index(enc);
4097
4098 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4099 /* US-ASCII automatically extended to ASCII-8BIT */
4100 if (code > 0xFF) {
4101 rb_raise(rb_eRangeError, "%u out of char range", code);
4102 }
4103 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4104 return ENCINDEX_ASCII_8BIT;
4105 }
4106 return encidx;
4107 }
4108 else {
4109 return -1;
4110 }
4111}
4112
4113/*
4114 * call-seq:
4115 * prepend(*other_strings) -> new_string
4116 *
4117 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4118 *
4119 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4120 *
4121 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4122 *
4123 */
4124
4125static VALUE
4126rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4127{
4128 str_modifiable(str);
4129
4130 if (argc == 1) {
4131 rb_str_update(str, 0L, 0L, argv[0]);
4132 }
4133 else if (argc > 1) {
4134 int i;
4135 VALUE arg_str = rb_str_tmp_new(0);
4136 rb_enc_copy(arg_str, str);
4137 for (i = 0; i < argc; i++) {
4138 rb_str_append(arg_str, argv[i]);
4139 }
4140 rb_str_update(str, 0L, 0L, arg_str);
4141 }
4142
4143 return str;
4144}
4145
4146st_index_t
4148{
4149 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4150 st_index_t precomputed_hash;
4151 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4152
4153 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4154 return precomputed_hash;
4155 }
4156
4157 return str_do_hash(str);
4158}
4159
4160int
4162{
4163 long len1, len2;
4164 const char *ptr1, *ptr2;
4165 RSTRING_GETMEM(str1, ptr1, len1);
4166 RSTRING_GETMEM(str2, ptr2, len2);
4167 return (len1 != len2 ||
4168 !rb_str_comparable(str1, str2) ||
4169 memcmp(ptr1, ptr2, len1) != 0);
4170}
4171
4172/*
4173 * call-seq:
4174 * hash -> integer
4175 *
4176 * :include: doc/string/hash.rdoc
4177 *
4178 */
4179
4180static VALUE
4181rb_str_hash_m(VALUE str)
4182{
4183 st_index_t hval = rb_str_hash(str);
4184 return ST2FIX(hval);
4185}
4186
4187#define lesser(a,b) (((a)>(b))?(b):(a))
4188
4189int
4191{
4192 int idx1, idx2;
4193 int rc1, rc2;
4194
4195 if (RSTRING_LEN(str1) == 0) return TRUE;
4196 if (RSTRING_LEN(str2) == 0) return TRUE;
4197 idx1 = ENCODING_GET(str1);
4198 idx2 = ENCODING_GET(str2);
4199 if (idx1 == idx2) return TRUE;
4200 rc1 = rb_enc_str_coderange(str1);
4201 rc2 = rb_enc_str_coderange(str2);
4202 if (rc1 == ENC_CODERANGE_7BIT) {
4203 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4204 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4205 return TRUE;
4206 }
4207 if (rc2 == ENC_CODERANGE_7BIT) {
4208 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4209 return TRUE;
4210 }
4211 return FALSE;
4212}
4213
4214int
4216{
4217 long len1, len2;
4218 const char *ptr1, *ptr2;
4219 int retval;
4220
4221 if (str1 == str2) return 0;
4222 RSTRING_GETMEM(str1, ptr1, len1);
4223 RSTRING_GETMEM(str2, ptr2, len2);
4224 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4225 if (len1 == len2) {
4226 if (!rb_str_comparable(str1, str2)) {
4227 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4228 return 1;
4229 return -1;
4230 }
4231 return 0;
4232 }
4233 if (len1 > len2) return 1;
4234 return -1;
4235 }
4236 if (retval > 0) return 1;
4237 return -1;
4238}
4239
4240/*
4241 * call-seq:
4242 * self == other -> true or false
4243 *
4244 * Returns whether +other+ is equal to +self+.
4245 *
4246 * When +other+ is a string, returns whether +other+ has the same length and content as +self+:
4247 *
4248 * s = 'foo'
4249 * s == 'foo' # => true
4250 * s == 'food' # => false
4251 * s == 'FOO' # => false
4252 *
4253 * Returns +false+ if the two strings' encodings are not compatible:
4254 *
4255 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4256 *
4257 * When +other+ is not a string:
4258 *
4259 * - If +other+ responds to method <tt>to_str</tt>,
4260 * <tt>other == self</tt> is called and its return value is returned.
4261 * - If +other+ does not respond to <tt>to_str</tt>,
4262 * +false+ is returned.
4263 *
4264 * Related: {Comparing}[rdoc-ref:String@Comparing].
4265 */
4266
4267VALUE
4269{
4270 if (str1 == str2) return Qtrue;
4271 if (!RB_TYPE_P(str2, T_STRING)) {
4272 if (!rb_respond_to(str2, idTo_str)) {
4273 return Qfalse;
4274 }
4275 return rb_equal(str2, str1);
4276 }
4277 return rb_str_eql_internal(str1, str2);
4278}
4279
4280/*
4281 * call-seq:
4282 * eql?(object) -> true or false
4283 *
4284 * :include: doc/string/eql_p.rdoc
4285 *
4286 */
4287
4288VALUE
4289rb_str_eql(VALUE str1, VALUE str2)
4290{
4291 if (str1 == str2) return Qtrue;
4292 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4293 return rb_str_eql_internal(str1, str2);
4294}
4295
4296/*
4297 * call-seq:
4298 * self <=> other -> -1, 0, 1, or nil
4299 *
4300 * Compares +self+ and +other+,
4301 * evaluating their _contents_, not their _lengths_.
4302 *
4303 * Returns:
4304 *
4305 * - +-1+, if +self+ is smaller.
4306 * - +0+, if the two are equal.
4307 * - +1+, if +self+ is larger.
4308 * - +nil+, if the two are incomparable.
4309 *
4310 * Examples:
4311 *
4312 * 'a' <=> 'b' # => -1
4313 * 'a' <=> 'ab' # => -1
4314 * 'a' <=> 'a' # => 0
4315 * 'b' <=> 'a' # => 1
4316 * 'ab' <=> 'a' # => 1
4317 * 'a' <=> :a # => nil
4318 *
4319 * \Class \String includes module Comparable,
4320 * each of whose methods uses String#<=> for comparison.
4321 *
4322 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4323 */
4324
4325static VALUE
4326rb_str_cmp_m(VALUE str1, VALUE str2)
4327{
4328 int result;
4329 VALUE s = rb_check_string_type(str2);
4330 if (NIL_P(s)) {
4331 return rb_invcmp(str1, str2);
4332 }
4333 result = rb_str_cmp(str1, s);
4334 return INT2FIX(result);
4335}
4336
4337static VALUE str_casecmp(VALUE str1, VALUE str2);
4338static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4339
4340/*
4341 * call-seq:
4342 * casecmp(other_string) -> -1, 0, 1, or nil
4343 *
4344 * Ignoring case, compares +self+ and +other_string+; returns:
4345 *
4346 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4347 * - 0 if the two are equal.
4348 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4349 * - +nil+ if the two are incomparable.
4350 *
4351 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4352 *
4353 * Examples:
4354 *
4355 * 'foo'.casecmp('goo') # => -1
4356 * 'goo'.casecmp('foo') # => 1
4357 * 'foo'.casecmp('food') # => -1
4358 * 'food'.casecmp('foo') # => 1
4359 * 'FOO'.casecmp('foo') # => 0
4360 * 'foo'.casecmp('FOO') # => 0
4361 * 'foo'.casecmp(1) # => nil
4362 *
4363 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4364 */
4365
4366static VALUE
4367rb_str_casecmp(VALUE str1, VALUE str2)
4368{
4369 VALUE s = rb_check_string_type(str2);
4370 if (NIL_P(s)) {
4371 return Qnil;
4372 }
4373 return str_casecmp(str1, s);
4374}
4375
4376static VALUE
4377str_casecmp(VALUE str1, VALUE str2)
4378{
4379 long len;
4380 rb_encoding *enc;
4381 const char *p1, *p1end, *p2, *p2end;
4382
4383 enc = rb_enc_compatible(str1, str2);
4384 if (!enc) {
4385 return Qnil;
4386 }
4387
4388 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4389 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4390 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4391 while (p1 < p1end && p2 < p2end) {
4392 if (*p1 != *p2) {
4393 unsigned int c1 = TOLOWER(*p1 & 0xff);
4394 unsigned int c2 = TOLOWER(*p2 & 0xff);
4395 if (c1 != c2)
4396 return INT2FIX(c1 < c2 ? -1 : 1);
4397 }
4398 p1++;
4399 p2++;
4400 }
4401 }
4402 else {
4403 while (p1 < p1end && p2 < p2end) {
4404 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4405 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4406
4407 if (0 <= c1 && 0 <= c2) {
4408 c1 = TOLOWER(c1);
4409 c2 = TOLOWER(c2);
4410 if (c1 != c2)
4411 return INT2FIX(c1 < c2 ? -1 : 1);
4412 }
4413 else {
4414 int r;
4415 l1 = rb_enc_mbclen(p1, p1end, enc);
4416 l2 = rb_enc_mbclen(p2, p2end, enc);
4417 len = l1 < l2 ? l1 : l2;
4418 r = memcmp(p1, p2, len);
4419 if (r != 0)
4420 return INT2FIX(r < 0 ? -1 : 1);
4421 if (l1 != l2)
4422 return INT2FIX(l1 < l2 ? -1 : 1);
4423 }
4424 p1 += l1;
4425 p2 += l2;
4426 }
4427 }
4428 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4429 if (p1 == p1end) return INT2FIX(-1);
4430 return INT2FIX(1);
4431}
4432
4433/*
4434 * call-seq:
4435 * casecmp?(other_string) -> true, false, or nil
4436 *
4437 * Returns +true+ if +self+ and +other_string+ are equal after
4438 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4439 *
4440 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4441 *
4442 * Examples:
4443 *
4444 * 'foo'.casecmp?('goo') # => false
4445 * 'goo'.casecmp?('foo') # => false
4446 * 'foo'.casecmp?('food') # => false
4447 * 'food'.casecmp?('foo') # => false
4448 * 'FOO'.casecmp?('foo') # => true
4449 * 'foo'.casecmp?('FOO') # => true
4450 * 'foo'.casecmp?(1) # => nil
4451 *
4452 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4453 */
4454
4455static VALUE
4456rb_str_casecmp_p(VALUE str1, VALUE str2)
4457{
4458 VALUE s = rb_check_string_type(str2);
4459 if (NIL_P(s)) {
4460 return Qnil;
4461 }
4462 return str_casecmp_p(str1, s);
4463}
4464
4465static VALUE
4466str_casecmp_p(VALUE str1, VALUE str2)
4467{
4468 rb_encoding *enc;
4469 VALUE folded_str1, folded_str2;
4470 VALUE fold_opt = sym_fold;
4471
4472 enc = rb_enc_compatible(str1, str2);
4473 if (!enc) {
4474 return Qnil;
4475 }
4476
4477 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4478 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4479
4480 return rb_str_eql(folded_str1, folded_str2);
4481}
4482
4483static long
4484strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4485 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4486{
4487 const char *search_start = str_ptr;
4488 long pos, search_len = str_len - offset;
4489
4490 for (;;) {
4491 const char *t;
4492 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4493 if (pos < 0) return pos;
4494 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4495 if (t == search_start + pos) break;
4496 search_len -= t - search_start;
4497 if (search_len <= 0) return -1;
4498 offset += t - search_start;
4499 search_start = t;
4500 }
4501 return pos + offset;
4502}
4503
4504/* found index in byte */
4505#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4506#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4507
4508static long
4509rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4510{
4511 const char *str_ptr, *str_ptr_end, *sub_ptr;
4512 long str_len, sub_len;
4513 rb_encoding *enc;
4514
4515 enc = rb_enc_check(str, sub);
4516 if (is_broken_string(sub)) return -1;
4517
4518 str_ptr = RSTRING_PTR(str);
4519 str_ptr_end = RSTRING_END(str);
4520 str_len = RSTRING_LEN(str);
4521 sub_ptr = RSTRING_PTR(sub);
4522 sub_len = RSTRING_LEN(sub);
4523
4524 if (str_len < sub_len) return -1;
4525
4526 if (offset != 0) {
4527 long str_len_char, sub_len_char;
4528 int single_byte = single_byte_optimizable(str);
4529 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4530 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4531 if (offset < 0) {
4532 offset += str_len_char;
4533 if (offset < 0) return -1;
4534 }
4535 if (str_len_char - offset < sub_len_char) return -1;
4536 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4537 str_ptr += offset;
4538 }
4539 if (sub_len == 0) return offset;
4540
4541 /* need proceed one character at a time */
4542 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4543}
4544
4545
4546/*
4547 * call-seq:
4548 * index(pattern, offset = 0) -> integer or nil
4549 *
4550 * :include: doc/string/index.rdoc
4551 *
4552 */
4553
4554static VALUE
4555rb_str_index_m(int argc, VALUE *argv, VALUE str)
4556{
4557 VALUE sub;
4558 VALUE initpos;
4559 rb_encoding *enc = STR_ENC_GET(str);
4560 long pos;
4561
4562 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4563 long slen = str_strlen(str, enc); /* str's enc */
4564 pos = NUM2LONG(initpos);
4565 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4566 if (RB_TYPE_P(sub, T_REGEXP)) {
4568 }
4569 return Qnil;
4570 }
4571 }
4572 else {
4573 pos = 0;
4574 }
4575
4576 if (RB_TYPE_P(sub, T_REGEXP)) {
4577 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4578 enc, single_byte_optimizable(str));
4579
4580 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4581 VALUE match = rb_backref_get();
4582 struct re_registers *regs = RMATCH_REGS(match);
4583 pos = rb_str_sublen(str, BEG(0));
4584 return LONG2NUM(pos);
4585 }
4586 }
4587 else {
4588 StringValue(sub);
4589 pos = rb_str_index(str, sub, pos);
4590 if (pos >= 0) {
4591 pos = rb_str_sublen(str, pos);
4592 return LONG2NUM(pos);
4593 }
4594 }
4595 return Qnil;
4596}
4597
4598/* Ensure that the given pos is a valid character boundary.
4599 * Note that in this function, "character" means a code point
4600 * (Unicode scalar value), not a grapheme cluster.
4601 */
4602static void
4603str_ensure_byte_pos(VALUE str, long pos)
4604{
4605 if (!single_byte_optimizable(str)) {
4606 const char *s = RSTRING_PTR(str);
4607 const char *e = RSTRING_END(str);
4608 const char *p = s + pos;
4609 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4610 rb_raise(rb_eIndexError,
4611 "offset %ld does not land on character boundary", pos);
4612 }
4613 }
4614}
4615
4616/*
4617 * call-seq:
4618 * byteindex(object, offset = 0) -> integer or nil
4619 *
4620 * Returns the 0-based integer index of a substring of +self+
4621 * specified by +object+ (a string or Regexp) and +offset+,
4622 * or +nil+ if there is no such substring;
4623 * the returned index is the count of _bytes_ (not characters).
4624 *
4625 * When +object+ is a string,
4626 * returns the index of the first found substring equal to +object+:
4627 *
4628 * s = 'foo' # => "foo"
4629 * s.size # => 3 # Three 1-byte characters.
4630 * s.bytesize # => 3 # Three bytes.
4631 * s.byteindex('f') # => 0
4632 * s.byteindex('o') # => 1
4633 * s.byteindex('oo') # => 1
4634 * s.byteindex('ooo') # => nil
4635 *
4636 * When +object+ is a Regexp,
4637 * returns the index of the first found substring matching +object+;
4638 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4639 *
4640 * s = 'foo'
4641 * s.byteindex(/f/) # => 0
4642 * $~ # => #<MatchData "f">
4643 * s.byteindex(/o/) # => 1
4644 * s.byteindex(/oo/) # => 1
4645 * s.byteindex(/ooo/) # => nil
4646 * $~ # => nil
4647 *
4648 * \Integer argument +offset+, if given, specifies the 0-based index
4649 * of the byte where searching is to begin.
4650 *
4651 * When +offset+ is non-negative,
4652 * searching begins at byte position +offset+:
4653 *
4654 * s = 'foo'
4655 * s.byteindex('o', 1) # => 1
4656 * s.byteindex('o', 2) # => 2
4657 * s.byteindex('o', 3) # => nil
4658 *
4659 * When +offset+ is negative, counts backward from the end of +self+:
4660 *
4661 * s = 'foo'
4662 * s.byteindex('o', -1) # => 2
4663 * s.byteindex('o', -2) # => 1
4664 * s.byteindex('o', -3) # => 1
4665 * s.byteindex('o', -4) # => nil
4666 *
4667 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4668 *
4669 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4670 * s.size # => 2 # Two 3-byte characters.
4671 * s.bytesize # => 6 # Six bytes.
4672 * s.byteindex("\uFFFF") # => 0
4673 * s.byteindex("\uFFFF", 1) # Raises IndexError
4674 * s.byteindex("\uFFFF", 2) # Raises IndexError
4675 * s.byteindex("\uFFFF", 3) # => 3
4676 * s.byteindex("\uFFFF", 4) # Raises IndexError
4677 * s.byteindex("\uFFFF", 5) # Raises IndexError
4678 * s.byteindex("\uFFFF", 6) # => nil
4679 *
4680 * Related: see {Querying}[rdoc-ref:String@Querying].
4681 */
4682
4683static VALUE
4684rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4685{
4686 VALUE sub;
4687 VALUE initpos;
4688 long pos;
4689
4690 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4691 long slen = RSTRING_LEN(str);
4692 pos = NUM2LONG(initpos);
4693 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4694 if (RB_TYPE_P(sub, T_REGEXP)) {
4696 }
4697 return Qnil;
4698 }
4699 }
4700 else {
4701 pos = 0;
4702 }
4703
4704 str_ensure_byte_pos(str, pos);
4705
4706 if (RB_TYPE_P(sub, T_REGEXP)) {
4707 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4708 VALUE match = rb_backref_get();
4709 struct re_registers *regs = RMATCH_REGS(match);
4710 pos = BEG(0);
4711 return LONG2NUM(pos);
4712 }
4713 }
4714 else {
4715 StringValue(sub);
4716 pos = rb_str_byteindex(str, sub, pos);
4717 if (pos >= 0) return LONG2NUM(pos);
4718 }
4719 return Qnil;
4720}
4721
4722#ifndef HAVE_MEMRCHR
4723static void*
4724memrchr(const char *search_str, int chr, long search_len)
4725{
4726 const char *ptr = search_str + search_len;
4727 while (ptr > search_str) {
4728 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4729 }
4730
4731 return ((void *)0);
4732}
4733#endif
4734
4735static long
4736str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4737{
4738 char *hit, *adjusted;
4739 int c;
4740 long slen, searchlen;
4741 char *sbeg, *e, *t;
4742
4743 sbeg = RSTRING_PTR(str);
4744 slen = RSTRING_LEN(sub);
4745 if (slen == 0) return s - sbeg;
4746 e = RSTRING_END(str);
4747 t = RSTRING_PTR(sub);
4748 c = *t & 0xff;
4749 searchlen = s - sbeg + 1;
4750
4751 if (memcmp(s, t, slen) == 0) {
4752 return s - sbeg;
4753 }
4754
4755 do {
4756 hit = memrchr(sbeg, c, searchlen);
4757 if (!hit) break;
4758 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4759 if (hit != adjusted) {
4760 searchlen = adjusted - sbeg;
4761 continue;
4762 }
4763 if (memcmp(hit, t, slen) == 0)
4764 return hit - sbeg;
4765 searchlen = adjusted - sbeg;
4766 } while (searchlen > 0);
4767
4768 return -1;
4769}
4770
4771/* found index in byte */
4772static long
4773rb_str_rindex(VALUE str, VALUE sub, long pos)
4774{
4775 long len, slen;
4776 char *sbeg, *s;
4777 rb_encoding *enc;
4778 int singlebyte;
4779
4780 enc = rb_enc_check(str, sub);
4781 if (is_broken_string(sub)) return -1;
4782 singlebyte = single_byte_optimizable(str);
4783 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4784 slen = str_strlen(sub, enc); /* rb_enc_check */
4785
4786 /* substring longer than string */
4787 if (len < slen) return -1;
4788 if (len - pos < slen) pos = len - slen;
4789 if (len == 0) return pos;
4790
4791 sbeg = RSTRING_PTR(str);
4792
4793 if (pos == 0) {
4794 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4795 return 0;
4796 else
4797 return -1;
4798 }
4799
4800 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4801 return str_rindex(str, sub, s, enc);
4802}
4803
4804/*
4805 * call-seq:
4806 * rindex(pattern, offset = self.length) -> integer or nil
4807 *
4808 * :include:doc/string/rindex.rdoc
4809 *
4810 */
4811
4812static VALUE
4813rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4814{
4815 VALUE sub;
4816 VALUE initpos;
4817 rb_encoding *enc = STR_ENC_GET(str);
4818 long pos, len = str_strlen(str, enc); /* str's enc */
4819
4820 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4821 pos = NUM2LONG(initpos);
4822 if (pos < 0 && (pos += len) < 0) {
4823 if (RB_TYPE_P(sub, T_REGEXP)) {
4825 }
4826 return Qnil;
4827 }
4828 if (pos > len) pos = len;
4829 }
4830 else {
4831 pos = len;
4832 }
4833
4834 if (RB_TYPE_P(sub, T_REGEXP)) {
4835 /* enc = rb_enc_check(str, sub); */
4836 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4837 enc, single_byte_optimizable(str));
4838
4839 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4840 VALUE match = rb_backref_get();
4841 struct re_registers *regs = RMATCH_REGS(match);
4842 pos = rb_str_sublen(str, BEG(0));
4843 return LONG2NUM(pos);
4844 }
4845 }
4846 else {
4847 StringValue(sub);
4848 pos = rb_str_rindex(str, sub, pos);
4849 if (pos >= 0) {
4850 pos = rb_str_sublen(str, pos);
4851 return LONG2NUM(pos);
4852 }
4853 }
4854 return Qnil;
4855}
4856
4857static long
4858rb_str_byterindex(VALUE str, VALUE sub, long pos)
4859{
4860 long len, slen;
4861 char *sbeg, *s;
4862 rb_encoding *enc;
4863
4864 enc = rb_enc_check(str, sub);
4865 if (is_broken_string(sub)) return -1;
4866 len = RSTRING_LEN(str);
4867 slen = RSTRING_LEN(sub);
4868
4869 /* substring longer than string */
4870 if (len < slen) return -1;
4871 if (len - pos < slen) pos = len - slen;
4872 if (len == 0) return pos;
4873
4874 sbeg = RSTRING_PTR(str);
4875
4876 if (pos == 0) {
4877 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4878 return 0;
4879 else
4880 return -1;
4881 }
4882
4883 s = sbeg + pos;
4884 return str_rindex(str, sub, s, enc);
4885}
4886
4887/*
4888 * call-seq:
4889 * byterindex(object, offset = self.bytesize) -> integer or nil
4890 *
4891 * Returns the 0-based integer index of a substring of +self+
4892 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4893 * or +nil+ if there is no such substring;
4894 * the returned index is the count of _bytes_ (not characters).
4895 *
4896 * When +object+ is a string,
4897 * returns the index of the _last_ found substring equal to +object+:
4898 *
4899 * s = 'foo' # => "foo"
4900 * s.size # => 3 # Three 1-byte characters.
4901 * s.bytesize # => 3 # Three bytes.
4902 * s.byterindex('f') # => 0
4903 s.byterindex('o') # => 2
4904 s.byterindex('oo') # => 1
4905 s.byterindex('ooo') # => nil
4906 *
4907 * When +object+ is a Regexp,
4908 * returns the index of the last found substring matching +object+;
4909 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4910 *
4911 * s = 'foo'
4912 * s.byterindex(/f/) # => 0
4913 * $~ # => #<MatchData "f">
4914 * s.byterindex(/o/) # => 2
4915 * s.byterindex(/oo/) # => 1
4916 * s.byterindex(/ooo/) # => nil
4917 * $~ # => nil
4918 *
4919 * The last match means starting at the possible last position,
4920 * not the last of the longest matches:
4921 *
4922 * s = 'foo'
4923 * s.byterindex(/o+/) # => 2
4924 * $~ #=> #<MatchData "o">
4925 *
4926 * To get the last longest match, use a negative lookbehind:
4927 *
4928 * s = 'foo'
4929 * s.byterindex(/(?<!o)o+/) # => 1
4930 * $~ # => #<MatchData "oo">
4931 *
4932 * Or use method #byteindex with negative lookahead:
4933 *
4934 * s = 'foo'
4935 * s.byteindex(/o+(?!.*o)/) # => 1
4936 * $~ #=> #<MatchData "oo">
4937 *
4938 * \Integer argument +offset+, if given, specifies the 0-based index
4939 * of the byte where searching is to end.
4940 *
4941 * When +offset+ is non-negative,
4942 * searching ends at byte position +offset+:
4943 *
4944 * s = 'foo'
4945 * s.byterindex('o', 0) # => nil
4946 * s.byterindex('o', 1) # => 1
4947 * s.byterindex('o', 2) # => 2
4948 * s.byterindex('o', 3) # => 2
4949 *
4950 * When +offset+ is negative, counts backward from the end of +self+:
4951 *
4952 * s = 'foo'
4953 * s.byterindex('o', -1) # => 2
4954 * s.byterindex('o', -2) # => 1
4955 * s.byterindex('o', -3) # => nil
4956 *
4957 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4958 *
4959 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4960 * s.size # => 2 # Two 3-byte characters.
4961 * s.bytesize # => 6 # Six bytes.
4962 * s.byterindex("\uFFFF") # => 3
4963 * s.byterindex("\uFFFF", 1) # Raises IndexError
4964 * s.byterindex("\uFFFF", 2) # Raises IndexError
4965 * s.byterindex("\uFFFF", 3) # => 3
4966 * s.byterindex("\uFFFF", 4) # Raises IndexError
4967 * s.byterindex("\uFFFF", 5) # Raises IndexError
4968 * s.byterindex("\uFFFF", 6) # => nil
4969 *
4970 * Related: see {Querying}[rdoc-ref:String@Querying].
4971 */
4972
4973static VALUE
4974rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4975{
4976 VALUE sub;
4977 VALUE initpos;
4978 long pos, len = RSTRING_LEN(str);
4979
4980 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4981 pos = NUM2LONG(initpos);
4982 if (pos < 0 && (pos += len) < 0) {
4983 if (RB_TYPE_P(sub, T_REGEXP)) {
4985 }
4986 return Qnil;
4987 }
4988 if (pos > len) pos = len;
4989 }
4990 else {
4991 pos = len;
4992 }
4993
4994 str_ensure_byte_pos(str, pos);
4995
4996 if (RB_TYPE_P(sub, T_REGEXP)) {
4997 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4998 VALUE match = rb_backref_get();
4999 struct re_registers *regs = RMATCH_REGS(match);
5000 pos = BEG(0);
5001 return LONG2NUM(pos);
5002 }
5003 }
5004 else {
5005 StringValue(sub);
5006 pos = rb_str_byterindex(str, sub, pos);
5007 if (pos >= 0) return LONG2NUM(pos);
5008 }
5009 return Qnil;
5010}
5011
5012/*
5013 * call-seq:
5014 * self =~ other -> integer or nil
5015 *
5016 * When +other+ is a Regexp:
5017 *
5018 * - Returns the integer index (in characters) of the first match
5019 * for +self+ and +other+, or +nil+ if none;
5020 * - Updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables].
5021 *
5022 * Examples:
5023 *
5024 * 'foo' =~ /f/ # => 0
5025 * $~ # => #<MatchData "f">
5026 * 'foo' =~ /o/ # => 1
5027 * $~ # => #<MatchData "o">
5028 * 'foo' =~ /x/ # => nil
5029 * $~ # => nil
5030 *
5031 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5032 * (see Regexp#=~):
5033 *
5034 * number = nil
5035 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5036 * number # => nil # Not assigned.
5037 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5038 * number # => "9" # Assigned.
5039 *
5040 * When +other+ is not a Regexp, returns the value
5041 * returned by <tt>other =~ self</tt>.
5042 *
5043 * Related: see {Querying}[rdoc-ref:String@Querying].
5044 */
5045
5046static VALUE
5047rb_str_match(VALUE x, VALUE y)
5048{
5049 switch (OBJ_BUILTIN_TYPE(y)) {
5050 case T_STRING:
5051 rb_raise(rb_eTypeError, "type mismatch: String given");
5052
5053 case T_REGEXP:
5054 return rb_reg_match(y, x);
5055
5056 default:
5057 return rb_funcall(y, idEqTilde, 1, x);
5058 }
5059}
5060
5061
5062static VALUE get_pat(VALUE);
5063
5064
5065/*
5066 * call-seq:
5067 * match(pattern, offset = 0) -> matchdata or nil
5068 * match(pattern, offset = 0) {|matchdata| ... } -> object
5069 *
5070 * Creates a MatchData object based on +self+ and the given arguments;
5071 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5072 *
5073 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5074 *
5075 * regexp = Regexp.new(pattern)
5076 *
5077 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5078 * (see Regexp#match):
5079 *
5080 * matchdata = regexp.match(self[offset..])
5081 *
5082 * With no block given, returns the computed +matchdata+ or +nil+:
5083 *
5084 * 'foo'.match('f') # => #<MatchData "f">
5085 * 'foo'.match('o') # => #<MatchData "o">
5086 * 'foo'.match('x') # => nil
5087 * 'foo'.match('f', 1) # => nil
5088 * 'foo'.match('o', 1) # => #<MatchData "o">
5089 *
5090 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5091 * returns the block's return value:
5092 *
5093 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5094 *
5095 * With a block given and +nil+ +matchdata+, does not call the block:
5096 *
5097 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5098 *
5099 * Related: see {Querying}[rdoc-ref:String@Querying].
5100 */
5101
5102static VALUE
5103rb_str_match_m(int argc, VALUE *argv, VALUE str)
5104{
5105 VALUE re, result;
5106 if (argc < 1)
5107 rb_check_arity(argc, 1, 2);
5108 re = argv[0];
5109 argv[0] = str;
5110 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5111 if (!NIL_P(result) && rb_block_given_p()) {
5112 return rb_yield(result);
5113 }
5114 return result;
5115}
5116
5117/*
5118 * call-seq:
5119 * match?(pattern, offset = 0) -> true or false
5120 *
5121 * Returns whether a match is found for +self+ and the given arguments;
5122 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5123 *
5124 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5125 *
5126 * regexp = Regexp.new(pattern)
5127 *
5128 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5129 * +false+ otherwise:
5130 *
5131 * 'foo'.match?(/o/) # => true
5132 * 'foo'.match?('o') # => true
5133 * 'foo'.match?(/x/) # => false
5134 * 'foo'.match?('f', 1) # => false
5135 * 'foo'.match?('o', 1) # => true
5136 *
5137 * Related: see {Querying}[rdoc-ref:String@Querying].
5138 */
5139
5140static VALUE
5141rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5142{
5143 VALUE re;
5144 rb_check_arity(argc, 1, 2);
5145 re = get_pat(argv[0]);
5146 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5147}
5148
5149enum neighbor_char {
5150 NEIGHBOR_NOT_CHAR,
5151 NEIGHBOR_FOUND,
5152 NEIGHBOR_WRAPPED
5153};
5154
5155static enum neighbor_char
5156enc_succ_char(char *p, long len, rb_encoding *enc)
5157{
5158 long i;
5159 int l;
5160
5161 if (rb_enc_mbminlen(enc) > 1) {
5162 /* wchar, trivial case */
5163 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5164 if (!MBCLEN_CHARFOUND_P(r)) {
5165 return NEIGHBOR_NOT_CHAR;
5166 }
5167 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5168 l = rb_enc_code_to_mbclen(c, enc);
5169 if (!l) return NEIGHBOR_NOT_CHAR;
5170 if (l != len) return NEIGHBOR_WRAPPED;
5171 rb_enc_mbcput(c, p, enc);
5172 r = rb_enc_precise_mbclen(p, p + len, enc);
5173 if (!MBCLEN_CHARFOUND_P(r)) {
5174 return NEIGHBOR_NOT_CHAR;
5175 }
5176 return NEIGHBOR_FOUND;
5177 }
5178 while (1) {
5179 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5180 p[i] = '\0';
5181 if (i < 0)
5182 return NEIGHBOR_WRAPPED;
5183 ++((unsigned char*)p)[i];
5184 l = rb_enc_precise_mbclen(p, p+len, enc);
5185 if (MBCLEN_CHARFOUND_P(l)) {
5186 l = MBCLEN_CHARFOUND_LEN(l);
5187 if (l == len) {
5188 return NEIGHBOR_FOUND;
5189 }
5190 else {
5191 memset(p+l, 0xff, len-l);
5192 }
5193 }
5194 if (MBCLEN_INVALID_P(l) && i < len-1) {
5195 long len2;
5196 int l2;
5197 for (len2 = len-1; 0 < len2; len2--) {
5198 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5199 if (!MBCLEN_INVALID_P(l2))
5200 break;
5201 }
5202 memset(p+len2+1, 0xff, len-(len2+1));
5203 }
5204 }
5205}
5206
5207static enum neighbor_char
5208enc_pred_char(char *p, long len, rb_encoding *enc)
5209{
5210 long i;
5211 int l;
5212 if (rb_enc_mbminlen(enc) > 1) {
5213 /* wchar, trivial case */
5214 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5215 if (!MBCLEN_CHARFOUND_P(r)) {
5216 return NEIGHBOR_NOT_CHAR;
5217 }
5218 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5219 if (!c) return NEIGHBOR_NOT_CHAR;
5220 --c;
5221 l = rb_enc_code_to_mbclen(c, enc);
5222 if (!l) return NEIGHBOR_NOT_CHAR;
5223 if (l != len) return NEIGHBOR_WRAPPED;
5224 rb_enc_mbcput(c, p, enc);
5225 r = rb_enc_precise_mbclen(p, p + len, enc);
5226 if (!MBCLEN_CHARFOUND_P(r)) {
5227 return NEIGHBOR_NOT_CHAR;
5228 }
5229 return NEIGHBOR_FOUND;
5230 }
5231 while (1) {
5232 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5233 p[i] = '\xff';
5234 if (i < 0)
5235 return NEIGHBOR_WRAPPED;
5236 --((unsigned char*)p)[i];
5237 l = rb_enc_precise_mbclen(p, p+len, enc);
5238 if (MBCLEN_CHARFOUND_P(l)) {
5239 l = MBCLEN_CHARFOUND_LEN(l);
5240 if (l == len) {
5241 return NEIGHBOR_FOUND;
5242 }
5243 else {
5244 memset(p+l, 0, len-l);
5245 }
5246 }
5247 if (MBCLEN_INVALID_P(l) && i < len-1) {
5248 long len2;
5249 int l2;
5250 for (len2 = len-1; 0 < len2; len2--) {
5251 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5252 if (!MBCLEN_INVALID_P(l2))
5253 break;
5254 }
5255 memset(p+len2+1, 0, len-(len2+1));
5256 }
5257 }
5258}
5259
5260/*
5261 overwrite +p+ by succeeding letter in +enc+ and returns
5262 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5263 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5264 assuming each ranges are successive, and mbclen
5265 never change in each ranges.
5266 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5267 character.
5268 */
5269static enum neighbor_char
5270enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5271{
5272 enum neighbor_char ret;
5273 unsigned int c;
5274 int ctype;
5275 int range;
5276 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5277
5278 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5279 int try;
5280 const int max_gaps = 1;
5281
5282 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5283 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5284 ctype = ONIGENC_CTYPE_DIGIT;
5285 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5286 ctype = ONIGENC_CTYPE_ALPHA;
5287 else
5288 return NEIGHBOR_NOT_CHAR;
5289
5290 MEMCPY(save, p, char, len);
5291 for (try = 0; try <= max_gaps; ++try) {
5292 ret = enc_succ_char(p, len, enc);
5293 if (ret == NEIGHBOR_FOUND) {
5294 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5295 if (rb_enc_isctype(c, ctype, enc))
5296 return NEIGHBOR_FOUND;
5297 }
5298 }
5299 MEMCPY(p, save, char, len);
5300 range = 1;
5301 while (1) {
5302 MEMCPY(save, p, char, len);
5303 ret = enc_pred_char(p, len, enc);
5304 if (ret == NEIGHBOR_FOUND) {
5305 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5306 if (!rb_enc_isctype(c, ctype, enc)) {
5307 MEMCPY(p, save, char, len);
5308 break;
5309 }
5310 }
5311 else {
5312 MEMCPY(p, save, char, len);
5313 break;
5314 }
5315 range++;
5316 }
5317 if (range == 1) {
5318 return NEIGHBOR_NOT_CHAR;
5319 }
5320
5321 if (ctype != ONIGENC_CTYPE_DIGIT) {
5322 MEMCPY(carry, p, char, len);
5323 return NEIGHBOR_WRAPPED;
5324 }
5325
5326 MEMCPY(carry, p, char, len);
5327 enc_succ_char(carry, len, enc);
5328 return NEIGHBOR_WRAPPED;
5329}
5330
5331
5332static VALUE str_succ(VALUE str);
5333
5334/*
5335 * call-seq:
5336 * succ -> new_str
5337 *
5338 * :include: doc/string/succ.rdoc
5339 *
5340 */
5341
5342VALUE
5344{
5345 VALUE str;
5346 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5347 rb_enc_cr_str_copy_for_substr(str, orig);
5348 return str_succ(str);
5349}
5350
5351static VALUE
5352str_succ(VALUE str)
5353{
5354 rb_encoding *enc;
5355 char *sbeg, *s, *e, *last_alnum = 0;
5356 int found_alnum = 0;
5357 long l, slen;
5358 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5359 long carry_pos = 0, carry_len = 1;
5360 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5361
5362 slen = RSTRING_LEN(str);
5363 if (slen == 0) return str;
5364
5365 enc = STR_ENC_GET(str);
5366 sbeg = RSTRING_PTR(str);
5367 s = e = sbeg + slen;
5368
5369 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5370 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5371 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5372 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5373 break;
5374 }
5375 }
5376 l = rb_enc_precise_mbclen(s, e, enc);
5377 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5378 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5379 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5380 switch (neighbor) {
5381 case NEIGHBOR_NOT_CHAR:
5382 continue;
5383 case NEIGHBOR_FOUND:
5384 return str;
5385 case NEIGHBOR_WRAPPED:
5386 last_alnum = s;
5387 break;
5388 }
5389 found_alnum = 1;
5390 carry_pos = s - sbeg;
5391 carry_len = l;
5392 }
5393 if (!found_alnum) { /* str contains no alnum */
5394 s = e;
5395 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5396 enum neighbor_char neighbor;
5397 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5398 l = rb_enc_precise_mbclen(s, e, enc);
5399 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5400 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5401 MEMCPY(tmp, s, char, l);
5402 neighbor = enc_succ_char(tmp, l, enc);
5403 switch (neighbor) {
5404 case NEIGHBOR_FOUND:
5405 MEMCPY(s, tmp, char, l);
5406 return str;
5407 break;
5408 case NEIGHBOR_WRAPPED:
5409 MEMCPY(s, tmp, char, l);
5410 break;
5411 case NEIGHBOR_NOT_CHAR:
5412 break;
5413 }
5414 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5415 /* wrapped to \0...\0. search next valid char. */
5416 enc_succ_char(s, l, enc);
5417 }
5418 if (!rb_enc_asciicompat(enc)) {
5419 MEMCPY(carry, s, char, l);
5420 carry_len = l;
5421 }
5422 carry_pos = s - sbeg;
5423 }
5425 }
5426 RESIZE_CAPA(str, slen + carry_len);
5427 sbeg = RSTRING_PTR(str);
5428 s = sbeg + carry_pos;
5429 memmove(s + carry_len, s, slen - carry_pos);
5430 memmove(s, carry, carry_len);
5431 slen += carry_len;
5432 STR_SET_LEN(str, slen);
5433 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5435 return str;
5436}
5437
5438
5439/*
5440 * call-seq:
5441 * succ! -> self
5442 *
5443 * Like String#succ, but modifies +self+ in place; returns +self+.
5444 *
5445 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5446 */
5447
5448static VALUE
5449rb_str_succ_bang(VALUE str)
5450{
5451 rb_str_modify(str);
5452 str_succ(str);
5453 return str;
5454}
5455
5456static int
5457all_digits_p(const char *s, long len)
5458{
5459 while (len-- > 0) {
5460 if (!ISDIGIT(*s)) return 0;
5461 s++;
5462 }
5463 return 1;
5464}
5465
5466static int
5467str_upto_i(VALUE str, VALUE arg)
5468{
5469 rb_yield(str);
5470 return 0;
5471}
5472
5473/*
5474 * call-seq:
5475 * upto(other_string, exclusive = false) {|string| ... } -> self
5476 * upto(other_string, exclusive = false) -> new_enumerator
5477 *
5478 * :include: doc/string/upto.rdoc
5479 *
5480 */
5481
5482static VALUE
5483rb_str_upto(int argc, VALUE *argv, VALUE beg)
5484{
5485 VALUE end, exclusive;
5486
5487 rb_scan_args(argc, argv, "11", &end, &exclusive);
5488 RETURN_ENUMERATOR(beg, argc, argv);
5489 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5490}
5491
5492VALUE
5493rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5494{
5495 VALUE current, after_end;
5496 ID succ;
5497 int n, ascii;
5498 rb_encoding *enc;
5499
5500 CONST_ID(succ, "succ");
5501 StringValue(end);
5502 enc = rb_enc_check(beg, end);
5503 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5504 /* single character */
5505 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5506 char c = RSTRING_PTR(beg)[0];
5507 char e = RSTRING_PTR(end)[0];
5508
5509 if (c > e || (excl && c == e)) return beg;
5510 for (;;) {
5511 VALUE str = rb_enc_str_new(&c, 1, enc);
5513 if ((*each)(str, arg)) break;
5514 if (!excl && c == e) break;
5515 c++;
5516 if (excl && c == e) break;
5517 }
5518 return beg;
5519 }
5520 /* both edges are all digits */
5521 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5522 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5523 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5524 VALUE b, e;
5525 int width;
5526
5527 width = RSTRING_LENINT(beg);
5528 b = rb_str_to_inum(beg, 10, FALSE);
5529 e = rb_str_to_inum(end, 10, FALSE);
5530 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5531 long bi = FIX2LONG(b);
5532 long ei = FIX2LONG(e);
5533 rb_encoding *usascii = rb_usascii_encoding();
5534
5535 while (bi <= ei) {
5536 if (excl && bi == ei) break;
5537 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5538 bi++;
5539 }
5540 }
5541 else {
5542 ID op = excl ? '<' : idLE;
5543 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5544
5545 args[0] = INT2FIX(width);
5546 while (rb_funcall(b, op, 1, e)) {
5547 args[1] = b;
5548 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5549 b = rb_funcallv(b, succ, 0, 0);
5550 }
5551 }
5552 return beg;
5553 }
5554 /* normal case */
5555 n = rb_str_cmp(beg, end);
5556 if (n > 0 || (excl && n == 0)) return beg;
5557
5558 after_end = rb_funcallv(end, succ, 0, 0);
5559 current = str_duplicate(rb_cString, beg);
5560 while (!rb_str_equal(current, after_end)) {
5561 VALUE next = Qnil;
5562 if (excl || !rb_str_equal(current, end))
5563 next = rb_funcallv(current, succ, 0, 0);
5564 if ((*each)(current, arg)) break;
5565 if (NIL_P(next)) break;
5566 current = next;
5567 StringValue(current);
5568 if (excl && rb_str_equal(current, end)) break;
5569 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5570 break;
5571 }
5572
5573 return beg;
5574}
5575
5576VALUE
5577rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5578{
5579 VALUE current;
5580 ID succ;
5581
5582 CONST_ID(succ, "succ");
5583 /* both edges are all digits */
5584 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5585 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5586 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5587 int width = RSTRING_LENINT(beg);
5588 b = rb_str_to_inum(beg, 10, FALSE);
5589 if (FIXNUM_P(b)) {
5590 long bi = FIX2LONG(b);
5591 rb_encoding *usascii = rb_usascii_encoding();
5592
5593 while (FIXABLE(bi)) {
5594 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5595 bi++;
5596 }
5597 b = LONG2NUM(bi);
5598 }
5599 args[0] = INT2FIX(width);
5600 while (1) {
5601 args[1] = b;
5602 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5603 b = rb_funcallv(b, succ, 0, 0);
5604 }
5605 }
5606 /* normal case */
5607 current = str_duplicate(rb_cString, beg);
5608 while (1) {
5609 VALUE next = rb_funcallv(current, succ, 0, 0);
5610 if ((*each)(current, arg)) break;
5611 current = next;
5612 StringValue(current);
5613 if (RSTRING_LEN(current) == 0)
5614 break;
5615 }
5616
5617 return beg;
5618}
5619
5620static int
5621include_range_i(VALUE str, VALUE arg)
5622{
5623 VALUE *argp = (VALUE *)arg;
5624 if (!rb_equal(str, *argp)) return 0;
5625 *argp = Qnil;
5626 return 1;
5627}
5628
5629VALUE
5630rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5631{
5632 beg = rb_str_new_frozen(beg);
5633 StringValue(end);
5634 end = rb_str_new_frozen(end);
5635 if (NIL_P(val)) return Qfalse;
5636 val = rb_check_string_type(val);
5637 if (NIL_P(val)) return Qfalse;
5638 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5639 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5640 rb_enc_asciicompat(STR_ENC_GET(val))) {
5641 const char *bp = RSTRING_PTR(beg);
5642 const char *ep = RSTRING_PTR(end);
5643 const char *vp = RSTRING_PTR(val);
5644 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5645 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5646 return Qfalse;
5647 else {
5648 char b = *bp;
5649 char e = *ep;
5650 char v = *vp;
5651
5652 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5653 if (b <= v && v < e) return Qtrue;
5654 return RBOOL(!RTEST(exclusive) && v == e);
5655 }
5656 }
5657 }
5658#if 0
5659 /* both edges are all digits */
5660 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5661 all_digits_p(bp, RSTRING_LEN(beg)) &&
5662 all_digits_p(ep, RSTRING_LEN(end))) {
5663 /* TODO */
5664 }
5665#endif
5666 }
5667 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5668
5669 return RBOOL(NIL_P(val));
5670}
5671
5672static VALUE
5673rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5674{
5675 if (rb_reg_search(re, str, 0, 0) >= 0) {
5676 VALUE match = rb_backref_get();
5677 int nth = rb_reg_backref_number(match, backref);
5678 return rb_reg_nth_match(nth, match);
5679 }
5680 return Qnil;
5681}
5682
5683static VALUE
5684rb_str_aref(VALUE str, VALUE indx)
5685{
5686 long idx;
5687
5688 if (FIXNUM_P(indx)) {
5689 idx = FIX2LONG(indx);
5690 }
5691 else if (RB_TYPE_P(indx, T_REGEXP)) {
5692 return rb_str_subpat(str, indx, INT2FIX(0));
5693 }
5694 else if (RB_TYPE_P(indx, T_STRING)) {
5695 if (rb_str_index(str, indx, 0) != -1)
5696 return str_duplicate(rb_cString, indx);
5697 return Qnil;
5698 }
5699 else {
5700 /* check if indx is Range */
5701 long beg, len = str_strlen(str, NULL);
5702 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5703 case Qfalse:
5704 break;
5705 case Qnil:
5706 return Qnil;
5707 default:
5708 return rb_str_substr(str, beg, len);
5709 }
5710 idx = NUM2LONG(indx);
5711 }
5712
5713 return str_substr(str, idx, 1, FALSE);
5714}
5715
5716
5717/*
5718 * call-seq:
5719 * self[offset] -> new_string or nil
5720 * self[offset, size] -> new_string or nil
5721 * self[range] -> new_string or nil
5722 * self[regexp, capture = 0] -> new_string or nil
5723 * self[substring] -> new_string or nil
5724 *
5725 * :include: doc/string/aref.rdoc
5726 *
5727 */
5728
5729static VALUE
5730rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5731{
5732 if (argc == 2) {
5733 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5734 return rb_str_subpat(str, argv[0], argv[1]);
5735 }
5736 else {
5737 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5738 }
5739 }
5740 rb_check_arity(argc, 1, 2);
5741 return rb_str_aref(str, argv[0]);
5742}
5743
5744VALUE
5746{
5747 char *ptr = RSTRING_PTR(str);
5748 long olen = RSTRING_LEN(str), nlen;
5749
5750 str_modifiable(str);
5751 if (len > olen) len = olen;
5752 nlen = olen - len;
5753 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5754 char *oldptr = ptr;
5755 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5756 STR_SET_EMBED(str);
5757 ptr = RSTRING(str)->as.embed.ary;
5758 memmove(ptr, oldptr + len, nlen);
5759 if (fl == STR_NOEMBED) xfree(oldptr);
5760 }
5761 else {
5762 if (!STR_SHARED_P(str)) {
5763 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5764 rb_enc_cr_str_exact_copy(shared, str);
5765 OBJ_FREEZE(shared);
5766 }
5767 ptr = RSTRING(str)->as.heap.ptr += len;
5768 }
5769 STR_SET_LEN(str, nlen);
5770
5771 if (!SHARABLE_MIDDLE_SUBSTRING) {
5772 TERM_FILL(ptr + nlen, TERM_LEN(str));
5773 }
5775 return str;
5776}
5777
5778static void
5779rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5780{
5781 char *sptr;
5782 long slen;
5783 int cr;
5784
5785 if (beg == 0 && vlen == 0) {
5786 rb_str_drop_bytes(str, len);
5787 return;
5788 }
5789
5790 str_modify_keep_cr(str);
5791 RSTRING_GETMEM(str, sptr, slen);
5792 if (len < vlen) {
5793 /* expand string */
5794 RESIZE_CAPA(str, slen + vlen - len);
5795 sptr = RSTRING_PTR(str);
5796 }
5797
5799 cr = rb_enc_str_coderange(val);
5800 else
5802
5803 if (vlen != len) {
5804 memmove(sptr + beg + vlen,
5805 sptr + beg + len,
5806 slen - (beg + len));
5807 }
5808 if (vlen < beg && len < 0) {
5809 MEMZERO(sptr + slen, char, -len);
5810 }
5811 if (vlen > 0) {
5812 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5813 }
5814 slen += vlen - len;
5815 STR_SET_LEN(str, slen);
5816 TERM_FILL(&sptr[slen], TERM_LEN(str));
5817 ENC_CODERANGE_SET(str, cr);
5818}
5819
5820static inline void
5821rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5822{
5823 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5824}
5825
5826void
5827rb_str_update(VALUE str, long beg, long len, VALUE val)
5828{
5829 long slen;
5830 char *p, *e;
5831 rb_encoding *enc;
5832 int singlebyte = single_byte_optimizable(str);
5833 int cr;
5834
5835 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5836
5837 StringValue(val);
5838 enc = rb_enc_check(str, val);
5839 slen = str_strlen(str, enc); /* rb_enc_check */
5840
5841 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5842 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5843 }
5844 if (beg < 0) {
5845 beg += slen;
5846 }
5847 RUBY_ASSERT(beg >= 0);
5848 RUBY_ASSERT(beg <= slen);
5849
5850 if (len > slen - beg) {
5851 len = slen - beg;
5852 }
5853 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5854 if (!p) p = RSTRING_END(str);
5855 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5856 if (!e) e = RSTRING_END(str);
5857 /* error check */
5858 beg = p - RSTRING_PTR(str); /* physical position */
5859 len = e - p; /* physical length */
5860 rb_str_update_0(str, beg, len, val);
5861 rb_enc_associate(str, enc);
5863 if (cr != ENC_CODERANGE_BROKEN)
5864 ENC_CODERANGE_SET(str, cr);
5865}
5866
5867static void
5868rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5869{
5870 int nth;
5871 VALUE match;
5872 long start, end, len;
5873 rb_encoding *enc;
5874 struct re_registers *regs;
5875
5876 if (rb_reg_search(re, str, 0, 0) < 0) {
5877 rb_raise(rb_eIndexError, "regexp not matched");
5878 }
5879 match = rb_backref_get();
5880 nth = rb_reg_backref_number(match, backref);
5881 regs = RMATCH_REGS(match);
5882 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5883 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5884 }
5885 if (nth < 0) {
5886 nth += regs->num_regs;
5887 }
5888
5889 start = BEG(nth);
5890 if (start == -1) {
5891 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5892 }
5893 end = END(nth);
5894 len = end - start;
5895 StringValue(val);
5896 enc = rb_enc_check_str(str, val);
5897 rb_str_update_0(str, start, len, val);
5898 rb_enc_associate(str, enc);
5899}
5900
5901static VALUE
5902rb_str_aset(VALUE str, VALUE indx, VALUE val)
5903{
5904 long idx, beg;
5905
5906 switch (TYPE(indx)) {
5907 case T_REGEXP:
5908 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5909 return val;
5910
5911 case T_STRING:
5912 beg = rb_str_index(str, indx, 0);
5913 if (beg < 0) {
5914 rb_raise(rb_eIndexError, "string not matched");
5915 }
5916 beg = rb_str_sublen(str, beg);
5917 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5918 return val;
5919
5920 default:
5921 /* check if indx is Range */
5922 {
5923 long beg, len;
5924 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5925 rb_str_update(str, beg, len, val);
5926 return val;
5927 }
5928 }
5929 /* FALLTHROUGH */
5930
5931 case T_FIXNUM:
5932 idx = NUM2LONG(indx);
5933 rb_str_update(str, idx, 1, val);
5934 return val;
5935 }
5936}
5937
5938/*
5939 * call-seq:
5940 * self[index] = other_string -> new_string
5941 * self[start, length] = other_string -> new_string
5942 * self[range] = other_string -> new_string
5943 * self[regexp, capture = 0] = other_string -> new_string
5944 * self[substring] = other_string -> new_string
5945 *
5946 * :include: doc/string/aset.rdoc
5947 *
5948 */
5949
5950static VALUE
5951rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5952{
5953 if (argc == 3) {
5954 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5955 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5956 }
5957 else {
5958 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5959 }
5960 return argv[2];
5961 }
5962 rb_check_arity(argc, 2, 3);
5963 return rb_str_aset(str, argv[0], argv[1]);
5964}
5965
5966/*
5967 * call-seq:
5968 * insert(offset, other_string) -> self
5969 *
5970 * :include: doc/string/insert.rdoc
5971 *
5972 */
5973
5974static VALUE
5975rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5976{
5977 long pos = NUM2LONG(idx);
5978
5979 if (pos == -1) {
5980 return rb_str_append(str, str2);
5981 }
5982 else if (pos < 0) {
5983 pos++;
5984 }
5985 rb_str_update(str, pos, 0, str2);
5986 return str;
5987}
5988
5989
5990/*
5991 * call-seq:
5992 * slice!(index) -> new_string or nil
5993 * slice!(start, length) -> new_string or nil
5994 * slice!(range) -> new_string or nil
5995 * slice!(regexp, capture = 0) -> new_string or nil
5996 * slice!(substring) -> new_string or nil
5997 *
5998 * Like String#[] (and its alias String#slice), except that:
5999 *
6000 * - Performs substitutions in +self+ (not in a copy of +self+).
6001 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
6002 *
6003 * A few examples:
6004 *
6005 * s = 'hello'
6006 * s.slice!('e') # => "e"
6007 * s # => "hllo"
6008 * s.slice!('e') # => nil
6009 * s # => "hllo"
6010 *
6011 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6012 */
6013
6014static VALUE
6015rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6016{
6017 VALUE result = Qnil;
6018 VALUE indx;
6019 long beg, len = 1;
6020 char *p;
6021
6022 rb_check_arity(argc, 1, 2);
6023 str_modify_keep_cr(str);
6024 indx = argv[0];
6025 if (RB_TYPE_P(indx, T_REGEXP)) {
6026 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6027 VALUE match = rb_backref_get();
6028 struct re_registers *regs = RMATCH_REGS(match);
6029 int nth = 0;
6030 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6031 if ((nth += regs->num_regs) <= 0) return Qnil;
6032 }
6033 else if (nth >= regs->num_regs) return Qnil;
6034 beg = BEG(nth);
6035 len = END(nth) - beg;
6036 goto subseq;
6037 }
6038 else if (argc == 2) {
6039 beg = NUM2LONG(indx);
6040 len = NUM2LONG(argv[1]);
6041 goto num_index;
6042 }
6043 else if (FIXNUM_P(indx)) {
6044 beg = FIX2LONG(indx);
6045 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6046 if (!len) return Qnil;
6047 beg = p - RSTRING_PTR(str);
6048 goto subseq;
6049 }
6050 else if (RB_TYPE_P(indx, T_STRING)) {
6051 beg = rb_str_index(str, indx, 0);
6052 if (beg == -1) return Qnil;
6053 len = RSTRING_LEN(indx);
6054 result = str_duplicate(rb_cString, indx);
6055 goto squash;
6056 }
6057 else {
6058 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6059 case Qnil:
6060 return Qnil;
6061 case Qfalse:
6062 beg = NUM2LONG(indx);
6063 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6064 if (!len) return Qnil;
6065 beg = p - RSTRING_PTR(str);
6066 goto subseq;
6067 default:
6068 goto num_index;
6069 }
6070 }
6071
6072 num_index:
6073 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6074 beg = p - RSTRING_PTR(str);
6075
6076 subseq:
6077 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6078 rb_enc_cr_str_copy_for_substr(result, str);
6079
6080 squash:
6081 if (len > 0) {
6082 if (beg == 0) {
6083 rb_str_drop_bytes(str, len);
6084 }
6085 else {
6086 char *sptr = RSTRING_PTR(str);
6087 long slen = RSTRING_LEN(str);
6088 if (beg + len > slen) /* pathological check */
6089 len = slen - beg;
6090 memmove(sptr + beg,
6091 sptr + beg + len,
6092 slen - (beg + len));
6093 slen -= len;
6094 STR_SET_LEN(str, slen);
6095 TERM_FILL(&sptr[slen], TERM_LEN(str));
6096 }
6097 }
6098 return result;
6099}
6100
6101static VALUE
6102get_pat(VALUE pat)
6103{
6104 VALUE val;
6105
6106 switch (OBJ_BUILTIN_TYPE(pat)) {
6107 case T_REGEXP:
6108 return pat;
6109
6110 case T_STRING:
6111 break;
6112
6113 default:
6114 val = rb_check_string_type(pat);
6115 if (NIL_P(val)) {
6116 Check_Type(pat, T_REGEXP);
6117 }
6118 pat = val;
6119 }
6120
6121 return rb_reg_regcomp(pat);
6122}
6123
6124static VALUE
6125get_pat_quoted(VALUE pat, int check)
6126{
6127 VALUE val;
6128
6129 switch (OBJ_BUILTIN_TYPE(pat)) {
6130 case T_REGEXP:
6131 return pat;
6132
6133 case T_STRING:
6134 break;
6135
6136 default:
6137 val = rb_check_string_type(pat);
6138 if (NIL_P(val)) {
6139 Check_Type(pat, T_REGEXP);
6140 }
6141 pat = val;
6142 }
6143 if (check && is_broken_string(pat)) {
6144 rb_exc_raise(rb_reg_check_preprocess(pat));
6145 }
6146 return pat;
6147}
6148
6149static long
6150rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6151{
6152 if (BUILTIN_TYPE(pat) == T_STRING) {
6153 pos = rb_str_byteindex(str, pat, pos);
6154 if (set_backref_str) {
6155 if (pos >= 0) {
6156 str = rb_str_new_frozen_String(str);
6157 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6158 if (match) {
6159 *match = match_data;
6160 }
6161 }
6162 else {
6164 }
6165 }
6166 return pos;
6167 }
6168 else {
6169 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6170 }
6171}
6172
6173static long
6174rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6175{
6176 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6177}
6178
6179
6180/*
6181 * call-seq:
6182 * sub!(pattern, replacement) -> self or nil
6183 * sub!(pattern) {|match| ... } -> self or nil
6184 *
6185 * Like String#sub, except that:
6186 *
6187 * - Changes are made to +self+, not to copy of +self+.
6188 * - Returns +self+ if any changes are made, +nil+ otherwise.
6189 *
6190 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6191 */
6192
6193static VALUE
6194rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6195{
6196 VALUE pat, repl, hash = Qnil;
6197 int iter = 0;
6198 long plen;
6199 int min_arity = rb_block_given_p() ? 1 : 2;
6200 long beg;
6201
6202 rb_check_arity(argc, min_arity, 2);
6203 if (argc == 1) {
6204 iter = 1;
6205 }
6206 else {
6207 repl = argv[1];
6208 hash = rb_check_hash_type(argv[1]);
6209 if (NIL_P(hash)) {
6210 StringValue(repl);
6211 }
6212 }
6213
6214 pat = get_pat_quoted(argv[0], 1);
6215
6216 str_modifiable(str);
6217 beg = rb_pat_search(pat, str, 0, 1);
6218 if (beg >= 0) {
6219 rb_encoding *enc;
6220 int cr = ENC_CODERANGE(str);
6221 long beg0, end0;
6222 VALUE match, match0 = Qnil;
6223 struct re_registers *regs;
6224 char *p, *rp;
6225 long len, rlen;
6226
6227 match = rb_backref_get();
6228 regs = RMATCH_REGS(match);
6229 if (RB_TYPE_P(pat, T_STRING)) {
6230 beg0 = beg;
6231 end0 = beg0 + RSTRING_LEN(pat);
6232 match0 = pat;
6233 }
6234 else {
6235 beg0 = BEG(0);
6236 end0 = END(0);
6237 if (iter) match0 = rb_reg_nth_match(0, match);
6238 }
6239
6240 if (iter || !NIL_P(hash)) {
6241 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6242
6243 if (iter) {
6244 repl = rb_obj_as_string(rb_yield(match0));
6245 }
6246 else {
6247 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6248 repl = rb_obj_as_string(repl);
6249 }
6250 str_mod_check(str, p, len);
6251 rb_check_frozen(str);
6252 }
6253 else {
6254 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6255 }
6256
6257 enc = rb_enc_compatible(str, repl);
6258 if (!enc) {
6259 rb_encoding *str_enc = STR_ENC_GET(str);
6260 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6261 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6262 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6263 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6264 rb_enc_inspect_name(str_enc),
6265 rb_enc_inspect_name(STR_ENC_GET(repl)));
6266 }
6267 enc = STR_ENC_GET(repl);
6268 }
6269 rb_str_modify(str);
6270 rb_enc_associate(str, enc);
6272 int cr2 = ENC_CODERANGE(repl);
6273 if (cr2 == ENC_CODERANGE_BROKEN ||
6274 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6276 else
6277 cr = cr2;
6278 }
6279 plen = end0 - beg0;
6280 rlen = RSTRING_LEN(repl);
6281 len = RSTRING_LEN(str);
6282 if (rlen > plen) {
6283 RESIZE_CAPA(str, len + rlen - plen);
6284 }
6285 p = RSTRING_PTR(str);
6286 if (rlen != plen) {
6287 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6288 }
6289 rp = RSTRING_PTR(repl);
6290 memmove(p + beg0, rp, rlen);
6291 len += rlen - plen;
6292 STR_SET_LEN(str, len);
6293 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6294 ENC_CODERANGE_SET(str, cr);
6295
6296 RB_GC_GUARD(match);
6297
6298 return str;
6299 }
6300 return Qnil;
6301}
6302
6303
6304/*
6305 * call-seq:
6306 * sub(pattern, replacement) -> new_string
6307 * sub(pattern) {|match| ... } -> new_string
6308 *
6309 * :include: doc/string/sub.rdoc
6310 */
6311
6312static VALUE
6313rb_str_sub(int argc, VALUE *argv, VALUE str)
6314{
6315 str = str_duplicate(rb_cString, str);
6316 rb_str_sub_bang(argc, argv, str);
6317 return str;
6318}
6319
6320static VALUE
6321str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6322{
6323 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6324 long beg, beg0, end0;
6325 long offset, blen, slen, len, last;
6326 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6327 char *sp, *cp;
6328 int need_backref_str = -1;
6329 rb_encoding *str_enc;
6330
6331 switch (argc) {
6332 case 1:
6333 RETURN_ENUMERATOR(str, argc, argv);
6334 mode = ITER;
6335 break;
6336 case 2:
6337 repl = argv[1];
6338 hash = rb_check_hash_type(argv[1]);
6339 if (NIL_P(hash)) {
6340 StringValue(repl);
6341 }
6342 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6343 mode = FAST_MAP;
6344 }
6345 else {
6346 mode = MAP;
6347 }
6348 break;
6349 default:
6350 rb_error_arity(argc, 1, 2);
6351 }
6352
6353 pat = get_pat_quoted(argv[0], 1);
6354 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6355
6356 if (beg < 0) {
6357 if (bang) return Qnil; /* no match, no substitution */
6358 return str_duplicate(rb_cString, str);
6359 }
6360
6361 offset = 0;
6362 blen = RSTRING_LEN(str) + 30; /* len + margin */
6363 dest = rb_str_buf_new(blen);
6364 sp = RSTRING_PTR(str);
6365 slen = RSTRING_LEN(str);
6366 cp = sp;
6367 str_enc = STR_ENC_GET(str);
6368 rb_enc_associate(dest, str_enc);
6369 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6370
6371 do {
6372 struct re_registers *regs = RMATCH_REGS(match);
6373 if (RB_TYPE_P(pat, T_STRING)) {
6374 beg0 = beg;
6375 end0 = beg0 + RSTRING_LEN(pat);
6376 match0 = pat;
6377 }
6378 else {
6379 beg0 = BEG(0);
6380 end0 = END(0);
6381 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6382 }
6383
6384 if (mode != STR) {
6385 if (mode == ITER) {
6386 val = rb_obj_as_string(rb_yield(match0));
6387 }
6388 else {
6389 struct RString fake_str = {RBASIC_INIT};
6390 VALUE key;
6391 if (mode == FAST_MAP) {
6392 // It is safe to use a fake_str here because we established that it won't escape,
6393 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6394 // default proc.
6395 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6396 }
6397 else {
6398 key = rb_str_subseq(str, beg0, end0 - beg0);
6399 }
6400 val = rb_hash_aref(hash, key);
6401 val = rb_obj_as_string(val);
6402 }
6403 str_mod_check(str, sp, slen);
6404 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6405 rb_raise(rb_eRuntimeError, "block should not cheat");
6406 }
6407 }
6408 else if (need_backref_str) {
6409 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6410 if (need_backref_str < 0) {
6411 need_backref_str = val != repl;
6412 }
6413 }
6414 else {
6415 val = repl;
6416 }
6417
6418 len = beg0 - offset; /* copy pre-match substr */
6419 if (len) {
6420 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6421 }
6422
6423 rb_str_buf_append(dest, val);
6424
6425 last = offset;
6426 offset = end0;
6427 if (beg0 == end0) {
6428 /*
6429 * Always consume at least one character of the input string
6430 * in order to prevent infinite loops.
6431 */
6432 if (RSTRING_LEN(str) <= end0) break;
6433 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6434 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6435 offset = end0 + len;
6436 }
6437 cp = RSTRING_PTR(str) + offset;
6438 if (offset > RSTRING_LEN(str)) break;
6439
6440 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6441 if (mode != FAST_MAP && mode != STR) {
6442 match = Qnil;
6443 }
6444 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6445
6446 RB_GC_GUARD(match);
6447 } while (beg >= 0);
6448
6449 if (RSTRING_LEN(str) > offset) {
6450 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6451 }
6452 rb_pat_search0(pat, str, last, 1, &match);
6453 if (bang) {
6454 str_shared_replace(str, dest);
6455 }
6456 else {
6457 str = dest;
6458 }
6459
6460 return str;
6461}
6462
6463
6464/*
6465 * call-seq:
6466 * gsub!(pattern, replacement) -> self or nil
6467 * gsub!(pattern) {|match| ... } -> self or nil
6468 * gsub!(pattern) -> an_enumerator
6469 *
6470 * Like String#gsub, except that:
6471 *
6472 * - Performs substitutions in +self+ (not in a copy of +self+).
6473 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6474 *
6475 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6476 */
6477
6478static VALUE
6479rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6480{
6481 str_modify_keep_cr(str);
6482 return str_gsub(argc, argv, str, 1);
6483}
6484
6485
6486/*
6487 * call-seq:
6488 * gsub(pattern, replacement) -> new_string
6489 * gsub(pattern) {|match| ... } -> new_string
6490 * gsub(pattern) -> enumerator
6491 *
6492 * Returns a copy of +self+ with zero or more substrings replaced.
6493 *
6494 * Argument +pattern+ may be a string or a Regexp;
6495 * argument +replacement+ may be a string or a Hash.
6496 * Varying types for the argument values makes this method very versatile.
6497 *
6498 * Below are some simple examples;
6499 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6500 *
6501 * With arguments +pattern+ and string +replacement+ given,
6502 * replaces each matching substring with the given +replacement+ string:
6503 *
6504 * s = 'abracadabra'
6505 * s.gsub('ab', 'AB') # => "ABracadABra"
6506 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6507 *
6508 * With arguments +pattern+ and hash +replacement+ given,
6509 * replaces each matching substring with a value from the given +replacement+ hash,
6510 * or removes it:
6511 *
6512 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6513 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6514 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6515 *
6516 * With argument +pattern+ and a block given,
6517 * calls the block with each matching substring;
6518 * replaces that substring with the block's return value:
6519 *
6520 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6521 * # => "ABrACADABrA"
6522 *
6523 * With argument +pattern+ and no block given,
6524 * returns a new Enumerator.
6525 *
6526 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6527 */
6528
6529static VALUE
6530rb_str_gsub(int argc, VALUE *argv, VALUE str)
6531{
6532 return str_gsub(argc, argv, str, 0);
6533}
6534
6535
6536/*
6537 * call-seq:
6538 * replace(other_string) -> self
6539 *
6540 * Replaces the contents of +self+ with the contents of +other_string+;
6541 * returns +self+:
6542 *
6543 * s = 'foo' # => "foo"
6544 * s.replace('bar') # => "bar"
6545 *
6546 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6547 */
6548
6549VALUE
6551{
6552 str_modifiable(str);
6553 if (str == str2) return str;
6554
6555 StringValue(str2);
6556 str_discard(str);
6557 return str_replace(str, str2);
6558}
6559
6560/*
6561 * call-seq:
6562 * clear -> self
6563 *
6564 * Removes the contents of +self+:
6565 *
6566 * s = 'foo'
6567 * s.clear # => ""
6568 * s # => ""
6569 *
6570 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6571 */
6572
6573static VALUE
6574rb_str_clear(VALUE str)
6575{
6576 str_discard(str);
6577 STR_SET_EMBED(str);
6578 STR_SET_LEN(str, 0);
6579 RSTRING_PTR(str)[0] = 0;
6580 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6582 else
6584 return str;
6585}
6586
6587/*
6588 * call-seq:
6589 * chr -> string
6590 *
6591 * :include: doc/string/chr.rdoc
6592 *
6593 */
6594
6595static VALUE
6596rb_str_chr(VALUE str)
6597{
6598 return rb_str_substr(str, 0, 1);
6599}
6600
6601/*
6602 * call-seq:
6603 * getbyte(index) -> integer or nil
6604 *
6605 * :include: doc/string/getbyte.rdoc
6606 *
6607 */
6608VALUE
6609rb_str_getbyte(VALUE str, VALUE index)
6610{
6611 long pos = NUM2LONG(index);
6612
6613 if (pos < 0)
6614 pos += RSTRING_LEN(str);
6615 if (pos < 0 || RSTRING_LEN(str) <= pos)
6616 return Qnil;
6617
6618 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6619}
6620
6621/*
6622 * call-seq:
6623 * setbyte(index, integer) -> integer
6624 *
6625 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6626 * returns +integer+:
6627 *
6628 * s = 'xyzzy'
6629 * s.setbyte(2, 129) # => 129
6630 * s # => "xy\x81zy"
6631 *
6632 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6633 */
6634VALUE
6635rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6636{
6637 long pos = NUM2LONG(index);
6638 long len = RSTRING_LEN(str);
6639 char *ptr, *head, *left = 0;
6640 rb_encoding *enc;
6641 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6642
6643 if (pos < -len || len <= pos)
6644 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6645 if (pos < 0)
6646 pos += len;
6647
6648 VALUE v = rb_to_int(value);
6649 VALUE w = rb_int_and(v, INT2FIX(0xff));
6650 char byte = (char)(NUM2INT(w) & 0xFF);
6651
6652 if (!str_independent(str))
6653 str_make_independent(str);
6654 enc = STR_ENC_GET(str);
6655 head = RSTRING_PTR(str);
6656 ptr = &head[pos];
6657 if (!STR_EMBED_P(str)) {
6658 cr = ENC_CODERANGE(str);
6659 switch (cr) {
6660 case ENC_CODERANGE_7BIT:
6661 left = ptr;
6662 *ptr = byte;
6663 if (ISASCII(byte)) goto end;
6664 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6665 if (!MBCLEN_CHARFOUND_P(nlen))
6667 else
6669 goto end;
6671 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6672 width = rb_enc_precise_mbclen(left, head+len, enc);
6673 *ptr = byte;
6674 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6675 if (!MBCLEN_CHARFOUND_P(nlen))
6677 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6679 goto end;
6680 }
6681 }
6683 *ptr = byte;
6684
6685 end:
6686 return value;
6687}
6688
6689static VALUE
6690str_byte_substr(VALUE str, long beg, long len, int empty)
6691{
6692 long n = RSTRING_LEN(str);
6693
6694 if (beg > n || len < 0) return Qnil;
6695 if (beg < 0) {
6696 beg += n;
6697 if (beg < 0) return Qnil;
6698 }
6699 if (len > n - beg)
6700 len = n - beg;
6701 if (len <= 0) {
6702 if (!empty) return Qnil;
6703 len = 0;
6704 }
6705
6706 VALUE str2 = str_subseq(str, beg, len);
6707
6708 str_enc_copy_direct(str2, str);
6709
6710 if (RSTRING_LEN(str2) == 0) {
6711 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6713 else
6715 }
6716 else {
6717 switch (ENC_CODERANGE(str)) {
6718 case ENC_CODERANGE_7BIT:
6720 break;
6721 default:
6723 break;
6724 }
6725 }
6726
6727 return str2;
6728}
6729
6730VALUE
6731rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6732{
6733 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6734}
6735
6736static VALUE
6737str_byte_aref(VALUE str, VALUE indx)
6738{
6739 long idx;
6740 if (FIXNUM_P(indx)) {
6741 idx = FIX2LONG(indx);
6742 }
6743 else {
6744 /* check if indx is Range */
6745 long beg, len = RSTRING_LEN(str);
6746
6747 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6748 case Qfalse:
6749 break;
6750 case Qnil:
6751 return Qnil;
6752 default:
6753 return str_byte_substr(str, beg, len, TRUE);
6754 }
6755
6756 idx = NUM2LONG(indx);
6757 }
6758 return str_byte_substr(str, idx, 1, FALSE);
6759}
6760
6761/*
6762 * call-seq:
6763 * byteslice(offset, length = 1) -> string or nil
6764 * byteslice(range) -> string or nil
6765 *
6766 * :include: doc/string/byteslice.rdoc
6767 */
6768
6769static VALUE
6770rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6771{
6772 if (argc == 2) {
6773 long beg = NUM2LONG(argv[0]);
6774 long len = NUM2LONG(argv[1]);
6775 return str_byte_substr(str, beg, len, TRUE);
6776 }
6777 rb_check_arity(argc, 1, 2);
6778 return str_byte_aref(str, argv[0]);
6779}
6780
6781static void
6782str_check_beg_len(VALUE str, long *beg, long *len)
6783{
6784 long end, slen = RSTRING_LEN(str);
6785
6786 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6787 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6788 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6789 }
6790 if (*beg < 0) {
6791 *beg += slen;
6792 }
6793 RUBY_ASSERT(*beg >= 0);
6794 RUBY_ASSERT(*beg <= slen);
6795
6796 if (*len > slen - *beg) {
6797 *len = slen - *beg;
6798 }
6799 end = *beg + *len;
6800 str_ensure_byte_pos(str, *beg);
6801 str_ensure_byte_pos(str, end);
6802}
6803
6804/*
6805 * call-seq:
6806 * bytesplice(offset, length, str) -> self
6807 * bytesplice(offset, length, str, str_offset, str_length) -> self
6808 * bytesplice(range, str) -> self
6809 * bytesplice(range, str, str_range) -> self
6810 *
6811 * :include: doc/string/bytesplice.rdoc
6812 */
6813
6814static VALUE
6815rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6816{
6817 long beg, len, vbeg, vlen;
6818 VALUE val;
6819 int cr;
6820
6821 rb_check_arity(argc, 2, 5);
6822 if (!(argc == 2 || argc == 3 || argc == 5)) {
6823 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6824 }
6825 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6826 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6827 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6828 rb_builtin_class_name(argv[0]));
6829 }
6830 val = argv[1];
6831 StringValue(val);
6832 if (argc == 2) {
6833 /* bytesplice(range, str) */
6834 vbeg = 0;
6835 vlen = RSTRING_LEN(val);
6836 }
6837 else {
6838 /* bytesplice(range, str, str_range) */
6839 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6840 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6841 rb_builtin_class_name(argv[2]));
6842 }
6843 }
6844 }
6845 else {
6846 beg = NUM2LONG(argv[0]);
6847 len = NUM2LONG(argv[1]);
6848 val = argv[2];
6849 StringValue(val);
6850 if (argc == 3) {
6851 /* bytesplice(index, length, str) */
6852 vbeg = 0;
6853 vlen = RSTRING_LEN(val);
6854 }
6855 else {
6856 /* bytesplice(index, length, str, str_index, str_length) */
6857 vbeg = NUM2LONG(argv[3]);
6858 vlen = NUM2LONG(argv[4]);
6859 }
6860 }
6861 str_check_beg_len(str, &beg, &len);
6862 str_check_beg_len(val, &vbeg, &vlen);
6863 str_modify_keep_cr(str);
6864
6865 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6866 rb_enc_associate(str, rb_enc_check(str, val));
6867 }
6868
6869 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6871 if (cr != ENC_CODERANGE_BROKEN)
6872 ENC_CODERANGE_SET(str, cr);
6873 return str;
6874}
6875
6876/*
6877 * call-seq:
6878 * reverse -> new_string
6879 *
6880 * Returns a new string with the characters from +self+ in reverse order.
6881 *
6882 * 'drawer'.reverse # => "reward"
6883 * 'reviled'.reverse # => "deliver"
6884 * 'stressed'.reverse # => "desserts"
6885 * 'semordnilaps'.reverse # => "spalindromes"
6886 *
6887 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6888 */
6889
6890static VALUE
6891rb_str_reverse(VALUE str)
6892{
6893 rb_encoding *enc;
6894 VALUE rev;
6895 char *s, *e, *p;
6896 int cr;
6897
6898 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6899 enc = STR_ENC_GET(str);
6900 rev = rb_str_new(0, RSTRING_LEN(str));
6901 s = RSTRING_PTR(str); e = RSTRING_END(str);
6902 p = RSTRING_END(rev);
6903 cr = ENC_CODERANGE(str);
6904
6905 if (RSTRING_LEN(str) > 1) {
6906 if (single_byte_optimizable(str)) {
6907 while (s < e) {
6908 *--p = *s++;
6909 }
6910 }
6911 else if (cr == ENC_CODERANGE_VALID) {
6912 while (s < e) {
6913 int clen = rb_enc_fast_mbclen(s, e, enc);
6914
6915 p -= clen;
6916 memcpy(p, s, clen);
6917 s += clen;
6918 }
6919 }
6920 else {
6921 cr = rb_enc_asciicompat(enc) ?
6923 while (s < e) {
6924 int clen = rb_enc_mbclen(s, e, enc);
6925
6926 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6927 p -= clen;
6928 memcpy(p, s, clen);
6929 s += clen;
6930 }
6931 }
6932 }
6933 STR_SET_LEN(rev, RSTRING_LEN(str));
6934 str_enc_copy_direct(rev, str);
6935 ENC_CODERANGE_SET(rev, cr);
6936
6937 return rev;
6938}
6939
6940
6941/*
6942 * call-seq:
6943 * reverse! -> self
6944 *
6945 * Returns +self+ with its characters reversed:
6946 *
6947 * 'drawer'.reverse! # => "reward"
6948 * 'reviled'.reverse! # => "deliver"
6949 * 'stressed'.reverse! # => "desserts"
6950 * 'semordnilaps'.reverse! # => "spalindromes"
6951 *
6952 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6953 */
6954
6955static VALUE
6956rb_str_reverse_bang(VALUE str)
6957{
6958 if (RSTRING_LEN(str) > 1) {
6959 if (single_byte_optimizable(str)) {
6960 char *s, *e, c;
6961
6962 str_modify_keep_cr(str);
6963 s = RSTRING_PTR(str);
6964 e = RSTRING_END(str) - 1;
6965 while (s < e) {
6966 c = *s;
6967 *s++ = *e;
6968 *e-- = c;
6969 }
6970 }
6971 else {
6972 str_shared_replace(str, rb_str_reverse(str));
6973 }
6974 }
6975 else {
6976 str_modify_keep_cr(str);
6977 }
6978 return str;
6979}
6980
6981
6982/*
6983 * call-seq:
6984 * include?(other_string) -> true or false
6985 *
6986 * Returns whether +self+ contains +other_string+:
6987 *
6988 * s = 'bar'
6989 * s.include?('ba') # => true
6990 * s.include?('ar') # => true
6991 * s.include?('bar') # => true
6992 * s.include?('a') # => true
6993 * s.include?('') # => true
6994 * s.include?('foo') # => false
6995 *
6996 * Related: see {Querying}[rdoc-ref:String@Querying].
6997 */
6998
6999VALUE
7000rb_str_include(VALUE str, VALUE arg)
7001{
7002 long i;
7003
7004 StringValue(arg);
7005 i = rb_str_index(str, arg, 0);
7006
7007 return RBOOL(i != -1);
7008}
7009
7010
7011/*
7012 * call-seq:
7013 * to_i(base = 10) -> integer
7014 *
7015 * Returns the result of interpreting leading characters in +self+
7016 * as an integer in the given +base+;
7017 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7018 *
7019 * '123456'.to_i # => 123456
7020 * '123def'.to_i(16) # => 1195503
7021 *
7022 * With +base+ zero given, string +object+ may contain leading characters
7023 * to specify the actual base:
7024 *
7025 * '123def'.to_i(0) # => 123
7026 * '0123def'.to_i(0) # => 83
7027 * '0b123def'.to_i(0) # => 1
7028 * '0o123def'.to_i(0) # => 83
7029 * '0d123def'.to_i(0) # => 123
7030 * '0x123def'.to_i(0) # => 1195503
7031 *
7032 * Characters past a leading valid number (in the given +base+) are ignored:
7033 *
7034 * '12.345'.to_i # => 12
7035 * '12345'.to_i(2) # => 1
7036 *
7037 * Returns zero if there is no leading valid number:
7038 *
7039 * 'abcdef'.to_i # => 0
7040 * '2'.to_i(2) # => 0
7041 *
7042 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7043 */
7044
7045static VALUE
7046rb_str_to_i(int argc, VALUE *argv, VALUE str)
7047{
7048 int base = 10;
7049
7050 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7051 rb_raise(rb_eArgError, "invalid radix %d", base);
7052 }
7053 return rb_str_to_inum(str, base, FALSE);
7054}
7055
7056
7057/*
7058 * call-seq:
7059 * to_f -> float
7060 *
7061 * Returns the result of interpreting leading characters in +self+ as a Float:
7062 *
7063 * '3.14159'.to_f # => 3.14159
7064 * '1.234e-2'.to_f # => 0.01234
7065 *
7066 * Characters past a leading valid number are ignored:
7067 *
7068 * '3.14 (pi to two places)'.to_f # => 3.14
7069 *
7070 * Returns zero if there is no leading valid number:
7071 *
7072 * 'abcdef'.to_f # => 0.0
7073 *
7074 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7075 */
7076
7077static VALUE
7078rb_str_to_f(VALUE str)
7079{
7080 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7081}
7082
7083
7084/*
7085 * call-seq:
7086 * to_s -> self or new_string
7087 *
7088 * Returns +self+ if +self+ is a +String+,
7089 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7090 *
7091 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7092 */
7093
7094static VALUE
7095rb_str_to_s(VALUE str)
7096{
7097 if (rb_obj_class(str) != rb_cString) {
7098 return str_duplicate(rb_cString, str);
7099 }
7100 return str;
7101}
7102
7103#if 0
7104static void
7105str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7106{
7107 char s[RUBY_MAX_CHAR_LEN];
7108 int n = rb_enc_codelen(c, enc);
7109
7110 rb_enc_mbcput(c, s, enc);
7111 rb_enc_str_buf_cat(str, s, n, enc);
7112}
7113#endif
7114
7115#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7116
7117int
7118rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7119{
7120 char buf[CHAR_ESC_LEN + 1];
7121 int l;
7122
7123#if SIZEOF_INT > 4
7124 c &= 0xffffffff;
7125#endif
7126 if (unicode_p) {
7127 if (c < 0x7F && ISPRINT(c)) {
7128 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7129 }
7130 else if (c < 0x10000) {
7131 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7132 }
7133 else {
7134 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7135 }
7136 }
7137 else {
7138 if (c < 0x100) {
7139 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7140 }
7141 else {
7142 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7143 }
7144 }
7145 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7146 rb_str_buf_cat(result, buf, l);
7147 return l;
7148}
7149
7150const char *
7151ruby_escaped_char(int c)
7152{
7153 switch (c) {
7154 case '\0': return "\\0";
7155 case '\n': return "\\n";
7156 case '\r': return "\\r";
7157 case '\t': return "\\t";
7158 case '\f': return "\\f";
7159 case '\013': return "\\v";
7160 case '\010': return "\\b";
7161 case '\007': return "\\a";
7162 case '\033': return "\\e";
7163 case '\x7f': return "\\c?";
7164 }
7165 return NULL;
7166}
7167
7168VALUE
7169rb_str_escape(VALUE str)
7170{
7171 int encidx = ENCODING_GET(str);
7172 rb_encoding *enc = rb_enc_from_index(encidx);
7173 const char *p = RSTRING_PTR(str);
7174 const char *pend = RSTRING_END(str);
7175 const char *prev = p;
7176 char buf[CHAR_ESC_LEN + 1];
7177 VALUE result = rb_str_buf_new(0);
7178 int unicode_p = rb_enc_unicode_p(enc);
7179 int asciicompat = rb_enc_asciicompat(enc);
7180
7181 while (p < pend) {
7182 unsigned int c;
7183 const char *cc;
7184 int n = rb_enc_precise_mbclen(p, pend, enc);
7185 if (!MBCLEN_CHARFOUND_P(n)) {
7186 if (p > prev) str_buf_cat(result, prev, p - prev);
7187 n = rb_enc_mbminlen(enc);
7188 if (pend < p + n)
7189 n = (int)(pend - p);
7190 while (n--) {
7191 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7192 str_buf_cat(result, buf, strlen(buf));
7193 prev = ++p;
7194 }
7195 continue;
7196 }
7197 n = MBCLEN_CHARFOUND_LEN(n);
7198 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7199 p += n;
7200 cc = ruby_escaped_char(c);
7201 if (cc) {
7202 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7203 str_buf_cat(result, cc, strlen(cc));
7204 prev = p;
7205 }
7206 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7207 }
7208 else {
7209 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7210 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7211 prev = p;
7212 }
7213 }
7214 if (p > prev) str_buf_cat(result, prev, p - prev);
7215 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7216
7217 return result;
7218}
7219
7220/*
7221 * call-seq:
7222 * inspect -> string
7223 *
7224 * :include: doc/string/inspect.rdoc
7225 *
7226 */
7227
7228VALUE
7230{
7231 int encidx = ENCODING_GET(str);
7232 rb_encoding *enc = rb_enc_from_index(encidx);
7233 const char *p, *pend, *prev;
7234 char buf[CHAR_ESC_LEN + 1];
7235 VALUE result = rb_str_buf_new(0);
7236 rb_encoding *resenc = rb_default_internal_encoding();
7237 int unicode_p = rb_enc_unicode_p(enc);
7238 int asciicompat = rb_enc_asciicompat(enc);
7239
7240 if (resenc == NULL) resenc = rb_default_external_encoding();
7241 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7242 rb_enc_associate(result, resenc);
7243 str_buf_cat2(result, "\"");
7244
7245 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7246 prev = p;
7247 while (p < pend) {
7248 unsigned int c, cc;
7249 int n;
7250
7251 n = rb_enc_precise_mbclen(p, pend, enc);
7252 if (!MBCLEN_CHARFOUND_P(n)) {
7253 if (p > prev) str_buf_cat(result, prev, p - prev);
7254 n = rb_enc_mbminlen(enc);
7255 if (pend < p + n)
7256 n = (int)(pend - p);
7257 while (n--) {
7258 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7259 str_buf_cat(result, buf, strlen(buf));
7260 prev = ++p;
7261 }
7262 continue;
7263 }
7264 n = MBCLEN_CHARFOUND_LEN(n);
7265 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7266 p += n;
7267 if ((asciicompat || unicode_p) &&
7268 (c == '"'|| c == '\\' ||
7269 (c == '#' &&
7270 p < pend &&
7271 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7272 (cc = rb_enc_codepoint(p,pend,enc),
7273 (cc == '$' || cc == '@' || cc == '{'))))) {
7274 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7275 str_buf_cat2(result, "\\");
7276 if (asciicompat || enc == resenc) {
7277 prev = p - n;
7278 continue;
7279 }
7280 }
7281 switch (c) {
7282 case '\n': cc = 'n'; break;
7283 case '\r': cc = 'r'; break;
7284 case '\t': cc = 't'; break;
7285 case '\f': cc = 'f'; break;
7286 case '\013': cc = 'v'; break;
7287 case '\010': cc = 'b'; break;
7288 case '\007': cc = 'a'; break;
7289 case 033: cc = 'e'; break;
7290 default: cc = 0; break;
7291 }
7292 if (cc) {
7293 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7294 buf[0] = '\\';
7295 buf[1] = (char)cc;
7296 str_buf_cat(result, buf, 2);
7297 prev = p;
7298 continue;
7299 }
7300 /* The special casing of 0x85 (NEXT_LINE) here is because
7301 * Oniguruma historically treats it as printable, but it
7302 * doesn't match the print POSIX bracket class or character
7303 * property in regexps.
7304 *
7305 * See Ruby Bug #16842 for details:
7306 * https://bugs.ruby-lang.org/issues/16842
7307 */
7308 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7309 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7310 continue;
7311 }
7312 else {
7313 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7314 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7315 prev = p;
7316 continue;
7317 }
7318 }
7319 if (p > prev) str_buf_cat(result, prev, p - prev);
7320 str_buf_cat2(result, "\"");
7321
7322 return result;
7323}
7324
7325#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7326
7327/*
7328 * call-seq:
7329 * dump -> new_string
7330 *
7331 * :include: doc/string/dump.rdoc
7332 *
7333 */
7334
7335VALUE
7337{
7338 int encidx = rb_enc_get_index(str);
7339 rb_encoding *enc = rb_enc_from_index(encidx);
7340 long len;
7341 const char *p, *pend;
7342 char *q, *qend;
7343 VALUE result;
7344 int u8 = (encidx == rb_utf8_encindex());
7345 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7346
7347 len = 2; /* "" */
7348 if (!rb_enc_asciicompat(enc)) {
7349 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7350 len += strlen(enc->name);
7351 }
7352
7353 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7354 while (p < pend) {
7355 int clen;
7356 unsigned char c = *p++;
7357
7358 switch (c) {
7359 case '"': case '\\':
7360 case '\n': case '\r':
7361 case '\t': case '\f':
7362 case '\013': case '\010': case '\007': case '\033':
7363 clen = 2;
7364 break;
7365
7366 case '#':
7367 clen = IS_EVSTR(p, pend) ? 2 : 1;
7368 break;
7369
7370 default:
7371 if (ISPRINT(c)) {
7372 clen = 1;
7373 }
7374 else {
7375 if (u8 && c > 0x7F) { /* \u notation */
7376 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7377 if (MBCLEN_CHARFOUND_P(n)) {
7378 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7379 if (cc <= 0xFFFF)
7380 clen = 6; /* \uXXXX */
7381 else if (cc <= 0xFFFFF)
7382 clen = 9; /* \u{XXXXX} */
7383 else
7384 clen = 10; /* \u{XXXXXX} */
7385 p += MBCLEN_CHARFOUND_LEN(n)-1;
7386 break;
7387 }
7388 }
7389 clen = 4; /* \xNN */
7390 }
7391 break;
7392 }
7393
7394 if (clen > LONG_MAX - len) {
7395 rb_raise(rb_eRuntimeError, "string size too big");
7396 }
7397 len += clen;
7398 }
7399
7400 result = rb_str_new(0, len);
7401 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7402 q = RSTRING_PTR(result); qend = q + len + 1;
7403
7404 *q++ = '"';
7405 while (p < pend) {
7406 unsigned char c = *p++;
7407
7408 if (c == '"' || c == '\\') {
7409 *q++ = '\\';
7410 *q++ = c;
7411 }
7412 else if (c == '#') {
7413 if (IS_EVSTR(p, pend)) *q++ = '\\';
7414 *q++ = '#';
7415 }
7416 else if (c == '\n') {
7417 *q++ = '\\';
7418 *q++ = 'n';
7419 }
7420 else if (c == '\r') {
7421 *q++ = '\\';
7422 *q++ = 'r';
7423 }
7424 else if (c == '\t') {
7425 *q++ = '\\';
7426 *q++ = 't';
7427 }
7428 else if (c == '\f') {
7429 *q++ = '\\';
7430 *q++ = 'f';
7431 }
7432 else if (c == '\013') {
7433 *q++ = '\\';
7434 *q++ = 'v';
7435 }
7436 else if (c == '\010') {
7437 *q++ = '\\';
7438 *q++ = 'b';
7439 }
7440 else if (c == '\007') {
7441 *q++ = '\\';
7442 *q++ = 'a';
7443 }
7444 else if (c == '\033') {
7445 *q++ = '\\';
7446 *q++ = 'e';
7447 }
7448 else if (ISPRINT(c)) {
7449 *q++ = c;
7450 }
7451 else {
7452 *q++ = '\\';
7453 if (u8) {
7454 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7455 if (MBCLEN_CHARFOUND_P(n)) {
7456 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7457 p += n;
7458 if (cc <= 0xFFFF)
7459 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7460 else
7461 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7462 q += strlen(q);
7463 continue;
7464 }
7465 }
7466 snprintf(q, qend-q, "x%02X", c);
7467 q += 3;
7468 }
7469 }
7470 *q++ = '"';
7471 *q = '\0';
7472 if (!rb_enc_asciicompat(enc)) {
7473 snprintf(q, qend-q, nonascii_suffix, enc->name);
7474 encidx = rb_ascii8bit_encindex();
7475 }
7476 /* result from dump is ASCII */
7477 rb_enc_associate_index(result, encidx);
7479 return result;
7480}
7481
7482static int
7483unescape_ascii(unsigned int c)
7484{
7485 switch (c) {
7486 case 'n':
7487 return '\n';
7488 case 'r':
7489 return '\r';
7490 case 't':
7491 return '\t';
7492 case 'f':
7493 return '\f';
7494 case 'v':
7495 return '\13';
7496 case 'b':
7497 return '\010';
7498 case 'a':
7499 return '\007';
7500 case 'e':
7501 return 033;
7502 }
7504}
7505
7506static void
7507undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7508{
7509 const char *s = *ss;
7510 unsigned int c;
7511 int codelen;
7512 size_t hexlen;
7513 unsigned char buf[6];
7514 static rb_encoding *enc_utf8 = NULL;
7515
7516 switch (*s) {
7517 case '\\':
7518 case '"':
7519 case '#':
7520 rb_str_cat(undumped, s, 1); /* cat itself */
7521 s++;
7522 break;
7523 case 'n':
7524 case 'r':
7525 case 't':
7526 case 'f':
7527 case 'v':
7528 case 'b':
7529 case 'a':
7530 case 'e':
7531 *buf = unescape_ascii(*s);
7532 rb_str_cat(undumped, (char *)buf, 1);
7533 s++;
7534 break;
7535 case 'u':
7536 if (*binary) {
7537 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7538 }
7539 *utf8 = true;
7540 if (++s >= s_end) {
7541 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7542 }
7543 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7544 if (*penc != enc_utf8) {
7545 *penc = enc_utf8;
7546 rb_enc_associate(undumped, enc_utf8);
7547 }
7548 if (*s == '{') { /* handle \u{...} form */
7549 s++;
7550 for (;;) {
7551 if (s >= s_end) {
7552 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7553 }
7554 if (*s == '}') {
7555 s++;
7556 break;
7557 }
7558 if (ISSPACE(*s)) {
7559 s++;
7560 continue;
7561 }
7562 c = scan_hex(s, s_end-s, &hexlen);
7563 if (hexlen == 0 || hexlen > 6) {
7564 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7565 }
7566 if (c > 0x10ffff) {
7567 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7568 }
7569 if (0xd800 <= c && c <= 0xdfff) {
7570 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7571 }
7572 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7573 rb_str_cat(undumped, (char *)buf, codelen);
7574 s += hexlen;
7575 }
7576 }
7577 else { /* handle \uXXXX form */
7578 c = scan_hex(s, 4, &hexlen);
7579 if (hexlen != 4) {
7580 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7581 }
7582 if (0xd800 <= c && c <= 0xdfff) {
7583 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7584 }
7585 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7586 rb_str_cat(undumped, (char *)buf, codelen);
7587 s += hexlen;
7588 }
7589 break;
7590 case 'x':
7591 if (++s >= s_end) {
7592 rb_raise(rb_eRuntimeError, "invalid hex escape");
7593 }
7594 *buf = scan_hex(s, 2, &hexlen);
7595 if (hexlen != 2) {
7596 rb_raise(rb_eRuntimeError, "invalid hex escape");
7597 }
7598 if (!ISASCII(*buf)) {
7599 if (*utf8) {
7600 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7601 }
7602 *binary = true;
7603 }
7604 rb_str_cat(undumped, (char *)buf, 1);
7605 s += hexlen;
7606 break;
7607 default:
7608 rb_str_cat(undumped, s-1, 2);
7609 s++;
7610 }
7611
7612 *ss = s;
7613}
7614
7615static VALUE rb_str_is_ascii_only_p(VALUE str);
7616
7617/*
7618 * call-seq:
7619 * undump -> new_string
7620 *
7621 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7622 *
7623 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7624 */
7625
7626static VALUE
7627str_undump(VALUE str)
7628{
7629 const char *s = RSTRING_PTR(str);
7630 const char *s_end = RSTRING_END(str);
7631 rb_encoding *enc = rb_enc_get(str);
7632 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7633 bool utf8 = false;
7634 bool binary = false;
7635 int w;
7636
7638 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7639 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7640 }
7641 if (!str_null_check(str, &w)) {
7642 rb_raise(rb_eRuntimeError, "string contains null byte");
7643 }
7644 if (RSTRING_LEN(str) < 2) goto invalid_format;
7645 if (*s != '"') goto invalid_format;
7646
7647 /* strip '"' at the start */
7648 s++;
7649
7650 for (;;) {
7651 if (s >= s_end) {
7652 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7653 }
7654
7655 if (*s == '"') {
7656 /* epilogue */
7657 s++;
7658 if (s == s_end) {
7659 /* ascii compatible dumped string */
7660 break;
7661 }
7662 else {
7663 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7664 static const char dup_suffix[] = ".dup";
7665 const char *encname;
7666 int encidx;
7667 ptrdiff_t size;
7668
7669 /* check separately for strings dumped by older versions */
7670 size = sizeof(dup_suffix) - 1;
7671 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7672
7673 size = sizeof(force_encoding_suffix) - 1;
7674 if (s_end - s <= size) goto invalid_format;
7675 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7676 s += size;
7677
7678 if (utf8) {
7679 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7680 }
7681
7682 encname = s;
7683 s = memchr(s, '"', s_end-s);
7684 size = s - encname;
7685 if (!s) goto invalid_format;
7686 if (s_end - s != 2) goto invalid_format;
7687 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7688
7689 encidx = rb_enc_find_index2(encname, (long)size);
7690 if (encidx < 0) {
7691 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7692 }
7693 rb_enc_associate_index(undumped, encidx);
7694 }
7695 break;
7696 }
7697
7698 if (*s == '\\') {
7699 s++;
7700 if (s >= s_end) {
7701 rb_raise(rb_eRuntimeError, "invalid escape");
7702 }
7703 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7704 }
7705 else {
7706 rb_str_cat(undumped, s++, 1);
7707 }
7708 }
7709
7710 RB_GC_GUARD(str);
7711
7712 return undumped;
7713invalid_format:
7714 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7715}
7716
7717static void
7718rb_str_check_dummy_enc(rb_encoding *enc)
7719{
7720 if (rb_enc_dummy_p(enc)) {
7721 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7722 rb_enc_name(enc));
7723 }
7724}
7725
7726static rb_encoding *
7727str_true_enc(VALUE str)
7728{
7729 rb_encoding *enc = STR_ENC_GET(str);
7730 rb_str_check_dummy_enc(enc);
7731 return enc;
7732}
7733
7734static OnigCaseFoldType
7735check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7736{
7737 if (argc==0)
7738 return flags;
7739 if (argc>2)
7740 rb_raise(rb_eArgError, "too many options");
7741 if (argv[0]==sym_turkic) {
7742 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7743 if (argc==2) {
7744 if (argv[1]==sym_lithuanian)
7745 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7746 else
7747 rb_raise(rb_eArgError, "invalid second option");
7748 }
7749 }
7750 else if (argv[0]==sym_lithuanian) {
7751 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7752 if (argc==2) {
7753 if (argv[1]==sym_turkic)
7754 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7755 else
7756 rb_raise(rb_eArgError, "invalid second option");
7757 }
7758 }
7759 else if (argc>1)
7760 rb_raise(rb_eArgError, "too many options");
7761 else if (argv[0]==sym_ascii)
7762 flags |= ONIGENC_CASE_ASCII_ONLY;
7763 else if (argv[0]==sym_fold) {
7764 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7765 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7766 else
7767 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7768 }
7769 else
7770 rb_raise(rb_eArgError, "invalid option");
7771 return flags;
7772}
7773
7774static inline bool
7775case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7776{
7777 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7778 return true;
7779 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7780}
7781
7782/* 16 should be long enough to absorb any kind of single character length increase */
7783#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7784#ifndef CASEMAP_DEBUG
7785# define CASEMAP_DEBUG 0
7786#endif
7787
7788struct mapping_buffer;
7789typedef struct mapping_buffer {
7790 size_t capa;
7791 size_t used;
7792 struct mapping_buffer *next;
7793 OnigUChar space[FLEX_ARY_LEN];
7795
7796static void
7797mapping_buffer_free(void *p)
7798{
7799 mapping_buffer *previous_buffer;
7800 mapping_buffer *current_buffer = p;
7801 while (current_buffer) {
7802 previous_buffer = current_buffer;
7803 current_buffer = current_buffer->next;
7804 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7805 }
7806}
7807
7808static const rb_data_type_t mapping_buffer_type = {
7809 "mapping_buffer",
7810 {0, mapping_buffer_free,},
7811 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7812};
7813
7814static VALUE
7815rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7816{
7817 VALUE target;
7818
7819 const OnigUChar *source_current, *source_end;
7820 int target_length = 0;
7821 VALUE buffer_anchor;
7822 mapping_buffer *current_buffer = 0;
7823 mapping_buffer **pre_buffer;
7824 size_t buffer_count = 0;
7825 int buffer_length_or_invalid;
7826
7827 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7828
7829 source_current = (OnigUChar*)RSTRING_PTR(source);
7830 source_end = (OnigUChar*)RSTRING_END(source);
7831
7832 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7833 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7834 while (source_current < source_end) {
7835 /* increase multiplier using buffer count to converge quickly */
7836 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7837 if (CASEMAP_DEBUG) {
7838 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7839 }
7840 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7841 *pre_buffer = current_buffer;
7842 pre_buffer = &current_buffer->next;
7843 current_buffer->next = NULL;
7844 current_buffer->capa = capa;
7845 buffer_length_or_invalid = enc->case_map(flags,
7846 &source_current, source_end,
7847 current_buffer->space,
7848 current_buffer->space+current_buffer->capa,
7849 enc);
7850 if (buffer_length_or_invalid < 0) {
7851 current_buffer = DATA_PTR(buffer_anchor);
7852 DATA_PTR(buffer_anchor) = 0;
7853 mapping_buffer_free(current_buffer);
7854 rb_raise(rb_eArgError, "input string invalid");
7855 }
7856 target_length += current_buffer->used = buffer_length_or_invalid;
7857 }
7858 if (CASEMAP_DEBUG) {
7859 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7860 }
7861
7862 if (buffer_count==1) {
7863 target = rb_str_new((const char*)current_buffer->space, target_length);
7864 }
7865 else {
7866 char *target_current;
7867
7868 target = rb_str_new(0, target_length);
7869 target_current = RSTRING_PTR(target);
7870 current_buffer = DATA_PTR(buffer_anchor);
7871 while (current_buffer) {
7872 memcpy(target_current, current_buffer->space, current_buffer->used);
7873 target_current += current_buffer->used;
7874 current_buffer = current_buffer->next;
7875 }
7876 }
7877 current_buffer = DATA_PTR(buffer_anchor);
7878 DATA_PTR(buffer_anchor) = 0;
7879 mapping_buffer_free(current_buffer);
7880
7881 RB_GC_GUARD(buffer_anchor);
7882
7883 /* TODO: check about string terminator character */
7884 str_enc_copy_direct(target, source);
7885 /*ENC_CODERANGE_SET(mapped, cr);*/
7886
7887 return target;
7888}
7889
7890static VALUE
7891rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7892{
7893 const OnigUChar *source_current, *source_end;
7894 OnigUChar *target_current, *target_end;
7895 long old_length = RSTRING_LEN(source);
7896 int length_or_invalid;
7897
7898 if (old_length == 0) return Qnil;
7899
7900 source_current = (OnigUChar*)RSTRING_PTR(source);
7901 source_end = (OnigUChar*)RSTRING_END(source);
7902 if (source == target) {
7903 target_current = (OnigUChar*)source_current;
7904 target_end = (OnigUChar*)source_end;
7905 }
7906 else {
7907 target_current = (OnigUChar*)RSTRING_PTR(target);
7908 target_end = (OnigUChar*)RSTRING_END(target);
7909 }
7910
7911 length_or_invalid = onigenc_ascii_only_case_map(flags,
7912 &source_current, source_end,
7913 target_current, target_end, enc);
7914 if (length_or_invalid < 0)
7915 rb_raise(rb_eArgError, "input string invalid");
7916 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7917 fprintf(stderr, "problem with rb_str_ascii_casemap"
7918 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7919 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7920 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7921 }
7922
7923 str_enc_copy(target, source);
7924
7925 return target;
7926}
7927
7928static bool
7929upcase_single(VALUE str)
7930{
7931 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7932 bool modified = false;
7933
7934 while (s < send) {
7935 unsigned int c = *(unsigned char*)s;
7936
7937 if ('a' <= c && c <= 'z') {
7938 *s = 'A' + (c - 'a');
7939 modified = true;
7940 }
7941 s++;
7942 }
7943 return modified;
7944}
7945
7946/*
7947 * call-seq:
7948 * upcase!(mapping) -> self or nil
7949 *
7950 * Like String#upcase, except that:
7951 *
7952 * - Changes character casings in +self+ (not in a copy of +self+).
7953 * - Returns +self+ if any changes are made, +nil+ otherwise.
7954 *
7955 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7956 */
7957
7958static VALUE
7959rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7960{
7961 rb_encoding *enc;
7962 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7963
7964 flags = check_case_options(argc, argv, flags);
7965 str_modify_keep_cr(str);
7966 enc = str_true_enc(str);
7967 if (case_option_single_p(flags, enc, str)) {
7968 if (upcase_single(str))
7969 flags |= ONIGENC_CASE_MODIFIED;
7970 }
7971 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7972 rb_str_ascii_casemap(str, str, &flags, enc);
7973 else
7974 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7975
7976 if (ONIGENC_CASE_MODIFIED&flags) return str;
7977 return Qnil;
7978}
7979
7980
7981/*
7982 * call-seq:
7983 * upcase(mapping = :ascii) -> new_string
7984 *
7985 * :include: doc/string/upcase.rdoc
7986 */
7987
7988static VALUE
7989rb_str_upcase(int argc, VALUE *argv, VALUE str)
7990{
7991 rb_encoding *enc;
7992 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7993 VALUE ret;
7994
7995 flags = check_case_options(argc, argv, flags);
7996 enc = str_true_enc(str);
7997 if (case_option_single_p(flags, enc, str)) {
7998 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7999 str_enc_copy_direct(ret, str);
8000 upcase_single(ret);
8001 }
8002 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8003 ret = rb_str_new(0, RSTRING_LEN(str));
8004 rb_str_ascii_casemap(str, ret, &flags, enc);
8005 }
8006 else {
8007 ret = rb_str_casemap(str, &flags, enc);
8008 }
8009
8010 return ret;
8011}
8012
8013static bool
8014downcase_single(VALUE str)
8015{
8016 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8017 bool modified = false;
8018
8019 while (s < send) {
8020 unsigned int c = *(unsigned char*)s;
8021
8022 if ('A' <= c && c <= 'Z') {
8023 *s = 'a' + (c - 'A');
8024 modified = true;
8025 }
8026 s++;
8027 }
8028
8029 return modified;
8030}
8031
8032/*
8033 * call-seq:
8034 * downcase!(mapping) -> self or nil
8035 *
8036 * Like String#downcase, except that:
8037 *
8038 * - Changes character casings in +self+ (not in a copy of +self+).
8039 * - Returns +self+ if any changes are made, +nil+ otherwise.
8040 *
8041 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8042 */
8043
8044static VALUE
8045rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8046{
8047 rb_encoding *enc;
8048 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8049
8050 flags = check_case_options(argc, argv, flags);
8051 str_modify_keep_cr(str);
8052 enc = str_true_enc(str);
8053 if (case_option_single_p(flags, enc, str)) {
8054 if (downcase_single(str))
8055 flags |= ONIGENC_CASE_MODIFIED;
8056 }
8057 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8058 rb_str_ascii_casemap(str, str, &flags, enc);
8059 else
8060 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8061
8062 if (ONIGENC_CASE_MODIFIED&flags) return str;
8063 return Qnil;
8064}
8065
8066
8067/*
8068 * call-seq:
8069 * downcase(mapping = :ascii) -> new_string
8070 *
8071 * :include: doc/string/downcase.rdoc
8072 *
8073 */
8074
8075static VALUE
8076rb_str_downcase(int argc, VALUE *argv, VALUE str)
8077{
8078 rb_encoding *enc;
8079 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8080 VALUE ret;
8081
8082 flags = check_case_options(argc, argv, flags);
8083 enc = str_true_enc(str);
8084 if (case_option_single_p(flags, enc, str)) {
8085 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8086 str_enc_copy_direct(ret, str);
8087 downcase_single(ret);
8088 }
8089 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8090 ret = rb_str_new(0, RSTRING_LEN(str));
8091 rb_str_ascii_casemap(str, ret, &flags, enc);
8092 }
8093 else {
8094 ret = rb_str_casemap(str, &flags, enc);
8095 }
8096
8097 return ret;
8098}
8099
8100
8101/*
8102 * call-seq:
8103 * capitalize!(mapping = :ascii) -> self or nil
8104 *
8105 * Like String#capitalize, except that:
8106 *
8107 * - Changes character casings in +self+ (not in a copy of +self+).
8108 * - Returns +self+ if any changes are made, +nil+ otherwise.
8109 *
8110 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8111 */
8112
8113static VALUE
8114rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8115{
8116 rb_encoding *enc;
8117 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8118
8119 flags = check_case_options(argc, argv, flags);
8120 str_modify_keep_cr(str);
8121 enc = str_true_enc(str);
8122 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8123 if (flags&ONIGENC_CASE_ASCII_ONLY)
8124 rb_str_ascii_casemap(str, str, &flags, enc);
8125 else
8126 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8127
8128 if (ONIGENC_CASE_MODIFIED&flags) return str;
8129 return Qnil;
8130}
8131
8132
8133/*
8134 * call-seq:
8135 * capitalize(mapping = :ascii) -> new_string
8136 *
8137 * :include: doc/string/capitalize.rdoc
8138 *
8139 */
8140
8141static VALUE
8142rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8143{
8144 rb_encoding *enc;
8145 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8146 VALUE ret;
8147
8148 flags = check_case_options(argc, argv, flags);
8149 enc = str_true_enc(str);
8150 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8151 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8152 ret = rb_str_new(0, RSTRING_LEN(str));
8153 rb_str_ascii_casemap(str, ret, &flags, enc);
8154 }
8155 else {
8156 ret = rb_str_casemap(str, &flags, enc);
8157 }
8158 return ret;
8159}
8160
8161
8162/*
8163 * call-seq:
8164 * swapcase!(mapping) -> self or nil
8165 *
8166 * Like String#swapcase, except that:
8167 *
8168 * - Changes are made to +self+, not to copy of +self+.
8169 * - Returns +self+ if any changes are made, +nil+ otherwise.
8170 *
8171 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8172 */
8173
8174static VALUE
8175rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8176{
8177 rb_encoding *enc;
8178 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8179
8180 flags = check_case_options(argc, argv, flags);
8181 str_modify_keep_cr(str);
8182 enc = str_true_enc(str);
8183 if (flags&ONIGENC_CASE_ASCII_ONLY)
8184 rb_str_ascii_casemap(str, str, &flags, enc);
8185 else
8186 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8187
8188 if (ONIGENC_CASE_MODIFIED&flags) return str;
8189 return Qnil;
8190}
8191
8192
8193/*
8194 * call-seq:
8195 * swapcase(mapping = :ascii) -> new_string
8196 *
8197 * :include: doc/string/swapcase.rdoc
8198 *
8199 */
8200
8201static VALUE
8202rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8203{
8204 rb_encoding *enc;
8205 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8206 VALUE ret;
8207
8208 flags = check_case_options(argc, argv, flags);
8209 enc = str_true_enc(str);
8210 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8211 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8212 ret = rb_str_new(0, RSTRING_LEN(str));
8213 rb_str_ascii_casemap(str, ret, &flags, enc);
8214 }
8215 else {
8216 ret = rb_str_casemap(str, &flags, enc);
8217 }
8218 return ret;
8219}
8220
8221typedef unsigned char *USTR;
8222
8223struct tr {
8224 int gen;
8225 unsigned int now, max;
8226 char *p, *pend;
8227};
8228
8229static unsigned int
8230trnext(struct tr *t, rb_encoding *enc)
8231{
8232 int n;
8233
8234 for (;;) {
8235 nextpart:
8236 if (!t->gen) {
8237 if (t->p == t->pend) return -1;
8238 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8239 t->p += n;
8240 }
8241 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8242 t->p += n;
8243 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8244 t->p += n;
8245 if (t->p < t->pend) {
8246 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8247 t->p += n;
8248 if (t->now > c) {
8249 if (t->now < 0x80 && c < 0x80) {
8250 rb_raise(rb_eArgError,
8251 "invalid range \"%c-%c\" in string transliteration",
8252 t->now, c);
8253 }
8254 else {
8255 rb_raise(rb_eArgError, "invalid range in string transliteration");
8256 }
8257 continue; /* not reached */
8258 }
8259 else if (t->now < c) {
8260 t->gen = 1;
8261 t->max = c;
8262 }
8263 }
8264 }
8265 return t->now;
8266 }
8267 else {
8268 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8269 if (t->now == t->max) {
8270 t->gen = 0;
8271 goto nextpart;
8272 }
8273 }
8274 if (t->now < t->max) {
8275 return t->now;
8276 }
8277 else {
8278 t->gen = 0;
8279 return t->max;
8280 }
8281 }
8282 }
8283}
8284
8285static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8286
8287static VALUE
8288tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8289{
8290 const unsigned int errc = -1;
8291 unsigned int trans[256];
8292 rb_encoding *enc, *e1, *e2;
8293 struct tr trsrc, trrepl;
8294 int cflag = 0;
8295 unsigned int c, c0, last = 0;
8296 int modify = 0, i, l;
8297 unsigned char *s, *send;
8298 VALUE hash = 0;
8299 int singlebyte = single_byte_optimizable(str);
8300 int termlen;
8301 int cr;
8302
8303#define CHECK_IF_ASCII(c) \
8304 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8305 (cr = ENC_CODERANGE_VALID) : 0)
8306
8307 StringValue(src);
8308 StringValue(repl);
8309 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8310 if (RSTRING_LEN(repl) == 0) {
8311 return rb_str_delete_bang(1, &src, str);
8312 }
8313
8314 cr = ENC_CODERANGE(str);
8315 e1 = rb_enc_check(str, src);
8316 e2 = rb_enc_check(str, repl);
8317 if (e1 == e2) {
8318 enc = e1;
8319 }
8320 else {
8321 enc = rb_enc_check(src, repl);
8322 }
8323 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8324 if (RSTRING_LEN(src) > 1 &&
8325 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8326 trsrc.p + l < trsrc.pend) {
8327 cflag = 1;
8328 trsrc.p += l;
8329 }
8330 trrepl.p = RSTRING_PTR(repl);
8331 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8332 trsrc.gen = trrepl.gen = 0;
8333 trsrc.now = trrepl.now = 0;
8334 trsrc.max = trrepl.max = 0;
8335
8336 if (cflag) {
8337 for (i=0; i<256; i++) {
8338 trans[i] = 1;
8339 }
8340 while ((c = trnext(&trsrc, enc)) != errc) {
8341 if (c < 256) {
8342 trans[c] = errc;
8343 }
8344 else {
8345 if (!hash) hash = rb_hash_new();
8346 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8347 }
8348 }
8349 while ((c = trnext(&trrepl, enc)) != errc)
8350 /* retrieve last replacer */;
8351 last = trrepl.now;
8352 for (i=0; i<256; i++) {
8353 if (trans[i] != errc) {
8354 trans[i] = last;
8355 }
8356 }
8357 }
8358 else {
8359 unsigned int r;
8360
8361 for (i=0; i<256; i++) {
8362 trans[i] = errc;
8363 }
8364 while ((c = trnext(&trsrc, enc)) != errc) {
8365 r = trnext(&trrepl, enc);
8366 if (r == errc) r = trrepl.now;
8367 if (c < 256) {
8368 trans[c] = r;
8369 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8370 }
8371 else {
8372 if (!hash) hash = rb_hash_new();
8373 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8374 }
8375 }
8376 }
8377
8378 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8379 cr = ENC_CODERANGE_7BIT;
8380 str_modify_keep_cr(str);
8381 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8382 termlen = rb_enc_mbminlen(enc);
8383 if (sflag) {
8384 int clen, tlen;
8385 long offset, max = RSTRING_LEN(str);
8386 unsigned int save = -1;
8387 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8388
8389 while (s < send) {
8390 int may_modify = 0;
8391
8392 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8393 if (!MBCLEN_CHARFOUND_P(r)) {
8394 xfree(buf);
8395 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8396 }
8397 clen = MBCLEN_CHARFOUND_LEN(r);
8398 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8399
8400 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8401
8402 s += clen;
8403 if (c < 256) {
8404 c = trans[c];
8405 }
8406 else if (hash) {
8407 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8408 if (NIL_P(tmp)) {
8409 if (cflag) c = last;
8410 else c = errc;
8411 }
8412 else if (cflag) c = errc;
8413 else c = NUM2INT(tmp);
8414 }
8415 else {
8416 c = errc;
8417 }
8418 if (c != (unsigned int)-1) {
8419 if (save == c) {
8420 CHECK_IF_ASCII(c);
8421 continue;
8422 }
8423 save = c;
8424 tlen = rb_enc_codelen(c, enc);
8425 modify = 1;
8426 }
8427 else {
8428 save = -1;
8429 c = c0;
8430 if (enc != e1) may_modify = 1;
8431 }
8432 if ((offset = t - buf) + tlen > max) {
8433 size_t MAYBE_UNUSED(old) = max + termlen;
8434 max = offset + tlen + (send - s);
8435 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8436 t = buf + offset;
8437 }
8438 rb_enc_mbcput(c, t, enc);
8439 if (may_modify && memcmp(s, t, tlen) != 0) {
8440 modify = 1;
8441 }
8442 CHECK_IF_ASCII(c);
8443 t += tlen;
8444 }
8445 if (!STR_EMBED_P(str)) {
8446 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8447 }
8448 TERM_FILL((char *)t, termlen);
8449 RSTRING(str)->as.heap.ptr = (char *)buf;
8450 STR_SET_LEN(str, t - buf);
8451 STR_SET_NOEMBED(str);
8452 RSTRING(str)->as.heap.aux.capa = max;
8453 }
8454 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8455 while (s < send) {
8456 c = (unsigned char)*s;
8457 if (trans[c] != errc) {
8458 if (!cflag) {
8459 c = trans[c];
8460 *s = c;
8461 modify = 1;
8462 }
8463 else {
8464 *s = last;
8465 modify = 1;
8466 }
8467 }
8468 CHECK_IF_ASCII(c);
8469 s++;
8470 }
8471 }
8472 else {
8473 int clen, tlen;
8474 long offset, max = (long)((send - s) * 1.2);
8475 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8476
8477 while (s < send) {
8478 int may_modify = 0;
8479
8480 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8481 if (!MBCLEN_CHARFOUND_P(r)) {
8482 xfree(buf);
8483 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8484 }
8485 clen = MBCLEN_CHARFOUND_LEN(r);
8486 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8487
8488 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8489
8490 if (c < 256) {
8491 c = trans[c];
8492 }
8493 else if (hash) {
8494 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8495 if (NIL_P(tmp)) {
8496 if (cflag) c = last;
8497 else c = errc;
8498 }
8499 else if (cflag) c = errc;
8500 else c = NUM2INT(tmp);
8501 }
8502 else {
8503 c = cflag ? last : errc;
8504 }
8505 if (c != errc) {
8506 tlen = rb_enc_codelen(c, enc);
8507 modify = 1;
8508 }
8509 else {
8510 c = c0;
8511 if (enc != e1) may_modify = 1;
8512 }
8513 if ((offset = t - buf) + tlen > max) {
8514 size_t MAYBE_UNUSED(old) = max + termlen;
8515 max = offset + tlen + (long)((send - s) * 1.2);
8516 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8517 t = buf + offset;
8518 }
8519 if (s != t) {
8520 rb_enc_mbcput(c, t, enc);
8521 if (may_modify && memcmp(s, t, tlen) != 0) {
8522 modify = 1;
8523 }
8524 }
8525 CHECK_IF_ASCII(c);
8526 s += clen;
8527 t += tlen;
8528 }
8529 if (!STR_EMBED_P(str)) {
8530 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8531 }
8532 TERM_FILL((char *)t, termlen);
8533 RSTRING(str)->as.heap.ptr = (char *)buf;
8534 STR_SET_LEN(str, t - buf);
8535 STR_SET_NOEMBED(str);
8536 RSTRING(str)->as.heap.aux.capa = max;
8537 }
8538
8539 if (modify) {
8540 if (cr != ENC_CODERANGE_BROKEN)
8541 ENC_CODERANGE_SET(str, cr);
8542 rb_enc_associate(str, enc);
8543 return str;
8544 }
8545 return Qnil;
8546}
8547
8548
8549/*
8550 * call-seq:
8551 * tr!(selector, replacements) -> self or nil
8552 *
8553 * Like String#tr, except:
8554 *
8555 * - Performs substitutions in +self+ (not in a copy of +self+).
8556 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8557 *
8558 * Related: {Modifying}[rdoc-ref:String@Modifying].
8559 */
8560
8561static VALUE
8562rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8563{
8564 return tr_trans(str, src, repl, 0);
8565}
8566
8567
8568/*
8569 * call-seq:
8570 * tr(selector, replacements) -> new_string
8571 *
8572 * Returns a copy of +self+ with each character specified by string +selector+
8573 * translated to the corresponding character in string +replacements+.
8574 * The correspondence is _positional_:
8575 *
8576 * - Each occurrence of the first character specified by +selector+
8577 * is translated to the first character in +replacements+.
8578 * - Each occurrence of the second character specified by +selector+
8579 * is translated to the second character in +replacements+.
8580 * - And so on.
8581 *
8582 * Example:
8583 *
8584 * 'hello'.tr('el', 'ip') #=> "hippo"
8585 *
8586 * If +replacements+ is shorter than +selector+,
8587 * it is implicitly padded with its own last character:
8588 *
8589 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8590 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8591 *
8592 * Arguments +selector+ and +replacements+ must be valid character selectors
8593 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8594 * and may use any of its valid forms, including negation, ranges, and escapes:
8595 *
8596 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8597 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8598 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8599 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8600 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8601 *
8602 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8603 */
8604
8605static VALUE
8606rb_str_tr(VALUE str, VALUE src, VALUE repl)
8607{
8608 str = str_duplicate(rb_cString, str);
8609 tr_trans(str, src, repl, 0);
8610 return str;
8611}
8612
8613#define TR_TABLE_MAX (UCHAR_MAX+1)
8614#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8615static void
8616tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8617 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8618{
8619 const unsigned int errc = -1;
8620 char buf[TR_TABLE_MAX];
8621 struct tr tr;
8622 unsigned int c;
8623 VALUE table = 0, ptable = 0;
8624 int i, l, cflag = 0;
8625
8626 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8627 tr.gen = tr.now = tr.max = 0;
8628
8629 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8630 cflag = 1;
8631 tr.p += l;
8632 }
8633 if (first) {
8634 for (i=0; i<TR_TABLE_MAX; i++) {
8635 stable[i] = 1;
8636 }
8637 stable[TR_TABLE_MAX] = cflag;
8638 }
8639 else if (stable[TR_TABLE_MAX] && !cflag) {
8640 stable[TR_TABLE_MAX] = 0;
8641 }
8642 for (i=0; i<TR_TABLE_MAX; i++) {
8643 buf[i] = cflag;
8644 }
8645
8646 while ((c = trnext(&tr, enc)) != errc) {
8647 if (c < TR_TABLE_MAX) {
8648 buf[(unsigned char)c] = !cflag;
8649 }
8650 else {
8651 VALUE key = UINT2NUM(c);
8652
8653 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8654 if (cflag) {
8655 ptable = *ctablep;
8656 table = ptable ? ptable : rb_hash_new();
8657 *ctablep = table;
8658 }
8659 else {
8660 table = rb_hash_new();
8661 ptable = *tablep;
8662 *tablep = table;
8663 }
8664 }
8665 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8666 rb_hash_aset(table, key, Qtrue);
8667 }
8668 }
8669 }
8670 for (i=0; i<TR_TABLE_MAX; i++) {
8671 stable[i] = stable[i] && buf[i];
8672 }
8673 if (!table && !cflag) {
8674 *tablep = 0;
8675 }
8676}
8677
8678
8679static int
8680tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8681{
8682 if (c < TR_TABLE_MAX) {
8683 return table[c] != 0;
8684 }
8685 else {
8686 VALUE v = UINT2NUM(c);
8687
8688 if (del) {
8689 if (!NIL_P(rb_hash_lookup(del, v)) &&
8690 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8691 return TRUE;
8692 }
8693 }
8694 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8695 return FALSE;
8696 }
8697 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8698 }
8699}
8700
8701/*
8702 * call-seq:
8703 * delete!(*selectors) -> self or nil
8704 *
8705 * Like String#delete, but modifies +self+ in place;
8706 * returns +self+ if any characters were deleted, +nil+ otherwise.
8707 *
8708 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8709 */
8710
8711static VALUE
8712rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8713{
8714 char squeez[TR_TABLE_SIZE];
8715 rb_encoding *enc = 0;
8716 char *s, *send, *t;
8717 VALUE del = 0, nodel = 0;
8718 int modify = 0;
8719 int i, ascompat, cr;
8720
8721 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8723 for (i=0; i<argc; i++) {
8724 VALUE s = argv[i];
8725
8726 StringValue(s);
8727 enc = rb_enc_check(str, s);
8728 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8729 }
8730
8731 str_modify_keep_cr(str);
8732 ascompat = rb_enc_asciicompat(enc);
8733 s = t = RSTRING_PTR(str);
8734 send = RSTRING_END(str);
8735 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8736 while (s < send) {
8737 unsigned int c;
8738 int clen;
8739
8740 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8741 if (squeez[c]) {
8742 modify = 1;
8743 }
8744 else {
8745 if (t != s) *t = c;
8746 t++;
8747 }
8748 s++;
8749 }
8750 else {
8751 c = rb_enc_codepoint_len(s, send, &clen, enc);
8752
8753 if (tr_find(c, squeez, del, nodel)) {
8754 modify = 1;
8755 }
8756 else {
8757 if (t != s) rb_enc_mbcput(c, t, enc);
8758 t += clen;
8760 }
8761 s += clen;
8762 }
8763 }
8764 TERM_FILL(t, TERM_LEN(str));
8765 STR_SET_LEN(str, t - RSTRING_PTR(str));
8766 ENC_CODERANGE_SET(str, cr);
8767
8768 if (modify) return str;
8769 return Qnil;
8770}
8771
8772
8773/*
8774 * call-seq:
8775 * delete(*selectors) -> new_string
8776 *
8777 * :include: doc/string/delete.rdoc
8778 *
8779 */
8780
8781static VALUE
8782rb_str_delete(int argc, VALUE *argv, VALUE str)
8783{
8784 str = str_duplicate(rb_cString, str);
8785 rb_str_delete_bang(argc, argv, str);
8786 return str;
8787}
8788
8789
8790/*
8791 * call-seq:
8792 * squeeze!(*selectors) -> self or nil
8793 *
8794 * Like String#squeeze, except that:
8795 *
8796 * - Characters are squeezed in +self+ (not in a copy of +self+).
8797 * - Returns +self+ if any changes are made, +nil+ otherwise.
8798 *
8799 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8800 */
8801
8802static VALUE
8803rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8804{
8805 char squeez[TR_TABLE_SIZE];
8806 rb_encoding *enc = 0;
8807 VALUE del = 0, nodel = 0;
8808 unsigned char *s, *send, *t;
8809 int i, modify = 0;
8810 int ascompat, singlebyte = single_byte_optimizable(str);
8811 unsigned int save;
8812
8813 if (argc == 0) {
8814 enc = STR_ENC_GET(str);
8815 }
8816 else {
8817 for (i=0; i<argc; i++) {
8818 VALUE s = argv[i];
8819
8820 StringValue(s);
8821 enc = rb_enc_check(str, s);
8822 if (singlebyte && !single_byte_optimizable(s))
8823 singlebyte = 0;
8824 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8825 }
8826 }
8827
8828 str_modify_keep_cr(str);
8829 s = t = (unsigned char *)RSTRING_PTR(str);
8830 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8831 send = (unsigned char *)RSTRING_END(str);
8832 save = -1;
8833 ascompat = rb_enc_asciicompat(enc);
8834
8835 if (singlebyte) {
8836 while (s < send) {
8837 unsigned int c = *s++;
8838 if (c != save || (argc > 0 && !squeez[c])) {
8839 *t++ = save = c;
8840 }
8841 }
8842 }
8843 else {
8844 while (s < send) {
8845 unsigned int c;
8846 int clen;
8847
8848 if (ascompat && (c = *s) < 0x80) {
8849 if (c != save || (argc > 0 && !squeez[c])) {
8850 *t++ = save = c;
8851 }
8852 s++;
8853 }
8854 else {
8855 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8856
8857 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8858 if (t != s) rb_enc_mbcput(c, t, enc);
8859 save = c;
8860 t += clen;
8861 }
8862 s += clen;
8863 }
8864 }
8865 }
8866
8867 TERM_FILL((char *)t, TERM_LEN(str));
8868 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8869 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8870 modify = 1;
8871 }
8872
8873 if (modify) return str;
8874 return Qnil;
8875}
8876
8877
8878/*
8879 * call-seq:
8880 * squeeze(*selectors) -> new_string
8881 *
8882 * :include: doc/string/squeeze.rdoc
8883 *
8884 */
8885
8886static VALUE
8887rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8888{
8889 str = str_duplicate(rb_cString, str);
8890 rb_str_squeeze_bang(argc, argv, str);
8891 return str;
8892}
8893
8894
8895/*
8896 * call-seq:
8897 * tr_s!(selector, replacements) -> self or nil
8898 *
8899 * Like String#tr_s, except:
8900 *
8901 * - Modifies +self+ in place (not a copy of +self+).
8902 * - Returns +self+ if any changes were made, +nil+ otherwise.
8903 *
8904 * Related: {Modifying}[rdoc-ref:String@Modifying].
8905 */
8906
8907static VALUE
8908rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8909{
8910 return tr_trans(str, src, repl, 1);
8911}
8912
8913
8914/*
8915 * call-seq:
8916 * tr_s(selector, replacements) -> new_string
8917 *
8918 * Like String#tr, except:
8919 *
8920 * - Also squeezes the modified portions of the translated string;
8921 * see String#squeeze.
8922 * - Returns the translated and squeezed string.
8923 *
8924 * Examples:
8925 *
8926 * 'hello'.tr_s('l', 'r') #=> "hero"
8927 * 'hello'.tr_s('el', '-') #=> "h-o"
8928 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8929 *
8930 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8931 *
8932 */
8933
8934static VALUE
8935rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8936{
8937 str = str_duplicate(rb_cString, str);
8938 tr_trans(str, src, repl, 1);
8939 return str;
8940}
8941
8942
8943/*
8944 * call-seq:
8945 * count(*selectors) -> integer
8946 *
8947 * :include: doc/string/count.rdoc
8948 */
8949
8950static VALUE
8951rb_str_count(int argc, VALUE *argv, VALUE str)
8952{
8953 char table[TR_TABLE_SIZE];
8954 rb_encoding *enc = 0;
8955 VALUE del = 0, nodel = 0, tstr;
8956 char *s, *send;
8957 int i;
8958 int ascompat;
8959 size_t n = 0;
8960
8962
8963 tstr = argv[0];
8964 StringValue(tstr);
8965 enc = rb_enc_check(str, tstr);
8966 if (argc == 1) {
8967 const char *ptstr;
8968 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8969 (ptstr = RSTRING_PTR(tstr),
8970 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8971 !is_broken_string(str)) {
8972 int clen;
8973 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8974
8975 s = RSTRING_PTR(str);
8976 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8977 send = RSTRING_END(str);
8978 while (s < send) {
8979 if (*(unsigned char*)s++ == c) n++;
8980 }
8981 return SIZET2NUM(n);
8982 }
8983 }
8984
8985 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8986 for (i=1; i<argc; i++) {
8987 tstr = argv[i];
8988 StringValue(tstr);
8989 enc = rb_enc_check(str, tstr);
8990 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8991 }
8992
8993 s = RSTRING_PTR(str);
8994 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8995 send = RSTRING_END(str);
8996 ascompat = rb_enc_asciicompat(enc);
8997 while (s < send) {
8998 unsigned int c;
8999
9000 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9001 if (table[c]) {
9002 n++;
9003 }
9004 s++;
9005 }
9006 else {
9007 int clen;
9008 c = rb_enc_codepoint_len(s, send, &clen, enc);
9009 if (tr_find(c, table, del, nodel)) {
9010 n++;
9011 }
9012 s += clen;
9013 }
9014 }
9015
9016 return SIZET2NUM(n);
9017}
9018
9019static VALUE
9020rb_fs_check(VALUE val)
9021{
9022 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9023 val = rb_check_string_type(val);
9024 if (NIL_P(val)) return 0;
9025 }
9026 return val;
9027}
9028
9029static const char isspacetable[256] = {
9030 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9032 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9034 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9035 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9036 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9037 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9038 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9039 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9040 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9041 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9042 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9043 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9044 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9045 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9046};
9047
9048#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9049
9050static long
9051split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9052{
9053 if (empty_count >= 0 && len == 0) {
9054 return empty_count + 1;
9055 }
9056 if (empty_count > 0) {
9057 /* make different substrings */
9058 if (result) {
9059 do {
9060 rb_ary_push(result, str_new_empty_String(str));
9061 } while (--empty_count > 0);
9062 }
9063 else {
9064 do {
9065 rb_yield(str_new_empty_String(str));
9066 } while (--empty_count > 0);
9067 }
9068 }
9069 str = rb_str_subseq(str, beg, len);
9070 if (result) {
9071 rb_ary_push(result, str);
9072 }
9073 else {
9074 rb_yield(str);
9075 }
9076 return empty_count;
9077}
9078
9079typedef enum {
9080 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9081} split_type_t;
9082
9083static split_type_t
9084literal_split_pattern(VALUE spat, split_type_t default_type)
9085{
9086 rb_encoding *enc = STR_ENC_GET(spat);
9087 const char *ptr;
9088 long len;
9089 RSTRING_GETMEM(spat, ptr, len);
9090 if (len == 0) {
9091 /* Special case - split into chars */
9092 return SPLIT_TYPE_CHARS;
9093 }
9094 else if (rb_enc_asciicompat(enc)) {
9095 if (len == 1 && ptr[0] == ' ') {
9096 return SPLIT_TYPE_AWK;
9097 }
9098 }
9099 else {
9100 int l;
9101 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9102 return SPLIT_TYPE_AWK;
9103 }
9104 }
9105 return default_type;
9106}
9107
9108/*
9109 * call-seq:
9110 * split(field_sep = $;, limit = 0) -> array_of_substrings
9111 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9112 *
9113 * :include: doc/string/split.rdoc
9114 *
9115 */
9116
9117static VALUE
9118rb_str_split_m(int argc, VALUE *argv, VALUE str)
9119{
9120 rb_encoding *enc;
9121 VALUE spat;
9122 VALUE limit;
9123 split_type_t split_type;
9124 long beg, end, i = 0, empty_count = -1;
9125 int lim = 0;
9126 VALUE result, tmp;
9127
9128 result = rb_block_given_p() ? Qfalse : Qnil;
9129 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9130 lim = NUM2INT(limit);
9131 if (lim <= 0) limit = Qnil;
9132 else if (lim == 1) {
9133 if (RSTRING_LEN(str) == 0)
9134 return result ? rb_ary_new2(0) : str;
9135 tmp = str_duplicate(rb_cString, str);
9136 if (!result) {
9137 rb_yield(tmp);
9138 return str;
9139 }
9140 return rb_ary_new3(1, tmp);
9141 }
9142 i = 1;
9143 }
9144 if (NIL_P(limit) && !lim) empty_count = 0;
9145
9146 enc = STR_ENC_GET(str);
9147 split_type = SPLIT_TYPE_REGEXP;
9148 if (!NIL_P(spat)) {
9149 spat = get_pat_quoted(spat, 0);
9150 }
9151 else if (NIL_P(spat = rb_fs)) {
9152 split_type = SPLIT_TYPE_AWK;
9153 }
9154 else if (!(spat = rb_fs_check(spat))) {
9155 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9156 }
9157 else {
9158 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9159 }
9160 if (split_type != SPLIT_TYPE_AWK) {
9161 switch (BUILTIN_TYPE(spat)) {
9162 case T_REGEXP:
9163 rb_reg_options(spat); /* check if uninitialized */
9164 tmp = RREGEXP_SRC(spat);
9165 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9166 if (split_type == SPLIT_TYPE_AWK) {
9167 spat = tmp;
9168 split_type = SPLIT_TYPE_STRING;
9169 }
9170 break;
9171
9172 case T_STRING:
9173 mustnot_broken(spat);
9174 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9175 break;
9176
9177 default:
9179 }
9180 }
9181
9182#define SPLIT_STR(beg, len) ( \
9183 empty_count = split_string(result, str, beg, len, empty_count), \
9184 str_mod_check(str, str_start, str_len))
9185
9186 beg = 0;
9187 char *ptr = RSTRING_PTR(str);
9188 char *const str_start = ptr;
9189 const long str_len = RSTRING_LEN(str);
9190 char *const eptr = str_start + str_len;
9191 if (split_type == SPLIT_TYPE_AWK) {
9192 char *bptr = ptr;
9193 int skip = 1;
9194 unsigned int c;
9195
9196 if (result) result = rb_ary_new();
9197 end = beg;
9198 if (is_ascii_string(str)) {
9199 while (ptr < eptr) {
9200 c = (unsigned char)*ptr++;
9201 if (skip) {
9202 if (ascii_isspace(c)) {
9203 beg = ptr - bptr;
9204 }
9205 else {
9206 end = ptr - bptr;
9207 skip = 0;
9208 if (!NIL_P(limit) && lim <= i) break;
9209 }
9210 }
9211 else if (ascii_isspace(c)) {
9212 SPLIT_STR(beg, end-beg);
9213 skip = 1;
9214 beg = ptr - bptr;
9215 if (!NIL_P(limit)) ++i;
9216 }
9217 else {
9218 end = ptr - bptr;
9219 }
9220 }
9221 }
9222 else {
9223 while (ptr < eptr) {
9224 int n;
9225
9226 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9227 ptr += n;
9228 if (skip) {
9229 if (rb_isspace(c)) {
9230 beg = ptr - bptr;
9231 }
9232 else {
9233 end = ptr - bptr;
9234 skip = 0;
9235 if (!NIL_P(limit) && lim <= i) break;
9236 }
9237 }
9238 else if (rb_isspace(c)) {
9239 SPLIT_STR(beg, end-beg);
9240 skip = 1;
9241 beg = ptr - bptr;
9242 if (!NIL_P(limit)) ++i;
9243 }
9244 else {
9245 end = ptr - bptr;
9246 }
9247 }
9248 }
9249 }
9250 else if (split_type == SPLIT_TYPE_STRING) {
9251 char *substr_start = ptr;
9252 char *sptr = RSTRING_PTR(spat);
9253 long slen = RSTRING_LEN(spat);
9254
9255 if (result) result = rb_ary_new();
9256 mustnot_broken(str);
9257 enc = rb_enc_check(str, spat);
9258 while (ptr < eptr &&
9259 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9260 /* Check we are at the start of a char */
9261 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9262 if (t != ptr + end) {
9263 ptr = t;
9264 continue;
9265 }
9266 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9267 str_mod_check(spat, sptr, slen);
9268 ptr += end + slen;
9269 substr_start = ptr;
9270 if (!NIL_P(limit) && lim <= ++i) break;
9271 }
9272 beg = ptr - str_start;
9273 }
9274 else if (split_type == SPLIT_TYPE_CHARS) {
9275 int n;
9276
9277 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9278 mustnot_broken(str);
9279 enc = rb_enc_get(str);
9280 while (ptr < eptr &&
9281 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9282 SPLIT_STR(ptr - str_start, n);
9283 ptr += n;
9284 if (!NIL_P(limit) && lim <= ++i) break;
9285 }
9286 beg = ptr - str_start;
9287 }
9288 else {
9289 if (result) result = rb_ary_new();
9290 long len = RSTRING_LEN(str);
9291 long start = beg;
9292 long idx;
9293 int last_null = 0;
9294 struct re_registers *regs;
9295 VALUE match = 0;
9296
9297 for (; rb_reg_search(spat, str, start, 0) >= 0;
9298 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9299 match = rb_backref_get();
9300 if (!result) rb_match_busy(match);
9301 regs = RMATCH_REGS(match);
9302 end = BEG(0);
9303 if (start == end && BEG(0) == END(0)) {
9304 if (!ptr) {
9305 SPLIT_STR(0, 0);
9306 break;
9307 }
9308 else if (last_null == 1) {
9309 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9310 beg = start;
9311 }
9312 else {
9313 if (start == len)
9314 start++;
9315 else
9316 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9317 last_null = 1;
9318 continue;
9319 }
9320 }
9321 else {
9322 SPLIT_STR(beg, end-beg);
9323 beg = start = END(0);
9324 }
9325 last_null = 0;
9326
9327 for (idx=1; idx < regs->num_regs; idx++) {
9328 if (BEG(idx) == -1) continue;
9329 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9330 }
9331 if (!NIL_P(limit) && lim <= ++i) break;
9332 }
9333 if (match) rb_match_unbusy(match);
9334 }
9335 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9336 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9337 }
9338
9339 return result ? result : str;
9340}
9341
9342VALUE
9343rb_str_split(VALUE str, const char *sep0)
9344{
9345 VALUE sep;
9346
9347 StringValue(str);
9348 sep = rb_str_new_cstr(sep0);
9349 return rb_str_split_m(1, &sep, str);
9350}
9351
9352#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9353
9354static inline int
9355enumerator_element(VALUE ary, VALUE e)
9356{
9357 if (ary) {
9358 rb_ary_push(ary, e);
9359 return 0;
9360 }
9361 else {
9362 rb_yield(e);
9363 return 1;
9364 }
9365}
9366
9367#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9368
9369static const char *
9370chomp_newline(const char *p, const char *e, rb_encoding *enc)
9371{
9372 const char *prev = rb_enc_prev_char(p, e, e, enc);
9373 if (rb_enc_is_newline(prev, e, enc)) {
9374 e = prev;
9375 prev = rb_enc_prev_char(p, e, e, enc);
9376 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9377 e = prev;
9378 }
9379 return e;
9380}
9381
9382static VALUE
9383get_rs(void)
9384{
9385 VALUE rs = rb_rs;
9386 if (!NIL_P(rs) &&
9387 (!RB_TYPE_P(rs, T_STRING) ||
9388 RSTRING_LEN(rs) != 1 ||
9389 RSTRING_PTR(rs)[0] != '\n')) {
9390 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9391 }
9392 return rs;
9393}
9394
9395#define rb_rs get_rs()
9396
9397static VALUE
9398rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9399{
9400 rb_encoding *enc;
9401 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9402 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9403 long pos, len, rslen;
9404 int rsnewline = 0;
9405
9406 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9407 rs = rb_rs;
9408 if (!NIL_P(opts)) {
9409 static ID keywords[1];
9410 if (!keywords[0]) {
9411 keywords[0] = rb_intern_const("chomp");
9412 }
9413 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9414 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9415 }
9416
9417 if (NIL_P(rs)) {
9418 if (!ENUM_ELEM(ary, str)) {
9419 return ary;
9420 }
9421 else {
9422 return orig;
9423 }
9424 }
9425
9426 if (!RSTRING_LEN(str)) goto end;
9427 str = rb_str_new_frozen(str);
9428 ptr = subptr = RSTRING_PTR(str);
9429 pend = RSTRING_END(str);
9430 len = RSTRING_LEN(str);
9431 StringValue(rs);
9432 rslen = RSTRING_LEN(rs);
9433
9434 if (rs == rb_default_rs)
9435 enc = rb_enc_get(str);
9436 else
9437 enc = rb_enc_check(str, rs);
9438
9439 if (rslen == 0) {
9440 /* paragraph mode */
9441 int n;
9442 const char *eol = NULL;
9443 subend = subptr;
9444 while (subend < pend) {
9445 long chomp_rslen = 0;
9446 do {
9447 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9448 n = 0;
9449 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9450 if (rb_enc_is_newline(subend + n, pend, enc)) {
9451 if (eol == subend) break;
9452 subend += rslen;
9453 if (subptr) {
9454 eol = subend;
9455 chomp_rslen = -rslen;
9456 }
9457 }
9458 else {
9459 if (!subptr) subptr = subend;
9460 subend += rslen;
9461 }
9462 rslen = 0;
9463 } while (subend < pend);
9464 if (!subptr) break;
9465 if (rslen == 0) chomp_rslen = 0;
9466 line = rb_str_subseq(str, subptr - ptr,
9467 subend - subptr + (chomp ? chomp_rslen : rslen));
9468 if (ENUM_ELEM(ary, line)) {
9469 str_mod_check(str, ptr, len);
9470 }
9471 subptr = eol = NULL;
9472 }
9473 goto end;
9474 }
9475 else {
9476 rsptr = RSTRING_PTR(rs);
9477 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9478 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9479 rsnewline = 1;
9480 }
9481 }
9482
9483 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9484 rs = rb_str_new(rsptr, rslen);
9485 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9486 rsptr = RSTRING_PTR(rs);
9487 rslen = RSTRING_LEN(rs);
9488 }
9489
9490 while (subptr < pend) {
9491 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9492 if (pos < 0) break;
9493 hit = subptr + pos;
9494 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9495 if (hit != adjusted) {
9496 subptr = adjusted;
9497 continue;
9498 }
9499 subend = hit += rslen;
9500 if (chomp) {
9501 if (rsnewline) {
9502 subend = chomp_newline(subptr, subend, enc);
9503 }
9504 else {
9505 subend -= rslen;
9506 }
9507 }
9508 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9509 if (ENUM_ELEM(ary, line)) {
9510 str_mod_check(str, ptr, len);
9511 }
9512 subptr = hit;
9513 }
9514
9515 if (subptr != pend) {
9516 if (chomp) {
9517 if (rsnewline) {
9518 pend = chomp_newline(subptr, pend, enc);
9519 }
9520 else if (pend - subptr >= rslen &&
9521 memcmp(pend - rslen, rsptr, rslen) == 0) {
9522 pend -= rslen;
9523 }
9524 }
9525 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9526 ENUM_ELEM(ary, line);
9527 RB_GC_GUARD(str);
9528 }
9529
9530 end:
9531 if (ary)
9532 return ary;
9533 else
9534 return orig;
9535}
9536
9537/*
9538 * call-seq:
9539 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9540 * each_line(record_separator = $/, chomp: false) -> enumerator
9541 *
9542 * :include: doc/string/each_line.rdoc
9543 *
9544 */
9545
9546static VALUE
9547rb_str_each_line(int argc, VALUE *argv, VALUE str)
9548{
9549 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9550 return rb_str_enumerate_lines(argc, argv, str, 0);
9551}
9552
9553/*
9554 * call-seq:
9555 * lines(record_separator = $/, chomp: false) -> array_of_strings
9556 *
9557 * Returns substrings ("lines") of +self+
9558 * according to the given arguments:
9559 *
9560 * s = <<~EOT
9561 * This is the first line.
9562 * This is line two.
9563 *
9564 * This is line four.
9565 * This is line five.
9566 * EOT
9567 *
9568 * With the default argument values:
9569 *
9570 * $/ # => "\n"
9571 * s.lines
9572 * # =>
9573 * ["This is the first line.\n",
9574 * "This is line two.\n",
9575 * "\n",
9576 * "This is line four.\n",
9577 * "This is line five.\n"]
9578 *
9579 * With a different +record_separator+:
9580 *
9581 * record_separator = ' is '
9582 * s.lines(record_separator)
9583 * # =>
9584 * ["This is ",
9585 * "the first line.\nThis is ",
9586 * "line two.\n\nThis is ",
9587 * "line four.\nThis is ",
9588 * "line five.\n"]
9589 *
9590 * With keyword argument +chomp+ as +true+,
9591 * removes the trailing newline from each line:
9592 *
9593 * s.lines(chomp: true)
9594 * # =>
9595 * ["This is the first line.",
9596 * "This is line two.",
9597 * "",
9598 * "This is line four.",
9599 * "This is line five."]
9600 *
9601 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9602 */
9603
9604static VALUE
9605rb_str_lines(int argc, VALUE *argv, VALUE str)
9606{
9607 VALUE ary = WANTARRAY("lines", 0);
9608 return rb_str_enumerate_lines(argc, argv, str, ary);
9609}
9610
9611static VALUE
9612rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9613{
9614 return LONG2FIX(RSTRING_LEN(str));
9615}
9616
9617static VALUE
9618rb_str_enumerate_bytes(VALUE str, VALUE ary)
9619{
9620 long i;
9621
9622 for (i=0; i<RSTRING_LEN(str); i++) {
9623 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9624 }
9625 if (ary)
9626 return ary;
9627 else
9628 return str;
9629}
9630
9631/*
9632 * call-seq:
9633 * each_byte {|byte| ... } -> self
9634 * each_byte -> enumerator
9635 *
9636 * :include: doc/string/each_byte.rdoc
9637 *
9638 */
9639
9640static VALUE
9641rb_str_each_byte(VALUE str)
9642{
9643 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9644 return rb_str_enumerate_bytes(str, 0);
9645}
9646
9647/*
9648 * call-seq:
9649 * bytes -> array_of_bytes
9650 *
9651 * :include: doc/string/bytes.rdoc
9652 *
9653 */
9654
9655static VALUE
9656rb_str_bytes(VALUE str)
9657{
9658 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9659 return rb_str_enumerate_bytes(str, ary);
9660}
9661
9662static VALUE
9663rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9664{
9665 return rb_str_length(str);
9666}
9667
9668static VALUE
9669rb_str_enumerate_chars(VALUE str, VALUE ary)
9670{
9671 VALUE orig = str;
9672 long i, len, n;
9673 const char *ptr;
9674 rb_encoding *enc;
9675
9676 str = rb_str_new_frozen(str);
9677 ptr = RSTRING_PTR(str);
9678 len = RSTRING_LEN(str);
9679 enc = rb_enc_get(str);
9680
9682 for (i = 0; i < len; i += n) {
9683 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9684 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9685 }
9686 }
9687 else {
9688 for (i = 0; i < len; i += n) {
9689 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9690 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9691 }
9692 }
9693 RB_GC_GUARD(str);
9694 if (ary)
9695 return ary;
9696 else
9697 return orig;
9698}
9699
9700/*
9701 * call-seq:
9702 * each_char {|char| ... } -> self
9703 * each_char -> enumerator
9704 *
9705 * :include: doc/string/each_char.rdoc
9706 *
9707 */
9708
9709static VALUE
9710rb_str_each_char(VALUE str)
9711{
9712 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9713 return rb_str_enumerate_chars(str, 0);
9714}
9715
9716/*
9717 * call-seq:
9718 * chars -> array_of_characters
9719 *
9720 * :include: doc/string/chars.rdoc
9721 *
9722 */
9723
9724static VALUE
9725rb_str_chars(VALUE str)
9726{
9727 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9728 return rb_str_enumerate_chars(str, ary);
9729}
9730
9731static VALUE
9732rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9733{
9734 VALUE orig = str;
9735 int n;
9736 unsigned int c;
9737 const char *ptr, *end;
9738 rb_encoding *enc;
9739
9740 if (single_byte_optimizable(str))
9741 return rb_str_enumerate_bytes(str, ary);
9742
9743 str = rb_str_new_frozen(str);
9744 ptr = RSTRING_PTR(str);
9745 end = RSTRING_END(str);
9746 enc = STR_ENC_GET(str);
9747
9748 while (ptr < end) {
9749 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9750 ENUM_ELEM(ary, UINT2NUM(c));
9751 ptr += n;
9752 }
9753 RB_GC_GUARD(str);
9754 if (ary)
9755 return ary;
9756 else
9757 return orig;
9758}
9759
9760/*
9761 * call-seq:
9762 * each_codepoint {|codepoint| ... } -> self
9763 * each_codepoint -> enumerator
9764 *
9765 * :include: doc/string/each_codepoint.rdoc
9766 *
9767 */
9768
9769static VALUE
9770rb_str_each_codepoint(VALUE str)
9771{
9772 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9773 return rb_str_enumerate_codepoints(str, 0);
9774}
9775
9776/*
9777 * call-seq:
9778 * codepoints -> array_of_integers
9779 *
9780 * :include: doc/string/codepoints.rdoc
9781 *
9782 */
9783
9784static VALUE
9785rb_str_codepoints(VALUE str)
9786{
9787 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9788 return rb_str_enumerate_codepoints(str, ary);
9789}
9790
9791static regex_t *
9792get_reg_grapheme_cluster(rb_encoding *enc)
9793{
9794 int encidx = rb_enc_to_index(enc);
9795
9796 const OnigUChar source_ascii[] = "\\X";
9797 const OnigUChar *source = source_ascii;
9798 size_t source_len = sizeof(source_ascii) - 1;
9799
9800 switch (encidx) {
9801#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9802#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9803#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9804#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9805#define CASE_UTF(e) \
9806 case ENCINDEX_UTF_##e: { \
9807 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9808 source = source_UTF_##e; \
9809 source_len = sizeof(source_UTF_##e); \
9810 break; \
9811 }
9812 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9813#undef CASE_UTF
9814#undef CHARS_16BE
9815#undef CHARS_16LE
9816#undef CHARS_32BE
9817#undef CHARS_32LE
9818 }
9819
9820 regex_t *reg_grapheme_cluster;
9821 OnigErrorInfo einfo;
9822 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9823 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9824 if (r) {
9825 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9826 onig_error_code_to_str(message, r, &einfo);
9827 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9828 }
9829
9830 return reg_grapheme_cluster;
9831}
9832
9833static regex_t *
9834get_cached_reg_grapheme_cluster(rb_encoding *enc)
9835{
9836 int encidx = rb_enc_to_index(enc);
9837 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9838
9839 if (encidx == rb_utf8_encindex()) {
9840 if (!reg_grapheme_cluster_utf8) {
9841 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9842 }
9843
9844 return reg_grapheme_cluster_utf8;
9845 }
9846
9847 return NULL;
9848}
9849
9850static VALUE
9851rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9852{
9853 size_t grapheme_cluster_count = 0;
9854 rb_encoding *enc = get_encoding(str);
9855 const char *ptr, *end;
9856
9857 if (!rb_enc_unicode_p(enc)) {
9858 return rb_str_length(str);
9859 }
9860
9861 bool cached_reg_grapheme_cluster = true;
9862 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9863 if (!reg_grapheme_cluster) {
9864 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9865 cached_reg_grapheme_cluster = false;
9866 }
9867
9868 ptr = RSTRING_PTR(str);
9869 end = RSTRING_END(str);
9870
9871 while (ptr < end) {
9872 OnigPosition len = onig_match(reg_grapheme_cluster,
9873 (const OnigUChar *)ptr, (const OnigUChar *)end,
9874 (const OnigUChar *)ptr, NULL, 0);
9875 if (len <= 0) break;
9876 grapheme_cluster_count++;
9877 ptr += len;
9878 }
9879
9880 if (!cached_reg_grapheme_cluster) {
9881 onig_free(reg_grapheme_cluster);
9882 }
9883
9884 return SIZET2NUM(grapheme_cluster_count);
9885}
9886
9887static VALUE
9888rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9889{
9890 VALUE orig = str;
9891 rb_encoding *enc = get_encoding(str);
9892 const char *ptr0, *ptr, *end;
9893
9894 if (!rb_enc_unicode_p(enc)) {
9895 return rb_str_enumerate_chars(str, ary);
9896 }
9897
9898 if (!ary) str = rb_str_new_frozen(str);
9899
9900 bool cached_reg_grapheme_cluster = true;
9901 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9902 if (!reg_grapheme_cluster) {
9903 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9904 cached_reg_grapheme_cluster = false;
9905 }
9906
9907 ptr0 = ptr = RSTRING_PTR(str);
9908 end = RSTRING_END(str);
9909
9910 while (ptr < end) {
9911 OnigPosition len = onig_match(reg_grapheme_cluster,
9912 (const OnigUChar *)ptr, (const OnigUChar *)end,
9913 (const OnigUChar *)ptr, NULL, 0);
9914 if (len <= 0) break;
9915 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9916 ptr += len;
9917 }
9918
9919 if (!cached_reg_grapheme_cluster) {
9920 onig_free(reg_grapheme_cluster);
9921 }
9922
9923 RB_GC_GUARD(str);
9924 if (ary)
9925 return ary;
9926 else
9927 return orig;
9928}
9929
9930/*
9931 * call-seq:
9932 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9933 * each_grapheme_cluster -> enumerator
9934 *
9935 * :include: doc/string/each_grapheme_cluster.rdoc
9936 *
9937 */
9938
9939static VALUE
9940rb_str_each_grapheme_cluster(VALUE str)
9941{
9942 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9943 return rb_str_enumerate_grapheme_clusters(str, 0);
9944}
9945
9946/*
9947 * call-seq:
9948 * grapheme_clusters -> array_of_grapheme_clusters
9949 *
9950 * :include: doc/string/grapheme_clusters.rdoc
9951 *
9952 */
9953
9954static VALUE
9955rb_str_grapheme_clusters(VALUE str)
9956{
9957 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9958 return rb_str_enumerate_grapheme_clusters(str, ary);
9959}
9960
9961static long
9962chopped_length(VALUE str)
9963{
9964 rb_encoding *enc = STR_ENC_GET(str);
9965 const char *p, *p2, *beg, *end;
9966
9967 beg = RSTRING_PTR(str);
9968 end = beg + RSTRING_LEN(str);
9969 if (beg >= end) return 0;
9970 p = rb_enc_prev_char(beg, end, end, enc);
9971 if (!p) return 0;
9972 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9973 p2 = rb_enc_prev_char(beg, p, end, enc);
9974 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9975 }
9976 return p - beg;
9977}
9978
9979/*
9980 * call-seq:
9981 * chop! -> self or nil
9982 *
9983 * Like String#chop, except that:
9984 *
9985 * - Removes trailing characters from +self+ (not from a copy of +self+).
9986 * - Returns +self+ if any characters are removed, +nil+ otherwise.
9987 *
9988 * Related: see {Modifying}[rdoc-ref:String@Modifying].
9989 */
9990
9991static VALUE
9992rb_str_chop_bang(VALUE str)
9993{
9994 str_modify_keep_cr(str);
9995 if (RSTRING_LEN(str) > 0) {
9996 long len;
9997 len = chopped_length(str);
9998 STR_SET_LEN(str, len);
9999 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10000 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10002 }
10003 return str;
10004 }
10005 return Qnil;
10006}
10007
10008
10009/*
10010 * call-seq:
10011 * chop -> new_string
10012 *
10013 * :include: doc/string/chop.rdoc
10014 *
10015 */
10016
10017static VALUE
10018rb_str_chop(VALUE str)
10019{
10020 return rb_str_subseq(str, 0, chopped_length(str));
10021}
10022
10023static long
10024smart_chomp(VALUE str, const char *e, const char *p)
10025{
10026 rb_encoding *enc = rb_enc_get(str);
10027 if (rb_enc_mbminlen(enc) > 1) {
10028 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10029 if (rb_enc_is_newline(pp, e, enc)) {
10030 e = pp;
10031 }
10032 pp = e - rb_enc_mbminlen(enc);
10033 if (pp >= p) {
10034 pp = rb_enc_left_char_head(p, pp, e, enc);
10035 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10036 e = pp;
10037 }
10038 }
10039 }
10040 else {
10041 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10042 case '\n':
10043 if (--e > p && *(e-1) == '\r') {
10044 --e;
10045 }
10046 break;
10047 case '\r':
10048 --e;
10049 break;
10050 }
10051 }
10052 return e - p;
10053}
10054
10055static long
10056chompped_length(VALUE str, VALUE rs)
10057{
10058 rb_encoding *enc;
10059 int newline;
10060 char *pp, *e, *rsptr;
10061 long rslen;
10062 char *const p = RSTRING_PTR(str);
10063 long len = RSTRING_LEN(str);
10064
10065 if (len == 0) return 0;
10066 e = p + len;
10067 if (rs == rb_default_rs) {
10068 return smart_chomp(str, e, p);
10069 }
10070
10071 enc = rb_enc_get(str);
10072 RSTRING_GETMEM(rs, rsptr, rslen);
10073 if (rslen == 0) {
10074 if (rb_enc_mbminlen(enc) > 1) {
10075 while (e > p) {
10076 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10077 if (!rb_enc_is_newline(pp, e, enc)) break;
10078 e = pp;
10079 pp -= rb_enc_mbminlen(enc);
10080 if (pp >= p) {
10081 pp = rb_enc_left_char_head(p, pp, e, enc);
10082 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10083 e = pp;
10084 }
10085 }
10086 }
10087 }
10088 else {
10089 while (e > p && *(e-1) == '\n') {
10090 --e;
10091 if (e > p && *(e-1) == '\r')
10092 --e;
10093 }
10094 }
10095 return e - p;
10096 }
10097 if (rslen > len) return len;
10098
10099 enc = rb_enc_get(rs);
10100 newline = rsptr[rslen-1];
10101 if (rslen == rb_enc_mbminlen(enc)) {
10102 if (rslen == 1) {
10103 if (newline == '\n')
10104 return smart_chomp(str, e, p);
10105 }
10106 else {
10107 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10108 return smart_chomp(str, e, p);
10109 }
10110 }
10111
10112 enc = rb_enc_check(str, rs);
10113 if (is_broken_string(rs)) {
10114 return len;
10115 }
10116 pp = e - rslen;
10117 if (p[len-1] == newline &&
10118 (rslen <= 1 ||
10119 memcmp(rsptr, pp, rslen) == 0)) {
10120 if (at_char_boundary(p, pp, e, enc))
10121 return len - rslen;
10122 RB_GC_GUARD(rs);
10123 }
10124 return len;
10125}
10126
10132static VALUE
10133chomp_rs(int argc, const VALUE *argv)
10134{
10135 rb_check_arity(argc, 0, 1);
10136 if (argc > 0) {
10137 VALUE rs = argv[0];
10138 if (!NIL_P(rs)) StringValue(rs);
10139 return rs;
10140 }
10141 else {
10142 return rb_rs;
10143 }
10144}
10145
10146VALUE
10147rb_str_chomp_string(VALUE str, VALUE rs)
10148{
10149 long olen = RSTRING_LEN(str);
10150 long len = chompped_length(str, rs);
10151 if (len >= olen) return Qnil;
10152 str_modify_keep_cr(str);
10153 STR_SET_LEN(str, len);
10154 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10155 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10157 }
10158 return str;
10159}
10160
10161/*
10162 * call-seq:
10163 * chomp!(line_sep = $/) -> self or nil
10164 *
10165 * Like String#chomp, except that:
10166 *
10167 * - Removes trailing characters from +self+ (not from a copy of +self+).
10168 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10169 *
10170 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10171 */
10172
10173static VALUE
10174rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10175{
10176 VALUE rs;
10177 str_modifiable(str);
10178 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10179 rs = chomp_rs(argc, argv);
10180 if (NIL_P(rs)) return Qnil;
10181 return rb_str_chomp_string(str, rs);
10182}
10183
10184
10185/*
10186 * call-seq:
10187 * chomp(line_sep = $/) -> new_string
10188 *
10189 * :include: doc/string/chomp.rdoc
10190 *
10191 */
10192
10193static VALUE
10194rb_str_chomp(int argc, VALUE *argv, VALUE str)
10195{
10196 VALUE rs = chomp_rs(argc, argv);
10197 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10198 return rb_str_subseq(str, 0, chompped_length(str, rs));
10199}
10200
10201static void
10202tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10203 VALUE str, int num_selectors, VALUE *selectors)
10204{
10205 int i;
10206
10207 for (i=0; i<num_selectors; i++) {
10208 VALUE selector = selectors[i];
10209 rb_encoding *enc;
10210
10211 StringValue(selector);
10212 enc = rb_enc_check(str, selector);
10213 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10214 }
10215}
10216
10217static long
10218lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10219{
10220 const char *const start = s;
10221
10222 if (!s || s >= e) return 0;
10223
10224 /* remove spaces at head */
10225 if (single_byte_optimizable(str)) {
10226 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10227 }
10228 else {
10229 while (s < e) {
10230 int n;
10231 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10232
10233 if (cc && !rb_isspace(cc)) break;
10234 s += n;
10235 }
10236 }
10237 return s - start;
10238}
10239
10240static long
10241lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10242 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10243{
10244 const char *const start = s;
10245
10246 if (!s || s >= e) return 0;
10247
10248 /* remove leading characters in the table */
10249 while (s < e) {
10250 int n;
10251 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10252
10253 if (!tr_find(cc, table, del, nodel)) break;
10254 s += n;
10255 }
10256 return s - start;
10257}
10258
10259/*
10260 * call-seq:
10261 * lstrip!(*selectors) -> self or nil
10262 *
10263 * Like String#lstrip, except that:
10264 *
10265 * - Performs stripping in +self+ (not in a copy of +self+).
10266 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10267 *
10268 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10269 */
10270
10271static VALUE
10272rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10273{
10274 rb_encoding *enc;
10275 char *start, *s;
10276 long olen, loffset;
10277
10278 str_modify_keep_cr(str);
10279 enc = STR_ENC_GET(str);
10280 RSTRING_GETMEM(str, start, olen);
10281 if (argc > 0) {
10282 char table[TR_TABLE_SIZE];
10283 VALUE del = 0, nodel = 0;
10284
10285 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10286 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10287 }
10288 else {
10289 loffset = lstrip_offset(str, start, start+olen, enc);
10290 }
10291
10292 if (loffset > 0) {
10293 long len = olen-loffset;
10294 s = start + loffset;
10295 memmove(start, s, len);
10296 STR_SET_LEN(str, len);
10297 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10298 return str;
10299 }
10300 return Qnil;
10301}
10302
10303
10304/*
10305 * call-seq:
10306 * lstrip(*selectors) -> new_string
10307 *
10308 * Returns a copy of +self+ with leading whitespace removed;
10309 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10310 *
10311 * whitespace = "\x00\t\n\v\f\r "
10312 * s = whitespace + 'abc' + whitespace
10313 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10314 * s.lstrip
10315 * # => "abc\u0000\t\n\v\f\r "
10316 *
10317 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10318 *
10319 * s = "---abc+++"
10320 * s.lstrip("-") # => "abc+++"
10321 *
10322 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10323 * and may use any of its valid forms, including negation, ranges, and escapes:
10324 *
10325 * "01234abc56789".lstrip("0-9") # "abc56789"
10326 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10327 *
10328 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10329 */
10330
10331static VALUE
10332rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10333{
10334 char *start;
10335 long len, loffset;
10336
10337 RSTRING_GETMEM(str, start, len);
10338 if (argc > 0) {
10339 char table[TR_TABLE_SIZE];
10340 VALUE del = 0, nodel = 0;
10341
10342 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10343 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10344 }
10345 else {
10346 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10347 }
10348 if (loffset <= 0) return str_duplicate(rb_cString, str);
10349 return rb_str_subseq(str, loffset, len - loffset);
10350}
10351
10352static long
10353rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10354{
10355 const char *t;
10356
10357 rb_str_check_dummy_enc(enc);
10359 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10360 }
10361 if (!s || s >= e) return 0;
10362 t = e;
10363
10364 /* remove trailing spaces or '\0's */
10365 if (single_byte_optimizable(str)) {
10366 unsigned char c;
10367 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10368 }
10369 else {
10370 char *tp;
10371
10372 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10373 unsigned int c = rb_enc_codepoint(tp, e, enc);
10374 if (c && !rb_isspace(c)) break;
10375 t = tp;
10376 }
10377 }
10378 return e - t;
10379}
10380
10381static long
10382rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10383 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10384{
10385 const char *t;
10386 char *tp;
10387
10388 rb_str_check_dummy_enc(enc);
10390 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10391 }
10392 if (!s || s >= e) return 0;
10393 t = e;
10394
10395 /* remove trailing characters in the table */
10396 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10397 unsigned int c = rb_enc_codepoint(tp, e, enc);
10398 if (!tr_find(c, table, del, nodel)) break;
10399 t = tp;
10400 }
10401
10402 return e - t;
10403}
10404
10405/*
10406 * call-seq:
10407 * rstrip!(*selectors) -> self or nil
10408 *
10409 * Like String#rstrip, except that:
10410 *
10411 * - Performs stripping in +self+ (not in a copy of +self+).
10412 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10413 *
10414 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10415 */
10416
10417static VALUE
10418rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10419{
10420 rb_encoding *enc;
10421 char *start;
10422 long olen, roffset;
10423
10424 str_modify_keep_cr(str);
10425 enc = STR_ENC_GET(str);
10426 RSTRING_GETMEM(str, start, olen);
10427 if (argc > 0) {
10428 char table[TR_TABLE_SIZE];
10429 VALUE del = 0, nodel = 0;
10430
10431 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10432 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10433 }
10434 else {
10435 roffset = rstrip_offset(str, start, start+olen, enc);
10436 }
10437 if (roffset > 0) {
10438 long len = olen - roffset;
10439
10440 STR_SET_LEN(str, len);
10441 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10442 return str;
10443 }
10444 return Qnil;
10445}
10446
10447
10448/*
10449 * call-seq:
10450 * rstrip(*selectors) -> new_string
10451 *
10452 * Returns a copy of +self+ with trailing whitespace removed;
10453 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10454 *
10455 * whitespace = "\x00\t\n\v\f\r "
10456 * s = whitespace + 'abc' + whitespace
10457 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10458 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10459 *
10460 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10461 *
10462 * s = "---abc+++"
10463 * s.rstrip("+") # => "---abc"
10464 *
10465 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10466 * and may use any of its valid forms, including negation, ranges, and escapes:
10467 *
10468 * "01234abc56789".rstrip("0-9") # "01234abc"
10469 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10470 *
10471 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10472 */
10473
10474static VALUE
10475rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10476{
10477 rb_encoding *enc;
10478 char *start;
10479 long olen, roffset;
10480
10481 enc = STR_ENC_GET(str);
10482 RSTRING_GETMEM(str, start, olen);
10483 if (argc > 0) {
10484 char table[TR_TABLE_SIZE];
10485 VALUE del = 0, nodel = 0;
10486
10487 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10488 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10489 }
10490 else {
10491 roffset = rstrip_offset(str, start, start+olen, enc);
10492 }
10493 if (roffset <= 0) return str_duplicate(rb_cString, str);
10494 return rb_str_subseq(str, 0, olen-roffset);
10495}
10496
10497
10498/*
10499 * call-seq:
10500 * strip!(*selectors) -> self or nil
10501 *
10502 * Like String#strip, except that:
10503 *
10504 * - Any modifications are made to +self+.
10505 * - Returns +self+ if any modification are made, +nil+ otherwise.
10506 *
10507 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10508 */
10509
10510static VALUE
10511rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10512{
10513 char *start;
10514 long olen, loffset, roffset;
10515 rb_encoding *enc;
10516
10517 str_modify_keep_cr(str);
10518 enc = STR_ENC_GET(str);
10519 RSTRING_GETMEM(str, start, olen);
10520
10521 if (argc > 0) {
10522 char table[TR_TABLE_SIZE];
10523 VALUE del = 0, nodel = 0;
10524
10525 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10526 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10527 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10528 }
10529 else {
10530 loffset = lstrip_offset(str, start, start+olen, enc);
10531 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10532 }
10533
10534 if (loffset > 0 || roffset > 0) {
10535 long len = olen-roffset;
10536 if (loffset > 0) {
10537 len -= loffset;
10538 memmove(start, start + loffset, len);
10539 }
10540 STR_SET_LEN(str, len);
10541 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10542 return str;
10543 }
10544 return Qnil;
10545}
10546
10547
10548/*
10549 * call-seq:
10550 * strip(*selectors) -> new_string
10551 *
10552 * Returns a copy of +self+ with leading and trailing whitespace removed;
10553 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10554 *
10555 * whitespace = "\x00\t\n\v\f\r "
10556 * s = whitespace + 'abc' + whitespace
10557 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10558 * s.strip # => "abc"
10559 *
10560 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10561 *
10562 * s = "---abc+++"
10563 * s.strip("-+") # => "abc"
10564 * s.strip("+-") # => "abc"
10565 *
10566 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10567 * and may use any of its valid forms, including negation, ranges, and escapes:
10568 *
10569 * "01234abc56789".strip("0-9") # "abc"
10570 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10571 *
10572 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10573 */
10574
10575static VALUE
10576rb_str_strip(int argc, VALUE *argv, VALUE str)
10577{
10578 char *start;
10579 long olen, loffset, roffset;
10580 rb_encoding *enc = STR_ENC_GET(str);
10581
10582 RSTRING_GETMEM(str, start, olen);
10583
10584 if (argc > 0) {
10585 char table[TR_TABLE_SIZE];
10586 VALUE del = 0, nodel = 0;
10587
10588 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10589 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10590 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10591 }
10592 else {
10593 loffset = lstrip_offset(str, start, start+olen, enc);
10594 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10595 }
10596
10597 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10598 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10599}
10600
10601static VALUE
10602scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10603{
10604 VALUE result = Qnil;
10605 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10606 if (pos >= 0) {
10607 VALUE match;
10608 struct re_registers *regs;
10609 if (BUILTIN_TYPE(pat) == T_STRING) {
10610 regs = NULL;
10611 end = pos + RSTRING_LEN(pat);
10612 }
10613 else {
10614 match = rb_backref_get();
10615 regs = RMATCH_REGS(match);
10616 pos = BEG(0);
10617 end = END(0);
10618 }
10619
10620 if (pos == end) {
10621 rb_encoding *enc = STR_ENC_GET(str);
10622 /*
10623 * Always consume at least one character of the input string
10624 */
10625 if (RSTRING_LEN(str) > end)
10626 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10627 RSTRING_END(str), enc);
10628 else
10629 *start = end + 1;
10630 }
10631 else {
10632 *start = end;
10633 }
10634
10635 if (!regs || regs->num_regs == 1) {
10636 result = rb_str_subseq(str, pos, end - pos);
10637 return result;
10638 }
10639 else {
10640 result = rb_ary_new2(regs->num_regs);
10641 for (int i = 1; i < regs->num_regs; i++) {
10642 VALUE s = Qnil;
10643 if (BEG(i) >= 0) {
10644 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10645 }
10646
10647 rb_ary_push(result, s);
10648 }
10649 }
10650
10651 RB_GC_GUARD(match);
10652 }
10653
10654 return result;
10655}
10656
10657
10658/*
10659 * call-seq:
10660 * scan(pattern) -> array_of_results
10661 * scan(pattern) {|result| ... } -> self
10662 *
10663 * :include: doc/string/scan.rdoc
10664 *
10665 */
10666
10667static VALUE
10668rb_str_scan(VALUE str, VALUE pat)
10669{
10670 VALUE result;
10671 long start = 0;
10672 long last = -1, prev = 0;
10673 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10674
10675 pat = get_pat_quoted(pat, 1);
10676 mustnot_broken(str);
10677 if (!rb_block_given_p()) {
10678 VALUE ary = rb_ary_new();
10679
10680 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10681 last = prev;
10682 prev = start;
10683 rb_ary_push(ary, result);
10684 }
10685 if (last >= 0) rb_pat_search(pat, str, last, 1);
10686 else rb_backref_set(Qnil);
10687 return ary;
10688 }
10689
10690 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10691 last = prev;
10692 prev = start;
10693 rb_yield(result);
10694 str_mod_check(str, p, len);
10695 }
10696 if (last >= 0) rb_pat_search(pat, str, last, 1);
10697 return str;
10698}
10699
10700
10701/*
10702 * call-seq:
10703 * hex -> integer
10704 *
10705 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10706 * returns its value as an integer.
10707 *
10708 * The leading substring is interpreted as hexadecimal when it begins with:
10709 *
10710 * - One or more character representing hexadecimal digits
10711 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10712 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10713 *
10714 * 'f'.hex # => 15
10715 * '11'.hex # => 17
10716 * 'FFF'.hex # => 4095
10717 * 'fffg'.hex # => 4095
10718 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10719 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10720 * 'deadbeef'.hex # => 3735928559
10721 *
10722 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10723 *
10724 * '0xfff'.hex # => 4095
10725 * '0xfffg'.hex # => 4095
10726 *
10727 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10728 *
10729 * '-fff'.hex # => -4095
10730 * '-0xFFF'.hex # => -4095
10731 *
10732 * For any substring not described above, returns zero:
10733 *
10734 * 'xxx'.hex # => 0
10735 * ''.hex # => 0
10736 *
10737 * Note that, unlike #oct, this method interprets only hexadecimal,
10738 * and not binary, octal, or decimal notations:
10739 *
10740 * '0b111'.hex # => 45329
10741 * '0o777'.hex # => 0
10742 * '0d999'.hex # => 55705
10743 *
10744 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10745 */
10746
10747static VALUE
10748rb_str_hex(VALUE str)
10749{
10750 return rb_str_to_inum(str, 16, FALSE);
10751}
10752
10753
10754/*
10755 * call-seq:
10756 * oct -> integer
10757 *
10758 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10759 * returns their value as an integer.
10760 *
10761 * In brief:
10762 *
10763 * # Interpreted as octal.
10764 * '777'.oct # => 511
10765 * '777x'.oct # => 511
10766 * '0777'.oct # => 511
10767 * '0o777'.oct # => 511
10768 * '-777'.oct # => -511
10769 * # Not interpreted as octal.
10770 * '0b111'.oct # => 7 # Interpreted as binary.
10771 * '0d999'.oct # => 999 # Interpreted as decimal.
10772 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10773 *
10774 * The leading substring is interpreted as octal when it begins with:
10775 *
10776 * - One or more character representing octal digits
10777 * (each in the range <tt>'0'..'7'</tt>);
10778 * the string to be interpreted ends at the first character that does not represent an octal digit:
10779 *
10780 * '7'.oct @ => 7
10781 * '11'.oct # => 9
10782 * '777'.oct # => 511
10783 * '0777'.oct # => 511
10784 * '7778'.oct # => 511
10785 * '777x'.oct # => 511
10786 *
10787 * - <tt>'0o'</tt>, followed by one or more octal digits:
10788 *
10789 * '0o777'.oct # => 511
10790 * '0o7778'.oct # => 511
10791 *
10792 * The leading substring is _not_ interpreted as octal when it begins with:
10793 *
10794 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10795 * (each in the range <tt>'0'..'1'</tt>);
10796 * the string to be interpreted ends at the first character that does not represent a binary digit.
10797 * the string is interpreted as binary digits (base 2):
10798 *
10799 * '0b111'.oct # => 7
10800 * '0b1112'.oct # => 7
10801 *
10802 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10803 * (each in the range <tt>'0'..'9'</tt>);
10804 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10805 * the string is interpreted as decimal digits (base 10):
10806 *
10807 * '0d999'.oct # => 999
10808 * '0d999x'.oct # => 999
10809 *
10810 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10811 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10812 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10813 * the string is interpreted as hexadecimal digits (base 16):
10814 *
10815 * '0xfff'.oct # => 4095
10816 * '0xfffg'.oct # => 4095
10817 *
10818 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10819 *
10820 * '-777'.oct # => -511
10821 * '-0777'.oct # => -511
10822 * '-0b111'.oct # => -7
10823 * '-0xfff'.oct # => -4095
10824 *
10825 * For any substring not described above, returns zero:
10826 *
10827 * 'foo'.oct # => 0
10828 * ''.oct # => 0
10829 *
10830 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10831 */
10832
10833static VALUE
10834rb_str_oct(VALUE str)
10835{
10836 return rb_str_to_inum(str, -8, FALSE);
10837}
10838
10839#ifndef HAVE_CRYPT_R
10840# include "ruby/thread_native.h"
10841# include "ruby/atomic.h"
10842
10843static struct {
10844 rb_nativethread_lock_t lock;
10845} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10846#endif
10847
10848/*
10849 * call-seq:
10850 * crypt(salt_str) -> new_string
10851 *
10852 * Returns the string generated by calling <code>crypt(3)</code>
10853 * standard library function with <code>str</code> and
10854 * <code>salt_str</code>, in this order, as its arguments. Please do
10855 * not use this method any longer. It is legacy; provided only for
10856 * backward compatibility with ruby scripts in earlier days. It is
10857 * bad to use in contemporary programs for several reasons:
10858 *
10859 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10860 * run. The generated string lacks data portability.
10861 *
10862 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10863 * (i.e. silently ends up in unexpected results).
10864 *
10865 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10866 * thread safe.
10867 *
10868 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10869 * very very weak. According to its manpage, Linux's traditional
10870 * <code>crypt(3)</code> output has only 2**56 variations; too
10871 * easy to brute force today. And this is the default behaviour.
10872 *
10873 * * In order to make things robust some OSes implement so-called
10874 * "modular" usage. To go through, you have to do a complex
10875 * build-up of the <code>salt_str</code> parameter, by hand.
10876 * Failure in generation of a proper salt string tends not to
10877 * yield any errors; typos in parameters are normally not
10878 * detectable.
10879 *
10880 * * For instance, in the following example, the second invocation
10881 * of String#crypt is wrong; it has a typo in "round=" (lacks
10882 * "s"). However the call does not fail and something unexpected
10883 * is generated.
10884 *
10885 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10886 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10887 *
10888 * * Even in the "modular" mode, some hash functions are considered
10889 * archaic and no longer recommended at all; for instance module
10890 * <code>$1$</code> is officially abandoned by its author: see
10891 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10892 * instance module <code>$3$</code> is considered completely
10893 * broken: see the manpage of FreeBSD.
10894 *
10895 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10896 * written above, <code>crypt(3)</code> on Mac OS never fails.
10897 * This means even if you build up a proper salt string it
10898 * generates a traditional DES hash anyways, and there is no way
10899 * for you to be aware of.
10900 *
10901 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10902 *
10903 * If for some reason you cannot migrate to other secure contemporary
10904 * password hashing algorithms, install the string-crypt gem and
10905 * <code>require 'string/crypt'</code> to continue using it.
10906 */
10907
10908static VALUE
10909rb_str_crypt(VALUE str, VALUE salt)
10910{
10911#ifdef HAVE_CRYPT_R
10912 VALUE databuf;
10913 struct crypt_data *data;
10914# define CRYPT_END() ALLOCV_END(databuf)
10915#else
10916 char *tmp_buf;
10917 extern char *crypt(const char *, const char *);
10918# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10919#endif
10920 VALUE result;
10921 const char *s, *saltp;
10922 char *res;
10923#ifdef BROKEN_CRYPT
10924 char salt_8bit_clean[3];
10925#endif
10926
10927 StringValue(salt);
10928 mustnot_wchar(str);
10929 mustnot_wchar(salt);
10930 s = StringValueCStr(str);
10931 saltp = RSTRING_PTR(salt);
10932 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10933 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10934 }
10935
10936#ifdef BROKEN_CRYPT
10937 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10938 salt_8bit_clean[0] = saltp[0] & 0x7f;
10939 salt_8bit_clean[1] = saltp[1] & 0x7f;
10940 salt_8bit_clean[2] = '\0';
10941 saltp = salt_8bit_clean;
10942 }
10943#endif
10944#ifdef HAVE_CRYPT_R
10945 data = ALLOCV(databuf, sizeof(struct crypt_data));
10946# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10947 data->initialized = 0;
10948# endif
10949 res = crypt_r(s, saltp, data);
10950#else
10951 rb_nativethread_lock_lock(&crypt_mutex.lock);
10952 res = crypt(s, saltp);
10953#endif
10954 if (!res) {
10955 int err = errno;
10956 CRYPT_END();
10957 rb_syserr_fail(err, "crypt");
10958 }
10959#ifdef HAVE_CRYPT_R
10960 result = rb_str_new_cstr(res);
10961 CRYPT_END();
10962#else
10963 // We need to copy this buffer because it's static and we need to unlock the mutex
10964 // before allocating a new object (the string to be returned). If we allocate while
10965 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10966 // if other ractors are waiting on this lock.
10967 size_t res_size = strlen(res)+1;
10968 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10969 memcpy(tmp_buf, res, res_size);
10970 res = tmp_buf;
10971 CRYPT_END();
10972 result = rb_str_new_cstr(res);
10973#endif
10974 return result;
10975}
10976
10977
10978/*
10979 * call-seq:
10980 * ord -> integer
10981 *
10982 * :include: doc/string/ord.rdoc
10983 *
10984 */
10985
10986static VALUE
10987rb_str_ord(VALUE s)
10988{
10989 unsigned int c;
10990
10991 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10992 return UINT2NUM(c);
10993}
10994/*
10995 * call-seq:
10996 * sum(n = 16) -> integer
10997 *
10998 * :include: doc/string/sum.rdoc
10999 *
11000 */
11001
11002static VALUE
11003rb_str_sum(int argc, VALUE *argv, VALUE str)
11004{
11005 int bits = 16;
11006 char *ptr, *p, *pend;
11007 long len;
11008 VALUE sum = INT2FIX(0);
11009 unsigned long sum0 = 0;
11010
11011 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11012 bits = 0;
11013 }
11014 ptr = p = RSTRING_PTR(str);
11015 len = RSTRING_LEN(str);
11016 pend = p + len;
11017
11018 while (p < pend) {
11019 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11020 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11021 str_mod_check(str, ptr, len);
11022 sum0 = 0;
11023 }
11024 sum0 += (unsigned char)*p;
11025 p++;
11026 }
11027
11028 if (bits == 0) {
11029 if (sum0) {
11030 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11031 }
11032 }
11033 else {
11034 if (sum == INT2FIX(0)) {
11035 if (bits < (int)sizeof(long)*CHAR_BIT) {
11036 sum0 &= (((unsigned long)1)<<bits)-1;
11037 }
11038 sum = LONG2FIX(sum0);
11039 }
11040 else {
11041 VALUE mod;
11042
11043 if (sum0) {
11044 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11045 }
11046
11047 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11048 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11049 sum = rb_funcall(sum, '&', 1, mod);
11050 }
11051 }
11052 return sum;
11053}
11054
11055static VALUE
11056rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11057{
11058 rb_encoding *enc;
11059 VALUE w;
11060 long width, len, flen = 1, fclen = 1;
11061 VALUE res;
11062 char *p;
11063 const char *f = " ";
11064 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11065 VALUE pad;
11066 int singlebyte = 1, cr;
11067 int termlen;
11068
11069 rb_scan_args(argc, argv, "11", &w, &pad);
11070 enc = STR_ENC_GET(str);
11071 termlen = rb_enc_mbminlen(enc);
11072 width = NUM2LONG(w);
11073 if (argc == 2) {
11074 StringValue(pad);
11075 enc = rb_enc_check(str, pad);
11076 f = RSTRING_PTR(pad);
11077 flen = RSTRING_LEN(pad);
11078 fclen = str_strlen(pad, enc); /* rb_enc_check */
11079 singlebyte = single_byte_optimizable(pad);
11080 if (flen == 0 || fclen == 0) {
11081 rb_raise(rb_eArgError, "zero width padding");
11082 }
11083 }
11084 len = str_strlen(str, enc); /* rb_enc_check */
11085 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11086 n = width - len;
11087 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11088 rlen = n - llen;
11089 cr = ENC_CODERANGE(str);
11090 if (flen > 1) {
11091 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11092 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11093 }
11094 size = RSTRING_LEN(str);
11095 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11096 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11097 (len += llen2 + rlen2) >= LONG_MAX - size) {
11098 rb_raise(rb_eArgError, "argument too big");
11099 }
11100 len += size;
11101 res = str_enc_new(rb_cString, 0, len, enc);
11102 p = RSTRING_PTR(res);
11103 if (flen <= 1) {
11104 memset(p, *f, llen);
11105 p += llen;
11106 }
11107 else {
11108 while (llen >= fclen) {
11109 memcpy(p,f,flen);
11110 p += flen;
11111 llen -= fclen;
11112 }
11113 if (llen > 0) {
11114 memcpy(p, f, llen2);
11115 p += llen2;
11116 }
11117 }
11118 memcpy(p, RSTRING_PTR(str), size);
11119 p += size;
11120 if (flen <= 1) {
11121 memset(p, *f, rlen);
11122 p += rlen;
11123 }
11124 else {
11125 while (rlen >= fclen) {
11126 memcpy(p,f,flen);
11127 p += flen;
11128 rlen -= fclen;
11129 }
11130 if (rlen > 0) {
11131 memcpy(p, f, rlen2);
11132 p += rlen2;
11133 }
11134 }
11135 TERM_FILL(p, termlen);
11136 STR_SET_LEN(res, p-RSTRING_PTR(res));
11137
11138 if (argc == 2)
11139 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11140 if (cr != ENC_CODERANGE_BROKEN)
11141 ENC_CODERANGE_SET(res, cr);
11142
11143 RB_GC_GUARD(pad);
11144 return res;
11145}
11146
11147
11148/*
11149 * call-seq:
11150 * ljust(width, pad_string = ' ') -> new_string
11151 *
11152 * :include: doc/string/ljust.rdoc
11153 *
11154 */
11155
11156static VALUE
11157rb_str_ljust(int argc, VALUE *argv, VALUE str)
11158{
11159 return rb_str_justify(argc, argv, str, 'l');
11160}
11161
11162/*
11163 * call-seq:
11164 * rjust(width, pad_string = ' ') -> new_string
11165 *
11166 * :include: doc/string/rjust.rdoc
11167 *
11168 */
11169
11170static VALUE
11171rb_str_rjust(int argc, VALUE *argv, VALUE str)
11172{
11173 return rb_str_justify(argc, argv, str, 'r');
11174}
11175
11176
11177/*
11178 * call-seq:
11179 * center(size, pad_string = ' ') -> new_string
11180 *
11181 * :include: doc/string/center.rdoc
11182 *
11183 */
11184
11185static VALUE
11186rb_str_center(int argc, VALUE *argv, VALUE str)
11187{
11188 return rb_str_justify(argc, argv, str, 'c');
11189}
11190
11191/*
11192 * call-seq:
11193 * partition(pattern) -> [pre_match, first_match, post_match]
11194 *
11195 * :include: doc/string/partition.rdoc
11196 *
11197 */
11198
11199static VALUE
11200rb_str_partition(VALUE str, VALUE sep)
11201{
11202 long pos;
11203
11204 sep = get_pat_quoted(sep, 0);
11205 if (RB_TYPE_P(sep, T_REGEXP)) {
11206 if (rb_reg_search(sep, str, 0, 0) < 0) {
11207 goto failed;
11208 }
11209 VALUE match = rb_backref_get();
11210 struct re_registers *regs = RMATCH_REGS(match);
11211
11212 pos = BEG(0);
11213 sep = rb_str_subseq(str, pos, END(0) - pos);
11214 }
11215 else {
11216 pos = rb_str_index(str, sep, 0);
11217 if (pos < 0) goto failed;
11218 }
11219 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11220 sep,
11221 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11222 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11223
11224 failed:
11225 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11226}
11227
11228/*
11229 * call-seq:
11230 * rpartition(pattern) -> [pre_match, last_match, post_match]
11231 *
11232 * :include: doc/string/rpartition.rdoc
11233 *
11234 */
11235
11236static VALUE
11237rb_str_rpartition(VALUE str, VALUE sep)
11238{
11239 long pos = RSTRING_LEN(str);
11240
11241 sep = get_pat_quoted(sep, 0);
11242 if (RB_TYPE_P(sep, T_REGEXP)) {
11243 if (rb_reg_search(sep, str, pos, 1) < 0) {
11244 goto failed;
11245 }
11246 VALUE match = rb_backref_get();
11247 struct re_registers *regs = RMATCH_REGS(match);
11248
11249 pos = BEG(0);
11250 sep = rb_str_subseq(str, pos, END(0) - pos);
11251 }
11252 else {
11253 pos = rb_str_sublen(str, pos);
11254 pos = rb_str_rindex(str, sep, pos);
11255 if (pos < 0) {
11256 goto failed;
11257 }
11258 }
11259
11260 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11261 sep,
11262 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11263 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11264 failed:
11265 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11266}
11267
11268/*
11269 * call-seq:
11270 * start_with?(*patterns) -> true or false
11271 *
11272 * :include: doc/string/start_with_p.rdoc
11273 *
11274 */
11275
11276static VALUE
11277rb_str_start_with(int argc, VALUE *argv, VALUE str)
11278{
11279 int i;
11280
11281 for (i=0; i<argc; i++) {
11282 VALUE tmp = argv[i];
11283 if (RB_TYPE_P(tmp, T_REGEXP)) {
11284 if (rb_reg_start_with_p(tmp, str))
11285 return Qtrue;
11286 }
11287 else {
11288 const char *p, *s, *e;
11289 long slen, tlen;
11290 rb_encoding *enc;
11291
11292 StringValue(tmp);
11293 enc = rb_enc_check(str, tmp);
11294 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11295 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11296 p = RSTRING_PTR(str);
11297 e = p + slen;
11298 s = p + tlen;
11299 if (!at_char_right_boundary(p, s, e, enc))
11300 continue;
11301 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11302 return Qtrue;
11303 }
11304 }
11305 return Qfalse;
11306}
11307
11308/*
11309 * call-seq:
11310 * end_with?(*strings) -> true or false
11311 *
11312 * :include: doc/string/end_with_p.rdoc
11313 *
11314 */
11315
11316static VALUE
11317rb_str_end_with(int argc, VALUE *argv, VALUE str)
11318{
11319 int i;
11320
11321 for (i=0; i<argc; i++) {
11322 VALUE tmp = argv[i];
11323 const char *p, *s, *e;
11324 long slen, tlen;
11325 rb_encoding *enc;
11326
11327 StringValue(tmp);
11328 enc = rb_enc_check(str, tmp);
11329 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11330 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11331 p = RSTRING_PTR(str);
11332 e = p + slen;
11333 s = e - tlen;
11334 if (!at_char_boundary(p, s, e, enc))
11335 continue;
11336 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11337 return Qtrue;
11338 }
11339 return Qfalse;
11340}
11341
11351static long
11352deleted_prefix_length(VALUE str, VALUE prefix)
11353{
11354 const char *strptr, *prefixptr;
11355 long olen, prefixlen;
11356 rb_encoding *enc = rb_enc_get(str);
11357
11358 StringValue(prefix);
11359
11360 if (!is_broken_string(prefix) ||
11361 !rb_enc_asciicompat(enc) ||
11362 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11363 enc = rb_enc_check(str, prefix);
11364 }
11365
11366 /* return 0 if not start with prefix */
11367 prefixlen = RSTRING_LEN(prefix);
11368 if (prefixlen <= 0) return 0;
11369 olen = RSTRING_LEN(str);
11370 if (olen < prefixlen) return 0;
11371 strptr = RSTRING_PTR(str);
11372 prefixptr = RSTRING_PTR(prefix);
11373 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11374 if (is_broken_string(prefix)) {
11375 if (!is_broken_string(str)) {
11376 /* prefix in a valid string cannot be broken */
11377 return 0;
11378 }
11379 const char *strend = strptr + olen;
11380 const char *after_prefix = strptr + prefixlen;
11381 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11382 /* prefix does not end at char-boundary */
11383 return 0;
11384 }
11385 }
11386 /* prefix part in `str` also should be valid. */
11387
11388 return prefixlen;
11389}
11390
11391/*
11392 * call-seq:
11393 * delete_prefix!(prefix) -> self or nil
11394 *
11395 * Like String#delete_prefix, except that +self+ is modified in place;
11396 * returns +self+ if the prefix is removed, +nil+ otherwise.
11397 *
11398 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11399 */
11400
11401static VALUE
11402rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11403{
11404 long prefixlen;
11405 str_modify_keep_cr(str);
11406
11407 prefixlen = deleted_prefix_length(str, prefix);
11408 if (prefixlen <= 0) return Qnil;
11409
11410 return rb_str_drop_bytes(str, prefixlen);
11411}
11412
11413/*
11414 * call-seq:
11415 * delete_prefix(prefix) -> new_string
11416 *
11417 * :include: doc/string/delete_prefix.rdoc
11418 *
11419 */
11420
11421static VALUE
11422rb_str_delete_prefix(VALUE str, VALUE prefix)
11423{
11424 long prefixlen;
11425
11426 prefixlen = deleted_prefix_length(str, prefix);
11427 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11428
11429 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11430}
11431
11441static long
11442deleted_suffix_length(VALUE str, VALUE suffix)
11443{
11444 const char *strptr, *suffixptr;
11445 long olen, suffixlen;
11446 rb_encoding *enc;
11447
11448 StringValue(suffix);
11449 if (is_broken_string(suffix)) return 0;
11450 enc = rb_enc_check(str, suffix);
11451
11452 /* return 0 if not start with suffix */
11453 suffixlen = RSTRING_LEN(suffix);
11454 if (suffixlen <= 0) return 0;
11455 olen = RSTRING_LEN(str);
11456 if (olen < suffixlen) return 0;
11457 strptr = RSTRING_PTR(str);
11458 suffixptr = RSTRING_PTR(suffix);
11459 const char *strend = strptr + olen;
11460 const char *before_suffix = strend - suffixlen;
11461 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11462 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11463
11464 return suffixlen;
11465}
11466
11467/*
11468 * call-seq:
11469 * delete_suffix!(suffix) -> self or nil
11470 *
11471 * Like String#delete_suffix, except that +self+ is modified in place;
11472 * returns +self+ if the suffix is removed, +nil+ otherwise.
11473 *
11474 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11475 */
11476
11477static VALUE
11478rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11479{
11480 long olen, suffixlen, len;
11481 str_modifiable(str);
11482
11483 suffixlen = deleted_suffix_length(str, suffix);
11484 if (suffixlen <= 0) return Qnil;
11485
11486 olen = RSTRING_LEN(str);
11487 str_modify_keep_cr(str);
11488 len = olen - suffixlen;
11489 STR_SET_LEN(str, len);
11490 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11491 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11493 }
11494 return str;
11495}
11496
11497/*
11498 * call-seq:
11499 * delete_suffix(suffix) -> new_string
11500 *
11501 * :include: doc/string/delete_suffix.rdoc
11502 *
11503 */
11504
11505static VALUE
11506rb_str_delete_suffix(VALUE str, VALUE suffix)
11507{
11508 long suffixlen;
11509
11510 suffixlen = deleted_suffix_length(str, suffix);
11511 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11512
11513 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11514}
11515
11516void
11517rb_str_setter(VALUE val, ID id, VALUE *var)
11518{
11519 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11520 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11521 }
11522 *var = val;
11523}
11524
11525static void
11526nil_setter_warning(ID id)
11527{
11528 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11529}
11530
11531void
11532rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11533{
11534 rb_str_setter(val, id, var);
11535 if (!NIL_P(*var)) {
11536 nil_setter_warning(id);
11537 }
11538}
11539
11540static void
11541rb_fs_setter(VALUE val, ID id, VALUE *var)
11542{
11543 val = rb_fs_check(val);
11544 if (!val) {
11545 rb_raise(rb_eTypeError,
11546 "value of %"PRIsVALUE" must be String or Regexp",
11547 rb_id2str(id));
11548 }
11549 if (!NIL_P(val)) {
11550 nil_setter_warning(id);
11551 }
11552 *var = val;
11553}
11554
11555
11556/*
11557 * call-seq:
11558 * force_encoding(encoding) -> self
11559 *
11560 * :include: doc/string/force_encoding.rdoc
11561 *
11562 */
11563
11564static VALUE
11565rb_str_force_encoding(VALUE str, VALUE enc)
11566{
11567 str_modifiable(str);
11568
11569 rb_encoding *encoding = rb_to_encoding(enc);
11570 int idx = rb_enc_to_index(encoding);
11571
11572 // If the encoding is unchanged, we do nothing.
11573 if (ENCODING_GET(str) == idx) {
11574 return str;
11575 }
11576
11577 rb_enc_associate_index(str, idx);
11578
11579 // If the coderange was 7bit and the new encoding is ASCII-compatible
11580 // we can keep the coderange.
11581 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11582 return str;
11583 }
11584
11586 return str;
11587}
11588
11589/*
11590 * call-seq:
11591 * b -> new_string
11592 *
11593 * :include: doc/string/b.rdoc
11594 *
11595 */
11596
11597static VALUE
11598rb_str_b(VALUE str)
11599{
11600 VALUE str2;
11601 if (STR_EMBED_P(str)) {
11602 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11603 }
11604 else {
11605 str2 = str_alloc_heap(rb_cString);
11606 }
11607 str_replace_shared_without_enc(str2, str);
11608
11609 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11610 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11611 // If we know the receiver's code range then we know the result's code range.
11612 int cr = ENC_CODERANGE(str);
11613 switch (cr) {
11614 case ENC_CODERANGE_7BIT:
11616 break;
11620 break;
11621 default:
11622 ENC_CODERANGE_CLEAR(str2);
11623 break;
11624 }
11625 }
11626
11627 return str2;
11628}
11629
11630/*
11631 * call-seq:
11632 * valid_encoding? -> true or false
11633 *
11634 * :include: doc/string/valid_encoding_p.rdoc
11635 *
11636 */
11637
11638static VALUE
11639rb_str_valid_encoding_p(VALUE str)
11640{
11641 int cr = rb_enc_str_coderange(str);
11642
11643 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11644}
11645
11646/*
11647 * call-seq:
11648 * ascii_only? -> true or false
11649 *
11650 * Returns whether +self+ contains only ASCII characters:
11651 *
11652 * 'abc'.ascii_only? # => true
11653 * "abc\u{6666}".ascii_only? # => false
11654 *
11655 * Related: see {Querying}[rdoc-ref:String@Querying].
11656 */
11657
11658static VALUE
11659rb_str_is_ascii_only_p(VALUE str)
11660{
11661 int cr = rb_enc_str_coderange(str);
11662
11663 return RBOOL(cr == ENC_CODERANGE_7BIT);
11664}
11665
11666VALUE
11668{
11669 static const char ellipsis[] = "...";
11670 const long ellipsislen = sizeof(ellipsis) - 1;
11671 rb_encoding *const enc = rb_enc_get(str);
11672 const long blen = RSTRING_LEN(str);
11673 const char *const p = RSTRING_PTR(str), *e = p + blen;
11674 VALUE estr, ret = 0;
11675
11676 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11677 if (len * rb_enc_mbminlen(enc) >= blen ||
11678 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11679 ret = str;
11680 }
11681 else if (len <= ellipsislen ||
11682 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11683 if (rb_enc_asciicompat(enc)) {
11684 ret = rb_str_new(ellipsis, len);
11685 rb_enc_associate(ret, enc);
11686 }
11687 else {
11688 estr = rb_usascii_str_new(ellipsis, len);
11689 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11690 }
11691 }
11692 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11693 rb_str_cat(ret, ellipsis, ellipsislen);
11694 }
11695 else {
11696 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11697 rb_enc_from_encoding(enc), 0, Qnil);
11698 rb_str_append(ret, estr);
11699 }
11700 return ret;
11701}
11702
11703static VALUE
11704str_compat_and_valid(VALUE str, rb_encoding *enc)
11705{
11706 int cr;
11707 str = StringValue(str);
11708 cr = rb_enc_str_coderange(str);
11709 if (cr == ENC_CODERANGE_BROKEN) {
11710 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11711 }
11712 else {
11713 rb_encoding *e = STR_ENC_GET(str);
11714 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11715 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11716 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11717 }
11718 }
11719 return str;
11720}
11721
11722static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11723
11724VALUE
11726{
11727 rb_encoding *enc = STR_ENC_GET(str);
11728 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11729}
11730
11731VALUE
11732rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11733{
11734 int cr = ENC_CODERANGE_UNKNOWN;
11735 if (enc == STR_ENC_GET(str)) {
11736 /* cached coderange makes sense only when enc equals the
11737 * actual encoding of str */
11738 cr = ENC_CODERANGE(str);
11739 }
11740 return enc_str_scrub(enc, str, repl, cr);
11741}
11742
11743static VALUE
11744enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11745{
11746 int encidx;
11747 VALUE buf = Qnil;
11748 const char *rep, *p, *e, *p1, *sp;
11749 long replen = -1;
11750 long slen;
11751
11752 if (rb_block_given_p()) {
11753 if (!NIL_P(repl))
11754 rb_raise(rb_eArgError, "both of block and replacement given");
11755 replen = 0;
11756 }
11757
11758 if (ENC_CODERANGE_CLEAN_P(cr))
11759 return Qnil;
11760
11761 if (!NIL_P(repl)) {
11762 repl = str_compat_and_valid(repl, enc);
11763 }
11764
11765 if (rb_enc_dummy_p(enc)) {
11766 return Qnil;
11767 }
11768 encidx = rb_enc_to_index(enc);
11769
11770#define DEFAULT_REPLACE_CHAR(str) do { \
11771 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11772 rep = replace; replen = (int)sizeof(replace); \
11773 } while (0)
11774
11775 slen = RSTRING_LEN(str);
11776 p = RSTRING_PTR(str);
11777 e = RSTRING_END(str);
11778 p1 = p;
11779 sp = p;
11780
11781 if (rb_enc_asciicompat(enc)) {
11782 int rep7bit_p;
11783 if (!replen) {
11784 rep = NULL;
11785 rep7bit_p = FALSE;
11786 }
11787 else if (!NIL_P(repl)) {
11788 rep = RSTRING_PTR(repl);
11789 replen = RSTRING_LEN(repl);
11790 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11791 }
11792 else if (encidx == rb_utf8_encindex()) {
11793 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11794 rep7bit_p = FALSE;
11795 }
11796 else {
11797 DEFAULT_REPLACE_CHAR("?");
11798 rep7bit_p = TRUE;
11799 }
11800 cr = ENC_CODERANGE_7BIT;
11801
11802 p = search_nonascii(p, e);
11803 if (!p) {
11804 p = e;
11805 }
11806 while (p < e) {
11807 int ret = rb_enc_precise_mbclen(p, e, enc);
11808 if (MBCLEN_NEEDMORE_P(ret)) {
11809 break;
11810 }
11811 else if (MBCLEN_CHARFOUND_P(ret)) {
11813 p += MBCLEN_CHARFOUND_LEN(ret);
11814 }
11815 else if (MBCLEN_INVALID_P(ret)) {
11816 /*
11817 * p1~p: valid ascii/multibyte chars
11818 * p ~e: invalid bytes + unknown bytes
11819 */
11820 long clen = rb_enc_mbmaxlen(enc);
11821 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11822 if (p > p1) {
11823 rb_str_buf_cat(buf, p1, p - p1);
11824 }
11825
11826 if (e - p < clen) clen = e - p;
11827 if (clen <= 2) {
11828 clen = 1;
11829 }
11830 else {
11831 const char *q = p;
11832 clen--;
11833 for (; clen > 1; clen--) {
11834 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11835 if (MBCLEN_NEEDMORE_P(ret)) break;
11836 if (MBCLEN_INVALID_P(ret)) continue;
11838 }
11839 }
11840 if (rep) {
11841 rb_str_buf_cat(buf, rep, replen);
11842 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11843 }
11844 else {
11845 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11846 str_mod_check(str, sp, slen);
11847 repl = str_compat_and_valid(repl, enc);
11848 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11851 }
11852 p += clen;
11853 p1 = p;
11854 p = search_nonascii(p, e);
11855 if (!p) {
11856 p = e;
11857 break;
11858 }
11859 }
11860 else {
11862 }
11863 }
11864 if (NIL_P(buf)) {
11865 if (p == e) {
11866 ENC_CODERANGE_SET(str, cr);
11867 return Qnil;
11868 }
11869 buf = rb_str_buf_new(RSTRING_LEN(str));
11870 }
11871 if (p1 < p) {
11872 rb_str_buf_cat(buf, p1, p - p1);
11873 }
11874 if (p < e) {
11875 if (rep) {
11876 rb_str_buf_cat(buf, rep, replen);
11877 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11878 }
11879 else {
11880 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11881 str_mod_check(str, sp, slen);
11882 repl = str_compat_and_valid(repl, enc);
11883 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11886 }
11887 }
11888 }
11889 else {
11890 /* ASCII incompatible */
11891 long mbminlen = rb_enc_mbminlen(enc);
11892 if (!replen) {
11893 rep = NULL;
11894 }
11895 else if (!NIL_P(repl)) {
11896 rep = RSTRING_PTR(repl);
11897 replen = RSTRING_LEN(repl);
11898 }
11899 else if (encidx == ENCINDEX_UTF_16BE) {
11900 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11901 }
11902 else if (encidx == ENCINDEX_UTF_16LE) {
11903 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11904 }
11905 else if (encidx == ENCINDEX_UTF_32BE) {
11906 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11907 }
11908 else if (encidx == ENCINDEX_UTF_32LE) {
11909 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11910 }
11911 else {
11912 DEFAULT_REPLACE_CHAR("?");
11913 }
11914
11915 while (p < e) {
11916 int ret = rb_enc_precise_mbclen(p, e, enc);
11917 if (MBCLEN_NEEDMORE_P(ret)) {
11918 break;
11919 }
11920 else if (MBCLEN_CHARFOUND_P(ret)) {
11921 p += MBCLEN_CHARFOUND_LEN(ret);
11922 }
11923 else if (MBCLEN_INVALID_P(ret)) {
11924 const char *q = p;
11925 long clen = rb_enc_mbmaxlen(enc);
11926 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11927 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11928
11929 if (e - p < clen) clen = e - p;
11930 if (clen <= mbminlen * 2) {
11931 clen = mbminlen;
11932 }
11933 else {
11934 clen -= mbminlen;
11935 for (; clen > mbminlen; clen-=mbminlen) {
11936 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11937 if (MBCLEN_NEEDMORE_P(ret)) break;
11938 if (MBCLEN_INVALID_P(ret)) continue;
11940 }
11941 }
11942 if (rep) {
11943 rb_str_buf_cat(buf, rep, replen);
11944 }
11945 else {
11946 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11947 str_mod_check(str, sp, slen);
11948 repl = str_compat_and_valid(repl, enc);
11949 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11950 }
11951 p += clen;
11952 p1 = p;
11953 }
11954 else {
11956 }
11957 }
11958 if (NIL_P(buf)) {
11959 if (p == e) {
11961 return Qnil;
11962 }
11963 buf = rb_str_buf_new(RSTRING_LEN(str));
11964 }
11965 if (p1 < p) {
11966 rb_str_buf_cat(buf, p1, p - p1);
11967 }
11968 if (p < e) {
11969 if (rep) {
11970 rb_str_buf_cat(buf, rep, replen);
11971 }
11972 else {
11973 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11974 str_mod_check(str, sp, slen);
11975 repl = str_compat_and_valid(repl, enc);
11976 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11977 }
11978 }
11980 }
11981 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11982 return buf;
11983}
11984
11985/*
11986 * call-seq:
11987 * scrub(replacement_string = default_replacement_string) -> new_string
11988 * scrub{|sequence| ... } -> new_string
11989 *
11990 * :include: doc/string/scrub.rdoc
11991 *
11992 */
11993static VALUE
11994str_scrub(int argc, VALUE *argv, VALUE str)
11995{
11996 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11997 VALUE new = rb_str_scrub(str, repl);
11998 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11999}
12000
12001/*
12002 * call-seq:
12003 * scrub!(replacement_string = default_replacement_string) -> self
12004 * scrub!{|sequence| ... } -> self
12005 *
12006 * Like String#scrub, except that:
12007 *
12008 * - Any replacements are made in +self+.
12009 * - Returns +self+.
12010 *
12011 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12012 *
12013 */
12014static VALUE
12015str_scrub_bang(int argc, VALUE *argv, VALUE str)
12016{
12017 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12018 VALUE new = rb_str_scrub(str, repl);
12019 if (!NIL_P(new)) rb_str_replace(str, new);
12020 return str;
12021}
12022
12023static ID id_normalize;
12024static ID id_normalized_p;
12025static VALUE mUnicodeNormalize;
12026
12027static VALUE
12028unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12029{
12030 static int UnicodeNormalizeRequired = 0;
12031 VALUE argv2[2];
12032
12033 if (!UnicodeNormalizeRequired) {
12034 rb_require("unicode_normalize/normalize.rb");
12035 UnicodeNormalizeRequired = 1;
12036 }
12037 argv2[0] = str;
12038 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12039 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12040}
12041
12042/*
12043 * call-seq:
12044 * unicode_normalize(form = :nfc) -> string
12045 *
12046 * :include: doc/string/unicode_normalize.rdoc
12047 *
12048 */
12049static VALUE
12050rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12051{
12052 return unicode_normalize_common(argc, argv, str, id_normalize);
12053}
12054
12055/*
12056 * call-seq:
12057 * unicode_normalize!(form = :nfc) -> self
12058 *
12059 * Like String#unicode_normalize, except that the normalization
12060 * is performed on +self+ (not on a copy of +self+).
12061 *
12062 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12063 *
12064 */
12065static VALUE
12066rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12067{
12068 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12069}
12070
12071/* call-seq:
12072 * unicode_normalized?(form = :nfc) -> true or false
12073 *
12074 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12075 * see String#unicode_normalize.
12076 *
12077 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12078 *
12079 * Examples:
12080 *
12081 * "a\u0300".unicode_normalized? # => false
12082 * "a\u0300".unicode_normalized?(:nfd) # => true
12083 * "\u00E0".unicode_normalized? # => true
12084 * "\u00E0".unicode_normalized?(:nfd) # => false
12085 *
12086 *
12087 * Raises an exception if +self+ is not in a Unicode encoding:
12088 *
12089 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12090 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12091 *
12092 * Related: see {Querying}[rdoc-ref:String@Querying].
12093 */
12094static VALUE
12095rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12096{
12097 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12098}
12099
12100/**********************************************************************
12101 * Document-class: Symbol
12102 *
12103 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12104 *
12105 * You can create a +Symbol+ object explicitly with:
12106 *
12107 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12108 *
12109 * The same +Symbol+ object will be
12110 * created for a given name or string for the duration of a program's
12111 * execution, regardless of the context or meaning of that name. Thus
12112 * if <code>Fred</code> is a constant in one context, a method in
12113 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12114 * will be the same object in all three contexts.
12115 *
12116 * module One
12117 * class Fred
12118 * end
12119 * $f1 = :Fred
12120 * end
12121 * module Two
12122 * Fred = 1
12123 * $f2 = :Fred
12124 * end
12125 * def Fred()
12126 * end
12127 * $f3 = :Fred
12128 * $f1.object_id #=> 2514190
12129 * $f2.object_id #=> 2514190
12130 * $f3.object_id #=> 2514190
12131 *
12132 * Constant, method, and variable names are returned as symbols:
12133 *
12134 * module One
12135 * Two = 2
12136 * def three; 3 end
12137 * @four = 4
12138 * @@five = 5
12139 * $six = 6
12140 * end
12141 * seven = 7
12142 *
12143 * One.constants
12144 * # => [:Two]
12145 * One.instance_methods(true)
12146 * # => [:three]
12147 * One.instance_variables
12148 * # => [:@four]
12149 * One.class_variables
12150 * # => [:@@five]
12151 * global_variables.grep(/six/)
12152 * # => [:$six]
12153 * local_variables
12154 * # => [:seven]
12155 *
12156 * A +Symbol+ object differs from a String object in that
12157 * a +Symbol+ object represents an identifier, while a String object
12158 * represents text or data.
12159 *
12160 * == What's Here
12161 *
12162 * First, what's elsewhere. Class +Symbol+:
12163 *
12164 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12165 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12166 *
12167 * Here, class +Symbol+ provides methods that are useful for:
12168 *
12169 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12170 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12171 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12172 *
12173 * === Methods for Querying
12174 *
12175 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12176 * - #=~: Returns the index of the first substring in symbol that matches a
12177 * given Regexp or other object; returns +nil+ if no match is found.
12178 * - #[], #slice : Returns a substring of symbol
12179 * determined by a given index, start/length, or range, or string.
12180 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12181 * - #encoding: Returns the Encoding object that represents the encoding
12182 * of symbol.
12183 * - #end_with?: Returns +true+ if symbol ends with
12184 * any of the given strings.
12185 * - #match: Returns a MatchData object if symbol
12186 * matches a given Regexp; +nil+ otherwise.
12187 * - #match?: Returns +true+ if symbol
12188 * matches a given Regexp; +false+ otherwise.
12189 * - #length, #size: Returns the number of characters in symbol.
12190 * - #start_with?: Returns +true+ if symbol starts with
12191 * any of the given strings.
12192 *
12193 * === Methods for Comparing
12194 *
12195 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12196 * or larger than symbol.
12197 * - #==, #===: Returns +true+ if a given symbol has the same content and
12198 * encoding.
12199 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12200 * symbol is smaller than, equal to, or larger than symbol.
12201 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12202 * after Unicode case folding; +false+ otherwise.
12203 *
12204 * === Methods for Converting
12205 *
12206 * - #capitalize: Returns symbol with the first character upcased
12207 * and all other characters downcased.
12208 * - #downcase: Returns symbol with all characters downcased.
12209 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12210 * - #name: Returns the frozen string corresponding to symbol.
12211 * - #succ, #next: Returns the symbol that is the successor to symbol.
12212 * - #swapcase: Returns symbol with all upcase characters downcased
12213 * and all downcase characters upcased.
12214 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12215 * - #to_s, #id2name: Returns the string corresponding to +self+.
12216 * - #to_sym, #intern: Returns +self+.
12217 * - #upcase: Returns symbol with all characters upcased.
12218 *
12219 */
12220
12221
12222/*
12223 * call-seq:
12224 * self == other -> true or false
12225 *
12226 * Returns whether +other+ is the same object as +self+.
12227 */
12228
12229#define sym_equal rb_obj_equal
12230
12231static int
12232sym_printable(const char *s, const char *send, rb_encoding *enc)
12233{
12234 while (s < send) {
12235 int n;
12236 int c = rb_enc_precise_mbclen(s, send, enc);
12237
12238 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12239 n = MBCLEN_CHARFOUND_LEN(c);
12240 c = rb_enc_mbc_to_codepoint(s, send, enc);
12241 if (!rb_enc_isprint(c, enc)) return FALSE;
12242 s += n;
12243 }
12244 return TRUE;
12245}
12246
12247int
12248rb_str_symname_p(VALUE sym)
12249{
12250 rb_encoding *enc;
12251 const char *ptr;
12252 long len;
12253 rb_encoding *resenc = rb_default_internal_encoding();
12254
12255 if (resenc == NULL) resenc = rb_default_external_encoding();
12256 enc = STR_ENC_GET(sym);
12257 ptr = RSTRING_PTR(sym);
12258 len = RSTRING_LEN(sym);
12259 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12260 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12261 return FALSE;
12262 }
12263 return TRUE;
12264}
12265
12266VALUE
12267rb_str_quote_unprintable(VALUE str)
12268{
12269 rb_encoding *enc;
12270 const char *ptr;
12271 long len;
12272 rb_encoding *resenc;
12273
12274 Check_Type(str, T_STRING);
12275 resenc = rb_default_internal_encoding();
12276 if (resenc == NULL) resenc = rb_default_external_encoding();
12277 enc = STR_ENC_GET(str);
12278 ptr = RSTRING_PTR(str);
12279 len = RSTRING_LEN(str);
12280 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12281 !sym_printable(ptr, ptr + len, enc)) {
12282 return rb_str_escape(str);
12283 }
12284 return str;
12285}
12286
12287VALUE
12288rb_id_quote_unprintable(ID id)
12289{
12290 VALUE str = rb_id2str(id);
12291 if (!rb_str_symname_p(str)) {
12292 return rb_str_escape(str);
12293 }
12294 return str;
12295}
12296
12297/*
12298 * call-seq:
12299 * inspect -> string
12300 *
12301 * Returns a string representation of +self+ (including the leading colon):
12302 *
12303 * :foo.inspect # => ":foo"
12304 *
12305 * Related: Symbol#to_s, Symbol#name.
12306 *
12307 */
12308
12309static VALUE
12310sym_inspect(VALUE sym)
12311{
12312 VALUE str = rb_sym2str(sym);
12313 const char *ptr;
12314 long len;
12315 char *dest;
12316
12317 if (!rb_str_symname_p(str)) {
12318 str = rb_str_inspect(str);
12319 len = RSTRING_LEN(str);
12320 rb_str_resize(str, len + 1);
12321 dest = RSTRING_PTR(str);
12322 memmove(dest + 1, dest, len);
12323 }
12324 else {
12325 rb_encoding *enc = STR_ENC_GET(str);
12326 VALUE orig_str = str;
12327
12328 len = RSTRING_LEN(orig_str);
12329 str = rb_enc_str_new(0, len + 1, enc);
12330
12331 // Get data pointer after allocation
12332 ptr = RSTRING_PTR(orig_str);
12333 dest = RSTRING_PTR(str);
12334 memcpy(dest + 1, ptr, len);
12335
12336 RB_GC_GUARD(orig_str);
12337 }
12338 dest[0] = ':';
12339
12341
12342 return str;
12343}
12344
12345VALUE
12347{
12348 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12349 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12350 return str;
12351}
12352
12353VALUE
12354rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12355{
12356 VALUE obj;
12357
12358 if (argc < 1) {
12359 rb_raise(rb_eArgError, "no receiver given");
12360 }
12361 obj = argv[0];
12362 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12363}
12364
12365/*
12366 * call-seq:
12367 * succ
12368 *
12369 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12370 *
12371 * :foo.succ # => :fop
12372 *
12373 * Related: String#succ.
12374 */
12375
12376static VALUE
12377sym_succ(VALUE sym)
12378{
12379 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12380}
12381
12382/*
12383 * call-seq:
12384 * self <=> other -> -1, 0, 1, or nil
12385 *
12386 * Compares +self+ and +other+, using String#<=>.
12387 *
12388 * Returns:
12389 *
12390 * - <tt>self.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12391 * - +nil+, otherwise.
12392 *
12393 * Examples:
12394 *
12395 * :bar <=> :foo # => -1
12396 * :foo <=> :foo # => 0
12397 * :foo <=> :bar # => 1
12398 * :foo <=> 'bar' # => nil
12399 *
12400 * \Class \Symbol includes module Comparable,
12401 * each of whose methods uses Symbol#<=> for comparison.
12402 *
12403 * Related: String#<=>.
12404 */
12405
12406static VALUE
12407sym_cmp(VALUE sym, VALUE other)
12408{
12409 if (!SYMBOL_P(other)) {
12410 return Qnil;
12411 }
12412 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12413}
12414
12415/*
12416 * call-seq:
12417 * casecmp(object) -> -1, 0, 1, or nil
12418 *
12419 * :include: doc/symbol/casecmp.rdoc
12420 *
12421 */
12422
12423static VALUE
12424sym_casecmp(VALUE sym, VALUE other)
12425{
12426 if (!SYMBOL_P(other)) {
12427 return Qnil;
12428 }
12429 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12430}
12431
12432/*
12433 * call-seq:
12434 * casecmp?(object) -> true, false, or nil
12435 *
12436 * :include: doc/symbol/casecmp_p.rdoc
12437 *
12438 */
12439
12440static VALUE
12441sym_casecmp_p(VALUE sym, VALUE other)
12442{
12443 if (!SYMBOL_P(other)) {
12444 return Qnil;
12445 }
12446 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12447}
12448
12449/*
12450 * call-seq:
12451 * self =~ other -> integer or nil
12452 *
12453 * Equivalent to <tt>self.to_s =~ other</tt>,
12454 * including possible updates to global variables;
12455 * see String#=~.
12456 *
12457 */
12458
12459static VALUE
12460sym_match(VALUE sym, VALUE other)
12461{
12462 return rb_str_match(rb_sym2str(sym), other);
12463}
12464
12465/*
12466 * call-seq:
12467 * match(pattern, offset = 0) -> matchdata or nil
12468 * match(pattern, offset = 0) {|matchdata| } -> object
12469 *
12470 * Equivalent to <tt>self.to_s.match</tt>,
12471 * including possible updates to global variables;
12472 * see String#match.
12473 *
12474 */
12475
12476static VALUE
12477sym_match_m(int argc, VALUE *argv, VALUE sym)
12478{
12479 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12480}
12481
12482/*
12483 * call-seq:
12484 * match?(pattern, offset) -> true or false
12485 *
12486 * Equivalent to <tt>sym.to_s.match?</tt>;
12487 * see String#match.
12488 *
12489 */
12490
12491static VALUE
12492sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12493{
12494 return rb_str_match_m_p(argc, argv, sym);
12495}
12496
12497/*
12498 * call-seq:
12499 * self[offset] -> string or nil
12500 * self[offset, size] -> string or nil
12501 * self[range] -> string or nil
12502 * self[regexp, capture = 0] -> string or nil
12503 * self[substring] -> string or nil
12504 *
12505 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12506 *
12507 */
12508
12509static VALUE
12510sym_aref(int argc, VALUE *argv, VALUE sym)
12511{
12512 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12513}
12514
12515/*
12516 * call-seq:
12517 * length -> integer
12518 *
12519 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12520 */
12521
12522static VALUE
12523sym_length(VALUE sym)
12524{
12525 return rb_str_length(rb_sym2str(sym));
12526}
12527
12528/*
12529 * call-seq:
12530 * empty? -> true or false
12531 *
12532 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12533 *
12534 */
12535
12536static VALUE
12537sym_empty(VALUE sym)
12538{
12539 return rb_str_empty(rb_sym2str(sym));
12540}
12541
12542/*
12543 * call-seq:
12544 * upcase(mapping) -> symbol
12545 *
12546 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12547 *
12548 * See String#upcase.
12549 *
12550 */
12551
12552static VALUE
12553sym_upcase(int argc, VALUE *argv, VALUE sym)
12554{
12555 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12556}
12557
12558/*
12559 * call-seq:
12560 * downcase(mapping) -> symbol
12561 *
12562 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12563 *
12564 * See String#downcase.
12565 *
12566 * Related: Symbol#upcase.
12567 *
12568 */
12569
12570static VALUE
12571sym_downcase(int argc, VALUE *argv, VALUE sym)
12572{
12573 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12574}
12575
12576/*
12577 * call-seq:
12578 * capitalize(mapping) -> symbol
12579 *
12580 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12581 *
12582 * See String#capitalize.
12583 *
12584 */
12585
12586static VALUE
12587sym_capitalize(int argc, VALUE *argv, VALUE sym)
12588{
12589 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12590}
12591
12592/*
12593 * call-seq:
12594 * swapcase(mapping) -> symbol
12595 *
12596 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12597 *
12598 * See String#swapcase.
12599 *
12600 */
12601
12602static VALUE
12603sym_swapcase(int argc, VALUE *argv, VALUE sym)
12604{
12605 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12606}
12607
12608/*
12609 * call-seq:
12610 * start_with?(*string_or_regexp) -> true or false
12611 *
12612 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12613 *
12614 */
12615
12616static VALUE
12617sym_start_with(int argc, VALUE *argv, VALUE sym)
12618{
12619 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12620}
12621
12622/*
12623 * call-seq:
12624 * end_with?(*strings) -> true or false
12625 *
12626 *
12627 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12628 *
12629 */
12630
12631static VALUE
12632sym_end_with(int argc, VALUE *argv, VALUE sym)
12633{
12634 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12635}
12636
12637/*
12638 * call-seq:
12639 * encoding -> encoding
12640 *
12641 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12642 *
12643 */
12644
12645static VALUE
12646sym_encoding(VALUE sym)
12647{
12648 return rb_obj_encoding(rb_sym2str(sym));
12649}
12650
12651static VALUE
12652string_for_symbol(VALUE name)
12653{
12654 if (!RB_TYPE_P(name, T_STRING)) {
12655 VALUE tmp = rb_check_string_type(name);
12656 if (NIL_P(tmp)) {
12657 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12658 name);
12659 }
12660 name = tmp;
12661 }
12662 return name;
12663}
12664
12665ID
12667{
12668 if (SYMBOL_P(name)) {
12669 return SYM2ID(name);
12670 }
12671 name = string_for_symbol(name);
12672 return rb_intern_str(name);
12673}
12674
12675VALUE
12677{
12678 if (SYMBOL_P(name)) {
12679 return name;
12680 }
12681 name = string_for_symbol(name);
12682 return rb_str_intern(name);
12683}
12684
12685/*
12686 * call-seq:
12687 * Symbol.all_symbols -> array_of_symbols
12688 *
12689 * Returns an array of all symbols currently in Ruby's symbol table:
12690 *
12691 * Symbol.all_symbols.size # => 9334
12692 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12693 *
12694 */
12695
12696static VALUE
12697sym_all_symbols(VALUE _)
12698{
12699 return rb_sym_all_symbols();
12700}
12701
12702VALUE
12703rb_str_to_interned_str(VALUE str)
12704{
12705 return rb_fstring(str);
12706}
12707
12708VALUE
12709rb_interned_str(const char *ptr, long len)
12710{
12711 struct RString fake_str = {RBASIC_INIT};
12712 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12713}
12714
12715VALUE
12717{
12718 return rb_interned_str(ptr, strlen(ptr));
12719}
12720
12721VALUE
12722rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12723{
12724 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12725 rb_enc_autoload(enc);
12726 }
12727
12728 struct RString fake_str = {RBASIC_INIT};
12729 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12730}
12731
12732VALUE
12733rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12734{
12735 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12736 rb_enc_autoload(enc);
12737 }
12738
12739 struct RString fake_str = {RBASIC_INIT};
12740 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12741 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12742 return str;
12743}
12744
12745VALUE
12747{
12748 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12749}
12750
12751#if USE_YJIT || USE_ZJIT
12752void
12753rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12754{
12755 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12756 ssize_t code = RB_NUM2SSIZE(codepoint);
12757
12758 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12759 rb_str_buf_cat_byte(str, (char) code);
12760 return;
12761 }
12762 }
12763
12764 rb_str_concat(str, codepoint);
12765}
12766#endif
12767
12768static int
12769fstring_set_class_i(VALUE *str, void *data)
12770{
12771 RBASIC_SET_CLASS(*str, rb_cString);
12772
12773 return ST_CONTINUE;
12774}
12775
12776void
12777Init_String(void)
12778{
12780
12781 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12782
12784 rb_define_alloc_func(rb_cString, empty_str_alloc);
12785 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12786 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12787 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12789 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12790 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12793 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12794 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12795 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12796 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12799 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12800 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12801 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12802 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12805 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12806 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12807 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12808 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12809 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12811 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12813 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12814 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12815 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12816 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12817 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12818 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12819 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12820 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12821 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12822 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12823 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12824 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12825 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12826 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12828 rb_define_method(rb_cString, "+@", str_uplus, 0);
12829 rb_define_method(rb_cString, "-@", str_uminus, 0);
12830 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12831 rb_define_alias(rb_cString, "dedup", "-@");
12832
12833 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12834 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12835 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12836 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12839 rb_define_method(rb_cString, "undump", str_undump, 0);
12840
12841 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12842 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12843 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12844 sym_fold = ID2SYM(rb_intern_const("fold"));
12845
12846 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12847 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12848 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12849 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12850
12851 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12852 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12853 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12854 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12855
12856 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12857 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12858 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12859 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12860 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12861 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12862 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12863 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12864 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12865 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12866 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12867 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12869 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12870 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12871 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12872 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12873 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12874
12875 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12876 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12877 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12878
12879 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12880
12881 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12882 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12883 rb_define_method(rb_cString, "center", rb_str_center, -1);
12884
12885 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12886 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12887 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12888 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12889 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12890 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12891 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12892 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12893 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12894
12895 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12896 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12897 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12898 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12899 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12900 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12901 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12902 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12903 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12904
12905 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12906 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12907 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12908 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12909 rb_define_method(rb_cString, "count", rb_str_count, -1);
12910
12911 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12912 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12913 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12914 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12915
12916 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12917 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12918 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12919 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12920 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12921
12922 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12923
12924 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12925 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12926
12927 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12928 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12929
12930 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12931 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12932 rb_define_method(rb_cString, "b", rb_str_b, 0);
12933 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12934 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12935
12936 /* define UnicodeNormalize module here so that we don't have to look it up */
12937 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12938 id_normalize = rb_intern_const("normalize");
12939 id_normalized_p = rb_intern_const("normalized?");
12940
12941 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12942 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12943 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12944
12945 rb_fs = Qnil;
12946 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12947 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12948 rb_gc_register_address(&rb_fs);
12949
12954 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12955
12956 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12957 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12958 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12959 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12960 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12961 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12962
12963 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12964 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12965 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12966 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12967
12968 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12969 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12970 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12971 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12972 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12973 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12974 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12975
12976 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12977 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12978 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12979 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12980
12981 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12982 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12983
12984 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12985}
12986
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:707
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:415
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1798
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1591
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1704
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2958
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2770
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3248
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1010
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3037
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:132
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:135
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:133
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:130
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:127
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:124
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:129
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:65
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:131
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:128
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:136
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:653
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3896
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1422
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1418
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1425
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1416
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1420
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_cObject
Object class.
Definition object.c:61
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:675
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2258
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2276
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1354
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3672
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:264
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:582
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:176
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1342
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3356
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1343
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:948
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1208
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3029
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1227
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12722
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:254
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2335
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3733
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1156
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1448
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1349
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:967
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12746
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:832
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:715
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2030
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2036
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1950
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1232
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4222
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3720
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1486
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1923
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1753
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1513
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2488
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3798
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1424
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12346
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2561
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1400
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1747
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3057
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5343
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4161
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3154
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11667
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1783
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1789
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1190
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1002
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1519
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1997
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4147
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3566
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2424
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2015
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6550
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3162
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12716
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1430
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3764
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3104
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4268
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3388
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7229
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2791
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12709
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4215
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4035
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4190
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3740
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3279
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5827
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11725
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1703
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2951
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3251
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3370
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1202
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2745
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7336
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1412
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1719
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2438
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5745
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9343
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1196
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1851
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2017
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2096
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3403
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1651
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12676
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12666
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1862
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3500
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4466
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1442
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2928
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2810
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1436
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2823
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1780
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
Definition rtypeddata.h:119
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:507
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
union RString::@52::@53::@55 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:211
Definition string.c:8223
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:307
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113