Ruby 4.1.0dev (2025-12-30 revision 27d6c966583c65c9ffd02f931d9c4efe8d7232e0)
string.c (27d6c966583c65c9ffd02f931d9c4efe8d7232e0)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 rb_gc_register_pinning_obj(str); \
209 FL_SET((shared_str), STR_SHARED_ROOT); \
210 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
211 FL_SET_RAW((shared_str), STR_BORROWED); \
212 } \
213} while (0)
214
215#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
216#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
217/* TODO: include the terminator size in capa. */
218
219#define STR_ENC_GET(str) get_encoding(str)
220
221#if !defined SHARABLE_MIDDLE_SUBSTRING
222# define SHARABLE_MIDDLE_SUBSTRING 0
223#endif
224#if !SHARABLE_MIDDLE_SUBSTRING
225#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
226#else
227#define SHARABLE_SUBSTRING_P(beg, len, end) 1
228#endif
229
230
231static inline long
232str_embed_capa(VALUE str)
233{
234 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
235}
236
237bool
238rb_str_reembeddable_p(VALUE str)
239{
240 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
241}
242
243static inline size_t
244rb_str_embed_size(long capa, long termlen)
245{
246 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
247 if (size < sizeof(struct RString)) size = sizeof(struct RString);
248 return size;
249}
250
251size_t
252rb_str_size_as_embedded(VALUE str)
253{
254 size_t real_size;
255 if (STR_EMBED_P(str)) {
256 size_t capa = RSTRING(str)->len;
257 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
258
259 real_size = rb_str_embed_size(capa, TERM_LEN(str));
260 }
261 /* if the string is not currently embedded, but it can be embedded, how
262 * much space would it require */
263 else if (rb_str_reembeddable_p(str)) {
264 size_t capa = RSTRING(str)->as.heap.aux.capa;
265 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
266
267 real_size = rb_str_embed_size(capa, TERM_LEN(str));
268 }
269 else {
270 real_size = sizeof(struct RString);
271 }
272
273 return real_size;
274}
275
276static inline bool
277STR_EMBEDDABLE_P(long len, long termlen)
278{
279 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
280}
281
282static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
283static VALUE str_new_frozen(VALUE klass, VALUE orig);
284static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
285static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
286static VALUE str_new(VALUE klass, const char *ptr, long len);
287static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
288static inline void str_modifiable(VALUE str);
289static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
290static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
291
292static inline void
293str_make_independent(VALUE str)
294{
295 long len = RSTRING_LEN(str);
296 int termlen = TERM_LEN(str);
297 str_make_independent_expand((str), len, 0L, termlen);
298}
299
300static inline int str_dependent_p(VALUE str);
301
302void
303rb_str_make_independent(VALUE str)
304{
305 if (str_dependent_p(str)) {
306 str_make_independent(str);
307 }
308}
309
310void
311rb_str_make_embedded(VALUE str)
312{
313 RUBY_ASSERT(rb_str_reembeddable_p(str));
314 RUBY_ASSERT(!STR_EMBED_P(str));
315
316 char *buf = RSTRING(str)->as.heap.ptr;
317 long len = RSTRING(str)->len;
318
319 STR_SET_EMBED(str);
320 STR_SET_LEN(str, len);
321
322 if (len > 0) {
323 memcpy(RSTRING_PTR(str), buf, len);
324 ruby_xfree(buf);
325 }
326
327 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
328}
329
330void
331rb_debug_rstring_null_ptr(const char *func)
332{
333 fprintf(stderr, "%s is returning NULL!! "
334 "SIGSEGV is highly expected to follow immediately.\n"
335 "If you could reproduce, attach your debugger here, "
336 "and look at the passed string.\n",
337 func);
338}
339
340/* symbols for [up|down|swap]case/capitalize options */
341static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
342
343static rb_encoding *
344get_encoding(VALUE str)
345{
346 return rb_enc_from_index(ENCODING_GET(str));
347}
348
349static void
350mustnot_broken(VALUE str)
351{
352 if (is_broken_string(str)) {
353 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
354 }
355}
356
357static void
358mustnot_wchar(VALUE str)
359{
360 rb_encoding *enc = STR_ENC_GET(str);
361 if (rb_enc_mbminlen(enc) > 1) {
362 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
363 }
364}
365
366static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
367
368#if SIZEOF_LONG == SIZEOF_VOIDP
369#define PRECOMPUTED_FAKESTR_HASH 1
370#else
371#endif
372
373static inline bool
374BARE_STRING_P(VALUE str)
375{
376 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
377}
378
379static inline st_index_t
380str_do_hash(VALUE str)
381{
382 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
383 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
384 if (e && !is_ascii_string(str)) {
385 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
386 }
387 return h;
388}
389
390static VALUE
391str_store_precomputed_hash(VALUE str, st_index_t hash)
392{
393 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
394 RUBY_ASSERT(STR_EMBED_P(str));
395
396#if RUBY_DEBUG
397 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
398 size_t free_bytes = str_embed_capa(str) - used_bytes;
399 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
400#endif
401
402 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
403
404 FL_SET(str, STR_PRECOMPUTED_HASH);
405
406 return str;
407}
408
409VALUE
410rb_fstring(VALUE str)
411{
412 VALUE fstr;
413 int bare;
414
415 Check_Type(str, T_STRING);
416
417 if (FL_TEST(str, RSTRING_FSTR))
418 return str;
419
420 bare = BARE_STRING_P(str);
421 if (!bare) {
422 if (STR_EMBED_P(str)) {
423 OBJ_FREEZE(str);
424 return str;
425 }
426
427 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
429 return str;
430 }
431 }
432
433 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
434 rb_str_resize(str, RSTRING_LEN(str));
435
436 fstr = register_fstring(str, false, false);
437
438 if (!bare) {
439 str_replace_shared_without_enc(str, fstr);
440 OBJ_FREEZE(str);
441 return str;
442 }
443 return fstr;
444}
445
446static VALUE fstring_table_obj;
447
448static VALUE
449fstring_concurrent_set_hash(VALUE str)
450{
451#ifdef PRECOMPUTED_FAKESTR_HASH
452 st_index_t h;
453 if (FL_TEST_RAW(str, STR_FAKESTR)) {
454 // register_fstring precomputes the hash and stores it in capa for fake strings
455 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
456 }
457 else {
458 h = rb_str_hash(str);
459 }
460 // rb_str_hash doesn't include the encoding for ascii only strings, so
461 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
462 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
463#else
464 return (VALUE)rb_str_hash(str);
465#endif
466}
467
468static bool
469fstring_concurrent_set_cmp(VALUE a, VALUE b)
470{
471 long alen, blen;
472 const char *aptr, *bptr;
473
476
477 RSTRING_GETMEM(a, aptr, alen);
478 RSTRING_GETMEM(b, bptr, blen);
479 return (alen == blen &&
480 ENCODING_GET(a) == ENCODING_GET(b) &&
481 memcmp(aptr, bptr, alen) == 0);
482}
483
485 bool copy;
486 bool force_precompute_hash;
487};
488
489static VALUE
490fstring_concurrent_set_create(VALUE str, void *data)
491{
492 struct fstr_create_arg *arg = data;
493
494 // Unless the string is empty or binary, its coderange has been precomputed.
495 int coderange = ENC_CODERANGE(str);
496
497 if (FL_TEST_RAW(str, STR_FAKESTR)) {
498 if (arg->copy) {
499 VALUE new_str;
500 long len = RSTRING_LEN(str);
501 long capa = len + sizeof(st_index_t);
502 int term_len = TERM_LEN(str);
503
504 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
505 new_str = str_alloc_embed(rb_cString, capa + term_len);
506 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
507 STR_SET_LEN(new_str, RSTRING_LEN(str));
508 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
509 rb_enc_copy(new_str, str);
510 str_store_precomputed_hash(new_str, str_do_hash(str));
511 }
512 else {
513 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
514 rb_enc_copy(new_str, str);
515#ifdef PRECOMPUTED_FAKESTR_HASH
516 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
517 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
518 }
519#endif
520 }
521 str = new_str;
522 }
523 else {
524 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
525 RSTRING(str)->len,
526 ENCODING_GET(str));
527 }
528 OBJ_FREEZE(str);
529 }
530 else {
531 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
532 str = str_new_frozen(rb_cString, str);
533 }
534 if (STR_SHARED_P(str)) { /* str should not be shared */
535 /* shared substring */
536 str_make_independent(str);
538 }
539 if (!BARE_STRING_P(str)) {
540 str = str_new_frozen(rb_cString, str);
541 }
542 }
543
544 ENC_CODERANGE_SET(str, coderange);
545 RBASIC(str)->flags |= RSTRING_FSTR;
546 if (!RB_OBJ_SHAREABLE_P(str)) {
547 RB_OBJ_SET_SHAREABLE(str);
548 }
549 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
552 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
553 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
555 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
556
557 return str;
558}
559
560static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
561 .hash = fstring_concurrent_set_hash,
562 .cmp = fstring_concurrent_set_cmp,
563 .create = fstring_concurrent_set_create,
564 .free = NULL,
565};
566
567void
568Init_fstring_table(void)
569{
570 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
571 rb_gc_register_address(&fstring_table_obj);
572}
573
574static VALUE
575register_fstring(VALUE str, bool copy, bool force_precompute_hash)
576{
577 struct fstr_create_arg args = {
578 .copy = copy,
579 .force_precompute_hash = force_precompute_hash
580 };
581
582#if SIZEOF_VOIDP == SIZEOF_LONG
583 if (FL_TEST_RAW(str, STR_FAKESTR)) {
584 // if the string hasn't been interned, we'll need the hash twice, so we
585 // compute it once and store it in capa
586 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
587 }
588#endif
589
590 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
591
592 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
594 RUBY_ASSERT(OBJ_FROZEN(result));
596 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
597 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
599
600 return result;
601}
602
603bool
604rb_obj_is_fstring_table(VALUE obj)
605{
606 ASSERT_vm_locking();
607
608 return obj == fstring_table_obj;
609}
610
611void
612rb_gc_free_fstring(VALUE obj)
613{
614 ASSERT_vm_locking_with_barrier();
615
616 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
618 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
619
620 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
621
622 RB_DEBUG_COUNTER_INC(obj_str_fstr);
623
624 FL_UNSET(obj, RSTRING_FSTR);
625}
626
627void
628rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
629{
630 if (fstring_table_obj) {
631 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
632 }
633}
634
635static VALUE
636setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
637{
638 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
639 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
640
641 if (!name) {
643 name = "";
644 }
645
646 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
647
648 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
649 fake_str->len = len;
650 fake_str->as.heap.ptr = (char *)name;
651 fake_str->as.heap.aux.capa = len;
652 return (VALUE)fake_str;
653}
654
655/*
656 * set up a fake string which refers a static string literal.
657 */
658VALUE
659rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
660{
661 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
662}
663
664/*
665 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
666 * shared string which refers a static string literal. `ptr` must
667 * point a constant string.
668 */
669VALUE
670rb_fstring_new(const char *ptr, long len)
671{
672 struct RString fake_str = {RBASIC_INIT};
673 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
674}
675
676VALUE
677rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
678{
679 struct RString fake_str = {RBASIC_INIT};
680 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
681}
682
683VALUE
684rb_fstring_cstr(const char *ptr)
685{
686 return rb_fstring_new(ptr, strlen(ptr));
687}
688
689static inline bool
690single_byte_optimizable(VALUE str)
691{
692 int encindex = ENCODING_GET(str);
693 switch (encindex) {
694 case ENCINDEX_ASCII_8BIT:
695 case ENCINDEX_US_ASCII:
696 return true;
697 case ENCINDEX_UTF_8:
698 // For UTF-8 it's worth scanning the string coderange when unknown.
700 }
701 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
702 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
703 return true;
704 }
705
706 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
707 return true;
708 }
709
710 /* Conservative. Possibly single byte.
711 * "\xa1" in Shift_JIS for example. */
712 return false;
713}
714
716
717static inline const char *
718search_nonascii(const char *p, const char *e)
719{
720 const char *s, *t;
721
722#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
723# if SIZEOF_UINTPTR_T == 8
724# define NONASCII_MASK UINT64_C(0x8080808080808080)
725# elif SIZEOF_UINTPTR_T == 4
726# define NONASCII_MASK UINT32_C(0x80808080)
727# else
728# error "don't know what to do."
729# endif
730#else
731# if SIZEOF_UINTPTR_T == 8
732# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
733# elif SIZEOF_UINTPTR_T == 4
734# define NONASCII_MASK 0x80808080UL /* or...? */
735# else
736# error "don't know what to do."
737# endif
738#endif
739
740 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
741#if !UNALIGNED_WORD_ACCESS
742 if ((uintptr_t)p % SIZEOF_VOIDP) {
743 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
744 p += l;
745 switch (l) {
746 default: UNREACHABLE;
747#if SIZEOF_VOIDP > 4
748 case 7: if (p[-7]&0x80) return p-7;
749 case 6: if (p[-6]&0x80) return p-6;
750 case 5: if (p[-5]&0x80) return p-5;
751 case 4: if (p[-4]&0x80) return p-4;
752#endif
753 case 3: if (p[-3]&0x80) return p-3;
754 case 2: if (p[-2]&0x80) return p-2;
755 case 1: if (p[-1]&0x80) return p-1;
756 case 0: break;
757 }
758 }
759#endif
760#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
761#define aligned_ptr(value) \
762 __builtin_assume_aligned((value), sizeof(uintptr_t))
763#else
764#define aligned_ptr(value) (value)
765#endif
766 s = aligned_ptr(p);
767 t = (e - (SIZEOF_VOIDP-1));
768#undef aligned_ptr
769 for (;s < t; s += sizeof(uintptr_t)) {
770 uintptr_t word;
771 memcpy(&word, s, sizeof(word));
772 if (word & NONASCII_MASK) {
773#ifdef WORDS_BIGENDIAN
774 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
775#else
776 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
777#endif
778 }
779 }
780 p = (const char *)s;
781 }
782
783 switch (e - p) {
784 default: UNREACHABLE;
785#if SIZEOF_VOIDP > 4
786 case 7: if (e[-7]&0x80) return e-7;
787 case 6: if (e[-6]&0x80) return e-6;
788 case 5: if (e[-5]&0x80) return e-5;
789 case 4: if (e[-4]&0x80) return e-4;
790#endif
791 case 3: if (e[-3]&0x80) return e-3;
792 case 2: if (e[-2]&0x80) return e-2;
793 case 1: if (e[-1]&0x80) return e-1;
794 case 0: return NULL;
795 }
796}
797
798static int
799coderange_scan(const char *p, long len, rb_encoding *enc)
800{
801 const char *e = p + len;
802
803 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
804 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
805 p = search_nonascii(p, e);
807 }
808
809 if (rb_enc_asciicompat(enc)) {
810 p = search_nonascii(p, e);
811 if (!p) return ENC_CODERANGE_7BIT;
812 for (;;) {
813 int ret = rb_enc_precise_mbclen(p, e, enc);
815 p += MBCLEN_CHARFOUND_LEN(ret);
816 if (p == e) break;
817 p = search_nonascii(p, e);
818 if (!p) break;
819 }
820 }
821 else {
822 while (p < e) {
823 int ret = rb_enc_precise_mbclen(p, e, enc);
825 p += MBCLEN_CHARFOUND_LEN(ret);
826 }
827 }
828 return ENC_CODERANGE_VALID;
829}
830
831long
832rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
833{
834 const char *p = s;
835
836 if (*cr == ENC_CODERANGE_BROKEN)
837 return e - s;
838
839 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
840 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
841 if (*cr == ENC_CODERANGE_VALID) return e - s;
842 p = search_nonascii(p, e);
844 return e - s;
845 }
846 else if (rb_enc_asciicompat(enc)) {
847 p = search_nonascii(p, e);
848 if (!p) {
849 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
850 return e - s;
851 }
852 for (;;) {
853 int ret = rb_enc_precise_mbclen(p, e, enc);
854 if (!MBCLEN_CHARFOUND_P(ret)) {
856 return p - s;
857 }
858 p += MBCLEN_CHARFOUND_LEN(ret);
859 if (p == e) break;
860 p = search_nonascii(p, e);
861 if (!p) break;
862 }
863 }
864 else {
865 while (p < e) {
866 int ret = rb_enc_precise_mbclen(p, e, enc);
867 if (!MBCLEN_CHARFOUND_P(ret)) {
869 return p - s;
870 }
871 p += MBCLEN_CHARFOUND_LEN(ret);
872 }
873 }
875 return e - s;
876}
877
878static inline void
879str_enc_copy(VALUE str1, VALUE str2)
880{
881 rb_enc_set_index(str1, ENCODING_GET(str2));
882}
883
884/* Like str_enc_copy, but does not check frozen status of str1.
885 * You should use this only if you're certain that str1 is not frozen. */
886static inline void
887str_enc_copy_direct(VALUE str1, VALUE str2)
888{
889 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
890 if (inlined_encoding == ENCODING_INLINE_MAX) {
891 rb_enc_set_index(str1, rb_enc_get_index(str2));
892 }
893 else {
894 ENCODING_SET_INLINED(str1, inlined_encoding);
895 }
896}
897
898static void
899rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
900{
901 /* this function is designed for copying encoding and coderange
902 * from src to new string "dest" which is made from the part of src.
903 */
904 str_enc_copy(dest, src);
905 if (RSTRING_LEN(dest) == 0) {
906 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
908 else
910 return;
911 }
912 switch (ENC_CODERANGE(src)) {
915 break;
917 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
918 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
920 else
922 break;
923 default:
924 break;
925 }
926}
927
928static void
929rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
930{
931 str_enc_copy(dest, src);
933}
934
935static int
936enc_coderange_scan(VALUE str, rb_encoding *enc)
937{
938 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
939}
940
941int
942rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
943{
944 return enc_coderange_scan(str, enc);
945}
946
947int
949{
950 int cr = ENC_CODERANGE(str);
951
952 if (cr == ENC_CODERANGE_UNKNOWN) {
953 cr = enc_coderange_scan(str, get_encoding(str));
954 ENC_CODERANGE_SET(str, cr);
955 }
956 return cr;
957}
958
959static inline bool
960rb_enc_str_asciicompat(VALUE str)
961{
962 int encindex = ENCODING_GET_INLINED(str);
963 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
964}
965
966int
968{
969 switch(ENC_CODERANGE(str)) {
971 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
973 return true;
974 default:
975 return false;
976 }
977}
978
979static inline void
980str_mod_check(VALUE s, const char *p, long len)
981{
982 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
983 rb_raise(rb_eRuntimeError, "string modified");
984 }
985}
986
987static size_t
988str_capacity(VALUE str, const int termlen)
989{
990 if (STR_EMBED_P(str)) {
991 return str_embed_capa(str) - termlen;
992 }
993 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
994 return RSTRING(str)->len;
995 }
996 else {
997 return RSTRING(str)->as.heap.aux.capa;
998 }
999}
1000
1001size_t
1003{
1004 return str_capacity(str, TERM_LEN(str));
1005}
1006
1007static inline void
1008must_not_null(const char *ptr)
1009{
1010 if (!ptr) {
1011 rb_raise(rb_eArgError, "NULL pointer given");
1012 }
1013}
1014
1015static inline VALUE
1016str_alloc_embed(VALUE klass, size_t capa)
1017{
1018 size_t size = rb_str_embed_size(capa, 0);
1019 RUBY_ASSERT(size > 0);
1020 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1021
1022 NEWOBJ_OF(str, struct RString, klass,
1024
1025 str->len = 0;
1026 str->as.embed.ary[0] = 0;
1027
1028 return (VALUE)str;
1029}
1030
1031static inline VALUE
1032str_alloc_heap(VALUE klass)
1033{
1034 NEWOBJ_OF(str, struct RString, klass,
1035 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1036
1037 str->len = 0;
1038 str->as.heap.aux.capa = 0;
1039 str->as.heap.ptr = NULL;
1040
1041 return (VALUE)str;
1042}
1043
1044static inline VALUE
1045empty_str_alloc(VALUE klass)
1046{
1047 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1048 VALUE str = str_alloc_embed(klass, 0);
1049 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1051 return str;
1052}
1053
1054static VALUE
1055str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1056{
1057 VALUE str;
1058
1059 if (len < 0) {
1060 rb_raise(rb_eArgError, "negative string size (or size too big)");
1061 }
1062
1063 if (enc == NULL) {
1064 enc = rb_ascii8bit_encoding();
1065 }
1066
1067 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1068
1069 int termlen = rb_enc_mbminlen(enc);
1070
1071 if (STR_EMBEDDABLE_P(len, termlen)) {
1072 str = str_alloc_embed(klass, len + termlen);
1073 if (len == 0) {
1074 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1075 }
1076 }
1077 else {
1078 str = str_alloc_heap(klass);
1079 RSTRING(str)->as.heap.aux.capa = len;
1080 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1081 * integer overflow. If we can STATIC_ASSERT that, the following
1082 * mul_add_mul can be reverted to a simple ALLOC_N. */
1083 RSTRING(str)->as.heap.ptr =
1084 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1085 }
1086
1087 rb_enc_raw_set(str, enc);
1088
1089 if (ptr) {
1090 memcpy(RSTRING_PTR(str), ptr, len);
1091 }
1092 else {
1093 memset(RSTRING_PTR(str), 0, len);
1094 }
1095
1096 STR_SET_LEN(str, len);
1097 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1098 return str;
1099}
1100
1101static VALUE
1102str_new(VALUE klass, const char *ptr, long len)
1103{
1104 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1105}
1106
1107VALUE
1108rb_str_new(const char *ptr, long len)
1109{
1110 return str_new(rb_cString, ptr, len);
1111}
1112
1113VALUE
1114rb_usascii_str_new(const char *ptr, long len)
1115{
1116 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1117}
1118
1119VALUE
1120rb_utf8_str_new(const char *ptr, long len)
1121{
1122 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1123}
1124
1125VALUE
1126rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1127{
1128 return str_enc_new(rb_cString, ptr, len, enc);
1129}
1130
1131VALUE
1133{
1134 must_not_null(ptr);
1135 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1136 * memory regions, and that cannot be detected by the MSAN. Just
1137 * trust the programmer that the argument passed here is a sane C
1138 * string. */
1139 __msan_unpoison_string(ptr);
1140 return rb_str_new(ptr, strlen(ptr));
1141}
1142
1143VALUE
1145{
1146 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1147}
1148
1149VALUE
1151{
1152 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1153}
1154
1155VALUE
1157{
1158 must_not_null(ptr);
1159 if (rb_enc_mbminlen(enc) != 1) {
1160 rb_raise(rb_eArgError, "wchar encoding given");
1161 }
1162 return rb_enc_str_new(ptr, strlen(ptr), enc);
1163}
1164
1165static VALUE
1166str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1167{
1168 VALUE str;
1169
1170 if (len < 0) {
1171 rb_raise(rb_eArgError, "negative string size (or size too big)");
1172 }
1173
1174 if (!ptr) {
1175 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1176 }
1177 else {
1178 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1179 str = str_alloc_heap(klass);
1180 RSTRING(str)->len = len;
1181 RSTRING(str)->as.heap.ptr = (char *)ptr;
1182 RSTRING(str)->as.heap.aux.capa = len;
1183 RBASIC(str)->flags |= STR_NOFREE;
1184 rb_enc_associate_index(str, encindex);
1185 }
1186 return str;
1187}
1188
1189VALUE
1190rb_str_new_static(const char *ptr, long len)
1191{
1192 return str_new_static(rb_cString, ptr, len, 0);
1193}
1194
1195VALUE
1197{
1198 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1199}
1200
1201VALUE
1203{
1204 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1205}
1206
1207VALUE
1209{
1210 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1211}
1212
1213static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1214 rb_encoding *from, rb_encoding *to,
1215 int ecflags, VALUE ecopts);
1216
1217static inline bool
1218is_enc_ascii_string(VALUE str, rb_encoding *enc)
1219{
1220 int encidx = rb_enc_to_index(enc);
1221 if (rb_enc_get_index(str) == encidx)
1222 return is_ascii_string(str);
1223 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1224}
1225
1226VALUE
1227rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1228{
1229 long len;
1230 const char *ptr;
1231 VALUE newstr;
1232
1233 if (!to) return str;
1234 if (!from) from = rb_enc_get(str);
1235 if (from == to) return str;
1236 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1237 rb_is_ascii8bit_enc(to)) {
1238 if (STR_ENC_GET(str) != to) {
1239 str = rb_str_dup(str);
1240 rb_enc_associate(str, to);
1241 }
1242 return str;
1243 }
1244
1245 RSTRING_GETMEM(str, ptr, len);
1246 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1247 from, to, ecflags, ecopts);
1248 if (NIL_P(newstr)) {
1249 /* some error, return original */
1250 return str;
1251 }
1252 return newstr;
1253}
1254
1255VALUE
1256rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1257 rb_encoding *from, int ecflags, VALUE ecopts)
1258{
1259 long olen;
1260
1261 olen = RSTRING_LEN(newstr);
1262 if (ofs < -olen || olen < ofs)
1263 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1264 if (ofs < 0) ofs += olen;
1265 if (!from) {
1266 STR_SET_LEN(newstr, ofs);
1267 return rb_str_cat(newstr, ptr, len);
1268 }
1269
1270 rb_str_modify(newstr);
1271 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1272 rb_enc_get(newstr),
1273 ecflags, ecopts);
1274}
1275
1276VALUE
1277rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1278{
1279 STR_SET_LEN(str, 0);
1280 rb_enc_associate(str, enc);
1281 rb_str_cat(str, ptr, len);
1282 return str;
1283}
1284
1285static VALUE
1286str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1287 rb_encoding *from, rb_encoding *to,
1288 int ecflags, VALUE ecopts)
1289{
1290 rb_econv_t *ec;
1292 long olen;
1293 VALUE econv_wrapper;
1294 const unsigned char *start, *sp;
1295 unsigned char *dest, *dp;
1296 size_t converted_output = (size_t)ofs;
1297
1298 olen = rb_str_capacity(newstr);
1299
1300 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1301 RBASIC_CLEAR_CLASS(econv_wrapper);
1302 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1303 if (!ec) return Qnil;
1304 DATA_PTR(econv_wrapper) = ec;
1305
1306 sp = (unsigned char*)ptr;
1307 start = sp;
1308 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1309 (dp = dest + converted_output),
1310 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1312 /* destination buffer short */
1313 size_t converted_input = sp - start;
1314 size_t rest = len - converted_input;
1315 converted_output = dp - dest;
1316 rb_str_set_len(newstr, converted_output);
1317 if (converted_input && converted_output &&
1318 rest < (LONG_MAX / converted_output)) {
1319 rest = (rest * converted_output) / converted_input;
1320 }
1321 else {
1322 rest = olen;
1323 }
1324 olen += rest < 2 ? 2 : rest;
1325 rb_str_resize(newstr, olen);
1326 }
1327 DATA_PTR(econv_wrapper) = 0;
1328 RB_GC_GUARD(econv_wrapper);
1329 rb_econv_close(ec);
1330 switch (ret) {
1331 case econv_finished:
1332 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1333 rb_str_set_len(newstr, len);
1334 rb_enc_associate(newstr, to);
1335 return newstr;
1336
1337 default:
1338 return Qnil;
1339 }
1340}
1341
1342VALUE
1344{
1345 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1346}
1347
1348VALUE
1350{
1351 rb_encoding *ienc;
1352 VALUE str;
1353 const int eidx = rb_enc_to_index(eenc);
1354
1355 if (!ptr) {
1356 return rb_enc_str_new(ptr, len, eenc);
1357 }
1358
1359 /* ASCII-8BIT case, no conversion */
1360 if ((eidx == rb_ascii8bit_encindex()) ||
1361 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1362 return rb_str_new(ptr, len);
1363 }
1364 /* no default_internal or same encoding, no conversion */
1365 ienc = rb_default_internal_encoding();
1366 if (!ienc || eenc == ienc) {
1367 return rb_enc_str_new(ptr, len, eenc);
1368 }
1369 /* ASCII compatible, and ASCII only string, no conversion in
1370 * default_internal */
1371 if ((eidx == rb_ascii8bit_encindex()) ||
1372 (eidx == rb_usascii_encindex()) ||
1373 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1374 return rb_enc_str_new(ptr, len, ienc);
1375 }
1376 /* convert from the given encoding to default_internal */
1377 str = rb_enc_str_new(NULL, 0, ienc);
1378 /* when the conversion failed for some reason, just ignore the
1379 * default_internal and result in the given encoding as-is. */
1380 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1381 rb_str_initialize(str, ptr, len, eenc);
1382 }
1383 return str;
1384}
1385
1386VALUE
1387rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1388{
1389 int eidx = rb_enc_to_index(eenc);
1390 if (eidx == rb_usascii_encindex() &&
1391 !is_ascii_string(str)) {
1392 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1393 return str;
1394 }
1395 rb_enc_associate_index(str, eidx);
1396 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1397}
1398
1399VALUE
1400rb_external_str_new(const char *ptr, long len)
1401{
1402 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1403}
1404
1405VALUE
1407{
1408 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1409}
1410
1411VALUE
1412rb_locale_str_new(const char *ptr, long len)
1413{
1414 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1415}
1416
1417VALUE
1419{
1420 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1421}
1422
1423VALUE
1425{
1426 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1427}
1428
1429VALUE
1431{
1432 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1433}
1434
1435VALUE
1437{
1438 return rb_str_export_to_enc(str, rb_default_external_encoding());
1439}
1440
1441VALUE
1443{
1444 return rb_str_export_to_enc(str, rb_locale_encoding());
1445}
1446
1447VALUE
1449{
1450 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1451}
1452
1453static VALUE
1454str_replace_shared_without_enc(VALUE str2, VALUE str)
1455{
1456 const int termlen = TERM_LEN(str);
1457 char *ptr;
1458 long len;
1459
1460 RSTRING_GETMEM(str, ptr, len);
1461 if (str_embed_capa(str2) >= len + termlen) {
1462 char *ptr2 = RSTRING(str2)->as.embed.ary;
1463 STR_SET_EMBED(str2);
1464 memcpy(ptr2, RSTRING_PTR(str), len);
1465 TERM_FILL(ptr2+len, termlen);
1466 }
1467 else {
1468 VALUE root;
1469 if (STR_SHARED_P(str)) {
1470 root = RSTRING(str)->as.heap.aux.shared;
1471 RSTRING_GETMEM(str, ptr, len);
1472 }
1473 else {
1474 root = rb_str_new_frozen(str);
1475 RSTRING_GETMEM(root, ptr, len);
1476 }
1477 RUBY_ASSERT(OBJ_FROZEN(root));
1478
1479 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1480 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1481 rb_fatal("about to free a possible shared root");
1482 }
1483 char *ptr2 = STR_HEAP_PTR(str2);
1484 if (ptr2 != ptr) {
1485 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1486 }
1487 }
1488 FL_SET(str2, STR_NOEMBED);
1489 RSTRING(str2)->as.heap.ptr = ptr;
1490 STR_SET_SHARED(str2, root);
1491 }
1492
1493 STR_SET_LEN(str2, len);
1494
1495 return str2;
1496}
1497
1498static VALUE
1499str_replace_shared(VALUE str2, VALUE str)
1500{
1501 str_replace_shared_without_enc(str2, str);
1502 rb_enc_cr_str_exact_copy(str2, str);
1503 return str2;
1504}
1505
1506static VALUE
1507str_new_shared(VALUE klass, VALUE str)
1508{
1509 return str_replace_shared(str_alloc_heap(klass), str);
1510}
1511
1512VALUE
1514{
1515 return str_new_shared(rb_obj_class(str), str);
1516}
1517
1518VALUE
1520{
1521 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1522 return str_new_frozen(rb_obj_class(orig), orig);
1523}
1524
1525static VALUE
1526rb_str_new_frozen_String(VALUE orig)
1527{
1528 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1529 return str_new_frozen(rb_cString, orig);
1530}
1531
1532
1533VALUE
1534rb_str_frozen_bare_string(VALUE orig)
1535{
1536 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1537 return str_new_frozen(rb_cString, orig);
1538}
1539
1540VALUE
1541rb_str_tmp_frozen_acquire(VALUE orig)
1542{
1543 if (OBJ_FROZEN_RAW(orig)) return orig;
1544 return str_new_frozen_buffer(0, orig, FALSE);
1545}
1546
1547VALUE
1548rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1549{
1550 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1551 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1552
1553 VALUE str = str_alloc_heap(0);
1554 OBJ_FREEZE(str);
1555 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1556 FL_SET(str, STR_SHARED_ROOT);
1557
1558 size_t capa = str_capacity(orig, TERM_LEN(orig));
1559
1560 /* If the string is embedded then we want to create a copy that is heap
1561 * allocated. If the string is shared then the shared root must be
1562 * embedded, so we want to create a copy. If the string is a shared root
1563 * then it must be embedded, so we want to create a copy. */
1564 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1565 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1566 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1567 }
1568 else {
1569 /* orig must be heap allocated and not shared, so we can safely transfer
1570 * the pointer to str. */
1571 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1572 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1573 RBASIC(orig)->flags &= ~STR_NOFREE;
1574 STR_SET_SHARED(orig, str);
1575 if (RB_OBJ_SHAREABLE_P(orig)) {
1576 RB_OBJ_SET_SHAREABLE(str);
1577 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1578 }
1579 }
1580
1581 RSTRING(str)->len = RSTRING(orig)->len;
1582 RSTRING(str)->as.heap.aux.capa = capa;
1583
1584 return str;
1585}
1586
1587void
1588rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1589{
1590 if (RBASIC_CLASS(tmp) != 0)
1591 return;
1592
1593 if (STR_EMBED_P(tmp)) {
1595 }
1596 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1597 !OBJ_FROZEN_RAW(orig)) {
1598 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1599
1600 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1601 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1602 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1603
1604 /* Unshare orig since the root (tmp) only has this one child. */
1605 FL_UNSET_RAW(orig, STR_SHARED);
1606 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1607 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1609
1610 /* Make tmp embedded and empty so it is safe for sweeping. */
1611 STR_SET_EMBED(tmp);
1612 STR_SET_LEN(tmp, 0);
1613 }
1614 }
1615}
1616
1617static VALUE
1618str_new_frozen(VALUE klass, VALUE orig)
1619{
1620 return str_new_frozen_buffer(klass, orig, TRUE);
1621}
1622
1623static VALUE
1624heap_str_make_shared(VALUE klass, VALUE orig)
1625{
1626 RUBY_ASSERT(!STR_EMBED_P(orig));
1627 RUBY_ASSERT(!STR_SHARED_P(orig));
1629
1630 VALUE str = str_alloc_heap(klass);
1631 STR_SET_LEN(str, RSTRING_LEN(orig));
1632 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1633 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1634 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1635 RBASIC(orig)->flags &= ~STR_NOFREE;
1636 STR_SET_SHARED(orig, str);
1637 if (klass == 0)
1638 FL_UNSET_RAW(str, STR_BORROWED);
1639 return str;
1640}
1641
1642static VALUE
1643str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1644{
1645 VALUE str;
1646
1647 long len = RSTRING_LEN(orig);
1648 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1649 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1650
1651 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1652 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1653 RUBY_ASSERT(STR_EMBED_P(str));
1654 }
1655 else {
1656 if (FL_TEST_RAW(orig, STR_SHARED)) {
1657 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1658 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1659 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1660 RUBY_ASSERT(ofs >= 0);
1661 RUBY_ASSERT(rest >= 0);
1662 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1664
1665 if ((ofs > 0) || (rest > 0) ||
1666 (klass != RBASIC(shared)->klass) ||
1667 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1668 str = str_new_shared(klass, shared);
1669 RUBY_ASSERT(!STR_EMBED_P(str));
1670 RSTRING(str)->as.heap.ptr += ofs;
1671 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1672 }
1673 else {
1674 if (RBASIC_CLASS(shared) == 0)
1675 FL_SET_RAW(shared, STR_BORROWED);
1676 return shared;
1677 }
1678 }
1679 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1680 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1681 STR_SET_EMBED(str);
1682 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1683 STR_SET_LEN(str, RSTRING_LEN(orig));
1684 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1685 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1686 }
1687 else {
1688 if (RB_OBJ_SHAREABLE_P(orig)) {
1689 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1690 }
1691 else {
1692 str = heap_str_make_shared(klass, orig);
1693 }
1694 }
1695 }
1696
1697 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1698 OBJ_FREEZE(str);
1699 return str;
1700}
1701
1702VALUE
1703rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1704{
1705 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1706}
1707
1708static VALUE
1709str_new_empty_String(VALUE str)
1710{
1711 VALUE v = rb_str_new(0, 0);
1712 rb_enc_copy(v, str);
1713 return v;
1714}
1715
1716#define STR_BUF_MIN_SIZE 63
1717
1718VALUE
1720{
1721 if (STR_EMBEDDABLE_P(capa, 1)) {
1722 return str_alloc_embed(rb_cString, capa + 1);
1723 }
1724
1725 VALUE str = str_alloc_heap(rb_cString);
1726
1727 RSTRING(str)->as.heap.aux.capa = capa;
1728 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1729 RSTRING(str)->as.heap.ptr[0] = '\0';
1730
1731 return str;
1732}
1733
1734VALUE
1736{
1737 VALUE str;
1738 long len = strlen(ptr);
1739
1740 str = rb_str_buf_new(len);
1741 rb_str_buf_cat(str, ptr, len);
1742
1743 return str;
1744}
1745
1746VALUE
1748{
1749 return str_new(0, 0, len);
1750}
1751
1752void
1754{
1755 if (STR_EMBED_P(str)) {
1756 RB_DEBUG_COUNTER_INC(obj_str_embed);
1757 }
1758 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1759 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1760 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1761 }
1762 else {
1763 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1764 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1765 }
1766}
1767
1768size_t
1769rb_str_memsize(VALUE str)
1770{
1771 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1772 return STR_HEAP_SIZE(str);
1773 }
1774 else {
1775 return 0;
1776 }
1777}
1778
1779VALUE
1781{
1782 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1783}
1784
1785static inline void str_discard(VALUE str);
1786static void str_shared_replace(VALUE str, VALUE str2);
1787
1788void
1790{
1791 if (str != str2) str_shared_replace(str, str2);
1792}
1793
1794static void
1795str_shared_replace(VALUE str, VALUE str2)
1796{
1797 rb_encoding *enc;
1798 int cr;
1799 int termlen;
1800
1801 RUBY_ASSERT(str2 != str);
1802 enc = STR_ENC_GET(str2);
1803 cr = ENC_CODERANGE(str2);
1804 str_discard(str);
1805 termlen = rb_enc_mbminlen(enc);
1806
1807 STR_SET_LEN(str, RSTRING_LEN(str2));
1808
1809 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1810 STR_SET_EMBED(str);
1811 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1812 rb_enc_associate(str, enc);
1813 ENC_CODERANGE_SET(str, cr);
1814 }
1815 else {
1816 if (STR_EMBED_P(str2)) {
1817 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1818 long len = RSTRING_LEN(str2);
1819 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1820
1821 char *new_ptr = ALLOC_N(char, len + termlen);
1822 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1823 RSTRING(str2)->as.heap.ptr = new_ptr;
1824 STR_SET_LEN(str2, len);
1825 RSTRING(str2)->as.heap.aux.capa = len;
1826 STR_SET_NOEMBED(str2);
1827 }
1828
1829 STR_SET_NOEMBED(str);
1830 FL_UNSET(str, STR_SHARED);
1831 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1832
1833 if (FL_TEST(str2, STR_SHARED)) {
1834 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1835 STR_SET_SHARED(str, shared);
1836 }
1837 else {
1838 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1839 }
1840
1841 /* abandon str2 */
1842 STR_SET_EMBED(str2);
1843 RSTRING_PTR(str2)[0] = 0;
1844 STR_SET_LEN(str2, 0);
1845 rb_enc_associate(str, enc);
1846 ENC_CODERANGE_SET(str, cr);
1847 }
1848}
1849
1850VALUE
1852{
1853 VALUE str;
1854
1855 if (RB_TYPE_P(obj, T_STRING)) {
1856 return obj;
1857 }
1858 str = rb_funcall(obj, idTo_s, 0);
1859 return rb_obj_as_string_result(str, obj);
1860}
1861
1862VALUE
1863rb_obj_as_string_result(VALUE str, VALUE obj)
1864{
1865 if (!RB_TYPE_P(str, T_STRING))
1866 return rb_any_to_s(obj);
1867 return str;
1868}
1869
1870static VALUE
1871str_replace(VALUE str, VALUE str2)
1872{
1873 long len;
1874
1875 len = RSTRING_LEN(str2);
1876 if (STR_SHARED_P(str2)) {
1877 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1879 STR_SET_NOEMBED(str);
1880 STR_SET_LEN(str, len);
1881 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1882 STR_SET_SHARED(str, shared);
1883 rb_enc_cr_str_exact_copy(str, str2);
1884 }
1885 else {
1886 str_replace_shared(str, str2);
1887 }
1888
1889 return str;
1890}
1891
1892static inline VALUE
1893ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1894{
1895 size_t size = rb_str_embed_size(capa, 0);
1896 RUBY_ASSERT(size > 0);
1897 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1898
1899 NEWOBJ_OF(str, struct RString, klass,
1901
1902 str->len = 0;
1903
1904 return (VALUE)str;
1905}
1906
1907static inline VALUE
1908ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1909{
1910 NEWOBJ_OF(str, struct RString, klass,
1911 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1912
1913 str->as.heap.aux.capa = 0;
1914 str->as.heap.ptr = NULL;
1915
1916 return (VALUE)str;
1917}
1918
1919static inline VALUE
1920str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1921{
1922 int encidx = 0;
1923 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1924 encidx = rb_enc_get_index(str);
1925 flags &= ~ENCODING_MASK;
1926 }
1927 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1928 if (encidx) rb_enc_associate_index(dup, encidx);
1929 return dup;
1930}
1931
1932static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1933
1934static inline VALUE
1935str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1936{
1937 VALUE flags = FL_TEST_RAW(str, flag_mask);
1938 long len = RSTRING_LEN(str);
1939
1940 RUBY_ASSERT(STR_EMBED_P(dup));
1941 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1942 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1943 STR_SET_LEN(dup, RSTRING_LEN(str));
1944 return str_duplicate_setup_encoding(str, dup, flags);
1945}
1946
1947static inline VALUE
1948str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1949{
1950 VALUE flags = FL_TEST_RAW(str, flag_mask);
1951 VALUE root = str;
1952 if (FL_TEST_RAW(str, STR_SHARED)) {
1953 root = RSTRING(str)->as.heap.aux.shared;
1954 }
1955 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1956 root = str = str_new_frozen(klass, str);
1957 flags = FL_TEST_RAW(str, flag_mask);
1958 }
1959 RUBY_ASSERT(!STR_SHARED_P(root));
1961
1962 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1963 FL_SET(root, STR_SHARED_ROOT);
1964 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1965 flags |= RSTRING_NOEMBED | STR_SHARED;
1966
1967 STR_SET_LEN(dup, RSTRING_LEN(str));
1968 return str_duplicate_setup_encoding(str, dup, flags);
1969}
1970
1971static inline VALUE
1972str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1973{
1974 if (STR_EMBED_P(str)) {
1975 return str_duplicate_setup_embed(klass, str, dup);
1976 }
1977 else {
1978 return str_duplicate_setup_heap(klass, str, dup);
1979 }
1980}
1981
1982static inline VALUE
1983str_duplicate(VALUE klass, VALUE str)
1984{
1985 VALUE dup;
1986 if (STR_EMBED_P(str)) {
1987 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1988 }
1989 else {
1990 dup = str_alloc_heap(klass);
1991 }
1992
1993 return str_duplicate_setup(klass, str, dup);
1994}
1995
1996VALUE
1998{
1999 return str_duplicate(rb_obj_class(str), str);
2000}
2001
2002/* :nodoc: */
2003VALUE
2004rb_str_dup_m(VALUE str)
2005{
2006 if (LIKELY(BARE_STRING_P(str))) {
2007 return str_duplicate(rb_cString, str);
2008 }
2009 else {
2010 return rb_obj_dup(str);
2011 }
2012}
2013
2014VALUE
2016{
2017 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2018 return str_duplicate(rb_cString, str);
2019}
2020
2021VALUE
2022rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2023{
2024 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2025 VALUE new_str, klass = rb_cString;
2026
2027 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2028 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2029 str_duplicate_setup_embed(klass, str, new_str);
2030 }
2031 else {
2032 new_str = ec_str_alloc_heap(ec, klass);
2033 str_duplicate_setup_heap(klass, str, new_str);
2034 }
2035 if (chilled) {
2036 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2037 }
2038 return new_str;
2039}
2040
2041VALUE
2042rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2043{
2044 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2045 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2046 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2047 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2048 return rb_str_freeze(str);
2049}
2050
2051/*
2052 * The documentation block below uses an include (instead of inline text)
2053 * because the included text has non-ASCII characters (which are not allowed in a C file).
2054 */
2055
2056/*
2057 *
2058 * call-seq:
2059 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2060 *
2061 * :include: doc/string/new.rdoc
2062 *
2063 */
2064
2065static VALUE
2066rb_str_init(int argc, VALUE *argv, VALUE str)
2067{
2068 static ID keyword_ids[2];
2069 VALUE orig, opt, venc, vcapa;
2070 VALUE kwargs[2];
2071 rb_encoding *enc = 0;
2072 int n;
2073
2074 if (!keyword_ids[0]) {
2075 keyword_ids[0] = rb_id_encoding();
2076 CONST_ID(keyword_ids[1], "capacity");
2077 }
2078
2079 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2080 if (!NIL_P(opt)) {
2081 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2082 venc = kwargs[0];
2083 vcapa = kwargs[1];
2084 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2085 enc = rb_to_encoding(venc);
2086 }
2087 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2088 long capa = NUM2LONG(vcapa);
2089 long len = 0;
2090 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2091
2092 if (capa < STR_BUF_MIN_SIZE) {
2093 capa = STR_BUF_MIN_SIZE;
2094 }
2095 if (n == 1) {
2096 StringValue(orig);
2097 len = RSTRING_LEN(orig);
2098 if (capa < len) {
2099 capa = len;
2100 }
2101 if (orig == str) n = 0;
2102 }
2103 str_modifiable(str);
2104 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2105 /* make noembed always */
2106 const size_t size = (size_t)capa + termlen;
2107 const char *const old_ptr = RSTRING_PTR(str);
2108 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2109 char *new_ptr = ALLOC_N(char, size);
2110 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2111 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2112 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2113 RSTRING(str)->as.heap.ptr = new_ptr;
2114 }
2115 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2116 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2117 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2118 }
2119 STR_SET_LEN(str, len);
2120 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2121 if (n == 1) {
2122 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2123 rb_enc_cr_str_exact_copy(str, orig);
2124 }
2125 FL_SET(str, STR_NOEMBED);
2126 RSTRING(str)->as.heap.aux.capa = capa;
2127 }
2128 else if (n == 1) {
2129 rb_str_replace(str, orig);
2130 }
2131 if (enc) {
2132 rb_enc_associate(str, enc);
2134 }
2135 }
2136 else if (n == 1) {
2137 rb_str_replace(str, orig);
2138 }
2139 return str;
2140}
2141
2142/* :nodoc: */
2143static VALUE
2144rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2145{
2146 if (klass != rb_cString) {
2147 return rb_class_new_instance_pass_kw(argc, argv, klass);
2148 }
2149
2150 static ID keyword_ids[2];
2151 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2152 VALUE kwargs[2];
2153 rb_encoding *enc = NULL;
2154
2155 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2156 if (NIL_P(opt)) {
2157 return rb_class_new_instance_pass_kw(argc, argv, klass);
2158 }
2159
2160 keyword_ids[0] = rb_id_encoding();
2161 CONST_ID(keyword_ids[1], "capacity");
2162 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2163 encoding = kwargs[0];
2164 capacity = kwargs[1];
2165
2166 if (n == 1) {
2167 orig = StringValue(orig);
2168 }
2169 else {
2170 orig = Qnil;
2171 }
2172
2173 if (UNDEF_P(encoding)) {
2174 if (!NIL_P(orig)) {
2175 encoding = rb_obj_encoding(orig);
2176 }
2177 }
2178
2179 if (!UNDEF_P(encoding)) {
2180 enc = rb_to_encoding(encoding);
2181 }
2182
2183 // If capacity is nil, we're basically just duping `orig`.
2184 if (UNDEF_P(capacity)) {
2185 if (NIL_P(orig)) {
2186 VALUE empty_str = str_new(klass, "", 0);
2187 if (enc) {
2188 rb_enc_associate(empty_str, enc);
2189 }
2190 return empty_str;
2191 }
2192 VALUE copy = str_duplicate(klass, orig);
2193 rb_enc_associate(copy, enc);
2194 ENC_CODERANGE_CLEAR(copy);
2195 return copy;
2196 }
2197
2198 long capa = 0;
2199 capa = NUM2LONG(capacity);
2200 if (capa < 0) {
2201 capa = 0;
2202 }
2203
2204 if (!NIL_P(orig)) {
2205 long orig_capa = rb_str_capacity(orig);
2206 if (orig_capa > capa) {
2207 capa = orig_capa;
2208 }
2209 }
2210
2211 VALUE str = str_enc_new(klass, NULL, capa, enc);
2212 STR_SET_LEN(str, 0);
2213 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2214
2215 if (!NIL_P(orig)) {
2216 rb_str_buf_append(str, orig);
2217 }
2218
2219 return str;
2220}
2221
2222#ifdef NONASCII_MASK
2223#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2224
2225/*
2226 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2227 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2228 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2229 *
2230 * if (!(byte & 0x80))
2231 * byte |= 0x40; // turn on bit6
2232 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2233 *
2234 * This function calculates whether a byte is leading or not for all bytes
2235 * in the argument word by concurrently using the above logic, and then
2236 * adds up the number of leading bytes in the word.
2237 */
2238static inline uintptr_t
2239count_utf8_lead_bytes_with_word(const uintptr_t *s)
2240{
2241 uintptr_t d = *s;
2242
2243 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2244 d = (d>>6) | (~d>>7);
2245 d &= NONASCII_MASK >> 7;
2246
2247 /* Gather all bytes. */
2248#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2249 /* use only if it can use POPCNT */
2250 return rb_popcount_intptr(d);
2251#else
2252 d += (d>>8);
2253 d += (d>>16);
2254# if SIZEOF_VOIDP == 8
2255 d += (d>>32);
2256# endif
2257 return (d&0xF);
2258#endif
2259}
2260#endif
2261
2262static inline long
2263enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2264{
2265 long c;
2266 const char *q;
2267
2268 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2269 long diff = (long)(e - p);
2270 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2271 }
2272#ifdef NONASCII_MASK
2273 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2274 uintptr_t len = 0;
2275 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2276 const uintptr_t *s, *t;
2277 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2278 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2279 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2280 while (p < (const char *)s) {
2281 if (is_utf8_lead_byte(*p)) len++;
2282 p++;
2283 }
2284 while (s < t) {
2285 len += count_utf8_lead_bytes_with_word(s);
2286 s++;
2287 }
2288 p = (const char *)s;
2289 }
2290 while (p < e) {
2291 if (is_utf8_lead_byte(*p)) len++;
2292 p++;
2293 }
2294 return (long)len;
2295 }
2296#endif
2297 else if (rb_enc_asciicompat(enc)) {
2298 c = 0;
2299 if (ENC_CODERANGE_CLEAN_P(cr)) {
2300 while (p < e) {
2301 if (ISASCII(*p)) {
2302 q = search_nonascii(p, e);
2303 if (!q)
2304 return c + (e - p);
2305 c += q - p;
2306 p = q;
2307 }
2308 p += rb_enc_fast_mbclen(p, e, enc);
2309 c++;
2310 }
2311 }
2312 else {
2313 while (p < e) {
2314 if (ISASCII(*p)) {
2315 q = search_nonascii(p, e);
2316 if (!q)
2317 return c + (e - p);
2318 c += q - p;
2319 p = q;
2320 }
2321 p += rb_enc_mbclen(p, e, enc);
2322 c++;
2323 }
2324 }
2325 return c;
2326 }
2327
2328 for (c=0; p<e; c++) {
2329 p += rb_enc_mbclen(p, e, enc);
2330 }
2331 return c;
2332}
2333
2334long
2335rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2336{
2337 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2338}
2339
2340/* To get strlen with cr
2341 * Note that given cr is not used.
2342 */
2343long
2344rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2345{
2346 long c;
2347 const char *q;
2348 int ret;
2349
2350 *cr = 0;
2351 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2352 long diff = (long)(e - p);
2353 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2354 }
2355 else if (rb_enc_asciicompat(enc)) {
2356 c = 0;
2357 while (p < e) {
2358 if (ISASCII(*p)) {
2359 q = search_nonascii(p, e);
2360 if (!q) {
2361 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2362 return c + (e - p);
2363 }
2364 c += q - p;
2365 p = q;
2366 }
2367 ret = rb_enc_precise_mbclen(p, e, enc);
2368 if (MBCLEN_CHARFOUND_P(ret)) {
2369 *cr |= ENC_CODERANGE_VALID;
2370 p += MBCLEN_CHARFOUND_LEN(ret);
2371 }
2372 else {
2374 p++;
2375 }
2376 c++;
2377 }
2378 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2379 return c;
2380 }
2381
2382 for (c=0; p<e; c++) {
2383 ret = rb_enc_precise_mbclen(p, e, enc);
2384 if (MBCLEN_CHARFOUND_P(ret)) {
2385 *cr |= ENC_CODERANGE_VALID;
2386 p += MBCLEN_CHARFOUND_LEN(ret);
2387 }
2388 else {
2390 if (p + rb_enc_mbminlen(enc) <= e)
2391 p += rb_enc_mbminlen(enc);
2392 else
2393 p = e;
2394 }
2395 }
2396 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2397 return c;
2398}
2399
2400/* enc must be str's enc or rb_enc_check(str, str2) */
2401static long
2402str_strlen(VALUE str, rb_encoding *enc)
2403{
2404 const char *p, *e;
2405 int cr;
2406
2407 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2408 if (!enc) enc = STR_ENC_GET(str);
2409 p = RSTRING_PTR(str);
2410 e = RSTRING_END(str);
2411 cr = ENC_CODERANGE(str);
2412
2413 if (cr == ENC_CODERANGE_UNKNOWN) {
2414 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2415 if (cr) ENC_CODERANGE_SET(str, cr);
2416 return n;
2417 }
2418 else {
2419 return enc_strlen(p, e, enc, cr);
2420 }
2421}
2422
2423long
2425{
2426 return str_strlen(str, NULL);
2427}
2428
2429/*
2430 * call-seq:
2431 * length -> integer
2432 *
2433 * :include: doc/string/length.rdoc
2434 *
2435 */
2436
2437VALUE
2439{
2440 return LONG2NUM(str_strlen(str, NULL));
2441}
2442
2443/*
2444 * call-seq:
2445 * bytesize -> integer
2446 *
2447 * :include: doc/string/bytesize.rdoc
2448 *
2449 */
2450
2451VALUE
2452rb_str_bytesize(VALUE str)
2453{
2454 return LONG2NUM(RSTRING_LEN(str));
2455}
2456
2457/*
2458 * call-seq:
2459 * empty? -> true or false
2460 *
2461 * Returns whether the length of +self+ is zero:
2462 *
2463 * 'hello'.empty? # => false
2464 * ' '.empty? # => false
2465 * ''.empty? # => true
2466 *
2467 * Related: see {Querying}[rdoc-ref:String@Querying].
2468 */
2469
2470static VALUE
2471rb_str_empty(VALUE str)
2472{
2473 return RBOOL(RSTRING_LEN(str) == 0);
2474}
2475
2476/*
2477 * call-seq:
2478 * self + other_string -> new_string
2479 *
2480 * Returns a new string containing +other_string+ concatenated to +self+:
2481 *
2482 * 'Hello from ' + self.to_s # => "Hello from main"
2483 *
2484 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2485 */
2486
2487VALUE
2489{
2490 VALUE str3;
2491 rb_encoding *enc;
2492 char *ptr1, *ptr2, *ptr3;
2493 long len1, len2;
2494 int termlen;
2495
2496 StringValue(str2);
2497 enc = rb_enc_check_str(str1, str2);
2498 RSTRING_GETMEM(str1, ptr1, len1);
2499 RSTRING_GETMEM(str2, ptr2, len2);
2500 termlen = rb_enc_mbminlen(enc);
2501 if (len1 > LONG_MAX - len2) {
2502 rb_raise(rb_eArgError, "string size too big");
2503 }
2504 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2505 ptr3 = RSTRING_PTR(str3);
2506 memcpy(ptr3, ptr1, len1);
2507 memcpy(ptr3+len1, ptr2, len2);
2508 TERM_FILL(&ptr3[len1+len2], termlen);
2509
2510 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2512 RB_GC_GUARD(str1);
2513 RB_GC_GUARD(str2);
2514 return str3;
2515}
2516
2517/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2518VALUE
2519rb_str_opt_plus(VALUE str1, VALUE str2)
2520{
2523 long len1, len2;
2524 MAYBE_UNUSED(char) *ptr1, *ptr2;
2525 RSTRING_GETMEM(str1, ptr1, len1);
2526 RSTRING_GETMEM(str2, ptr2, len2);
2527 int enc1 = rb_enc_get_index(str1);
2528 int enc2 = rb_enc_get_index(str2);
2529
2530 if (enc1 < 0) {
2531 return Qundef;
2532 }
2533 else if (enc2 < 0) {
2534 return Qundef;
2535 }
2536 else if (enc1 != enc2) {
2537 return Qundef;
2538 }
2539 else if (len1 > LONG_MAX - len2) {
2540 return Qundef;
2541 }
2542 else {
2543 return rb_str_plus(str1, str2);
2544 }
2545
2546}
2547
2548/*
2549 * call-seq:
2550 * self * n -> new_string
2551 *
2552 * Returns a new string containing +n+ copies of +self+:
2553 *
2554 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2555 * 'No!' * 0 # => ""
2556 *
2557 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2558 */
2559
2560VALUE
2562{
2563 VALUE str2;
2564 long n, len;
2565 char *ptr2;
2566 int termlen;
2567
2568 if (times == INT2FIX(1)) {
2569 return str_duplicate(rb_cString, str);
2570 }
2571 if (times == INT2FIX(0)) {
2572 str2 = str_alloc_embed(rb_cString, 0);
2573 rb_enc_copy(str2, str);
2574 return str2;
2575 }
2576 len = NUM2LONG(times);
2577 if (len < 0) {
2578 rb_raise(rb_eArgError, "negative argument");
2579 }
2580 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2581 if (STR_EMBEDDABLE_P(len, 1)) {
2582 str2 = str_alloc_embed(rb_cString, len + 1);
2583 memset(RSTRING_PTR(str2), 0, len + 1);
2584 }
2585 else {
2586 str2 = str_alloc_heap(rb_cString);
2587 RSTRING(str2)->as.heap.aux.capa = len;
2588 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2589 }
2590 STR_SET_LEN(str2, len);
2591 rb_enc_copy(str2, str);
2592 return str2;
2593 }
2594 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2595 rb_raise(rb_eArgError, "argument too big");
2596 }
2597
2598 len *= RSTRING_LEN(str);
2599 termlen = TERM_LEN(str);
2600 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2601 ptr2 = RSTRING_PTR(str2);
2602 if (len) {
2603 n = RSTRING_LEN(str);
2604 memcpy(ptr2, RSTRING_PTR(str), n);
2605 while (n <= len/2) {
2606 memcpy(ptr2 + n, ptr2, n);
2607 n *= 2;
2608 }
2609 memcpy(ptr2 + n, ptr2, len-n);
2610 }
2611 STR_SET_LEN(str2, len);
2612 TERM_FILL(&ptr2[len], termlen);
2613 rb_enc_cr_str_copy_for_substr(str2, str);
2614
2615 return str2;
2616}
2617
2618/*
2619 * call-seq:
2620 * self % object -> new_string
2621 *
2622 * Returns the result of formatting +object+ into the format specifications
2623 * contained in +self+
2624 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2625 *
2626 * '%05d' % 123 # => "00123"
2627 *
2628 * If +self+ contains multiple format specifications,
2629 * +object+ must be an array or hash containing the objects to be formatted:
2630 *
2631 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2632 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2633 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2634 *
2635 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2636 */
2637
2638static VALUE
2639rb_str_format_m(VALUE str, VALUE arg)
2640{
2641 VALUE tmp = rb_check_array_type(arg);
2642
2643 if (!NIL_P(tmp)) {
2644 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2645 }
2646 return rb_str_format(1, &arg, str);
2647}
2648
2649static inline void
2650rb_check_lockedtmp(VALUE str)
2651{
2652 if (FL_TEST(str, STR_TMPLOCK)) {
2653 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2654 }
2655}
2656
2657// If none of these flags are set, we know we have an modifiable string.
2658// If any is set, we need to do more detailed checks.
2659#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2660static inline void
2661str_modifiable(VALUE str)
2662{
2663 RUBY_ASSERT(ruby_thread_has_gvl_p());
2664
2665 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2666 if (CHILLED_STRING_P(str)) {
2667 CHILLED_STRING_MUTATED(str);
2668 }
2669 rb_check_lockedtmp(str);
2670 rb_check_frozen(str);
2671 }
2672}
2673
2674static inline int
2675str_dependent_p(VALUE str)
2676{
2677 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2678 return FALSE;
2679 }
2680 else {
2681 return TRUE;
2682 }
2683}
2684
2685// If none of these flags are set, we know we have an independent string.
2686// If any is set, we need to do more detailed checks.
2687#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2688static inline int
2689str_independent(VALUE str)
2690{
2691 RUBY_ASSERT(ruby_thread_has_gvl_p());
2692
2693 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2694 str_modifiable(str);
2695 return !str_dependent_p(str);
2696 }
2697 return TRUE;
2698}
2699
2700static void
2701str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2702{
2703 RUBY_ASSERT(ruby_thread_has_gvl_p());
2704
2705 char *ptr;
2706 char *oldptr;
2707 long capa = len + expand;
2708
2709 if (len > capa) len = capa;
2710
2711 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2712 ptr = RSTRING(str)->as.heap.ptr;
2713 STR_SET_EMBED(str);
2714 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2715 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2716 STR_SET_LEN(str, len);
2717 return;
2718 }
2719
2720 ptr = ALLOC_N(char, (size_t)capa + termlen);
2721 oldptr = RSTRING_PTR(str);
2722 if (oldptr) {
2723 memcpy(ptr, oldptr, len);
2724 }
2725 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2726 xfree(oldptr);
2727 }
2728 STR_SET_NOEMBED(str);
2729 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2730 TERM_FILL(ptr + len, termlen);
2731 RSTRING(str)->as.heap.ptr = ptr;
2732 STR_SET_LEN(str, len);
2733 RSTRING(str)->as.heap.aux.capa = capa;
2734}
2735
2736void
2737rb_str_modify(VALUE str)
2738{
2739 if (!str_independent(str))
2740 str_make_independent(str);
2742}
2743
2744void
2746{
2747 RUBY_ASSERT(ruby_thread_has_gvl_p());
2748
2749 int termlen = TERM_LEN(str);
2750 long len = RSTRING_LEN(str);
2751
2752 if (expand < 0) {
2753 rb_raise(rb_eArgError, "negative expanding string size");
2754 }
2755 if (expand >= LONG_MAX - len) {
2756 rb_raise(rb_eArgError, "string size too big");
2757 }
2758
2759 if (!str_independent(str)) {
2760 str_make_independent_expand(str, len, expand, termlen);
2761 }
2762 else if (expand > 0) {
2763 RESIZE_CAPA_TERM(str, len + expand, termlen);
2764 }
2766}
2767
2768/* As rb_str_modify(), but don't clear coderange */
2769static void
2770str_modify_keep_cr(VALUE str)
2771{
2772 if (!str_independent(str))
2773 str_make_independent(str);
2775 /* Force re-scan later */
2777}
2778
2779static inline void
2780str_discard(VALUE str)
2781{
2782 str_modifiable(str);
2783 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2784 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2785 RSTRING(str)->as.heap.ptr = 0;
2786 STR_SET_LEN(str, 0);
2787 }
2788}
2789
2790void
2792{
2793 int encindex = rb_enc_get_index(str);
2794
2795 if (RB_UNLIKELY(encindex == -1)) {
2796 rb_raise(rb_eTypeError, "not encoding capable object");
2797 }
2798
2799 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2800 return;
2801 }
2802
2803 rb_encoding *enc = rb_enc_from_index(encindex);
2804 if (!rb_enc_asciicompat(enc)) {
2805 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2806 }
2807}
2808
2809VALUE
2811{
2812 RUBY_ASSERT(ruby_thread_has_gvl_p());
2813
2814 VALUE s = *ptr;
2815 if (!RB_TYPE_P(s, T_STRING)) {
2816 s = rb_str_to_str(s);
2817 *ptr = s;
2818 }
2819 return s;
2820}
2821
2822char *
2824{
2825 VALUE str = rb_string_value(ptr);
2826 return RSTRING_PTR(str);
2827}
2828
2829static int
2830zero_filled(const char *s, int n)
2831{
2832 for (; n > 0; --n) {
2833 if (*s++) return 0;
2834 }
2835 return 1;
2836}
2837
2838static const char *
2839str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2840{
2841 const char *e = s + len;
2842
2843 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2844 if (zero_filled(s, minlen)) return s;
2845 }
2846 return 0;
2847}
2848
2849static char *
2850str_fill_term(VALUE str, char *s, long len, int termlen)
2851{
2852 /* This function assumes that (capa + termlen) bytes of memory
2853 * is allocated, like many other functions in this file.
2854 */
2855 if (str_dependent_p(str)) {
2856 if (!zero_filled(s + len, termlen))
2857 str_make_independent_expand(str, len, 0L, termlen);
2858 }
2859 else {
2860 TERM_FILL(s + len, termlen);
2861 return s;
2862 }
2863 return RSTRING_PTR(str);
2864}
2865
2866void
2867rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2868{
2869 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2870 long len = RSTRING_LEN(str);
2871
2872 RUBY_ASSERT(capa >= len);
2873 if (capa - len < termlen) {
2874 rb_check_lockedtmp(str);
2875 str_make_independent_expand(str, len, 0L, termlen);
2876 }
2877 else if (str_dependent_p(str)) {
2878 if (termlen > oldtermlen)
2879 str_make_independent_expand(str, len, 0L, termlen);
2880 }
2881 else {
2882 if (!STR_EMBED_P(str)) {
2883 /* modify capa instead of realloc */
2884 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2885 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2886 }
2887 if (termlen > oldtermlen) {
2888 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2889 }
2890 }
2891
2892 return;
2893}
2894
2895static char *
2896str_null_check(VALUE str, int *w)
2897{
2898 char *s = RSTRING_PTR(str);
2899 long len = RSTRING_LEN(str);
2900 rb_encoding *enc = rb_enc_get(str);
2901 const int minlen = rb_enc_mbminlen(enc);
2902
2903 if (minlen > 1) {
2904 *w = 1;
2905 if (str_null_char(s, len, minlen, enc)) {
2906 return NULL;
2907 }
2908 return str_fill_term(str, s, len, minlen);
2909 }
2910 *w = 0;
2911 if (!s || memchr(s, 0, len)) {
2912 return NULL;
2913 }
2914 if (s[len]) {
2915 s = str_fill_term(str, s, len, minlen);
2916 }
2917 return s;
2918}
2919
2920char *
2921rb_str_to_cstr(VALUE str)
2922{
2923 int w;
2924 return str_null_check(str, &w);
2925}
2926
2927char *
2929{
2930 VALUE str = rb_string_value(ptr);
2931 int w;
2932 char *s = str_null_check(str, &w);
2933 if (!s) {
2934 if (w) {
2935 rb_raise(rb_eArgError, "string contains null char");
2936 }
2937 rb_raise(rb_eArgError, "string contains null byte");
2938 }
2939 return s;
2940}
2941
2942char *
2943rb_str_fill_terminator(VALUE str, const int newminlen)
2944{
2945 char *s = RSTRING_PTR(str);
2946 long len = RSTRING_LEN(str);
2947 return str_fill_term(str, s, len, newminlen);
2948}
2949
2950VALUE
2952{
2953 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2954 return str;
2955}
2956
2957/*
2958 * call-seq:
2959 * String.try_convert(object) -> object, new_string, or nil
2960 *
2961 * Attempts to convert the given +object+ to a string.
2962 *
2963 * If +object+ is already a string, returns +object+, unmodified.
2964 *
2965 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2966 * calls <tt>object.to_str</tt> and returns the result.
2967 *
2968 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2969 *
2970 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2971 */
2972static VALUE
2973rb_str_s_try_convert(VALUE dummy, VALUE str)
2974{
2975 return rb_check_string_type(str);
2976}
2977
2978static char*
2979str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2980{
2981 long nth = *nthp;
2982 if (rb_enc_mbmaxlen(enc) == 1) {
2983 p += nth;
2984 }
2985 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2986 p += nth * rb_enc_mbmaxlen(enc);
2987 }
2988 else if (rb_enc_asciicompat(enc)) {
2989 const char *p2, *e2;
2990 int n;
2991
2992 while (p < e && 0 < nth) {
2993 e2 = p + nth;
2994 if (e < e2) {
2995 *nthp = nth;
2996 return (char *)e;
2997 }
2998 if (ISASCII(*p)) {
2999 p2 = search_nonascii(p, e2);
3000 if (!p2) {
3001 nth -= e2 - p;
3002 *nthp = nth;
3003 return (char *)e2;
3004 }
3005 nth -= p2 - p;
3006 p = p2;
3007 }
3008 n = rb_enc_mbclen(p, e, enc);
3009 p += n;
3010 nth--;
3011 }
3012 *nthp = nth;
3013 if (nth != 0) {
3014 return (char *)e;
3015 }
3016 return (char *)p;
3017 }
3018 else {
3019 while (p < e && nth--) {
3020 p += rb_enc_mbclen(p, e, enc);
3021 }
3022 }
3023 if (p > e) p = e;
3024 *nthp = nth;
3025 return (char*)p;
3026}
3027
3028char*
3029rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3030{
3031 return str_nth_len(p, e, &nth, enc);
3032}
3033
3034static char*
3035str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3036{
3037 if (singlebyte)
3038 p += nth;
3039 else {
3040 p = str_nth_len(p, e, &nth, enc);
3041 }
3042 if (!p) return 0;
3043 if (p > e) p = e;
3044 return (char *)p;
3045}
3046
3047/* char offset to byte offset */
3048static long
3049str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3050{
3051 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3052 if (!pp) return e - p;
3053 return pp - p;
3054}
3055
3056long
3057rb_str_offset(VALUE str, long pos)
3058{
3059 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3060 STR_ENC_GET(str), single_byte_optimizable(str));
3061}
3062
3063#ifdef NONASCII_MASK
3064static char *
3065str_utf8_nth(const char *p, const char *e, long *nthp)
3066{
3067 long nth = *nthp;
3068 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3069 const uintptr_t *s, *t;
3070 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3071 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3072 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3073 while (p < (const char *)s) {
3074 if (is_utf8_lead_byte(*p)) nth--;
3075 p++;
3076 }
3077 do {
3078 nth -= count_utf8_lead_bytes_with_word(s);
3079 s++;
3080 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3081 p = (char *)s;
3082 }
3083 while (p < e) {
3084 if (is_utf8_lead_byte(*p)) {
3085 if (nth == 0) break;
3086 nth--;
3087 }
3088 p++;
3089 }
3090 *nthp = nth;
3091 return (char *)p;
3092}
3093
3094static long
3095str_utf8_offset(const char *p, const char *e, long nth)
3096{
3097 const char *pp = str_utf8_nth(p, e, &nth);
3098 return pp - p;
3099}
3100#endif
3101
3102/* byte offset to char offset */
3103long
3104rb_str_sublen(VALUE str, long pos)
3105{
3106 if (single_byte_optimizable(str) || pos < 0)
3107 return pos;
3108 else {
3109 char *p = RSTRING_PTR(str);
3110 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3111 }
3112}
3113
3114static VALUE
3115str_subseq(VALUE str, long beg, long len)
3116{
3117 VALUE str2;
3118
3119 RUBY_ASSERT(beg >= 0);
3120 RUBY_ASSERT(len >= 0);
3121 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3122
3123 const int termlen = TERM_LEN(str);
3124 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3125 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3126 RB_GC_GUARD(str);
3127 return str2;
3128 }
3129
3130 str2 = str_alloc_heap(rb_cString);
3131 if (str_embed_capa(str2) >= len + termlen) {
3132 char *ptr2 = RSTRING(str2)->as.embed.ary;
3133 STR_SET_EMBED(str2);
3134 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3135 TERM_FILL(ptr2+len, termlen);
3136
3137 STR_SET_LEN(str2, len);
3138 RB_GC_GUARD(str);
3139 }
3140 else {
3141 str_replace_shared(str2, str);
3142 RUBY_ASSERT(!STR_EMBED_P(str2));
3143 ENC_CODERANGE_CLEAR(str2);
3144 RSTRING(str2)->as.heap.ptr += beg;
3145 if (RSTRING_LEN(str2) > len) {
3146 STR_SET_LEN(str2, len);
3147 }
3148 }
3149
3150 return str2;
3151}
3152
3153VALUE
3154rb_str_subseq(VALUE str, long beg, long len)
3155{
3156 VALUE str2 = str_subseq(str, beg, len);
3157 rb_enc_cr_str_copy_for_substr(str2, str);
3158 return str2;
3159}
3160
3161char *
3162rb_str_subpos(VALUE str, long beg, long *lenp)
3163{
3164 long len = *lenp;
3165 long slen = -1L;
3166 const long blen = RSTRING_LEN(str);
3167 rb_encoding *enc = STR_ENC_GET(str);
3168 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3169
3170 if (len < 0) return 0;
3171 if (beg < 0 && -beg < 0) return 0;
3172 if (!blen) {
3173 len = 0;
3174 }
3175 if (single_byte_optimizable(str)) {
3176 if (beg > blen) return 0;
3177 if (beg < 0) {
3178 beg += blen;
3179 if (beg < 0) return 0;
3180 }
3181 if (len > blen - beg)
3182 len = blen - beg;
3183 if (len < 0) return 0;
3184 p = s + beg;
3185 goto end;
3186 }
3187 if (beg < 0) {
3188 if (len > -beg) len = -beg;
3189 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3190 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3191 beg = -beg;
3192 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3193 p = e;
3194 if (!p) return 0;
3195 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3196 if (!p) return 0;
3197 len = e - p;
3198 goto end;
3199 }
3200 else {
3201 slen = str_strlen(str, enc);
3202 beg += slen;
3203 if (beg < 0) return 0;
3204 p = s + beg;
3205 if (len == 0) goto end;
3206 }
3207 }
3208 else if (beg > 0 && beg > blen) {
3209 return 0;
3210 }
3211 if (len == 0) {
3212 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3213 p = s + beg;
3214 }
3215#ifdef NONASCII_MASK
3216 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3217 enc == rb_utf8_encoding()) {
3218 p = str_utf8_nth(s, e, &beg);
3219 if (beg > 0) return 0;
3220 len = str_utf8_offset(p, e, len);
3221 }
3222#endif
3223 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3224 int char_sz = rb_enc_mbmaxlen(enc);
3225
3226 p = s + beg * char_sz;
3227 if (p > e) {
3228 return 0;
3229 }
3230 else if (len * char_sz > e - p)
3231 len = e - p;
3232 else
3233 len *= char_sz;
3234 }
3235 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3236 if (beg > 0) return 0;
3237 len = 0;
3238 }
3239 else {
3240 len = str_offset(p, e, len, enc, 0);
3241 }
3242 end:
3243 *lenp = len;
3244 RB_GC_GUARD(str);
3245 return p;
3246}
3247
3248static VALUE str_substr(VALUE str, long beg, long len, int empty);
3249
3250VALUE
3251rb_str_substr(VALUE str, long beg, long len)
3252{
3253 return str_substr(str, beg, len, TRUE);
3254}
3255
3256VALUE
3257rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3258{
3259 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3260}
3261
3262static VALUE
3263str_substr(VALUE str, long beg, long len, int empty)
3264{
3265 char *p = rb_str_subpos(str, beg, &len);
3266
3267 if (!p) return Qnil;
3268 if (!len && !empty) return Qnil;
3269
3270 beg = p - RSTRING_PTR(str);
3271
3272 VALUE str2 = str_subseq(str, beg, len);
3273 rb_enc_cr_str_copy_for_substr(str2, str);
3274 return str2;
3275}
3276
3277/* :nodoc: */
3278VALUE
3280{
3281 if (CHILLED_STRING_P(str)) {
3282 FL_UNSET_RAW(str, STR_CHILLED);
3283 }
3284
3285 if (OBJ_FROZEN(str)) return str;
3286 rb_str_resize(str, RSTRING_LEN(str));
3287 return rb_obj_freeze(str);
3288}
3289
3290/*
3291 * call-seq:
3292 * +string -> new_string or self
3293 *
3294 * Returns +self+ if +self+ is not frozen and can be mutated
3295 * without warning issuance.
3296 *
3297 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3298 *
3299 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3300 */
3301static VALUE
3302str_uplus(VALUE str)
3303{
3304 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3305 return rb_str_dup(str);
3306 }
3307 else {
3308 return str;
3309 }
3310}
3311
3312/*
3313 * call-seq:
3314 * -self -> frozen_string
3315 *
3316 * Returns a frozen string equal to +self+.
3317 *
3318 * The returned string is +self+ if and only if all of the following are true:
3319 *
3320 * - +self+ is already frozen.
3321 * - +self+ is an instance of \String (rather than of a subclass of \String)
3322 * - +self+ has no instance variables set on it.
3323 *
3324 * Otherwise, the returned string is a frozen copy of +self+.
3325 *
3326 * Returning +self+, when possible, saves duplicating +self+;
3327 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3328 *
3329 * It may also save duplicating other, already-existing, strings:
3330 *
3331 * s0 = 'foo'
3332 * s1 = 'foo'
3333 * s0.object_id == s1.object_id # => false
3334 * (-s0).object_id == (-s1).object_id # => true
3335 *
3336 * Note that method #-@ is convenient for defining a constant:
3337 *
3338 * FileName = -'config/database.yml'
3339 *
3340 * While its alias #dedup is better suited for chaining:
3341 *
3342 * 'foo'.dedup.gsub!('o')
3343 *
3344 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3345 */
3346static VALUE
3347str_uminus(VALUE str)
3348{
3349 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3350 str = rb_str_dup(str);
3351 }
3352 return rb_fstring(str);
3353}
3354
3355RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3356#define rb_str_dup_frozen rb_str_new_frozen
3357
3358VALUE
3360{
3361 rb_check_frozen(str);
3362 if (FL_TEST(str, STR_TMPLOCK)) {
3363 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3364 }
3365 FL_SET(str, STR_TMPLOCK);
3366 return str;
3367}
3368
3369VALUE
3371{
3372 rb_check_frozen(str);
3373 if (!FL_TEST(str, STR_TMPLOCK)) {
3374 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3375 }
3376 FL_UNSET(str, STR_TMPLOCK);
3377 return str;
3378}
3379
3380VALUE
3381rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3382{
3383 rb_str_locktmp(str);
3384 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3385}
3386
3387void
3389{
3390 RUBY_ASSERT(ruby_thread_has_gvl_p());
3391
3392 long capa;
3393 const int termlen = TERM_LEN(str);
3394
3395 str_modifiable(str);
3396 if (STR_SHARED_P(str)) {
3397 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3398 }
3399 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3400 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3401 }
3402
3403 int cr = ENC_CODERANGE(str);
3404 if (len == 0) {
3405 /* Empty string does not contain non-ASCII */
3407 }
3408 else if (cr == ENC_CODERANGE_UNKNOWN) {
3409 /* Leave unknown. */
3410 }
3411 else if (len > RSTRING_LEN(str)) {
3412 if (ENC_CODERANGE_CLEAN_P(cr)) {
3413 /* Update the coderange regarding the extended part. */
3414 const char *const prev_end = RSTRING_END(str);
3415 const char *const new_end = RSTRING_PTR(str) + len;
3416 rb_encoding *enc = rb_enc_get(str);
3417 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3418 ENC_CODERANGE_SET(str, cr);
3419 }
3420 else if (cr == ENC_CODERANGE_BROKEN) {
3421 /* May be valid now, by appended part. */
3423 }
3424 }
3425 else if (len < RSTRING_LEN(str)) {
3426 if (cr != ENC_CODERANGE_7BIT) {
3427 /* ASCII-only string is keeping after truncated. Valid
3428 * and broken may be invalid or valid, leave unknown. */
3430 }
3431 }
3432
3433 STR_SET_LEN(str, len);
3434 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3435}
3436
3437VALUE
3438rb_str_resize(VALUE str, long len)
3439{
3440 if (len < 0) {
3441 rb_raise(rb_eArgError, "negative string size (or size too big)");
3442 }
3443
3444 int independent = str_independent(str);
3445 long slen = RSTRING_LEN(str);
3446 const int termlen = TERM_LEN(str);
3447
3448 if (slen > len || (termlen != 1 && slen < len)) {
3450 }
3451
3452 {
3453 long capa;
3454 if (STR_EMBED_P(str)) {
3455 if (len == slen) return str;
3456 if (str_embed_capa(str) >= len + termlen) {
3457 STR_SET_LEN(str, len);
3458 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3459 return str;
3460 }
3461 str_make_independent_expand(str, slen, len - slen, termlen);
3462 }
3463 else if (str_embed_capa(str) >= len + termlen) {
3464 char *ptr = STR_HEAP_PTR(str);
3465 STR_SET_EMBED(str);
3466 if (slen > len) slen = len;
3467 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3468 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3469 STR_SET_LEN(str, len);
3470 if (independent) ruby_xfree(ptr);
3471 return str;
3472 }
3473 else if (!independent) {
3474 if (len == slen) return str;
3475 str_make_independent_expand(str, slen, len - slen, termlen);
3476 }
3477 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3478 (capa - len) > (len < 1024 ? len : 1024)) {
3479 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3480 (size_t)len + termlen, STR_HEAP_SIZE(str));
3481 RSTRING(str)->as.heap.aux.capa = len;
3482 }
3483 else if (len == slen) return str;
3484 STR_SET_LEN(str, len);
3485 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3486 }
3487 return str;
3488}
3489
3490static void
3491str_ensure_available_capa(VALUE str, long len)
3492{
3493 str_modify_keep_cr(str);
3494
3495 const int termlen = TERM_LEN(str);
3496 long olen = RSTRING_LEN(str);
3497
3498 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3499 rb_raise(rb_eArgError, "string sizes too big");
3500 }
3501
3502 long total = olen + len;
3503 long capa = str_capacity(str, termlen);
3504
3505 if (capa < total) {
3506 if (total >= LONG_MAX / 2) {
3507 capa = total;
3508 }
3509 while (total > capa) {
3510 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3511 }
3512 RESIZE_CAPA_TERM(str, capa, termlen);
3513 }
3514}
3515
3516static VALUE
3517str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3518{
3519 if (keep_cr) {
3520 str_modify_keep_cr(str);
3521 }
3522 else {
3523 rb_str_modify(str);
3524 }
3525 if (len == 0) return 0;
3526
3527 long total, olen, off = -1;
3528 char *sptr;
3529 const int termlen = TERM_LEN(str);
3530
3531 RSTRING_GETMEM(str, sptr, olen);
3532 if (ptr >= sptr && ptr <= sptr + olen) {
3533 off = ptr - sptr;
3534 }
3535
3536 long capa = str_capacity(str, termlen);
3537
3538 if (olen > LONG_MAX - len) {
3539 rb_raise(rb_eArgError, "string sizes too big");
3540 }
3541 total = olen + len;
3542 if (capa < total) {
3543 if (total >= LONG_MAX / 2) {
3544 capa = total;
3545 }
3546 while (total > capa) {
3547 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3548 }
3549 RESIZE_CAPA_TERM(str, capa, termlen);
3550 sptr = RSTRING_PTR(str);
3551 }
3552 if (off != -1) {
3553 ptr = sptr + off;
3554 }
3555 memcpy(sptr + olen, ptr, len);
3556 STR_SET_LEN(str, total);
3557 TERM_FILL(sptr + total, termlen); /* sentinel */
3558
3559 return str;
3560}
3561
3562#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3563#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3564
3565VALUE
3566rb_str_cat(VALUE str, const char *ptr, long len)
3567{
3568 if (len == 0) return str;
3569 if (len < 0) {
3570 rb_raise(rb_eArgError, "negative string size (or size too big)");
3571 }
3572 return str_buf_cat(str, ptr, len);
3573}
3574
3575VALUE
3576rb_str_cat_cstr(VALUE str, const char *ptr)
3577{
3578 must_not_null(ptr);
3579 return rb_str_buf_cat(str, ptr, strlen(ptr));
3580}
3581
3582static void
3583rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3584{
3585 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3586
3587 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3588 if (UNLIKELY(!str_independent(str))) {
3589 str_make_independent(str);
3590 }
3591
3592 long string_length = -1;
3593 const int null_terminator_length = 1;
3594 char *sptr;
3595 RSTRING_GETMEM(str, sptr, string_length);
3596
3597 // Ensure the resulting string wouldn't be too long.
3598 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3599 rb_raise(rb_eArgError, "string sizes too big");
3600 }
3601
3602 long string_capacity = str_capacity(str, null_terminator_length);
3603
3604 // Get the code range before any modifications since those might clear the code range.
3605 int cr = ENC_CODERANGE(str);
3606
3607 // Check if the string has spare string_capacity to write the new byte.
3608 if (LIKELY(string_capacity >= string_length + 1)) {
3609 // In fast path we can write the new byte and note the string's new length.
3610 sptr[string_length] = byte;
3611 STR_SET_LEN(str, string_length + 1);
3612 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3613 }
3614 else {
3615 // If there's not enough string_capacity, make a call into the general string concatenation function.
3616 str_buf_cat(str, (char *)&byte, 1);
3617 }
3618
3619 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3620 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3621 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3622 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3623 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3624 if (ISASCII(byte)) {
3626 }
3627 else {
3629
3630 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3631 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3632 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3633 }
3634 }
3635 }
3636}
3637
3638RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3639RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3640RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3641
3642static VALUE
3643rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3644 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3645{
3646 int str_encindex = ENCODING_GET(str);
3647 int res_encindex;
3648 int str_cr, res_cr;
3649 rb_encoding *str_enc, *ptr_enc;
3650
3651 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3652
3653 if (str_encindex == ptr_encindex) {
3654 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3655 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3656 }
3657 }
3658 else {
3659 str_enc = rb_enc_from_index(str_encindex);
3660 ptr_enc = rb_enc_from_index(ptr_encindex);
3661 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3662 if (len == 0)
3663 return str;
3664 if (RSTRING_LEN(str) == 0) {
3665 rb_str_buf_cat(str, ptr, len);
3666 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3667 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3668 return str;
3669 }
3670 goto incompatible;
3671 }
3672 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3673 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3674 }
3675 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3676 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3677 str_cr = rb_enc_str_coderange(str);
3678 }
3679 }
3680 }
3681 if (ptr_cr_ret)
3682 *ptr_cr_ret = ptr_cr;
3683
3684 if (str_encindex != ptr_encindex &&
3685 str_cr != ENC_CODERANGE_7BIT &&
3686 ptr_cr != ENC_CODERANGE_7BIT) {
3687 str_enc = rb_enc_from_index(str_encindex);
3688 ptr_enc = rb_enc_from_index(ptr_encindex);
3689 goto incompatible;
3690 }
3691
3692 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3693 res_encindex = str_encindex;
3694 res_cr = ENC_CODERANGE_UNKNOWN;
3695 }
3696 else if (str_cr == ENC_CODERANGE_7BIT) {
3697 if (ptr_cr == ENC_CODERANGE_7BIT) {
3698 res_encindex = str_encindex;
3699 res_cr = ENC_CODERANGE_7BIT;
3700 }
3701 else {
3702 res_encindex = ptr_encindex;
3703 res_cr = ptr_cr;
3704 }
3705 }
3706 else if (str_cr == ENC_CODERANGE_VALID) {
3707 res_encindex = str_encindex;
3708 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3709 res_cr = str_cr;
3710 else
3711 res_cr = ptr_cr;
3712 }
3713 else { /* str_cr == ENC_CODERANGE_BROKEN */
3714 res_encindex = str_encindex;
3715 res_cr = str_cr;
3716 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3717 }
3718
3719 if (len < 0) {
3720 rb_raise(rb_eArgError, "negative string size (or size too big)");
3721 }
3722 str_buf_cat(str, ptr, len);
3723 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3724 return str;
3725
3726 incompatible:
3727 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3728 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3730}
3731
3732VALUE
3733rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3734{
3735 return rb_enc_cr_str_buf_cat(str, ptr, len,
3736 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3737}
3738
3739VALUE
3741{
3742 /* ptr must reference NUL terminated ASCII string. */
3743 int encindex = ENCODING_GET(str);
3744 rb_encoding *enc = rb_enc_from_index(encindex);
3745 if (rb_enc_asciicompat(enc)) {
3746 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3747 encindex, ENC_CODERANGE_7BIT, 0);
3748 }
3749 else {
3750 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3751 while (*ptr) {
3752 unsigned int c = (unsigned char)*ptr;
3753 int len = rb_enc_codelen(c, enc);
3754 rb_enc_mbcput(c, buf, enc);
3755 rb_enc_cr_str_buf_cat(str, buf, len,
3756 encindex, ENC_CODERANGE_VALID, 0);
3757 ptr++;
3758 }
3759 return str;
3760 }
3761}
3762
3763VALUE
3765{
3766 int str2_cr = rb_enc_str_coderange(str2);
3767
3768 if (str_enc_fastpath(str)) {
3769 switch (str2_cr) {
3770 case ENC_CODERANGE_7BIT:
3771 // If RHS is 7bit we can do simple concatenation
3772 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3773 RB_GC_GUARD(str2);
3774 return str;
3776 // If RHS is valid, we can do simple concatenation if encodings are the same
3777 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3778 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3779 int str_cr = ENC_CODERANGE(str);
3780 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3781 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3782 }
3783 RB_GC_GUARD(str2);
3784 return str;
3785 }
3786 }
3787 }
3788
3789 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3790 ENCODING_GET(str2), str2_cr, &str2_cr);
3791
3792 ENC_CODERANGE_SET(str2, str2_cr);
3793
3794 return str;
3795}
3796
3797VALUE
3799{
3800 StringValue(str2);
3801 return rb_str_buf_append(str, str2);
3802}
3803
3804VALUE
3805rb_str_concat_literals(size_t num, const VALUE *strary)
3806{
3807 VALUE str;
3808 size_t i, s = 0;
3809 unsigned long len = 1;
3810
3811 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3812 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3813
3814 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3815 str = rb_str_buf_new(len);
3816 str_enc_copy_direct(str, strary[0]);
3817
3818 for (i = s; i < num; ++i) {
3819 const VALUE v = strary[i];
3820 int encidx = ENCODING_GET(v);
3821
3822 rb_str_buf_append(str, v);
3823 if (encidx != ENCINDEX_US_ASCII) {
3824 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3825 rb_enc_set_index(str, encidx);
3826 }
3827 }
3828 return str;
3829}
3830
3831/*
3832 * call-seq:
3833 * concat(*objects) -> string
3834 *
3835 * :include: doc/string/concat.rdoc
3836 */
3837static VALUE
3838rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3839{
3840 str_modifiable(str);
3841
3842 if (argc == 1) {
3843 return rb_str_concat(str, argv[0]);
3844 }
3845 else if (argc > 1) {
3846 int i;
3847 VALUE arg_str = rb_str_tmp_new(0);
3848 rb_enc_copy(arg_str, str);
3849 for (i = 0; i < argc; i++) {
3850 rb_str_concat(arg_str, argv[i]);
3851 }
3852 rb_str_buf_append(str, arg_str);
3853 }
3854
3855 return str;
3856}
3857
3858/*
3859 * call-seq:
3860 * append_as_bytes(*objects) -> self
3861 *
3862 * Concatenates each object in +objects+ into +self+; returns +self+;
3863 * performs no encoding validation or conversion:
3864 *
3865 * s = 'foo'
3866 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3867 * s.valid_encoding? # => false
3868 * s.append_as_bytes("\xAC 12")
3869 * s.valid_encoding? # => true
3870 *
3871 * When a given object is an integer,
3872 * the value is considered an 8-bit byte;
3873 * if the integer occupies more than one byte (i.e,. is greater than 255),
3874 * appends only the low-order byte (similar to String#setbyte):
3875 *
3876 * s = ""
3877 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3878 * s.bytesize # => 2
3879 *
3880 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3881 */
3882
3883VALUE
3884rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3885{
3886 long needed_capacity = 0;
3887 volatile VALUE t0;
3888 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3889
3890 for (int index = 0; index < argc; index++) {
3891 VALUE obj = argv[index];
3892 enum ruby_value_type type = types[index] = rb_type(obj);
3893 switch (type) {
3894 case T_FIXNUM:
3895 case T_BIGNUM:
3896 needed_capacity++;
3897 break;
3898 case T_STRING:
3899 needed_capacity += RSTRING_LEN(obj);
3900 break;
3901 default:
3902 rb_raise(
3904 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3905 rb_obj_class(obj)
3906 );
3907 break;
3908 }
3909 }
3910
3911 str_ensure_available_capa(str, needed_capacity);
3912 char *sptr = RSTRING_END(str);
3913
3914 for (int index = 0; index < argc; index++) {
3915 VALUE obj = argv[index];
3916 enum ruby_value_type type = types[index];
3917 switch (type) {
3918 case T_FIXNUM:
3919 case T_BIGNUM: {
3920 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3921 char byte = (char)(NUM2INT(obj) & 0xFF);
3922 *sptr = byte;
3923 sptr++;
3924 break;
3925 }
3926 case T_STRING: {
3927 const char *ptr;
3928 long len;
3929 RSTRING_GETMEM(obj, ptr, len);
3930 memcpy(sptr, ptr, len);
3931 sptr += len;
3932 break;
3933 }
3934 default:
3935 rb_bug("append_as_bytes arguments should have been validated");
3936 }
3937 }
3938
3939 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3940 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3941
3942 int cr = ENC_CODERANGE(str);
3943 switch (cr) {
3944 case ENC_CODERANGE_7BIT: {
3945 for (int index = 0; index < argc; index++) {
3946 VALUE obj = argv[index];
3947 enum ruby_value_type type = types[index];
3948 switch (type) {
3949 case T_FIXNUM:
3950 case T_BIGNUM: {
3951 if (!ISASCII(NUM2INT(obj))) {
3952 goto clear_cr;
3953 }
3954 break;
3955 }
3956 case T_STRING: {
3957 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3958 goto clear_cr;
3959 }
3960 break;
3961 }
3962 default:
3963 rb_bug("append_as_bytes arguments should have been validated");
3964 }
3965 }
3966 break;
3967 }
3969 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3970 goto keep_cr;
3971 }
3972 else {
3973 goto clear_cr;
3974 }
3975 break;
3976 default:
3977 goto clear_cr;
3978 break;
3979 }
3980
3981 RB_GC_GUARD(t0);
3982
3983 clear_cr:
3984 // If no fast path was hit, we clear the coderange.
3985 // append_as_bytes is predominantly meant to be used in
3986 // buffering situation, hence it's likely the coderange
3987 // will never be scanned, so it's not worth spending time
3988 // precomputing the coderange except for simple and common
3989 // situations.
3991 keep_cr:
3992 return str;
3993}
3994
3995/*
3996 * call-seq:
3997 * self << object -> self
3998 *
3999 * Appends a string representation of +object+ to +self+;
4000 * returns +self+.
4001 *
4002 * If +object+ is a string, appends it to +self+:
4003 *
4004 * s = 'foo'
4005 * s << 'bar' # => "foobar"
4006 * s # => "foobar"
4007 *
4008 * If +object+ is an integer,
4009 * its value is considered a codepoint;
4010 * converts the value to a character before concatenating:
4011 *
4012 * s = 'foo'
4013 * s << 33 # => "foo!"
4014 *
4015 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4016 * and the encoding of +self+ is Encoding::US_ASCII,
4017 * changes the encoding to Encoding::ASCII_8BIT:
4018 *
4019 * s = 'foo'.encode(Encoding::US_ASCII)
4020 * s.encoding # => #<Encoding:US-ASCII>
4021 * s << 0xff # => "foo\xFF"
4022 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4023 *
4024 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4025 *
4026 * s = 'foo'
4027 * s.encoding # => <Encoding:UTF-8>
4028 * s << 0x00110000 # 1114112 out of char range (RangeError)
4029 * s = 'foo'.encode(Encoding::EUC_JP)
4030 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4031 *
4032 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4033 */
4034VALUE
4036{
4037 unsigned int code;
4038 rb_encoding *enc = STR_ENC_GET(str1);
4039 int encidx;
4040
4041 if (RB_INTEGER_TYPE_P(str2)) {
4042 if (rb_num_to_uint(str2, &code) == 0) {
4043 }
4044 else if (FIXNUM_P(str2)) {
4045 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4046 }
4047 else {
4048 rb_raise(rb_eRangeError, "bignum out of char range");
4049 }
4050 }
4051 else {
4052 return rb_str_append(str1, str2);
4053 }
4054
4055 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4056
4057 if (encidx >= 0) {
4058 rb_str_buf_cat_byte(str1, (unsigned char)code);
4059 }
4060 else {
4061 long pos = RSTRING_LEN(str1);
4062 int cr = ENC_CODERANGE(str1);
4063 int len;
4064 char *buf;
4065
4066 switch (len = rb_enc_codelen(code, enc)) {
4067 case ONIGERR_INVALID_CODE_POINT_VALUE:
4068 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4069 break;
4070 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4071 case 0:
4072 rb_raise(rb_eRangeError, "%u out of char range", code);
4073 break;
4074 }
4075 buf = ALLOCA_N(char, len + 1);
4076 rb_enc_mbcput(code, buf, enc);
4077 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4078 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4079 }
4080 rb_str_resize(str1, pos+len);
4081 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4082 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4084 }
4085 else if (cr == ENC_CODERANGE_BROKEN) {
4087 }
4088 ENC_CODERANGE_SET(str1, cr);
4089 }
4090 return str1;
4091}
4092
4093int
4094rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4095{
4096 int encidx = rb_enc_to_index(enc);
4097
4098 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4099 /* US-ASCII automatically extended to ASCII-8BIT */
4100 if (code > 0xFF) {
4101 rb_raise(rb_eRangeError, "%u out of char range", code);
4102 }
4103 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4104 return ENCINDEX_ASCII_8BIT;
4105 }
4106 return encidx;
4107 }
4108 else {
4109 return -1;
4110 }
4111}
4112
4113/*
4114 * call-seq:
4115 * prepend(*other_strings) -> new_string
4116 *
4117 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4118 *
4119 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4120 *
4121 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4122 *
4123 */
4124
4125static VALUE
4126rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4127{
4128 str_modifiable(str);
4129
4130 if (argc == 1) {
4131 rb_str_update(str, 0L, 0L, argv[0]);
4132 }
4133 else if (argc > 1) {
4134 int i;
4135 VALUE arg_str = rb_str_tmp_new(0);
4136 rb_enc_copy(arg_str, str);
4137 for (i = 0; i < argc; i++) {
4138 rb_str_append(arg_str, argv[i]);
4139 }
4140 rb_str_update(str, 0L, 0L, arg_str);
4141 }
4142
4143 return str;
4144}
4145
4146st_index_t
4148{
4149 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4150 st_index_t precomputed_hash;
4151 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4152
4153 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4154 return precomputed_hash;
4155 }
4156
4157 return str_do_hash(str);
4158}
4159
4160int
4162{
4163 long len1, len2;
4164 const char *ptr1, *ptr2;
4165 RSTRING_GETMEM(str1, ptr1, len1);
4166 RSTRING_GETMEM(str2, ptr2, len2);
4167 return (len1 != len2 ||
4168 !rb_str_comparable(str1, str2) ||
4169 memcmp(ptr1, ptr2, len1) != 0);
4170}
4171
4172/*
4173 * call-seq:
4174 * hash -> integer
4175 *
4176 * :include: doc/string/hash.rdoc
4177 *
4178 */
4179
4180static VALUE
4181rb_str_hash_m(VALUE str)
4182{
4183 st_index_t hval = rb_str_hash(str);
4184 return ST2FIX(hval);
4185}
4186
4187#define lesser(a,b) (((a)>(b))?(b):(a))
4188
4189int
4191{
4192 int idx1, idx2;
4193 int rc1, rc2;
4194
4195 if (RSTRING_LEN(str1) == 0) return TRUE;
4196 if (RSTRING_LEN(str2) == 0) return TRUE;
4197 idx1 = ENCODING_GET(str1);
4198 idx2 = ENCODING_GET(str2);
4199 if (idx1 == idx2) return TRUE;
4200 rc1 = rb_enc_str_coderange(str1);
4201 rc2 = rb_enc_str_coderange(str2);
4202 if (rc1 == ENC_CODERANGE_7BIT) {
4203 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4204 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4205 return TRUE;
4206 }
4207 if (rc2 == ENC_CODERANGE_7BIT) {
4208 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4209 return TRUE;
4210 }
4211 return FALSE;
4212}
4213
4214int
4216{
4217 long len1, len2;
4218 const char *ptr1, *ptr2;
4219 int retval;
4220
4221 if (str1 == str2) return 0;
4222 RSTRING_GETMEM(str1, ptr1, len1);
4223 RSTRING_GETMEM(str2, ptr2, len2);
4224 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4225 if (len1 == len2) {
4226 if (!rb_str_comparable(str1, str2)) {
4227 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4228 return 1;
4229 return -1;
4230 }
4231 return 0;
4232 }
4233 if (len1 > len2) return 1;
4234 return -1;
4235 }
4236 if (retval > 0) return 1;
4237 return -1;
4238}
4239
4240/*
4241 * call-seq:
4242 * self == object -> true or false
4243 *
4244 * Returns whether +object+ is equal to +self+.
4245 *
4246 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4247 *
4248 * s = 'foo'
4249 * s == 'foo' # => true
4250 * s == 'food' # => false
4251 * s == 'FOO' # => false
4252 *
4253 * Returns +false+ if the two strings' encodings are not compatible:
4254 *
4255 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4256 *
4257 * When +object+ is not a string:
4258 *
4259 * - If +object+ responds to method <tt>to_str</tt>,
4260 * <tt>object == self</tt> is called and its return value is returned.
4261 * - If +object+ does not respond to <tt>to_str</tt>,
4262 * +false+ is returned.
4263 *
4264 * Related: {Comparing}[rdoc-ref:String@Comparing].
4265 */
4266
4267VALUE
4269{
4270 if (str1 == str2) return Qtrue;
4271 if (!RB_TYPE_P(str2, T_STRING)) {
4272 if (!rb_respond_to(str2, idTo_str)) {
4273 return Qfalse;
4274 }
4275 return rb_equal(str2, str1);
4276 }
4277 return rb_str_eql_internal(str1, str2);
4278}
4279
4280/*
4281 * call-seq:
4282 * eql?(object) -> true or false
4283 *
4284 * :include: doc/string/eql_p.rdoc
4285 *
4286 */
4287
4288VALUE
4289rb_str_eql(VALUE str1, VALUE str2)
4290{
4291 if (str1 == str2) return Qtrue;
4292 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4293 return rb_str_eql_internal(str1, str2);
4294}
4295
4296/*
4297 * call-seq:
4298 * self <=> other -> -1, 0, 1, or nil
4299 *
4300 * Compares +self+ and +other+,
4301 * evaluating their _contents_, not their _lengths_.
4302 *
4303 * Returns:
4304 *
4305 * - +-1+, if +self+ is smaller.
4306 * - +0+, if the two are equal.
4307 * - +1+, if +self+ is larger.
4308 * - +nil+, if the two are incomparable.
4309 *
4310 * Examples:
4311 *
4312 * 'a' <=> 'b' # => -1
4313 * 'a' <=> 'ab' # => -1
4314 * 'a' <=> 'a' # => 0
4315 * 'b' <=> 'a' # => 1
4316 * 'ab' <=> 'a' # => 1
4317 * 'a' <=> :a # => nil
4318 *
4319 * \Class \String includes module Comparable,
4320 * each of whose methods uses String#<=> for comparison.
4321 *
4322 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4323 */
4324
4325static VALUE
4326rb_str_cmp_m(VALUE str1, VALUE str2)
4327{
4328 int result;
4329 VALUE s = rb_check_string_type(str2);
4330 if (NIL_P(s)) {
4331 return rb_invcmp(str1, str2);
4332 }
4333 result = rb_str_cmp(str1, s);
4334 return INT2FIX(result);
4335}
4336
4337static VALUE str_casecmp(VALUE str1, VALUE str2);
4338static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4339
4340/*
4341 * call-seq:
4342 * casecmp(other_string) -> -1, 0, 1, or nil
4343 *
4344 * Ignoring case, compares +self+ and +other_string+; returns:
4345 *
4346 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4347 * - 0 if the two are equal.
4348 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4349 * - +nil+ if the two are incomparable.
4350 *
4351 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4352 *
4353 * Examples:
4354 *
4355 * 'foo'.casecmp('goo') # => -1
4356 * 'goo'.casecmp('foo') # => 1
4357 * 'foo'.casecmp('food') # => -1
4358 * 'food'.casecmp('foo') # => 1
4359 * 'FOO'.casecmp('foo') # => 0
4360 * 'foo'.casecmp('FOO') # => 0
4361 * 'foo'.casecmp(1) # => nil
4362 *
4363 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4364 */
4365
4366static VALUE
4367rb_str_casecmp(VALUE str1, VALUE str2)
4368{
4369 VALUE s = rb_check_string_type(str2);
4370 if (NIL_P(s)) {
4371 return Qnil;
4372 }
4373 return str_casecmp(str1, s);
4374}
4375
4376static VALUE
4377str_casecmp(VALUE str1, VALUE str2)
4378{
4379 long len;
4380 rb_encoding *enc;
4381 const char *p1, *p1end, *p2, *p2end;
4382
4383 enc = rb_enc_compatible(str1, str2);
4384 if (!enc) {
4385 return Qnil;
4386 }
4387
4388 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4389 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4390 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4391 while (p1 < p1end && p2 < p2end) {
4392 if (*p1 != *p2) {
4393 unsigned int c1 = TOLOWER(*p1 & 0xff);
4394 unsigned int c2 = TOLOWER(*p2 & 0xff);
4395 if (c1 != c2)
4396 return INT2FIX(c1 < c2 ? -1 : 1);
4397 }
4398 p1++;
4399 p2++;
4400 }
4401 }
4402 else {
4403 while (p1 < p1end && p2 < p2end) {
4404 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4405 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4406
4407 if (0 <= c1 && 0 <= c2) {
4408 c1 = TOLOWER(c1);
4409 c2 = TOLOWER(c2);
4410 if (c1 != c2)
4411 return INT2FIX(c1 < c2 ? -1 : 1);
4412 }
4413 else {
4414 int r;
4415 l1 = rb_enc_mbclen(p1, p1end, enc);
4416 l2 = rb_enc_mbclen(p2, p2end, enc);
4417 len = l1 < l2 ? l1 : l2;
4418 r = memcmp(p1, p2, len);
4419 if (r != 0)
4420 return INT2FIX(r < 0 ? -1 : 1);
4421 if (l1 != l2)
4422 return INT2FIX(l1 < l2 ? -1 : 1);
4423 }
4424 p1 += l1;
4425 p2 += l2;
4426 }
4427 }
4428 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4429 if (p1 == p1end) return INT2FIX(-1);
4430 return INT2FIX(1);
4431}
4432
4433/*
4434 * call-seq:
4435 * casecmp?(other_string) -> true, false, or nil
4436 *
4437 * Returns +true+ if +self+ and +other_string+ are equal after
4438 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4439 *
4440 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4441 *
4442 * Examples:
4443 *
4444 * 'foo'.casecmp?('goo') # => false
4445 * 'goo'.casecmp?('foo') # => false
4446 * 'foo'.casecmp?('food') # => false
4447 * 'food'.casecmp?('foo') # => false
4448 * 'FOO'.casecmp?('foo') # => true
4449 * 'foo'.casecmp?('FOO') # => true
4450 * 'foo'.casecmp?(1) # => nil
4451 *
4452 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4453 */
4454
4455static VALUE
4456rb_str_casecmp_p(VALUE str1, VALUE str2)
4457{
4458 VALUE s = rb_check_string_type(str2);
4459 if (NIL_P(s)) {
4460 return Qnil;
4461 }
4462 return str_casecmp_p(str1, s);
4463}
4464
4465static VALUE
4466str_casecmp_p(VALUE str1, VALUE str2)
4467{
4468 rb_encoding *enc;
4469 VALUE folded_str1, folded_str2;
4470 VALUE fold_opt = sym_fold;
4471
4472 enc = rb_enc_compatible(str1, str2);
4473 if (!enc) {
4474 return Qnil;
4475 }
4476
4477 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4478 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4479
4480 return rb_str_eql(folded_str1, folded_str2);
4481}
4482
4483static long
4484strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4485 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4486{
4487 const char *search_start = str_ptr;
4488 long pos, search_len = str_len - offset;
4489
4490 for (;;) {
4491 const char *t;
4492 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4493 if (pos < 0) return pos;
4494 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4495 if (t == search_start + pos) break;
4496 search_len -= t - search_start;
4497 if (search_len <= 0) return -1;
4498 offset += t - search_start;
4499 search_start = t;
4500 }
4501 return pos + offset;
4502}
4503
4504/* found index in byte */
4505#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4506#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4507
4508static long
4509rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4510{
4511 const char *str_ptr, *str_ptr_end, *sub_ptr;
4512 long str_len, sub_len;
4513 rb_encoding *enc;
4514
4515 enc = rb_enc_check(str, sub);
4516 if (is_broken_string(sub)) return -1;
4517
4518 str_ptr = RSTRING_PTR(str);
4519 str_ptr_end = RSTRING_END(str);
4520 str_len = RSTRING_LEN(str);
4521 sub_ptr = RSTRING_PTR(sub);
4522 sub_len = RSTRING_LEN(sub);
4523
4524 if (str_len < sub_len) return -1;
4525
4526 if (offset != 0) {
4527 long str_len_char, sub_len_char;
4528 int single_byte = single_byte_optimizable(str);
4529 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4530 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4531 if (offset < 0) {
4532 offset += str_len_char;
4533 if (offset < 0) return -1;
4534 }
4535 if (str_len_char - offset < sub_len_char) return -1;
4536 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4537 str_ptr += offset;
4538 }
4539 if (sub_len == 0) return offset;
4540
4541 /* need proceed one character at a time */
4542 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4543}
4544
4545
4546/*
4547 * call-seq:
4548 * index(pattern, offset = 0) -> integer or nil
4549 *
4550 * :include: doc/string/index.rdoc
4551 *
4552 */
4553
4554static VALUE
4555rb_str_index_m(int argc, VALUE *argv, VALUE str)
4556{
4557 VALUE sub;
4558 VALUE initpos;
4559 rb_encoding *enc = STR_ENC_GET(str);
4560 long pos;
4561
4562 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4563 long slen = str_strlen(str, enc); /* str's enc */
4564 pos = NUM2LONG(initpos);
4565 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4566 if (RB_TYPE_P(sub, T_REGEXP)) {
4568 }
4569 return Qnil;
4570 }
4571 }
4572 else {
4573 pos = 0;
4574 }
4575
4576 if (RB_TYPE_P(sub, T_REGEXP)) {
4577 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4578 enc, single_byte_optimizable(str));
4579
4580 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4581 VALUE match = rb_backref_get();
4582 struct re_registers *regs = RMATCH_REGS(match);
4583 pos = rb_str_sublen(str, BEG(0));
4584 return LONG2NUM(pos);
4585 }
4586 }
4587 else {
4588 StringValue(sub);
4589 pos = rb_str_index(str, sub, pos);
4590 if (pos >= 0) {
4591 pos = rb_str_sublen(str, pos);
4592 return LONG2NUM(pos);
4593 }
4594 }
4595 return Qnil;
4596}
4597
4598/* Ensure that the given pos is a valid character boundary.
4599 * Note that in this function, "character" means a code point
4600 * (Unicode scalar value), not a grapheme cluster.
4601 */
4602static void
4603str_ensure_byte_pos(VALUE str, long pos)
4604{
4605 if (!single_byte_optimizable(str)) {
4606 const char *s = RSTRING_PTR(str);
4607 const char *e = RSTRING_END(str);
4608 const char *p = s + pos;
4609 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4610 rb_raise(rb_eIndexError,
4611 "offset %ld does not land on character boundary", pos);
4612 }
4613 }
4614}
4615
4616/*
4617 * call-seq:
4618 * byteindex(object, offset = 0) -> integer or nil
4619 *
4620 * Returns the 0-based integer index of a substring of +self+
4621 * specified by +object+ (a string or Regexp) and +offset+,
4622 * or +nil+ if there is no such substring;
4623 * the returned index is the count of _bytes_ (not characters).
4624 *
4625 * When +object+ is a string,
4626 * returns the index of the first found substring equal to +object+:
4627 *
4628 * s = 'foo' # => "foo"
4629 * s.size # => 3 # Three 1-byte characters.
4630 * s.bytesize # => 3 # Three bytes.
4631 * s.byteindex('f') # => 0
4632 * s.byteindex('o') # => 1
4633 * s.byteindex('oo') # => 1
4634 * s.byteindex('ooo') # => nil
4635 *
4636 * When +object+ is a Regexp,
4637 * returns the index of the first found substring matching +object+;
4638 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4639 *
4640 * s = 'foo'
4641 * s.byteindex(/f/) # => 0
4642 * $~ # => #<MatchData "f">
4643 * s.byteindex(/o/) # => 1
4644 * s.byteindex(/oo/) # => 1
4645 * s.byteindex(/ooo/) # => nil
4646 * $~ # => nil
4647 *
4648 * \Integer argument +offset+, if given, specifies the 0-based index
4649 * of the byte where searching is to begin.
4650 *
4651 * When +offset+ is non-negative,
4652 * searching begins at byte position +offset+:
4653 *
4654 * s = 'foo'
4655 * s.byteindex('o', 1) # => 1
4656 * s.byteindex('o', 2) # => 2
4657 * s.byteindex('o', 3) # => nil
4658 *
4659 * When +offset+ is negative, counts backward from the end of +self+:
4660 *
4661 * s = 'foo'
4662 * s.byteindex('o', -1) # => 2
4663 * s.byteindex('o', -2) # => 1
4664 * s.byteindex('o', -3) # => 1
4665 * s.byteindex('o', -4) # => nil
4666 *
4667 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4668 *
4669 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4670 * s.size # => 2 # Two 3-byte characters.
4671 * s.bytesize # => 6 # Six bytes.
4672 * s.byteindex("\uFFFF") # => 0
4673 * s.byteindex("\uFFFF", 1) # Raises IndexError
4674 * s.byteindex("\uFFFF", 2) # Raises IndexError
4675 * s.byteindex("\uFFFF", 3) # => 3
4676 * s.byteindex("\uFFFF", 4) # Raises IndexError
4677 * s.byteindex("\uFFFF", 5) # Raises IndexError
4678 * s.byteindex("\uFFFF", 6) # => nil
4679 *
4680 * Related: see {Querying}[rdoc-ref:String@Querying].
4681 */
4682
4683static VALUE
4684rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4685{
4686 VALUE sub;
4687 VALUE initpos;
4688 long pos;
4689
4690 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4691 long slen = RSTRING_LEN(str);
4692 pos = NUM2LONG(initpos);
4693 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4694 if (RB_TYPE_P(sub, T_REGEXP)) {
4696 }
4697 return Qnil;
4698 }
4699 }
4700 else {
4701 pos = 0;
4702 }
4703
4704 str_ensure_byte_pos(str, pos);
4705
4706 if (RB_TYPE_P(sub, T_REGEXP)) {
4707 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4708 VALUE match = rb_backref_get();
4709 struct re_registers *regs = RMATCH_REGS(match);
4710 pos = BEG(0);
4711 return LONG2NUM(pos);
4712 }
4713 }
4714 else {
4715 StringValue(sub);
4716 pos = rb_str_byteindex(str, sub, pos);
4717 if (pos >= 0) return LONG2NUM(pos);
4718 }
4719 return Qnil;
4720}
4721
4722#ifndef HAVE_MEMRCHR
4723static void*
4724memrchr(const char *search_str, int chr, long search_len)
4725{
4726 const char *ptr = search_str + search_len;
4727 while (ptr > search_str) {
4728 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4729 }
4730
4731 return ((void *)0);
4732}
4733#endif
4734
4735static long
4736str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4737{
4738 char *hit, *adjusted;
4739 int c;
4740 long slen, searchlen;
4741 char *sbeg, *e, *t;
4742
4743 sbeg = RSTRING_PTR(str);
4744 slen = RSTRING_LEN(sub);
4745 if (slen == 0) return s - sbeg;
4746 e = RSTRING_END(str);
4747 t = RSTRING_PTR(sub);
4748 c = *t & 0xff;
4749 searchlen = s - sbeg + 1;
4750
4751 if (memcmp(s, t, slen) == 0) {
4752 return s - sbeg;
4753 }
4754
4755 do {
4756 hit = memrchr(sbeg, c, searchlen);
4757 if (!hit) break;
4758 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4759 if (hit != adjusted) {
4760 searchlen = adjusted - sbeg;
4761 continue;
4762 }
4763 if (memcmp(hit, t, slen) == 0)
4764 return hit - sbeg;
4765 searchlen = adjusted - sbeg;
4766 } while (searchlen > 0);
4767
4768 return -1;
4769}
4770
4771/* found index in byte */
4772static long
4773rb_str_rindex(VALUE str, VALUE sub, long pos)
4774{
4775 long len, slen;
4776 char *sbeg, *s;
4777 rb_encoding *enc;
4778 int singlebyte;
4779
4780 enc = rb_enc_check(str, sub);
4781 if (is_broken_string(sub)) return -1;
4782 singlebyte = single_byte_optimizable(str);
4783 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4784 slen = str_strlen(sub, enc); /* rb_enc_check */
4785
4786 /* substring longer than string */
4787 if (len < slen) return -1;
4788 if (len - pos < slen) pos = len - slen;
4789 if (len == 0) return pos;
4790
4791 sbeg = RSTRING_PTR(str);
4792
4793 if (pos == 0) {
4794 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4795 return 0;
4796 else
4797 return -1;
4798 }
4799
4800 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4801 return str_rindex(str, sub, s, enc);
4802}
4803
4804/*
4805 * call-seq:
4806 * rindex(pattern, offset = self.length) -> integer or nil
4807 *
4808 * :include:doc/string/rindex.rdoc
4809 *
4810 */
4811
4812static VALUE
4813rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4814{
4815 VALUE sub;
4816 VALUE initpos;
4817 rb_encoding *enc = STR_ENC_GET(str);
4818 long pos, len = str_strlen(str, enc); /* str's enc */
4819
4820 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4821 pos = NUM2LONG(initpos);
4822 if (pos < 0 && (pos += len) < 0) {
4823 if (RB_TYPE_P(sub, T_REGEXP)) {
4825 }
4826 return Qnil;
4827 }
4828 if (pos > len) pos = len;
4829 }
4830 else {
4831 pos = len;
4832 }
4833
4834 if (RB_TYPE_P(sub, T_REGEXP)) {
4835 /* enc = rb_enc_check(str, sub); */
4836 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4837 enc, single_byte_optimizable(str));
4838
4839 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4840 VALUE match = rb_backref_get();
4841 struct re_registers *regs = RMATCH_REGS(match);
4842 pos = rb_str_sublen(str, BEG(0));
4843 return LONG2NUM(pos);
4844 }
4845 }
4846 else {
4847 StringValue(sub);
4848 pos = rb_str_rindex(str, sub, pos);
4849 if (pos >= 0) {
4850 pos = rb_str_sublen(str, pos);
4851 return LONG2NUM(pos);
4852 }
4853 }
4854 return Qnil;
4855}
4856
4857static long
4858rb_str_byterindex(VALUE str, VALUE sub, long pos)
4859{
4860 long len, slen;
4861 char *sbeg, *s;
4862 rb_encoding *enc;
4863
4864 enc = rb_enc_check(str, sub);
4865 if (is_broken_string(sub)) return -1;
4866 len = RSTRING_LEN(str);
4867 slen = RSTRING_LEN(sub);
4868
4869 /* substring longer than string */
4870 if (len < slen) return -1;
4871 if (len - pos < slen) pos = len - slen;
4872 if (len == 0) return pos;
4873
4874 sbeg = RSTRING_PTR(str);
4875
4876 if (pos == 0) {
4877 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4878 return 0;
4879 else
4880 return -1;
4881 }
4882
4883 s = sbeg + pos;
4884 return str_rindex(str, sub, s, enc);
4885}
4886
4887/*
4888 * call-seq:
4889 * byterindex(object, offset = self.bytesize) -> integer or nil
4890 *
4891 * Returns the 0-based integer index of a substring of +self+
4892 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4893 * or +nil+ if there is no such substring;
4894 * the returned index is the count of _bytes_ (not characters).
4895 *
4896 * When +object+ is a string,
4897 * returns the index of the _last_ found substring equal to +object+:
4898 *
4899 * s = 'foo' # => "foo"
4900 * s.size # => 3 # Three 1-byte characters.
4901 * s.bytesize # => 3 # Three bytes.
4902 * s.byterindex('f') # => 0
4903 s.byterindex('o') # => 2
4904 s.byterindex('oo') # => 1
4905 s.byterindex('ooo') # => nil
4906 *
4907 * When +object+ is a Regexp,
4908 * returns the index of the last found substring matching +object+;
4909 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4910 *
4911 * s = 'foo'
4912 * s.byterindex(/f/) # => 0
4913 * $~ # => #<MatchData "f">
4914 * s.byterindex(/o/) # => 2
4915 * s.byterindex(/oo/) # => 1
4916 * s.byterindex(/ooo/) # => nil
4917 * $~ # => nil
4918 *
4919 * The last match means starting at the possible last position,
4920 * not the last of the longest matches:
4921 *
4922 * s = 'foo'
4923 * s.byterindex(/o+/) # => 2
4924 * $~ #=> #<MatchData "o">
4925 *
4926 * To get the last longest match, use a negative lookbehind:
4927 *
4928 * s = 'foo'
4929 * s.byterindex(/(?<!o)o+/) # => 1
4930 * $~ # => #<MatchData "oo">
4931 *
4932 * Or use method #byteindex with negative lookahead:
4933 *
4934 * s = 'foo'
4935 * s.byteindex(/o+(?!.*o)/) # => 1
4936 * $~ #=> #<MatchData "oo">
4937 *
4938 * \Integer argument +offset+, if given, specifies the 0-based index
4939 * of the byte where searching is to end.
4940 *
4941 * When +offset+ is non-negative,
4942 * searching ends at byte position +offset+:
4943 *
4944 * s = 'foo'
4945 * s.byterindex('o', 0) # => nil
4946 * s.byterindex('o', 1) # => 1
4947 * s.byterindex('o', 2) # => 2
4948 * s.byterindex('o', 3) # => 2
4949 *
4950 * When +offset+ is negative, counts backward from the end of +self+:
4951 *
4952 * s = 'foo'
4953 * s.byterindex('o', -1) # => 2
4954 * s.byterindex('o', -2) # => 1
4955 * s.byterindex('o', -3) # => nil
4956 *
4957 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4958 *
4959 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4960 * s.size # => 2 # Two 3-byte characters.
4961 * s.bytesize # => 6 # Six bytes.
4962 * s.byterindex("\uFFFF") # => 3
4963 * s.byterindex("\uFFFF", 1) # Raises IndexError
4964 * s.byterindex("\uFFFF", 2) # Raises IndexError
4965 * s.byterindex("\uFFFF", 3) # => 3
4966 * s.byterindex("\uFFFF", 4) # Raises IndexError
4967 * s.byterindex("\uFFFF", 5) # Raises IndexError
4968 * s.byterindex("\uFFFF", 6) # => nil
4969 *
4970 * Related: see {Querying}[rdoc-ref:String@Querying].
4971 */
4972
4973static VALUE
4974rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4975{
4976 VALUE sub;
4977 VALUE initpos;
4978 long pos, len = RSTRING_LEN(str);
4979
4980 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4981 pos = NUM2LONG(initpos);
4982 if (pos < 0 && (pos += len) < 0) {
4983 if (RB_TYPE_P(sub, T_REGEXP)) {
4985 }
4986 return Qnil;
4987 }
4988 if (pos > len) pos = len;
4989 }
4990 else {
4991 pos = len;
4992 }
4993
4994 str_ensure_byte_pos(str, pos);
4995
4996 if (RB_TYPE_P(sub, T_REGEXP)) {
4997 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4998 VALUE match = rb_backref_get();
4999 struct re_registers *regs = RMATCH_REGS(match);
5000 pos = BEG(0);
5001 return LONG2NUM(pos);
5002 }
5003 }
5004 else {
5005 StringValue(sub);
5006 pos = rb_str_byterindex(str, sub, pos);
5007 if (pos >= 0) return LONG2NUM(pos);
5008 }
5009 return Qnil;
5010}
5011
5012/*
5013 * call-seq:
5014 * self =~ object -> integer or nil
5015 *
5016 * When +object+ is a Regexp, returns the index of the first substring in +self+
5017 * matched by +object+,
5018 * or +nil+ if no match is found;
5019 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5020 *
5021 * 'foo' =~ /f/ # => 0
5022 * $~ # => #<MatchData "f">
5023 * 'foo' =~ /o/ # => 1
5024 * $~ # => #<MatchData "o">
5025 * 'foo' =~ /x/ # => nil
5026 * $~ # => nil
5027 *
5028 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5029 * (see Regexp#=~):
5030 *
5031 * number = nil
5032 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5033 * number # => nil # Not assigned.
5034 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5035 * number # => "9" # Assigned.
5036 *
5037 * If +object+ is not a Regexp, returns the value
5038 * returned by <tt>object =~ self</tt>.
5039 *
5040 * Related: see {Querying}[rdoc-ref:String@Querying].
5041 */
5042
5043static VALUE
5044rb_str_match(VALUE x, VALUE y)
5045{
5046 switch (OBJ_BUILTIN_TYPE(y)) {
5047 case T_STRING:
5048 rb_raise(rb_eTypeError, "type mismatch: String given");
5049
5050 case T_REGEXP:
5051 return rb_reg_match(y, x);
5052
5053 default:
5054 return rb_funcall(y, idEqTilde, 1, x);
5055 }
5056}
5057
5058
5059static VALUE get_pat(VALUE);
5060
5061
5062/*
5063 * call-seq:
5064 * match(pattern, offset = 0) -> matchdata or nil
5065 * match(pattern, offset = 0) {|matchdata| ... } -> object
5066 *
5067 * Creates a MatchData object based on +self+ and the given arguments;
5068 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5069 *
5070 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5071 *
5072 * regexp = Regexp.new(pattern)
5073 *
5074 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5075 * (see Regexp#match):
5076 *
5077 * matchdata = regexp.match(self[offset..])
5078 *
5079 * With no block given, returns the computed +matchdata+ or +nil+:
5080 *
5081 * 'foo'.match('f') # => #<MatchData "f">
5082 * 'foo'.match('o') # => #<MatchData "o">
5083 * 'foo'.match('x') # => nil
5084 * 'foo'.match('f', 1) # => nil
5085 * 'foo'.match('o', 1) # => #<MatchData "o">
5086 *
5087 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5088 * returns the block's return value:
5089 *
5090 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5091 *
5092 * With a block given and +nil+ +matchdata+, does not call the block:
5093 *
5094 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5095 *
5096 * Related: see {Querying}[rdoc-ref:String@Querying].
5097 */
5098
5099static VALUE
5100rb_str_match_m(int argc, VALUE *argv, VALUE str)
5101{
5102 VALUE re, result;
5103 if (argc < 1)
5104 rb_check_arity(argc, 1, 2);
5105 re = argv[0];
5106 argv[0] = str;
5107 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5108 if (!NIL_P(result) && rb_block_given_p()) {
5109 return rb_yield(result);
5110 }
5111 return result;
5112}
5113
5114/*
5115 * call-seq:
5116 * match?(pattern, offset = 0) -> true or false
5117 *
5118 * Returns whether a match is found for +self+ and the given arguments;
5119 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5120 *
5121 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5122 *
5123 * regexp = Regexp.new(pattern)
5124 *
5125 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5126 * +false+ otherwise:
5127 *
5128 * 'foo'.match?(/o/) # => true
5129 * 'foo'.match?('o') # => true
5130 * 'foo'.match?(/x/) # => false
5131 * 'foo'.match?('f', 1) # => false
5132 * 'foo'.match?('o', 1) # => true
5133 *
5134 * Related: see {Querying}[rdoc-ref:String@Querying].
5135 */
5136
5137static VALUE
5138rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5139{
5140 VALUE re;
5141 rb_check_arity(argc, 1, 2);
5142 re = get_pat(argv[0]);
5143 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5144}
5145
5146enum neighbor_char {
5147 NEIGHBOR_NOT_CHAR,
5148 NEIGHBOR_FOUND,
5149 NEIGHBOR_WRAPPED
5150};
5151
5152static enum neighbor_char
5153enc_succ_char(char *p, long len, rb_encoding *enc)
5154{
5155 long i;
5156 int l;
5157
5158 if (rb_enc_mbminlen(enc) > 1) {
5159 /* wchar, trivial case */
5160 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5161 if (!MBCLEN_CHARFOUND_P(r)) {
5162 return NEIGHBOR_NOT_CHAR;
5163 }
5164 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5165 l = rb_enc_code_to_mbclen(c, enc);
5166 if (!l) return NEIGHBOR_NOT_CHAR;
5167 if (l != len) return NEIGHBOR_WRAPPED;
5168 rb_enc_mbcput(c, p, enc);
5169 r = rb_enc_precise_mbclen(p, p + len, enc);
5170 if (!MBCLEN_CHARFOUND_P(r)) {
5171 return NEIGHBOR_NOT_CHAR;
5172 }
5173 return NEIGHBOR_FOUND;
5174 }
5175 while (1) {
5176 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5177 p[i] = '\0';
5178 if (i < 0)
5179 return NEIGHBOR_WRAPPED;
5180 ++((unsigned char*)p)[i];
5181 l = rb_enc_precise_mbclen(p, p+len, enc);
5182 if (MBCLEN_CHARFOUND_P(l)) {
5183 l = MBCLEN_CHARFOUND_LEN(l);
5184 if (l == len) {
5185 return NEIGHBOR_FOUND;
5186 }
5187 else {
5188 memset(p+l, 0xff, len-l);
5189 }
5190 }
5191 if (MBCLEN_INVALID_P(l) && i < len-1) {
5192 long len2;
5193 int l2;
5194 for (len2 = len-1; 0 < len2; len2--) {
5195 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5196 if (!MBCLEN_INVALID_P(l2))
5197 break;
5198 }
5199 memset(p+len2+1, 0xff, len-(len2+1));
5200 }
5201 }
5202}
5203
5204static enum neighbor_char
5205enc_pred_char(char *p, long len, rb_encoding *enc)
5206{
5207 long i;
5208 int l;
5209 if (rb_enc_mbminlen(enc) > 1) {
5210 /* wchar, trivial case */
5211 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5212 if (!MBCLEN_CHARFOUND_P(r)) {
5213 return NEIGHBOR_NOT_CHAR;
5214 }
5215 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5216 if (!c) return NEIGHBOR_NOT_CHAR;
5217 --c;
5218 l = rb_enc_code_to_mbclen(c, enc);
5219 if (!l) return NEIGHBOR_NOT_CHAR;
5220 if (l != len) return NEIGHBOR_WRAPPED;
5221 rb_enc_mbcput(c, p, enc);
5222 r = rb_enc_precise_mbclen(p, p + len, enc);
5223 if (!MBCLEN_CHARFOUND_P(r)) {
5224 return NEIGHBOR_NOT_CHAR;
5225 }
5226 return NEIGHBOR_FOUND;
5227 }
5228 while (1) {
5229 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5230 p[i] = '\xff';
5231 if (i < 0)
5232 return NEIGHBOR_WRAPPED;
5233 --((unsigned char*)p)[i];
5234 l = rb_enc_precise_mbclen(p, p+len, enc);
5235 if (MBCLEN_CHARFOUND_P(l)) {
5236 l = MBCLEN_CHARFOUND_LEN(l);
5237 if (l == len) {
5238 return NEIGHBOR_FOUND;
5239 }
5240 else {
5241 memset(p+l, 0, len-l);
5242 }
5243 }
5244 if (MBCLEN_INVALID_P(l) && i < len-1) {
5245 long len2;
5246 int l2;
5247 for (len2 = len-1; 0 < len2; len2--) {
5248 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5249 if (!MBCLEN_INVALID_P(l2))
5250 break;
5251 }
5252 memset(p+len2+1, 0, len-(len2+1));
5253 }
5254 }
5255}
5256
5257/*
5258 overwrite +p+ by succeeding letter in +enc+ and returns
5259 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5260 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5261 assuming each ranges are successive, and mbclen
5262 never change in each ranges.
5263 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5264 character.
5265 */
5266static enum neighbor_char
5267enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5268{
5269 enum neighbor_char ret;
5270 unsigned int c;
5271 int ctype;
5272 int range;
5273 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5274
5275 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5276 int try;
5277 const int max_gaps = 1;
5278
5279 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5280 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5281 ctype = ONIGENC_CTYPE_DIGIT;
5282 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5283 ctype = ONIGENC_CTYPE_ALPHA;
5284 else
5285 return NEIGHBOR_NOT_CHAR;
5286
5287 MEMCPY(save, p, char, len);
5288 for (try = 0; try <= max_gaps; ++try) {
5289 ret = enc_succ_char(p, len, enc);
5290 if (ret == NEIGHBOR_FOUND) {
5291 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5292 if (rb_enc_isctype(c, ctype, enc))
5293 return NEIGHBOR_FOUND;
5294 }
5295 }
5296 MEMCPY(p, save, char, len);
5297 range = 1;
5298 while (1) {
5299 MEMCPY(save, p, char, len);
5300 ret = enc_pred_char(p, len, enc);
5301 if (ret == NEIGHBOR_FOUND) {
5302 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5303 if (!rb_enc_isctype(c, ctype, enc)) {
5304 MEMCPY(p, save, char, len);
5305 break;
5306 }
5307 }
5308 else {
5309 MEMCPY(p, save, char, len);
5310 break;
5311 }
5312 range++;
5313 }
5314 if (range == 1) {
5315 return NEIGHBOR_NOT_CHAR;
5316 }
5317
5318 if (ctype != ONIGENC_CTYPE_DIGIT) {
5319 MEMCPY(carry, p, char, len);
5320 return NEIGHBOR_WRAPPED;
5321 }
5322
5323 MEMCPY(carry, p, char, len);
5324 enc_succ_char(carry, len, enc);
5325 return NEIGHBOR_WRAPPED;
5326}
5327
5328
5329static VALUE str_succ(VALUE str);
5330
5331/*
5332 * call-seq:
5333 * succ -> new_str
5334 *
5335 * :include: doc/string/succ.rdoc
5336 *
5337 */
5338
5339VALUE
5341{
5342 VALUE str;
5343 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5344 rb_enc_cr_str_copy_for_substr(str, orig);
5345 return str_succ(str);
5346}
5347
5348static VALUE
5349str_succ(VALUE str)
5350{
5351 rb_encoding *enc;
5352 char *sbeg, *s, *e, *last_alnum = 0;
5353 int found_alnum = 0;
5354 long l, slen;
5355 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5356 long carry_pos = 0, carry_len = 1;
5357 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5358
5359 slen = RSTRING_LEN(str);
5360 if (slen == 0) return str;
5361
5362 enc = STR_ENC_GET(str);
5363 sbeg = RSTRING_PTR(str);
5364 s = e = sbeg + slen;
5365
5366 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5367 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5368 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5369 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5370 break;
5371 }
5372 }
5373 l = rb_enc_precise_mbclen(s, e, enc);
5374 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5375 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5376 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5377 switch (neighbor) {
5378 case NEIGHBOR_NOT_CHAR:
5379 continue;
5380 case NEIGHBOR_FOUND:
5381 return str;
5382 case NEIGHBOR_WRAPPED:
5383 last_alnum = s;
5384 break;
5385 }
5386 found_alnum = 1;
5387 carry_pos = s - sbeg;
5388 carry_len = l;
5389 }
5390 if (!found_alnum) { /* str contains no alnum */
5391 s = e;
5392 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5393 enum neighbor_char neighbor;
5394 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5395 l = rb_enc_precise_mbclen(s, e, enc);
5396 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5397 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5398 MEMCPY(tmp, s, char, l);
5399 neighbor = enc_succ_char(tmp, l, enc);
5400 switch (neighbor) {
5401 case NEIGHBOR_FOUND:
5402 MEMCPY(s, tmp, char, l);
5403 return str;
5404 break;
5405 case NEIGHBOR_WRAPPED:
5406 MEMCPY(s, tmp, char, l);
5407 break;
5408 case NEIGHBOR_NOT_CHAR:
5409 break;
5410 }
5411 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5412 /* wrapped to \0...\0. search next valid char. */
5413 enc_succ_char(s, l, enc);
5414 }
5415 if (!rb_enc_asciicompat(enc)) {
5416 MEMCPY(carry, s, char, l);
5417 carry_len = l;
5418 }
5419 carry_pos = s - sbeg;
5420 }
5422 }
5423 RESIZE_CAPA(str, slen + carry_len);
5424 sbeg = RSTRING_PTR(str);
5425 s = sbeg + carry_pos;
5426 memmove(s + carry_len, s, slen - carry_pos);
5427 memmove(s, carry, carry_len);
5428 slen += carry_len;
5429 STR_SET_LEN(str, slen);
5430 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5432 return str;
5433}
5434
5435
5436/*
5437 * call-seq:
5438 * succ! -> self
5439 *
5440 * Like String#succ, but modifies +self+ in place; returns +self+.
5441 *
5442 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5443 */
5444
5445static VALUE
5446rb_str_succ_bang(VALUE str)
5447{
5448 rb_str_modify(str);
5449 str_succ(str);
5450 return str;
5451}
5452
5453static int
5454all_digits_p(const char *s, long len)
5455{
5456 while (len-- > 0) {
5457 if (!ISDIGIT(*s)) return 0;
5458 s++;
5459 }
5460 return 1;
5461}
5462
5463static int
5464str_upto_i(VALUE str, VALUE arg)
5465{
5466 rb_yield(str);
5467 return 0;
5468}
5469
5470/*
5471 * call-seq:
5472 * upto(other_string, exclusive = false) {|string| ... } -> self
5473 * upto(other_string, exclusive = false) -> new_enumerator
5474 *
5475 * :include: doc/string/upto.rdoc
5476 *
5477 */
5478
5479static VALUE
5480rb_str_upto(int argc, VALUE *argv, VALUE beg)
5481{
5482 VALUE end, exclusive;
5483
5484 rb_scan_args(argc, argv, "11", &end, &exclusive);
5485 RETURN_ENUMERATOR(beg, argc, argv);
5486 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5487}
5488
5489VALUE
5490rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5491{
5492 VALUE current, after_end;
5493 ID succ;
5494 int n, ascii;
5495 rb_encoding *enc;
5496
5497 CONST_ID(succ, "succ");
5498 StringValue(end);
5499 enc = rb_enc_check(beg, end);
5500 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5501 /* single character */
5502 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5503 char c = RSTRING_PTR(beg)[0];
5504 char e = RSTRING_PTR(end)[0];
5505
5506 if (c > e || (excl && c == e)) return beg;
5507 for (;;) {
5508 VALUE str = rb_enc_str_new(&c, 1, enc);
5510 if ((*each)(str, arg)) break;
5511 if (!excl && c == e) break;
5512 c++;
5513 if (excl && c == e) break;
5514 }
5515 return beg;
5516 }
5517 /* both edges are all digits */
5518 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5519 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5520 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5521 VALUE b, e;
5522 int width;
5523
5524 width = RSTRING_LENINT(beg);
5525 b = rb_str_to_inum(beg, 10, FALSE);
5526 e = rb_str_to_inum(end, 10, FALSE);
5527 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5528 long bi = FIX2LONG(b);
5529 long ei = FIX2LONG(e);
5530 rb_encoding *usascii = rb_usascii_encoding();
5531
5532 while (bi <= ei) {
5533 if (excl && bi == ei) break;
5534 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5535 bi++;
5536 }
5537 }
5538 else {
5539 ID op = excl ? '<' : idLE;
5540 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5541
5542 args[0] = INT2FIX(width);
5543 while (rb_funcall(b, op, 1, e)) {
5544 args[1] = b;
5545 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5546 b = rb_funcallv(b, succ, 0, 0);
5547 }
5548 }
5549 return beg;
5550 }
5551 /* normal case */
5552 n = rb_str_cmp(beg, end);
5553 if (n > 0 || (excl && n == 0)) return beg;
5554
5555 after_end = rb_funcallv(end, succ, 0, 0);
5556 current = str_duplicate(rb_cString, beg);
5557 while (!rb_str_equal(current, after_end)) {
5558 VALUE next = Qnil;
5559 if (excl || !rb_str_equal(current, end))
5560 next = rb_funcallv(current, succ, 0, 0);
5561 if ((*each)(current, arg)) break;
5562 if (NIL_P(next)) break;
5563 current = next;
5564 StringValue(current);
5565 if (excl && rb_str_equal(current, end)) break;
5566 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5567 break;
5568 }
5569
5570 return beg;
5571}
5572
5573VALUE
5574rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5575{
5576 VALUE current;
5577 ID succ;
5578
5579 CONST_ID(succ, "succ");
5580 /* both edges are all digits */
5581 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5582 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5583 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5584 int width = RSTRING_LENINT(beg);
5585 b = rb_str_to_inum(beg, 10, FALSE);
5586 if (FIXNUM_P(b)) {
5587 long bi = FIX2LONG(b);
5588 rb_encoding *usascii = rb_usascii_encoding();
5589
5590 while (FIXABLE(bi)) {
5591 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5592 bi++;
5593 }
5594 b = LONG2NUM(bi);
5595 }
5596 args[0] = INT2FIX(width);
5597 while (1) {
5598 args[1] = b;
5599 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5600 b = rb_funcallv(b, succ, 0, 0);
5601 }
5602 }
5603 /* normal case */
5604 current = str_duplicate(rb_cString, beg);
5605 while (1) {
5606 VALUE next = rb_funcallv(current, succ, 0, 0);
5607 if ((*each)(current, arg)) break;
5608 current = next;
5609 StringValue(current);
5610 if (RSTRING_LEN(current) == 0)
5611 break;
5612 }
5613
5614 return beg;
5615}
5616
5617static int
5618include_range_i(VALUE str, VALUE arg)
5619{
5620 VALUE *argp = (VALUE *)arg;
5621 if (!rb_equal(str, *argp)) return 0;
5622 *argp = Qnil;
5623 return 1;
5624}
5625
5626VALUE
5627rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5628{
5629 beg = rb_str_new_frozen(beg);
5630 StringValue(end);
5631 end = rb_str_new_frozen(end);
5632 if (NIL_P(val)) return Qfalse;
5633 val = rb_check_string_type(val);
5634 if (NIL_P(val)) return Qfalse;
5635 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5636 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5637 rb_enc_asciicompat(STR_ENC_GET(val))) {
5638 const char *bp = RSTRING_PTR(beg);
5639 const char *ep = RSTRING_PTR(end);
5640 const char *vp = RSTRING_PTR(val);
5641 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5642 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5643 return Qfalse;
5644 else {
5645 char b = *bp;
5646 char e = *ep;
5647 char v = *vp;
5648
5649 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5650 if (b <= v && v < e) return Qtrue;
5651 return RBOOL(!RTEST(exclusive) && v == e);
5652 }
5653 }
5654 }
5655#if 0
5656 /* both edges are all digits */
5657 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5658 all_digits_p(bp, RSTRING_LEN(beg)) &&
5659 all_digits_p(ep, RSTRING_LEN(end))) {
5660 /* TODO */
5661 }
5662#endif
5663 }
5664 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5665
5666 return RBOOL(NIL_P(val));
5667}
5668
5669static VALUE
5670rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5671{
5672 if (rb_reg_search(re, str, 0, 0) >= 0) {
5673 VALUE match = rb_backref_get();
5674 int nth = rb_reg_backref_number(match, backref);
5675 return rb_reg_nth_match(nth, match);
5676 }
5677 return Qnil;
5678}
5679
5680static VALUE
5681rb_str_aref(VALUE str, VALUE indx)
5682{
5683 long idx;
5684
5685 if (FIXNUM_P(indx)) {
5686 idx = FIX2LONG(indx);
5687 }
5688 else if (RB_TYPE_P(indx, T_REGEXP)) {
5689 return rb_str_subpat(str, indx, INT2FIX(0));
5690 }
5691 else if (RB_TYPE_P(indx, T_STRING)) {
5692 if (rb_str_index(str, indx, 0) != -1)
5693 return str_duplicate(rb_cString, indx);
5694 return Qnil;
5695 }
5696 else {
5697 /* check if indx is Range */
5698 long beg, len = str_strlen(str, NULL);
5699 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5700 case Qfalse:
5701 break;
5702 case Qnil:
5703 return Qnil;
5704 default:
5705 return rb_str_substr(str, beg, len);
5706 }
5707 idx = NUM2LONG(indx);
5708 }
5709
5710 return str_substr(str, idx, 1, FALSE);
5711}
5712
5713
5714/*
5715 * call-seq:
5716 * self[index] -> new_string or nil
5717 * self[start, length] -> new_string or nil
5718 * self[range] -> new_string or nil
5719 * self[regexp, capture = 0] -> new_string or nil
5720 * self[substring] -> new_string or nil
5721 *
5722 * :include: doc/string/aref.rdoc
5723 *
5724 */
5725
5726static VALUE
5727rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5728{
5729 if (argc == 2) {
5730 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5731 return rb_str_subpat(str, argv[0], argv[1]);
5732 }
5733 else {
5734 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5735 }
5736 }
5737 rb_check_arity(argc, 1, 2);
5738 return rb_str_aref(str, argv[0]);
5739}
5740
5741VALUE
5743{
5744 char *ptr = RSTRING_PTR(str);
5745 long olen = RSTRING_LEN(str), nlen;
5746
5747 str_modifiable(str);
5748 if (len > olen) len = olen;
5749 nlen = olen - len;
5750 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5751 char *oldptr = ptr;
5752 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5753 STR_SET_EMBED(str);
5754 ptr = RSTRING(str)->as.embed.ary;
5755 memmove(ptr, oldptr + len, nlen);
5756 if (fl == STR_NOEMBED) xfree(oldptr);
5757 }
5758 else {
5759 if (!STR_SHARED_P(str)) {
5760 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5761 rb_enc_cr_str_exact_copy(shared, str);
5762 OBJ_FREEZE(shared);
5763 }
5764 ptr = RSTRING(str)->as.heap.ptr += len;
5765 }
5766 STR_SET_LEN(str, nlen);
5767
5768 if (!SHARABLE_MIDDLE_SUBSTRING) {
5769 TERM_FILL(ptr + nlen, TERM_LEN(str));
5770 }
5772 return str;
5773}
5774
5775static void
5776rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5777{
5778 char *sptr;
5779 long slen;
5780 int cr;
5781
5782 if (beg == 0 && vlen == 0) {
5783 rb_str_drop_bytes(str, len);
5784 return;
5785 }
5786
5787 str_modify_keep_cr(str);
5788 RSTRING_GETMEM(str, sptr, slen);
5789 if (len < vlen) {
5790 /* expand string */
5791 RESIZE_CAPA(str, slen + vlen - len);
5792 sptr = RSTRING_PTR(str);
5793 }
5794
5796 cr = rb_enc_str_coderange(val);
5797 else
5799
5800 if (vlen != len) {
5801 memmove(sptr + beg + vlen,
5802 sptr + beg + len,
5803 slen - (beg + len));
5804 }
5805 if (vlen < beg && len < 0) {
5806 MEMZERO(sptr + slen, char, -len);
5807 }
5808 if (vlen > 0) {
5809 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5810 }
5811 slen += vlen - len;
5812 STR_SET_LEN(str, slen);
5813 TERM_FILL(&sptr[slen], TERM_LEN(str));
5814 ENC_CODERANGE_SET(str, cr);
5815}
5816
5817static inline void
5818rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5819{
5820 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5821}
5822
5823void
5824rb_str_update(VALUE str, long beg, long len, VALUE val)
5825{
5826 long slen;
5827 char *p, *e;
5828 rb_encoding *enc;
5829 int singlebyte = single_byte_optimizable(str);
5830 int cr;
5831
5832 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5833
5834 StringValue(val);
5835 enc = rb_enc_check(str, val);
5836 slen = str_strlen(str, enc); /* rb_enc_check */
5837
5838 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5839 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5840 }
5841 if (beg < 0) {
5842 beg += slen;
5843 }
5844 RUBY_ASSERT(beg >= 0);
5845 RUBY_ASSERT(beg <= slen);
5846
5847 if (len > slen - beg) {
5848 len = slen - beg;
5849 }
5850 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5851 if (!p) p = RSTRING_END(str);
5852 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5853 if (!e) e = RSTRING_END(str);
5854 /* error check */
5855 beg = p - RSTRING_PTR(str); /* physical position */
5856 len = e - p; /* physical length */
5857 rb_str_update_0(str, beg, len, val);
5858 rb_enc_associate(str, enc);
5860 if (cr != ENC_CODERANGE_BROKEN)
5861 ENC_CODERANGE_SET(str, cr);
5862}
5863
5864static void
5865rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5866{
5867 int nth;
5868 VALUE match;
5869 long start, end, len;
5870 rb_encoding *enc;
5871 struct re_registers *regs;
5872
5873 if (rb_reg_search(re, str, 0, 0) < 0) {
5874 rb_raise(rb_eIndexError, "regexp not matched");
5875 }
5876 match = rb_backref_get();
5877 nth = rb_reg_backref_number(match, backref);
5878 regs = RMATCH_REGS(match);
5879 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5880 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5881 }
5882 if (nth < 0) {
5883 nth += regs->num_regs;
5884 }
5885
5886 start = BEG(nth);
5887 if (start == -1) {
5888 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5889 }
5890 end = END(nth);
5891 len = end - start;
5892 StringValue(val);
5893 enc = rb_enc_check_str(str, val);
5894 rb_str_update_0(str, start, len, val);
5895 rb_enc_associate(str, enc);
5896}
5897
5898static VALUE
5899rb_str_aset(VALUE str, VALUE indx, VALUE val)
5900{
5901 long idx, beg;
5902
5903 switch (TYPE(indx)) {
5904 case T_REGEXP:
5905 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5906 return val;
5907
5908 case T_STRING:
5909 beg = rb_str_index(str, indx, 0);
5910 if (beg < 0) {
5911 rb_raise(rb_eIndexError, "string not matched");
5912 }
5913 beg = rb_str_sublen(str, beg);
5914 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5915 return val;
5916
5917 default:
5918 /* check if indx is Range */
5919 {
5920 long beg, len;
5921 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5922 rb_str_update(str, beg, len, val);
5923 return val;
5924 }
5925 }
5926 /* FALLTHROUGH */
5927
5928 case T_FIXNUM:
5929 idx = NUM2LONG(indx);
5930 rb_str_update(str, idx, 1, val);
5931 return val;
5932 }
5933}
5934
5935/*
5936 * call-seq:
5937 * self[index] = other_string -> new_string
5938 * self[start, length] = other_string -> new_string
5939 * self[range] = other_string -> new_string
5940 * self[regexp, capture = 0] = other_string -> new_string
5941 * self[substring] = other_string -> new_string
5942 *
5943 * :include: doc/string/aset.rdoc
5944 *
5945 */
5946
5947static VALUE
5948rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5949{
5950 if (argc == 3) {
5951 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5952 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5953 }
5954 else {
5955 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5956 }
5957 return argv[2];
5958 }
5959 rb_check_arity(argc, 2, 3);
5960 return rb_str_aset(str, argv[0], argv[1]);
5961}
5962
5963/*
5964 * call-seq:
5965 * insert(offset, other_string) -> self
5966 *
5967 * :include: doc/string/insert.rdoc
5968 *
5969 */
5970
5971static VALUE
5972rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5973{
5974 long pos = NUM2LONG(idx);
5975
5976 if (pos == -1) {
5977 return rb_str_append(str, str2);
5978 }
5979 else if (pos < 0) {
5980 pos++;
5981 }
5982 rb_str_update(str, pos, 0, str2);
5983 return str;
5984}
5985
5986
5987/*
5988 * call-seq:
5989 * slice!(index) -> new_string or nil
5990 * slice!(start, length) -> new_string or nil
5991 * slice!(range) -> new_string or nil
5992 * slice!(regexp, capture = 0) -> new_string or nil
5993 * slice!(substring) -> new_string or nil
5994 *
5995 * Like String#[] (and its alias String#slice), except that:
5996 *
5997 * - Performs substitutions in +self+ (not in a copy of +self+).
5998 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
5999 *
6000 * A few examples:
6001 *
6002 * s = 'hello'
6003 * s.slice!('e') # => "e"
6004 * s # => "hllo"
6005 * s.slice!('e') # => nil
6006 * s # => "hllo"
6007 *
6008 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6009 */
6010
6011static VALUE
6012rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6013{
6014 VALUE result = Qnil;
6015 VALUE indx;
6016 long beg, len = 1;
6017 char *p;
6018
6019 rb_check_arity(argc, 1, 2);
6020 str_modify_keep_cr(str);
6021 indx = argv[0];
6022 if (RB_TYPE_P(indx, T_REGEXP)) {
6023 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6024 VALUE match = rb_backref_get();
6025 struct re_registers *regs = RMATCH_REGS(match);
6026 int nth = 0;
6027 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6028 if ((nth += regs->num_regs) <= 0) return Qnil;
6029 }
6030 else if (nth >= regs->num_regs) return Qnil;
6031 beg = BEG(nth);
6032 len = END(nth) - beg;
6033 goto subseq;
6034 }
6035 else if (argc == 2) {
6036 beg = NUM2LONG(indx);
6037 len = NUM2LONG(argv[1]);
6038 goto num_index;
6039 }
6040 else if (FIXNUM_P(indx)) {
6041 beg = FIX2LONG(indx);
6042 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6043 if (!len) return Qnil;
6044 beg = p - RSTRING_PTR(str);
6045 goto subseq;
6046 }
6047 else if (RB_TYPE_P(indx, T_STRING)) {
6048 beg = rb_str_index(str, indx, 0);
6049 if (beg == -1) return Qnil;
6050 len = RSTRING_LEN(indx);
6051 result = str_duplicate(rb_cString, indx);
6052 goto squash;
6053 }
6054 else {
6055 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6056 case Qnil:
6057 return Qnil;
6058 case Qfalse:
6059 beg = NUM2LONG(indx);
6060 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6061 if (!len) return Qnil;
6062 beg = p - RSTRING_PTR(str);
6063 goto subseq;
6064 default:
6065 goto num_index;
6066 }
6067 }
6068
6069 num_index:
6070 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6071 beg = p - RSTRING_PTR(str);
6072
6073 subseq:
6074 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6075 rb_enc_cr_str_copy_for_substr(result, str);
6076
6077 squash:
6078 if (len > 0) {
6079 if (beg == 0) {
6080 rb_str_drop_bytes(str, len);
6081 }
6082 else {
6083 char *sptr = RSTRING_PTR(str);
6084 long slen = RSTRING_LEN(str);
6085 if (beg + len > slen) /* pathological check */
6086 len = slen - beg;
6087 memmove(sptr + beg,
6088 sptr + beg + len,
6089 slen - (beg + len));
6090 slen -= len;
6091 STR_SET_LEN(str, slen);
6092 TERM_FILL(&sptr[slen], TERM_LEN(str));
6093 }
6094 }
6095 return result;
6096}
6097
6098static VALUE
6099get_pat(VALUE pat)
6100{
6101 VALUE val;
6102
6103 switch (OBJ_BUILTIN_TYPE(pat)) {
6104 case T_REGEXP:
6105 return pat;
6106
6107 case T_STRING:
6108 break;
6109
6110 default:
6111 val = rb_check_string_type(pat);
6112 if (NIL_P(val)) {
6113 Check_Type(pat, T_REGEXP);
6114 }
6115 pat = val;
6116 }
6117
6118 return rb_reg_regcomp(pat);
6119}
6120
6121static VALUE
6122get_pat_quoted(VALUE pat, int check)
6123{
6124 VALUE val;
6125
6126 switch (OBJ_BUILTIN_TYPE(pat)) {
6127 case T_REGEXP:
6128 return pat;
6129
6130 case T_STRING:
6131 break;
6132
6133 default:
6134 val = rb_check_string_type(pat);
6135 if (NIL_P(val)) {
6136 Check_Type(pat, T_REGEXP);
6137 }
6138 pat = val;
6139 }
6140 if (check && is_broken_string(pat)) {
6141 rb_exc_raise(rb_reg_check_preprocess(pat));
6142 }
6143 return pat;
6144}
6145
6146static long
6147rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6148{
6149 if (BUILTIN_TYPE(pat) == T_STRING) {
6150 pos = rb_str_byteindex(str, pat, pos);
6151 if (set_backref_str) {
6152 if (pos >= 0) {
6153 str = rb_str_new_frozen_String(str);
6154 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6155 if (match) {
6156 *match = match_data;
6157 }
6158 }
6159 else {
6161 }
6162 }
6163 return pos;
6164 }
6165 else {
6166 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6167 }
6168}
6169
6170static long
6171rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6172{
6173 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6174}
6175
6176
6177/*
6178 * call-seq:
6179 * sub!(pattern, replacement) -> self or nil
6180 * sub!(pattern) {|match| ... } -> self or nil
6181 *
6182 * Like String#sub, except that:
6183 *
6184 * - Changes are made to +self+, not to copy of +self+.
6185 * - Returns +self+ if any changes are made, +nil+ otherwise.
6186 *
6187 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6188 */
6189
6190static VALUE
6191rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6192{
6193 VALUE pat, repl, hash = Qnil;
6194 int iter = 0;
6195 long plen;
6196 int min_arity = rb_block_given_p() ? 1 : 2;
6197 long beg;
6198
6199 rb_check_arity(argc, min_arity, 2);
6200 if (argc == 1) {
6201 iter = 1;
6202 }
6203 else {
6204 repl = argv[1];
6205 hash = rb_check_hash_type(argv[1]);
6206 if (NIL_P(hash)) {
6207 StringValue(repl);
6208 }
6209 }
6210
6211 pat = get_pat_quoted(argv[0], 1);
6212
6213 str_modifiable(str);
6214 beg = rb_pat_search(pat, str, 0, 1);
6215 if (beg >= 0) {
6216 rb_encoding *enc;
6217 int cr = ENC_CODERANGE(str);
6218 long beg0, end0;
6219 VALUE match, match0 = Qnil;
6220 struct re_registers *regs;
6221 char *p, *rp;
6222 long len, rlen;
6223
6224 match = rb_backref_get();
6225 regs = RMATCH_REGS(match);
6226 if (RB_TYPE_P(pat, T_STRING)) {
6227 beg0 = beg;
6228 end0 = beg0 + RSTRING_LEN(pat);
6229 match0 = pat;
6230 }
6231 else {
6232 beg0 = BEG(0);
6233 end0 = END(0);
6234 if (iter) match0 = rb_reg_nth_match(0, match);
6235 }
6236
6237 if (iter || !NIL_P(hash)) {
6238 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6239
6240 if (iter) {
6241 repl = rb_obj_as_string(rb_yield(match0));
6242 }
6243 else {
6244 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6245 repl = rb_obj_as_string(repl);
6246 }
6247 str_mod_check(str, p, len);
6248 rb_check_frozen(str);
6249 }
6250 else {
6251 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6252 }
6253
6254 enc = rb_enc_compatible(str, repl);
6255 if (!enc) {
6256 rb_encoding *str_enc = STR_ENC_GET(str);
6257 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6258 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6259 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6260 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6261 rb_enc_inspect_name(str_enc),
6262 rb_enc_inspect_name(STR_ENC_GET(repl)));
6263 }
6264 enc = STR_ENC_GET(repl);
6265 }
6266 rb_str_modify(str);
6267 rb_enc_associate(str, enc);
6269 int cr2 = ENC_CODERANGE(repl);
6270 if (cr2 == ENC_CODERANGE_BROKEN ||
6271 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6273 else
6274 cr = cr2;
6275 }
6276 plen = end0 - beg0;
6277 rlen = RSTRING_LEN(repl);
6278 len = RSTRING_LEN(str);
6279 if (rlen > plen) {
6280 RESIZE_CAPA(str, len + rlen - plen);
6281 }
6282 p = RSTRING_PTR(str);
6283 if (rlen != plen) {
6284 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6285 }
6286 rp = RSTRING_PTR(repl);
6287 memmove(p + beg0, rp, rlen);
6288 len += rlen - plen;
6289 STR_SET_LEN(str, len);
6290 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6291 ENC_CODERANGE_SET(str, cr);
6292
6293 RB_GC_GUARD(match);
6294
6295 return str;
6296 }
6297 return Qnil;
6298}
6299
6300
6301/*
6302 * call-seq:
6303 * sub(pattern, replacement) -> new_string
6304 * sub(pattern) {|match| ... } -> new_string
6305 *
6306 * :include: doc/string/sub.rdoc
6307 */
6308
6309static VALUE
6310rb_str_sub(int argc, VALUE *argv, VALUE str)
6311{
6312 str = str_duplicate(rb_cString, str);
6313 rb_str_sub_bang(argc, argv, str);
6314 return str;
6315}
6316
6317static VALUE
6318str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6319{
6320 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6321 long beg, beg0, end0;
6322 long offset, blen, slen, len, last;
6323 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6324 char *sp, *cp;
6325 int need_backref_str = -1;
6326 rb_encoding *str_enc;
6327
6328 switch (argc) {
6329 case 1:
6330 RETURN_ENUMERATOR(str, argc, argv);
6331 mode = ITER;
6332 break;
6333 case 2:
6334 repl = argv[1];
6335 hash = rb_check_hash_type(argv[1]);
6336 if (NIL_P(hash)) {
6337 StringValue(repl);
6338 }
6339 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6340 mode = FAST_MAP;
6341 }
6342 else {
6343 mode = MAP;
6344 }
6345 break;
6346 default:
6347 rb_error_arity(argc, 1, 2);
6348 }
6349
6350 pat = get_pat_quoted(argv[0], 1);
6351 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6352
6353 if (beg < 0) {
6354 if (bang) return Qnil; /* no match, no substitution */
6355 return str_duplicate(rb_cString, str);
6356 }
6357
6358 offset = 0;
6359 blen = RSTRING_LEN(str) + 30; /* len + margin */
6360 dest = rb_str_buf_new(blen);
6361 sp = RSTRING_PTR(str);
6362 slen = RSTRING_LEN(str);
6363 cp = sp;
6364 str_enc = STR_ENC_GET(str);
6365 rb_enc_associate(dest, str_enc);
6366 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6367
6368 do {
6369 struct re_registers *regs = RMATCH_REGS(match);
6370 if (RB_TYPE_P(pat, T_STRING)) {
6371 beg0 = beg;
6372 end0 = beg0 + RSTRING_LEN(pat);
6373 match0 = pat;
6374 }
6375 else {
6376 beg0 = BEG(0);
6377 end0 = END(0);
6378 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6379 }
6380
6381 if (mode != STR) {
6382 if (mode == ITER) {
6383 val = rb_obj_as_string(rb_yield(match0));
6384 }
6385 else {
6386 struct RString fake_str = {RBASIC_INIT};
6387 VALUE key;
6388 if (mode == FAST_MAP) {
6389 // It is safe to use a fake_str here because we established that it won't escape,
6390 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6391 // default proc.
6392 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6393 }
6394 else {
6395 key = rb_str_subseq(str, beg0, end0 - beg0);
6396 }
6397 val = rb_hash_aref(hash, key);
6398 val = rb_obj_as_string(val);
6399 }
6400 str_mod_check(str, sp, slen);
6401 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6402 rb_raise(rb_eRuntimeError, "block should not cheat");
6403 }
6404 }
6405 else if (need_backref_str) {
6406 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6407 if (need_backref_str < 0) {
6408 need_backref_str = val != repl;
6409 }
6410 }
6411 else {
6412 val = repl;
6413 }
6414
6415 len = beg0 - offset; /* copy pre-match substr */
6416 if (len) {
6417 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6418 }
6419
6420 rb_str_buf_append(dest, val);
6421
6422 last = offset;
6423 offset = end0;
6424 if (beg0 == end0) {
6425 /*
6426 * Always consume at least one character of the input string
6427 * in order to prevent infinite loops.
6428 */
6429 if (RSTRING_LEN(str) <= end0) break;
6430 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6431 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6432 offset = end0 + len;
6433 }
6434 cp = RSTRING_PTR(str) + offset;
6435 if (offset > RSTRING_LEN(str)) break;
6436
6437 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6438 if (mode != FAST_MAP && mode != STR) {
6439 match = Qnil;
6440 }
6441 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6442
6443 RB_GC_GUARD(match);
6444 } while (beg >= 0);
6445
6446 if (RSTRING_LEN(str) > offset) {
6447 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6448 }
6449 rb_pat_search0(pat, str, last, 1, &match);
6450 if (bang) {
6451 str_shared_replace(str, dest);
6452 }
6453 else {
6454 str = dest;
6455 }
6456
6457 return str;
6458}
6459
6460
6461/*
6462 * call-seq:
6463 * gsub!(pattern, replacement) -> self or nil
6464 * gsub!(pattern) {|match| ... } -> self or nil
6465 * gsub!(pattern) -> an_enumerator
6466 *
6467 * Like String#gsub, except that:
6468 *
6469 * - Performs substitutions in +self+ (not in a copy of +self+).
6470 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6471 *
6472 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6473 */
6474
6475static VALUE
6476rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6477{
6478 str_modify_keep_cr(str);
6479 return str_gsub(argc, argv, str, 1);
6480}
6481
6482
6483/*
6484 * call-seq:
6485 * gsub(pattern, replacement) -> new_string
6486 * gsub(pattern) {|match| ... } -> new_string
6487 * gsub(pattern) -> enumerator
6488 *
6489 * Returns a copy of +self+ with zero or more substrings replaced.
6490 *
6491 * Argument +pattern+ may be a string or a Regexp;
6492 * argument +replacement+ may be a string or a Hash.
6493 * Varying types for the argument values makes this method very versatile.
6494 *
6495 * Below are some simple examples;
6496 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6497 *
6498 * With arguments +pattern+ and string +replacement+ given,
6499 * replaces each matching substring with the given +replacement+ string:
6500 *
6501 * s = 'abracadabra'
6502 * s.gsub('ab', 'AB') # => "ABracadABra"
6503 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6504 *
6505 * With arguments +pattern+ and hash +replacement+ given,
6506 * replaces each matching substring with a value from the given +replacement+ hash,
6507 * or removes it:
6508 *
6509 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6510 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6511 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6512 *
6513 * With argument +pattern+ and a block given,
6514 * calls the block with each matching substring;
6515 * replaces that substring with the block's return value:
6516 *
6517 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6518 * # => "ABrACADABrA"
6519 *
6520 * With argument +pattern+ and no block given,
6521 * returns a new Enumerator.
6522 *
6523 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6524 */
6525
6526static VALUE
6527rb_str_gsub(int argc, VALUE *argv, VALUE str)
6528{
6529 return str_gsub(argc, argv, str, 0);
6530}
6531
6532
6533/*
6534 * call-seq:
6535 * replace(other_string) -> self
6536 *
6537 * Replaces the contents of +self+ with the contents of +other_string+;
6538 * returns +self+:
6539 *
6540 * s = 'foo' # => "foo"
6541 * s.replace('bar') # => "bar"
6542 *
6543 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6544 */
6545
6546VALUE
6548{
6549 str_modifiable(str);
6550 if (str == str2) return str;
6551
6552 StringValue(str2);
6553 str_discard(str);
6554 return str_replace(str, str2);
6555}
6556
6557/*
6558 * call-seq:
6559 * clear -> self
6560 *
6561 * Removes the contents of +self+:
6562 *
6563 * s = 'foo'
6564 * s.clear # => ""
6565 * s # => ""
6566 *
6567 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6568 */
6569
6570static VALUE
6571rb_str_clear(VALUE str)
6572{
6573 str_discard(str);
6574 STR_SET_EMBED(str);
6575 STR_SET_LEN(str, 0);
6576 RSTRING_PTR(str)[0] = 0;
6577 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6579 else
6581 return str;
6582}
6583
6584/*
6585 * call-seq:
6586 * chr -> string
6587 *
6588 * :include: doc/string/chr.rdoc
6589 *
6590 */
6591
6592static VALUE
6593rb_str_chr(VALUE str)
6594{
6595 return rb_str_substr(str, 0, 1);
6596}
6597
6598/*
6599 * call-seq:
6600 * getbyte(index) -> integer or nil
6601 *
6602 * :include: doc/string/getbyte.rdoc
6603 *
6604 */
6605VALUE
6606rb_str_getbyte(VALUE str, VALUE index)
6607{
6608 long pos = NUM2LONG(index);
6609
6610 if (pos < 0)
6611 pos += RSTRING_LEN(str);
6612 if (pos < 0 || RSTRING_LEN(str) <= pos)
6613 return Qnil;
6614
6615 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6616}
6617
6618/*
6619 * call-seq:
6620 * setbyte(index, integer) -> integer
6621 *
6622 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6623 * returns +integer+:
6624 *
6625 * s = 'xyzzy'
6626 * s.setbyte(2, 129) # => 129
6627 * s # => "xy\x81zy"
6628 *
6629 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6630 */
6631VALUE
6632rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6633{
6634 long pos = NUM2LONG(index);
6635 long len = RSTRING_LEN(str);
6636 char *ptr, *head, *left = 0;
6637 rb_encoding *enc;
6638 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6639
6640 if (pos < -len || len <= pos)
6641 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6642 if (pos < 0)
6643 pos += len;
6644
6645 VALUE v = rb_to_int(value);
6646 VALUE w = rb_int_and(v, INT2FIX(0xff));
6647 char byte = (char)(NUM2INT(w) & 0xFF);
6648
6649 if (!str_independent(str))
6650 str_make_independent(str);
6651 enc = STR_ENC_GET(str);
6652 head = RSTRING_PTR(str);
6653 ptr = &head[pos];
6654 if (!STR_EMBED_P(str)) {
6655 cr = ENC_CODERANGE(str);
6656 switch (cr) {
6657 case ENC_CODERANGE_7BIT:
6658 left = ptr;
6659 *ptr = byte;
6660 if (ISASCII(byte)) goto end;
6661 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6662 if (!MBCLEN_CHARFOUND_P(nlen))
6664 else
6666 goto end;
6668 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6669 width = rb_enc_precise_mbclen(left, head+len, enc);
6670 *ptr = byte;
6671 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6672 if (!MBCLEN_CHARFOUND_P(nlen))
6674 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6676 goto end;
6677 }
6678 }
6680 *ptr = byte;
6681
6682 end:
6683 return value;
6684}
6685
6686static VALUE
6687str_byte_substr(VALUE str, long beg, long len, int empty)
6688{
6689 long n = RSTRING_LEN(str);
6690
6691 if (beg > n || len < 0) return Qnil;
6692 if (beg < 0) {
6693 beg += n;
6694 if (beg < 0) return Qnil;
6695 }
6696 if (len > n - beg)
6697 len = n - beg;
6698 if (len <= 0) {
6699 if (!empty) return Qnil;
6700 len = 0;
6701 }
6702
6703 VALUE str2 = str_subseq(str, beg, len);
6704
6705 str_enc_copy_direct(str2, str);
6706
6707 if (RSTRING_LEN(str2) == 0) {
6708 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6710 else
6712 }
6713 else {
6714 switch (ENC_CODERANGE(str)) {
6715 case ENC_CODERANGE_7BIT:
6717 break;
6718 default:
6720 break;
6721 }
6722 }
6723
6724 return str2;
6725}
6726
6727VALUE
6728rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6729{
6730 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6731}
6732
6733static VALUE
6734str_byte_aref(VALUE str, VALUE indx)
6735{
6736 long idx;
6737 if (FIXNUM_P(indx)) {
6738 idx = FIX2LONG(indx);
6739 }
6740 else {
6741 /* check if indx is Range */
6742 long beg, len = RSTRING_LEN(str);
6743
6744 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6745 case Qfalse:
6746 break;
6747 case Qnil:
6748 return Qnil;
6749 default:
6750 return str_byte_substr(str, beg, len, TRUE);
6751 }
6752
6753 idx = NUM2LONG(indx);
6754 }
6755 return str_byte_substr(str, idx, 1, FALSE);
6756}
6757
6758/*
6759 * call-seq:
6760 * byteslice(offset, length = 1) -> string or nil
6761 * byteslice(range) -> string or nil
6762 *
6763 * :include: doc/string/byteslice.rdoc
6764 */
6765
6766static VALUE
6767rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6768{
6769 if (argc == 2) {
6770 long beg = NUM2LONG(argv[0]);
6771 long len = NUM2LONG(argv[1]);
6772 return str_byte_substr(str, beg, len, TRUE);
6773 }
6774 rb_check_arity(argc, 1, 2);
6775 return str_byte_aref(str, argv[0]);
6776}
6777
6778static void
6779str_check_beg_len(VALUE str, long *beg, long *len)
6780{
6781 long end, slen = RSTRING_LEN(str);
6782
6783 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6784 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6785 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6786 }
6787 if (*beg < 0) {
6788 *beg += slen;
6789 }
6790 RUBY_ASSERT(*beg >= 0);
6791 RUBY_ASSERT(*beg <= slen);
6792
6793 if (*len > slen - *beg) {
6794 *len = slen - *beg;
6795 }
6796 end = *beg + *len;
6797 str_ensure_byte_pos(str, *beg);
6798 str_ensure_byte_pos(str, end);
6799}
6800
6801/*
6802 * call-seq:
6803 * bytesplice(offset, length, str) -> self
6804 * bytesplice(offset, length, str, str_offset, str_length) -> self
6805 * bytesplice(range, str) -> self
6806 * bytesplice(range, str, str_range) -> self
6807 *
6808 * :include: doc/string/bytesplice.rdoc
6809 */
6810
6811static VALUE
6812rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6813{
6814 long beg, len, vbeg, vlen;
6815 VALUE val;
6816 int cr;
6817
6818 rb_check_arity(argc, 2, 5);
6819 if (!(argc == 2 || argc == 3 || argc == 5)) {
6820 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6821 }
6822 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6823 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6824 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6825 rb_builtin_class_name(argv[0]));
6826 }
6827 val = argv[1];
6828 StringValue(val);
6829 if (argc == 2) {
6830 /* bytesplice(range, str) */
6831 vbeg = 0;
6832 vlen = RSTRING_LEN(val);
6833 }
6834 else {
6835 /* bytesplice(range, str, str_range) */
6836 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6837 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6838 rb_builtin_class_name(argv[2]));
6839 }
6840 }
6841 }
6842 else {
6843 beg = NUM2LONG(argv[0]);
6844 len = NUM2LONG(argv[1]);
6845 val = argv[2];
6846 StringValue(val);
6847 if (argc == 3) {
6848 /* bytesplice(index, length, str) */
6849 vbeg = 0;
6850 vlen = RSTRING_LEN(val);
6851 }
6852 else {
6853 /* bytesplice(index, length, str, str_index, str_length) */
6854 vbeg = NUM2LONG(argv[3]);
6855 vlen = NUM2LONG(argv[4]);
6856 }
6857 }
6858 str_check_beg_len(str, &beg, &len);
6859 str_check_beg_len(val, &vbeg, &vlen);
6860 str_modify_keep_cr(str);
6861
6862 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6863 rb_enc_associate(str, rb_enc_check(str, val));
6864 }
6865
6866 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6868 if (cr != ENC_CODERANGE_BROKEN)
6869 ENC_CODERANGE_SET(str, cr);
6870 return str;
6871}
6872
6873/*
6874 * call-seq:
6875 * reverse -> new_string
6876 *
6877 * Returns a new string with the characters from +self+ in reverse order.
6878 *
6879 * 'drawer'.reverse # => "reward"
6880 * 'reviled'.reverse # => "deliver"
6881 * 'stressed'.reverse # => "desserts"
6882 * 'semordnilaps'.reverse # => "spalindromes"
6883 *
6884 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6885 */
6886
6887static VALUE
6888rb_str_reverse(VALUE str)
6889{
6890 rb_encoding *enc;
6891 VALUE rev;
6892 char *s, *e, *p;
6893 int cr;
6894
6895 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6896 enc = STR_ENC_GET(str);
6897 rev = rb_str_new(0, RSTRING_LEN(str));
6898 s = RSTRING_PTR(str); e = RSTRING_END(str);
6899 p = RSTRING_END(rev);
6900 cr = ENC_CODERANGE(str);
6901
6902 if (RSTRING_LEN(str) > 1) {
6903 if (single_byte_optimizable(str)) {
6904 while (s < e) {
6905 *--p = *s++;
6906 }
6907 }
6908 else if (cr == ENC_CODERANGE_VALID) {
6909 while (s < e) {
6910 int clen = rb_enc_fast_mbclen(s, e, enc);
6911
6912 p -= clen;
6913 memcpy(p, s, clen);
6914 s += clen;
6915 }
6916 }
6917 else {
6918 cr = rb_enc_asciicompat(enc) ?
6920 while (s < e) {
6921 int clen = rb_enc_mbclen(s, e, enc);
6922
6923 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6924 p -= clen;
6925 memcpy(p, s, clen);
6926 s += clen;
6927 }
6928 }
6929 }
6930 STR_SET_LEN(rev, RSTRING_LEN(str));
6931 str_enc_copy_direct(rev, str);
6932 ENC_CODERANGE_SET(rev, cr);
6933
6934 return rev;
6935}
6936
6937
6938/*
6939 * call-seq:
6940 * reverse! -> self
6941 *
6942 * Returns +self+ with its characters reversed:
6943 *
6944 * 'drawer'.reverse! # => "reward"
6945 * 'reviled'.reverse! # => "deliver"
6946 * 'stressed'.reverse! # => "desserts"
6947 * 'semordnilaps'.reverse! # => "spalindromes"
6948 *
6949 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6950 */
6951
6952static VALUE
6953rb_str_reverse_bang(VALUE str)
6954{
6955 if (RSTRING_LEN(str) > 1) {
6956 if (single_byte_optimizable(str)) {
6957 char *s, *e, c;
6958
6959 str_modify_keep_cr(str);
6960 s = RSTRING_PTR(str);
6961 e = RSTRING_END(str) - 1;
6962 while (s < e) {
6963 c = *s;
6964 *s++ = *e;
6965 *e-- = c;
6966 }
6967 }
6968 else {
6969 str_shared_replace(str, rb_str_reverse(str));
6970 }
6971 }
6972 else {
6973 str_modify_keep_cr(str);
6974 }
6975 return str;
6976}
6977
6978
6979/*
6980 * call-seq:
6981 * include?(other_string) -> true or false
6982 *
6983 * Returns whether +self+ contains +other_string+:
6984 *
6985 * s = 'bar'
6986 * s.include?('ba') # => true
6987 * s.include?('ar') # => true
6988 * s.include?('bar') # => true
6989 * s.include?('a') # => true
6990 * s.include?('') # => true
6991 * s.include?('foo') # => false
6992 *
6993 * Related: see {Querying}[rdoc-ref:String@Querying].
6994 */
6995
6996VALUE
6997rb_str_include(VALUE str, VALUE arg)
6998{
6999 long i;
7000
7001 StringValue(arg);
7002 i = rb_str_index(str, arg, 0);
7003
7004 return RBOOL(i != -1);
7005}
7006
7007
7008/*
7009 * call-seq:
7010 * to_i(base = 10) -> integer
7011 *
7012 * Returns the result of interpreting leading characters in +self+
7013 * as an integer in the given +base+;
7014 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7015 *
7016 * '123456'.to_i # => 123456
7017 * '123def'.to_i(16) # => 1195503
7018 *
7019 * With +base+ zero given, string +object+ may contain leading characters
7020 * to specify the actual base:
7021 *
7022 * '123def'.to_i(0) # => 123
7023 * '0123def'.to_i(0) # => 83
7024 * '0b123def'.to_i(0) # => 1
7025 * '0o123def'.to_i(0) # => 83
7026 * '0d123def'.to_i(0) # => 123
7027 * '0x123def'.to_i(0) # => 1195503
7028 *
7029 * Characters past a leading valid number (in the given +base+) are ignored:
7030 *
7031 * '12.345'.to_i # => 12
7032 * '12345'.to_i(2) # => 1
7033 *
7034 * Returns zero if there is no leading valid number:
7035 *
7036 * 'abcdef'.to_i # => 0
7037 * '2'.to_i(2) # => 0
7038 *
7039 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7040 */
7041
7042static VALUE
7043rb_str_to_i(int argc, VALUE *argv, VALUE str)
7044{
7045 int base = 10;
7046
7047 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7048 rb_raise(rb_eArgError, "invalid radix %d", base);
7049 }
7050 return rb_str_to_inum(str, base, FALSE);
7051}
7052
7053
7054/*
7055 * call-seq:
7056 * to_f -> float
7057 *
7058 * Returns the result of interpreting leading characters in +self+ as a Float:
7059 *
7060 * '3.14159'.to_f # => 3.14159
7061 * '1.234e-2'.to_f # => 0.01234
7062 *
7063 * Characters past a leading valid number are ignored:
7064 *
7065 * '3.14 (pi to two places)'.to_f # => 3.14
7066 *
7067 * Returns zero if there is no leading valid number:
7068 *
7069 * 'abcdef'.to_f # => 0.0
7070 *
7071 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7072 */
7073
7074static VALUE
7075rb_str_to_f(VALUE str)
7076{
7077 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7078}
7079
7080
7081/*
7082 * call-seq:
7083 * to_s -> self or new_string
7084 *
7085 * Returns +self+ if +self+ is a +String+,
7086 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7087 *
7088 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7089 */
7090
7091static VALUE
7092rb_str_to_s(VALUE str)
7093{
7094 if (rb_obj_class(str) != rb_cString) {
7095 return str_duplicate(rb_cString, str);
7096 }
7097 return str;
7098}
7099
7100#if 0
7101static void
7102str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7103{
7104 char s[RUBY_MAX_CHAR_LEN];
7105 int n = rb_enc_codelen(c, enc);
7106
7107 rb_enc_mbcput(c, s, enc);
7108 rb_enc_str_buf_cat(str, s, n, enc);
7109}
7110#endif
7111
7112#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7113
7114int
7115rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7116{
7117 char buf[CHAR_ESC_LEN + 1];
7118 int l;
7119
7120#if SIZEOF_INT > 4
7121 c &= 0xffffffff;
7122#endif
7123 if (unicode_p) {
7124 if (c < 0x7F && ISPRINT(c)) {
7125 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7126 }
7127 else if (c < 0x10000) {
7128 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7129 }
7130 else {
7131 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7132 }
7133 }
7134 else {
7135 if (c < 0x100) {
7136 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7137 }
7138 else {
7139 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7140 }
7141 }
7142 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7143 rb_str_buf_cat(result, buf, l);
7144 return l;
7145}
7146
7147const char *
7148ruby_escaped_char(int c)
7149{
7150 switch (c) {
7151 case '\0': return "\\0";
7152 case '\n': return "\\n";
7153 case '\r': return "\\r";
7154 case '\t': return "\\t";
7155 case '\f': return "\\f";
7156 case '\013': return "\\v";
7157 case '\010': return "\\b";
7158 case '\007': return "\\a";
7159 case '\033': return "\\e";
7160 case '\x7f': return "\\c?";
7161 }
7162 return NULL;
7163}
7164
7165VALUE
7166rb_str_escape(VALUE str)
7167{
7168 int encidx = ENCODING_GET(str);
7169 rb_encoding *enc = rb_enc_from_index(encidx);
7170 const char *p = RSTRING_PTR(str);
7171 const char *pend = RSTRING_END(str);
7172 const char *prev = p;
7173 char buf[CHAR_ESC_LEN + 1];
7174 VALUE result = rb_str_buf_new(0);
7175 int unicode_p = rb_enc_unicode_p(enc);
7176 int asciicompat = rb_enc_asciicompat(enc);
7177
7178 while (p < pend) {
7179 unsigned int c;
7180 const char *cc;
7181 int n = rb_enc_precise_mbclen(p, pend, enc);
7182 if (!MBCLEN_CHARFOUND_P(n)) {
7183 if (p > prev) str_buf_cat(result, prev, p - prev);
7184 n = rb_enc_mbminlen(enc);
7185 if (pend < p + n)
7186 n = (int)(pend - p);
7187 while (n--) {
7188 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7189 str_buf_cat(result, buf, strlen(buf));
7190 prev = ++p;
7191 }
7192 continue;
7193 }
7194 n = MBCLEN_CHARFOUND_LEN(n);
7195 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7196 p += n;
7197 cc = ruby_escaped_char(c);
7198 if (cc) {
7199 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7200 str_buf_cat(result, cc, strlen(cc));
7201 prev = p;
7202 }
7203 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7204 }
7205 else {
7206 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7207 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7208 prev = p;
7209 }
7210 }
7211 if (p > prev) str_buf_cat(result, prev, p - prev);
7212 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7213
7214 return result;
7215}
7216
7217/*
7218 * call-seq:
7219 * inspect -> string
7220 *
7221 * :include: doc/string/inspect.rdoc
7222 *
7223 */
7224
7225VALUE
7227{
7228 int encidx = ENCODING_GET(str);
7229 rb_encoding *enc = rb_enc_from_index(encidx);
7230 const char *p, *pend, *prev;
7231 char buf[CHAR_ESC_LEN + 1];
7232 VALUE result = rb_str_buf_new(0);
7233 rb_encoding *resenc = rb_default_internal_encoding();
7234 int unicode_p = rb_enc_unicode_p(enc);
7235 int asciicompat = rb_enc_asciicompat(enc);
7236
7237 if (resenc == NULL) resenc = rb_default_external_encoding();
7238 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7239 rb_enc_associate(result, resenc);
7240 str_buf_cat2(result, "\"");
7241
7242 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7243 prev = p;
7244 while (p < pend) {
7245 unsigned int c, cc;
7246 int n;
7247
7248 n = rb_enc_precise_mbclen(p, pend, enc);
7249 if (!MBCLEN_CHARFOUND_P(n)) {
7250 if (p > prev) str_buf_cat(result, prev, p - prev);
7251 n = rb_enc_mbminlen(enc);
7252 if (pend < p + n)
7253 n = (int)(pend - p);
7254 while (n--) {
7255 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7256 str_buf_cat(result, buf, strlen(buf));
7257 prev = ++p;
7258 }
7259 continue;
7260 }
7261 n = MBCLEN_CHARFOUND_LEN(n);
7262 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7263 p += n;
7264 if ((asciicompat || unicode_p) &&
7265 (c == '"'|| c == '\\' ||
7266 (c == '#' &&
7267 p < pend &&
7268 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7269 (cc = rb_enc_codepoint(p,pend,enc),
7270 (cc == '$' || cc == '@' || cc == '{'))))) {
7271 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7272 str_buf_cat2(result, "\\");
7273 if (asciicompat || enc == resenc) {
7274 prev = p - n;
7275 continue;
7276 }
7277 }
7278 switch (c) {
7279 case '\n': cc = 'n'; break;
7280 case '\r': cc = 'r'; break;
7281 case '\t': cc = 't'; break;
7282 case '\f': cc = 'f'; break;
7283 case '\013': cc = 'v'; break;
7284 case '\010': cc = 'b'; break;
7285 case '\007': cc = 'a'; break;
7286 case 033: cc = 'e'; break;
7287 default: cc = 0; break;
7288 }
7289 if (cc) {
7290 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7291 buf[0] = '\\';
7292 buf[1] = (char)cc;
7293 str_buf_cat(result, buf, 2);
7294 prev = p;
7295 continue;
7296 }
7297 /* The special casing of 0x85 (NEXT_LINE) here is because
7298 * Oniguruma historically treats it as printable, but it
7299 * doesn't match the print POSIX bracket class or character
7300 * property in regexps.
7301 *
7302 * See Ruby Bug #16842 for details:
7303 * https://bugs.ruby-lang.org/issues/16842
7304 */
7305 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7306 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7307 continue;
7308 }
7309 else {
7310 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7311 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7312 prev = p;
7313 continue;
7314 }
7315 }
7316 if (p > prev) str_buf_cat(result, prev, p - prev);
7317 str_buf_cat2(result, "\"");
7318
7319 return result;
7320}
7321
7322#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7323
7324/*
7325 * call-seq:
7326 * dump -> new_string
7327 *
7328 * :include: doc/string/dump.rdoc
7329 *
7330 */
7331
7332VALUE
7334{
7335 int encidx = rb_enc_get_index(str);
7336 rb_encoding *enc = rb_enc_from_index(encidx);
7337 long len;
7338 const char *p, *pend;
7339 char *q, *qend;
7340 VALUE result;
7341 int u8 = (encidx == rb_utf8_encindex());
7342 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7343
7344 len = 2; /* "" */
7345 if (!rb_enc_asciicompat(enc)) {
7346 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7347 len += strlen(enc->name);
7348 }
7349
7350 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7351 while (p < pend) {
7352 int clen;
7353 unsigned char c = *p++;
7354
7355 switch (c) {
7356 case '"': case '\\':
7357 case '\n': case '\r':
7358 case '\t': case '\f':
7359 case '\013': case '\010': case '\007': case '\033':
7360 clen = 2;
7361 break;
7362
7363 case '#':
7364 clen = IS_EVSTR(p, pend) ? 2 : 1;
7365 break;
7366
7367 default:
7368 if (ISPRINT(c)) {
7369 clen = 1;
7370 }
7371 else {
7372 if (u8 && c > 0x7F) { /* \u notation */
7373 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7374 if (MBCLEN_CHARFOUND_P(n)) {
7375 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7376 if (cc <= 0xFFFF)
7377 clen = 6; /* \uXXXX */
7378 else if (cc <= 0xFFFFF)
7379 clen = 9; /* \u{XXXXX} */
7380 else
7381 clen = 10; /* \u{XXXXXX} */
7382 p += MBCLEN_CHARFOUND_LEN(n)-1;
7383 break;
7384 }
7385 }
7386 clen = 4; /* \xNN */
7387 }
7388 break;
7389 }
7390
7391 if (clen > LONG_MAX - len) {
7392 rb_raise(rb_eRuntimeError, "string size too big");
7393 }
7394 len += clen;
7395 }
7396
7397 result = rb_str_new(0, len);
7398 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7399 q = RSTRING_PTR(result); qend = q + len + 1;
7400
7401 *q++ = '"';
7402 while (p < pend) {
7403 unsigned char c = *p++;
7404
7405 if (c == '"' || c == '\\') {
7406 *q++ = '\\';
7407 *q++ = c;
7408 }
7409 else if (c == '#') {
7410 if (IS_EVSTR(p, pend)) *q++ = '\\';
7411 *q++ = '#';
7412 }
7413 else if (c == '\n') {
7414 *q++ = '\\';
7415 *q++ = 'n';
7416 }
7417 else if (c == '\r') {
7418 *q++ = '\\';
7419 *q++ = 'r';
7420 }
7421 else if (c == '\t') {
7422 *q++ = '\\';
7423 *q++ = 't';
7424 }
7425 else if (c == '\f') {
7426 *q++ = '\\';
7427 *q++ = 'f';
7428 }
7429 else if (c == '\013') {
7430 *q++ = '\\';
7431 *q++ = 'v';
7432 }
7433 else if (c == '\010') {
7434 *q++ = '\\';
7435 *q++ = 'b';
7436 }
7437 else if (c == '\007') {
7438 *q++ = '\\';
7439 *q++ = 'a';
7440 }
7441 else if (c == '\033') {
7442 *q++ = '\\';
7443 *q++ = 'e';
7444 }
7445 else if (ISPRINT(c)) {
7446 *q++ = c;
7447 }
7448 else {
7449 *q++ = '\\';
7450 if (u8) {
7451 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7452 if (MBCLEN_CHARFOUND_P(n)) {
7453 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7454 p += n;
7455 if (cc <= 0xFFFF)
7456 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7457 else
7458 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7459 q += strlen(q);
7460 continue;
7461 }
7462 }
7463 snprintf(q, qend-q, "x%02X", c);
7464 q += 3;
7465 }
7466 }
7467 *q++ = '"';
7468 *q = '\0';
7469 if (!rb_enc_asciicompat(enc)) {
7470 snprintf(q, qend-q, nonascii_suffix, enc->name);
7471 encidx = rb_ascii8bit_encindex();
7472 }
7473 /* result from dump is ASCII */
7474 rb_enc_associate_index(result, encidx);
7476 return result;
7477}
7478
7479static int
7480unescape_ascii(unsigned int c)
7481{
7482 switch (c) {
7483 case 'n':
7484 return '\n';
7485 case 'r':
7486 return '\r';
7487 case 't':
7488 return '\t';
7489 case 'f':
7490 return '\f';
7491 case 'v':
7492 return '\13';
7493 case 'b':
7494 return '\010';
7495 case 'a':
7496 return '\007';
7497 case 'e':
7498 return 033;
7499 }
7501}
7502
7503static void
7504undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7505{
7506 const char *s = *ss;
7507 unsigned int c;
7508 int codelen;
7509 size_t hexlen;
7510 unsigned char buf[6];
7511 static rb_encoding *enc_utf8 = NULL;
7512
7513 switch (*s) {
7514 case '\\':
7515 case '"':
7516 case '#':
7517 rb_str_cat(undumped, s, 1); /* cat itself */
7518 s++;
7519 break;
7520 case 'n':
7521 case 'r':
7522 case 't':
7523 case 'f':
7524 case 'v':
7525 case 'b':
7526 case 'a':
7527 case 'e':
7528 *buf = unescape_ascii(*s);
7529 rb_str_cat(undumped, (char *)buf, 1);
7530 s++;
7531 break;
7532 case 'u':
7533 if (*binary) {
7534 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7535 }
7536 *utf8 = true;
7537 if (++s >= s_end) {
7538 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7539 }
7540 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7541 if (*penc != enc_utf8) {
7542 *penc = enc_utf8;
7543 rb_enc_associate(undumped, enc_utf8);
7544 }
7545 if (*s == '{') { /* handle \u{...} form */
7546 s++;
7547 for (;;) {
7548 if (s >= s_end) {
7549 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7550 }
7551 if (*s == '}') {
7552 s++;
7553 break;
7554 }
7555 if (ISSPACE(*s)) {
7556 s++;
7557 continue;
7558 }
7559 c = scan_hex(s, s_end-s, &hexlen);
7560 if (hexlen == 0 || hexlen > 6) {
7561 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7562 }
7563 if (c > 0x10ffff) {
7564 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7565 }
7566 if (0xd800 <= c && c <= 0xdfff) {
7567 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7568 }
7569 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7570 rb_str_cat(undumped, (char *)buf, codelen);
7571 s += hexlen;
7572 }
7573 }
7574 else { /* handle \uXXXX form */
7575 c = scan_hex(s, 4, &hexlen);
7576 if (hexlen != 4) {
7577 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7578 }
7579 if (0xd800 <= c && c <= 0xdfff) {
7580 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7581 }
7582 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7583 rb_str_cat(undumped, (char *)buf, codelen);
7584 s += hexlen;
7585 }
7586 break;
7587 case 'x':
7588 if (++s >= s_end) {
7589 rb_raise(rb_eRuntimeError, "invalid hex escape");
7590 }
7591 *buf = scan_hex(s, 2, &hexlen);
7592 if (hexlen != 2) {
7593 rb_raise(rb_eRuntimeError, "invalid hex escape");
7594 }
7595 if (!ISASCII(*buf)) {
7596 if (*utf8) {
7597 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7598 }
7599 *binary = true;
7600 }
7601 rb_str_cat(undumped, (char *)buf, 1);
7602 s += hexlen;
7603 break;
7604 default:
7605 rb_str_cat(undumped, s-1, 2);
7606 s++;
7607 }
7608
7609 *ss = s;
7610}
7611
7612static VALUE rb_str_is_ascii_only_p(VALUE str);
7613
7614/*
7615 * call-seq:
7616 * undump -> new_string
7617 *
7618 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7619 *
7620 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7621 */
7622
7623static VALUE
7624str_undump(VALUE str)
7625{
7626 const char *s = RSTRING_PTR(str);
7627 const char *s_end = RSTRING_END(str);
7628 rb_encoding *enc = rb_enc_get(str);
7629 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7630 bool utf8 = false;
7631 bool binary = false;
7632 int w;
7633
7635 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7636 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7637 }
7638 if (!str_null_check(str, &w)) {
7639 rb_raise(rb_eRuntimeError, "string contains null byte");
7640 }
7641 if (RSTRING_LEN(str) < 2) goto invalid_format;
7642 if (*s != '"') goto invalid_format;
7643
7644 /* strip '"' at the start */
7645 s++;
7646
7647 for (;;) {
7648 if (s >= s_end) {
7649 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7650 }
7651
7652 if (*s == '"') {
7653 /* epilogue */
7654 s++;
7655 if (s == s_end) {
7656 /* ascii compatible dumped string */
7657 break;
7658 }
7659 else {
7660 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7661 static const char dup_suffix[] = ".dup";
7662 const char *encname;
7663 int encidx;
7664 ptrdiff_t size;
7665
7666 /* check separately for strings dumped by older versions */
7667 size = sizeof(dup_suffix) - 1;
7668 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7669
7670 size = sizeof(force_encoding_suffix) - 1;
7671 if (s_end - s <= size) goto invalid_format;
7672 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7673 s += size;
7674
7675 if (utf8) {
7676 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7677 }
7678
7679 encname = s;
7680 s = memchr(s, '"', s_end-s);
7681 size = s - encname;
7682 if (!s) goto invalid_format;
7683 if (s_end - s != 2) goto invalid_format;
7684 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7685
7686 encidx = rb_enc_find_index2(encname, (long)size);
7687 if (encidx < 0) {
7688 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7689 }
7690 rb_enc_associate_index(undumped, encidx);
7691 }
7692 break;
7693 }
7694
7695 if (*s == '\\') {
7696 s++;
7697 if (s >= s_end) {
7698 rb_raise(rb_eRuntimeError, "invalid escape");
7699 }
7700 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7701 }
7702 else {
7703 rb_str_cat(undumped, s++, 1);
7704 }
7705 }
7706
7707 RB_GC_GUARD(str);
7708
7709 return undumped;
7710invalid_format:
7711 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7712}
7713
7714static void
7715rb_str_check_dummy_enc(rb_encoding *enc)
7716{
7717 if (rb_enc_dummy_p(enc)) {
7718 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7719 rb_enc_name(enc));
7720 }
7721}
7722
7723static rb_encoding *
7724str_true_enc(VALUE str)
7725{
7726 rb_encoding *enc = STR_ENC_GET(str);
7727 rb_str_check_dummy_enc(enc);
7728 return enc;
7729}
7730
7731static OnigCaseFoldType
7732check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7733{
7734 if (argc==0)
7735 return flags;
7736 if (argc>2)
7737 rb_raise(rb_eArgError, "too many options");
7738 if (argv[0]==sym_turkic) {
7739 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7740 if (argc==2) {
7741 if (argv[1]==sym_lithuanian)
7742 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7743 else
7744 rb_raise(rb_eArgError, "invalid second option");
7745 }
7746 }
7747 else if (argv[0]==sym_lithuanian) {
7748 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7749 if (argc==2) {
7750 if (argv[1]==sym_turkic)
7751 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7752 else
7753 rb_raise(rb_eArgError, "invalid second option");
7754 }
7755 }
7756 else if (argc>1)
7757 rb_raise(rb_eArgError, "too many options");
7758 else if (argv[0]==sym_ascii)
7759 flags |= ONIGENC_CASE_ASCII_ONLY;
7760 else if (argv[0]==sym_fold) {
7761 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7762 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7763 else
7764 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7765 }
7766 else
7767 rb_raise(rb_eArgError, "invalid option");
7768 return flags;
7769}
7770
7771static inline bool
7772case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7773{
7774 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7775 return true;
7776 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7777}
7778
7779/* 16 should be long enough to absorb any kind of single character length increase */
7780#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7781#ifndef CASEMAP_DEBUG
7782# define CASEMAP_DEBUG 0
7783#endif
7784
7785struct mapping_buffer;
7786typedef struct mapping_buffer {
7787 size_t capa;
7788 size_t used;
7789 struct mapping_buffer *next;
7790 OnigUChar space[FLEX_ARY_LEN];
7792
7793static void
7794mapping_buffer_free(void *p)
7795{
7796 mapping_buffer *previous_buffer;
7797 mapping_buffer *current_buffer = p;
7798 while (current_buffer) {
7799 previous_buffer = current_buffer;
7800 current_buffer = current_buffer->next;
7801 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7802 }
7803}
7804
7805static const rb_data_type_t mapping_buffer_type = {
7806 "mapping_buffer",
7807 {0, mapping_buffer_free,},
7808 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7809};
7810
7811static VALUE
7812rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7813{
7814 VALUE target;
7815
7816 const OnigUChar *source_current, *source_end;
7817 int target_length = 0;
7818 VALUE buffer_anchor;
7819 mapping_buffer *current_buffer = 0;
7820 mapping_buffer **pre_buffer;
7821 size_t buffer_count = 0;
7822 int buffer_length_or_invalid;
7823
7824 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7825
7826 source_current = (OnigUChar*)RSTRING_PTR(source);
7827 source_end = (OnigUChar*)RSTRING_END(source);
7828
7829 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7830 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7831 while (source_current < source_end) {
7832 /* increase multiplier using buffer count to converge quickly */
7833 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7834 if (CASEMAP_DEBUG) {
7835 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7836 }
7837 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7838 *pre_buffer = current_buffer;
7839 pre_buffer = &current_buffer->next;
7840 current_buffer->next = NULL;
7841 current_buffer->capa = capa;
7842 buffer_length_or_invalid = enc->case_map(flags,
7843 &source_current, source_end,
7844 current_buffer->space,
7845 current_buffer->space+current_buffer->capa,
7846 enc);
7847 if (buffer_length_or_invalid < 0) {
7848 current_buffer = DATA_PTR(buffer_anchor);
7849 DATA_PTR(buffer_anchor) = 0;
7850 mapping_buffer_free(current_buffer);
7851 rb_raise(rb_eArgError, "input string invalid");
7852 }
7853 target_length += current_buffer->used = buffer_length_or_invalid;
7854 }
7855 if (CASEMAP_DEBUG) {
7856 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7857 }
7858
7859 if (buffer_count==1) {
7860 target = rb_str_new((const char*)current_buffer->space, target_length);
7861 }
7862 else {
7863 char *target_current;
7864
7865 target = rb_str_new(0, target_length);
7866 target_current = RSTRING_PTR(target);
7867 current_buffer = DATA_PTR(buffer_anchor);
7868 while (current_buffer) {
7869 memcpy(target_current, current_buffer->space, current_buffer->used);
7870 target_current += current_buffer->used;
7871 current_buffer = current_buffer->next;
7872 }
7873 }
7874 current_buffer = DATA_PTR(buffer_anchor);
7875 DATA_PTR(buffer_anchor) = 0;
7876 mapping_buffer_free(current_buffer);
7877
7878 RB_GC_GUARD(buffer_anchor);
7879
7880 /* TODO: check about string terminator character */
7881 str_enc_copy_direct(target, source);
7882 /*ENC_CODERANGE_SET(mapped, cr);*/
7883
7884 return target;
7885}
7886
7887static VALUE
7888rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7889{
7890 const OnigUChar *source_current, *source_end;
7891 OnigUChar *target_current, *target_end;
7892 long old_length = RSTRING_LEN(source);
7893 int length_or_invalid;
7894
7895 if (old_length == 0) return Qnil;
7896
7897 source_current = (OnigUChar*)RSTRING_PTR(source);
7898 source_end = (OnigUChar*)RSTRING_END(source);
7899 if (source == target) {
7900 target_current = (OnigUChar*)source_current;
7901 target_end = (OnigUChar*)source_end;
7902 }
7903 else {
7904 target_current = (OnigUChar*)RSTRING_PTR(target);
7905 target_end = (OnigUChar*)RSTRING_END(target);
7906 }
7907
7908 length_or_invalid = onigenc_ascii_only_case_map(flags,
7909 &source_current, source_end,
7910 target_current, target_end, enc);
7911 if (length_or_invalid < 0)
7912 rb_raise(rb_eArgError, "input string invalid");
7913 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7914 fprintf(stderr, "problem with rb_str_ascii_casemap"
7915 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7916 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7917 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7918 }
7919
7920 str_enc_copy(target, source);
7921
7922 return target;
7923}
7924
7925static bool
7926upcase_single(VALUE str)
7927{
7928 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7929 bool modified = false;
7930
7931 while (s < send) {
7932 unsigned int c = *(unsigned char*)s;
7933
7934 if ('a' <= c && c <= 'z') {
7935 *s = 'A' + (c - 'a');
7936 modified = true;
7937 }
7938 s++;
7939 }
7940 return modified;
7941}
7942
7943/*
7944 * call-seq:
7945 * upcase!(mapping) -> self or nil
7946 *
7947 * Like String#upcase, except that:
7948 *
7949 * - Changes character casings in +self+ (not in a copy of +self+).
7950 * - Returns +self+ if any changes are made, +nil+ otherwise.
7951 *
7952 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7953 */
7954
7955static VALUE
7956rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7957{
7958 rb_encoding *enc;
7959 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7960
7961 flags = check_case_options(argc, argv, flags);
7962 str_modify_keep_cr(str);
7963 enc = str_true_enc(str);
7964 if (case_option_single_p(flags, enc, str)) {
7965 if (upcase_single(str))
7966 flags |= ONIGENC_CASE_MODIFIED;
7967 }
7968 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7969 rb_str_ascii_casemap(str, str, &flags, enc);
7970 else
7971 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7972
7973 if (ONIGENC_CASE_MODIFIED&flags) return str;
7974 return Qnil;
7975}
7976
7977
7978/*
7979 * call-seq:
7980 * upcase(mapping = :ascii) -> new_string
7981 *
7982 * :include: doc/string/upcase.rdoc
7983 */
7984
7985static VALUE
7986rb_str_upcase(int argc, VALUE *argv, VALUE str)
7987{
7988 rb_encoding *enc;
7989 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7990 VALUE ret;
7991
7992 flags = check_case_options(argc, argv, flags);
7993 enc = str_true_enc(str);
7994 if (case_option_single_p(flags, enc, str)) {
7995 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7996 str_enc_copy_direct(ret, str);
7997 upcase_single(ret);
7998 }
7999 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8000 ret = rb_str_new(0, RSTRING_LEN(str));
8001 rb_str_ascii_casemap(str, ret, &flags, enc);
8002 }
8003 else {
8004 ret = rb_str_casemap(str, &flags, enc);
8005 }
8006
8007 return ret;
8008}
8009
8010static bool
8011downcase_single(VALUE str)
8012{
8013 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8014 bool modified = false;
8015
8016 while (s < send) {
8017 unsigned int c = *(unsigned char*)s;
8018
8019 if ('A' <= c && c <= 'Z') {
8020 *s = 'a' + (c - 'A');
8021 modified = true;
8022 }
8023 s++;
8024 }
8025
8026 return modified;
8027}
8028
8029/*
8030 * call-seq:
8031 * downcase!(mapping) -> self or nil
8032 *
8033 * Like String#downcase, except that:
8034 *
8035 * - Changes character casings in +self+ (not in a copy of +self+).
8036 * - Returns +self+ if any changes are made, +nil+ otherwise.
8037 *
8038 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8039 */
8040
8041static VALUE
8042rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8043{
8044 rb_encoding *enc;
8045 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8046
8047 flags = check_case_options(argc, argv, flags);
8048 str_modify_keep_cr(str);
8049 enc = str_true_enc(str);
8050 if (case_option_single_p(flags, enc, str)) {
8051 if (downcase_single(str))
8052 flags |= ONIGENC_CASE_MODIFIED;
8053 }
8054 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8055 rb_str_ascii_casemap(str, str, &flags, enc);
8056 else
8057 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8058
8059 if (ONIGENC_CASE_MODIFIED&flags) return str;
8060 return Qnil;
8061}
8062
8063
8064/*
8065 * call-seq:
8066 * downcase(mapping = :ascii) -> new_string
8067 *
8068 * :include: doc/string/downcase.rdoc
8069 *
8070 */
8071
8072static VALUE
8073rb_str_downcase(int argc, VALUE *argv, VALUE str)
8074{
8075 rb_encoding *enc;
8076 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8077 VALUE ret;
8078
8079 flags = check_case_options(argc, argv, flags);
8080 enc = str_true_enc(str);
8081 if (case_option_single_p(flags, enc, str)) {
8082 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8083 str_enc_copy_direct(ret, str);
8084 downcase_single(ret);
8085 }
8086 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8087 ret = rb_str_new(0, RSTRING_LEN(str));
8088 rb_str_ascii_casemap(str, ret, &flags, enc);
8089 }
8090 else {
8091 ret = rb_str_casemap(str, &flags, enc);
8092 }
8093
8094 return ret;
8095}
8096
8097
8098/*
8099 * call-seq:
8100 * capitalize!(mapping = :ascii) -> self or nil
8101 *
8102 * Like String#capitalize, except that:
8103 *
8104 * - Changes character casings in +self+ (not in a copy of +self+).
8105 * - Returns +self+ if any changes are made, +nil+ otherwise.
8106 *
8107 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8108 */
8109
8110static VALUE
8111rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8112{
8113 rb_encoding *enc;
8114 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8115
8116 flags = check_case_options(argc, argv, flags);
8117 str_modify_keep_cr(str);
8118 enc = str_true_enc(str);
8119 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8120 if (flags&ONIGENC_CASE_ASCII_ONLY)
8121 rb_str_ascii_casemap(str, str, &flags, enc);
8122 else
8123 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8124
8125 if (ONIGENC_CASE_MODIFIED&flags) return str;
8126 return Qnil;
8127}
8128
8129
8130/*
8131 * call-seq:
8132 * capitalize(mapping = :ascii) -> new_string
8133 *
8134 * :include: doc/string/capitalize.rdoc
8135 *
8136 */
8137
8138static VALUE
8139rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8140{
8141 rb_encoding *enc;
8142 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8143 VALUE ret;
8144
8145 flags = check_case_options(argc, argv, flags);
8146 enc = str_true_enc(str);
8147 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8148 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8149 ret = rb_str_new(0, RSTRING_LEN(str));
8150 rb_str_ascii_casemap(str, ret, &flags, enc);
8151 }
8152 else {
8153 ret = rb_str_casemap(str, &flags, enc);
8154 }
8155 return ret;
8156}
8157
8158
8159/*
8160 * call-seq:
8161 * swapcase!(mapping) -> self or nil
8162 *
8163 * Like String#swapcase, except that:
8164 *
8165 * - Changes are made to +self+, not to copy of +self+.
8166 * - Returns +self+ if any changes are made, +nil+ otherwise.
8167 *
8168 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8169 */
8170
8171static VALUE
8172rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8173{
8174 rb_encoding *enc;
8175 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8176
8177 flags = check_case_options(argc, argv, flags);
8178 str_modify_keep_cr(str);
8179 enc = str_true_enc(str);
8180 if (flags&ONIGENC_CASE_ASCII_ONLY)
8181 rb_str_ascii_casemap(str, str, &flags, enc);
8182 else
8183 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8184
8185 if (ONIGENC_CASE_MODIFIED&flags) return str;
8186 return Qnil;
8187}
8188
8189
8190/*
8191 * call-seq:
8192 * swapcase(mapping = :ascii) -> new_string
8193 *
8194 * :include: doc/string/swapcase.rdoc
8195 *
8196 */
8197
8198static VALUE
8199rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8200{
8201 rb_encoding *enc;
8202 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8203 VALUE ret;
8204
8205 flags = check_case_options(argc, argv, flags);
8206 enc = str_true_enc(str);
8207 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8208 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8209 ret = rb_str_new(0, RSTRING_LEN(str));
8210 rb_str_ascii_casemap(str, ret, &flags, enc);
8211 }
8212 else {
8213 ret = rb_str_casemap(str, &flags, enc);
8214 }
8215 return ret;
8216}
8217
8218typedef unsigned char *USTR;
8219
8220struct tr {
8221 int gen;
8222 unsigned int now, max;
8223 char *p, *pend;
8224};
8225
8226static unsigned int
8227trnext(struct tr *t, rb_encoding *enc)
8228{
8229 int n;
8230
8231 for (;;) {
8232 nextpart:
8233 if (!t->gen) {
8234 if (t->p == t->pend) return -1;
8235 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8236 t->p += n;
8237 }
8238 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8239 t->p += n;
8240 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8241 t->p += n;
8242 if (t->p < t->pend) {
8243 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8244 t->p += n;
8245 if (t->now > c) {
8246 if (t->now < 0x80 && c < 0x80) {
8247 rb_raise(rb_eArgError,
8248 "invalid range \"%c-%c\" in string transliteration",
8249 t->now, c);
8250 }
8251 else {
8252 rb_raise(rb_eArgError, "invalid range in string transliteration");
8253 }
8254 continue; /* not reached */
8255 }
8256 else if (t->now < c) {
8257 t->gen = 1;
8258 t->max = c;
8259 }
8260 }
8261 }
8262 return t->now;
8263 }
8264 else {
8265 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8266 if (t->now == t->max) {
8267 t->gen = 0;
8268 goto nextpart;
8269 }
8270 }
8271 if (t->now < t->max) {
8272 return t->now;
8273 }
8274 else {
8275 t->gen = 0;
8276 return t->max;
8277 }
8278 }
8279 }
8280}
8281
8282static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8283
8284static VALUE
8285tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8286{
8287 const unsigned int errc = -1;
8288 unsigned int trans[256];
8289 rb_encoding *enc, *e1, *e2;
8290 struct tr trsrc, trrepl;
8291 int cflag = 0;
8292 unsigned int c, c0, last = 0;
8293 int modify = 0, i, l;
8294 unsigned char *s, *send;
8295 VALUE hash = 0;
8296 int singlebyte = single_byte_optimizable(str);
8297 int termlen;
8298 int cr;
8299
8300#define CHECK_IF_ASCII(c) \
8301 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8302 (cr = ENC_CODERANGE_VALID) : 0)
8303
8304 StringValue(src);
8305 StringValue(repl);
8306 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8307 if (RSTRING_LEN(repl) == 0) {
8308 return rb_str_delete_bang(1, &src, str);
8309 }
8310
8311 cr = ENC_CODERANGE(str);
8312 e1 = rb_enc_check(str, src);
8313 e2 = rb_enc_check(str, repl);
8314 if (e1 == e2) {
8315 enc = e1;
8316 }
8317 else {
8318 enc = rb_enc_check(src, repl);
8319 }
8320 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8321 if (RSTRING_LEN(src) > 1 &&
8322 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8323 trsrc.p + l < trsrc.pend) {
8324 cflag = 1;
8325 trsrc.p += l;
8326 }
8327 trrepl.p = RSTRING_PTR(repl);
8328 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8329 trsrc.gen = trrepl.gen = 0;
8330 trsrc.now = trrepl.now = 0;
8331 trsrc.max = trrepl.max = 0;
8332
8333 if (cflag) {
8334 for (i=0; i<256; i++) {
8335 trans[i] = 1;
8336 }
8337 while ((c = trnext(&trsrc, enc)) != errc) {
8338 if (c < 256) {
8339 trans[c] = errc;
8340 }
8341 else {
8342 if (!hash) hash = rb_hash_new();
8343 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8344 }
8345 }
8346 while ((c = trnext(&trrepl, enc)) != errc)
8347 /* retrieve last replacer */;
8348 last = trrepl.now;
8349 for (i=0; i<256; i++) {
8350 if (trans[i] != errc) {
8351 trans[i] = last;
8352 }
8353 }
8354 }
8355 else {
8356 unsigned int r;
8357
8358 for (i=0; i<256; i++) {
8359 trans[i] = errc;
8360 }
8361 while ((c = trnext(&trsrc, enc)) != errc) {
8362 r = trnext(&trrepl, enc);
8363 if (r == errc) r = trrepl.now;
8364 if (c < 256) {
8365 trans[c] = r;
8366 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8367 }
8368 else {
8369 if (!hash) hash = rb_hash_new();
8370 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8371 }
8372 }
8373 }
8374
8375 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8376 cr = ENC_CODERANGE_7BIT;
8377 str_modify_keep_cr(str);
8378 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8379 termlen = rb_enc_mbminlen(enc);
8380 if (sflag) {
8381 int clen, tlen;
8382 long offset, max = RSTRING_LEN(str);
8383 unsigned int save = -1;
8384 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8385
8386 while (s < send) {
8387 int may_modify = 0;
8388
8389 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8390 if (!MBCLEN_CHARFOUND_P(r)) {
8391 xfree(buf);
8392 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8393 }
8394 clen = MBCLEN_CHARFOUND_LEN(r);
8395 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8396
8397 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8398
8399 s += clen;
8400 if (c < 256) {
8401 c = trans[c];
8402 }
8403 else if (hash) {
8404 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8405 if (NIL_P(tmp)) {
8406 if (cflag) c = last;
8407 else c = errc;
8408 }
8409 else if (cflag) c = errc;
8410 else c = NUM2INT(tmp);
8411 }
8412 else {
8413 c = errc;
8414 }
8415 if (c != (unsigned int)-1) {
8416 if (save == c) {
8417 CHECK_IF_ASCII(c);
8418 continue;
8419 }
8420 save = c;
8421 tlen = rb_enc_codelen(c, enc);
8422 modify = 1;
8423 }
8424 else {
8425 save = -1;
8426 c = c0;
8427 if (enc != e1) may_modify = 1;
8428 }
8429 if ((offset = t - buf) + tlen > max) {
8430 size_t MAYBE_UNUSED(old) = max + termlen;
8431 max = offset + tlen + (send - s);
8432 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8433 t = buf + offset;
8434 }
8435 rb_enc_mbcput(c, t, enc);
8436 if (may_modify && memcmp(s, t, tlen) != 0) {
8437 modify = 1;
8438 }
8439 CHECK_IF_ASCII(c);
8440 t += tlen;
8441 }
8442 if (!STR_EMBED_P(str)) {
8443 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8444 }
8445 TERM_FILL((char *)t, termlen);
8446 RSTRING(str)->as.heap.ptr = (char *)buf;
8447 STR_SET_LEN(str, t - buf);
8448 STR_SET_NOEMBED(str);
8449 RSTRING(str)->as.heap.aux.capa = max;
8450 }
8451 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8452 while (s < send) {
8453 c = (unsigned char)*s;
8454 if (trans[c] != errc) {
8455 if (!cflag) {
8456 c = trans[c];
8457 *s = c;
8458 modify = 1;
8459 }
8460 else {
8461 *s = last;
8462 modify = 1;
8463 }
8464 }
8465 CHECK_IF_ASCII(c);
8466 s++;
8467 }
8468 }
8469 else {
8470 int clen, tlen;
8471 long offset, max = (long)((send - s) * 1.2);
8472 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8473
8474 while (s < send) {
8475 int may_modify = 0;
8476
8477 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8478 if (!MBCLEN_CHARFOUND_P(r)) {
8479 xfree(buf);
8480 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8481 }
8482 clen = MBCLEN_CHARFOUND_LEN(r);
8483 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8484
8485 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8486
8487 if (c < 256) {
8488 c = trans[c];
8489 }
8490 else if (hash) {
8491 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8492 if (NIL_P(tmp)) {
8493 if (cflag) c = last;
8494 else c = errc;
8495 }
8496 else if (cflag) c = errc;
8497 else c = NUM2INT(tmp);
8498 }
8499 else {
8500 c = cflag ? last : errc;
8501 }
8502 if (c != errc) {
8503 tlen = rb_enc_codelen(c, enc);
8504 modify = 1;
8505 }
8506 else {
8507 c = c0;
8508 if (enc != e1) may_modify = 1;
8509 }
8510 if ((offset = t - buf) + tlen > max) {
8511 size_t MAYBE_UNUSED(old) = max + termlen;
8512 max = offset + tlen + (long)((send - s) * 1.2);
8513 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8514 t = buf + offset;
8515 }
8516 if (s != t) {
8517 rb_enc_mbcput(c, t, enc);
8518 if (may_modify && memcmp(s, t, tlen) != 0) {
8519 modify = 1;
8520 }
8521 }
8522 CHECK_IF_ASCII(c);
8523 s += clen;
8524 t += tlen;
8525 }
8526 if (!STR_EMBED_P(str)) {
8527 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8528 }
8529 TERM_FILL((char *)t, termlen);
8530 RSTRING(str)->as.heap.ptr = (char *)buf;
8531 STR_SET_LEN(str, t - buf);
8532 STR_SET_NOEMBED(str);
8533 RSTRING(str)->as.heap.aux.capa = max;
8534 }
8535
8536 if (modify) {
8537 if (cr != ENC_CODERANGE_BROKEN)
8538 ENC_CODERANGE_SET(str, cr);
8539 rb_enc_associate(str, enc);
8540 return str;
8541 }
8542 return Qnil;
8543}
8544
8545
8546/*
8547 * call-seq:
8548 * tr!(selector, replacements) -> self or nil
8549 *
8550 * Like String#tr, except:
8551 *
8552 * - Performs substitutions in +self+ (not in a copy of +self+).
8553 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8554 *
8555 * Related: {Modifying}[rdoc-ref:String@Modifying].
8556 */
8557
8558static VALUE
8559rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8560{
8561 return tr_trans(str, src, repl, 0);
8562}
8563
8564
8565/*
8566 * call-seq:
8567 * tr(selector, replacements) -> new_string
8568 *
8569 * Returns a copy of +self+ with each character specified by string +selector+
8570 * translated to the corresponding character in string +replacements+.
8571 * The correspondence is _positional_:
8572 *
8573 * - Each occurrence of the first character specified by +selector+
8574 * is translated to the first character in +replacements+.
8575 * - Each occurrence of the second character specified by +selector+
8576 * is translated to the second character in +replacements+.
8577 * - And so on.
8578 *
8579 * Example:
8580 *
8581 * 'hello'.tr('el', 'ip') #=> "hippo"
8582 *
8583 * If +replacements+ is shorter than +selector+,
8584 * it is implicitly padded with its own last character:
8585 *
8586 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8587 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8588 *
8589 * Arguments +selector+ and +replacements+ must be valid character selectors
8590 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8591 * and may use any of its valid forms, including negation, ranges, and escapes:
8592 *
8593 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8594 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8595 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8596 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8597 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8598 *
8599 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8600 */
8601
8602static VALUE
8603rb_str_tr(VALUE str, VALUE src, VALUE repl)
8604{
8605 str = str_duplicate(rb_cString, str);
8606 tr_trans(str, src, repl, 0);
8607 return str;
8608}
8609
8610#define TR_TABLE_MAX (UCHAR_MAX+1)
8611#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8612static void
8613tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8614 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8615{
8616 const unsigned int errc = -1;
8617 char buf[TR_TABLE_MAX];
8618 struct tr tr;
8619 unsigned int c;
8620 VALUE table = 0, ptable = 0;
8621 int i, l, cflag = 0;
8622
8623 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8624 tr.gen = tr.now = tr.max = 0;
8625
8626 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8627 cflag = 1;
8628 tr.p += l;
8629 }
8630 if (first) {
8631 for (i=0; i<TR_TABLE_MAX; i++) {
8632 stable[i] = 1;
8633 }
8634 stable[TR_TABLE_MAX] = cflag;
8635 }
8636 else if (stable[TR_TABLE_MAX] && !cflag) {
8637 stable[TR_TABLE_MAX] = 0;
8638 }
8639 for (i=0; i<TR_TABLE_MAX; i++) {
8640 buf[i] = cflag;
8641 }
8642
8643 while ((c = trnext(&tr, enc)) != errc) {
8644 if (c < TR_TABLE_MAX) {
8645 buf[(unsigned char)c] = !cflag;
8646 }
8647 else {
8648 VALUE key = UINT2NUM(c);
8649
8650 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8651 if (cflag) {
8652 ptable = *ctablep;
8653 table = ptable ? ptable : rb_hash_new();
8654 *ctablep = table;
8655 }
8656 else {
8657 table = rb_hash_new();
8658 ptable = *tablep;
8659 *tablep = table;
8660 }
8661 }
8662 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8663 rb_hash_aset(table, key, Qtrue);
8664 }
8665 }
8666 }
8667 for (i=0; i<TR_TABLE_MAX; i++) {
8668 stable[i] = stable[i] && buf[i];
8669 }
8670 if (!table && !cflag) {
8671 *tablep = 0;
8672 }
8673}
8674
8675
8676static int
8677tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8678{
8679 if (c < TR_TABLE_MAX) {
8680 return table[c] != 0;
8681 }
8682 else {
8683 VALUE v = UINT2NUM(c);
8684
8685 if (del) {
8686 if (!NIL_P(rb_hash_lookup(del, v)) &&
8687 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8688 return TRUE;
8689 }
8690 }
8691 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8692 return FALSE;
8693 }
8694 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8695 }
8696}
8697
8698/*
8699 * call-seq:
8700 * delete!(*selectors) -> self or nil
8701 *
8702 * Like String#delete, but modifies +self+ in place;
8703 * returns +self+ if any characters were deleted, +nil+ otherwise.
8704 *
8705 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8706 */
8707
8708static VALUE
8709rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8710{
8711 char squeez[TR_TABLE_SIZE];
8712 rb_encoding *enc = 0;
8713 char *s, *send, *t;
8714 VALUE del = 0, nodel = 0;
8715 int modify = 0;
8716 int i, ascompat, cr;
8717
8718 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8720 for (i=0; i<argc; i++) {
8721 VALUE s = argv[i];
8722
8723 StringValue(s);
8724 enc = rb_enc_check(str, s);
8725 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8726 }
8727
8728 str_modify_keep_cr(str);
8729 ascompat = rb_enc_asciicompat(enc);
8730 s = t = RSTRING_PTR(str);
8731 send = RSTRING_END(str);
8732 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8733 while (s < send) {
8734 unsigned int c;
8735 int clen;
8736
8737 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8738 if (squeez[c]) {
8739 modify = 1;
8740 }
8741 else {
8742 if (t != s) *t = c;
8743 t++;
8744 }
8745 s++;
8746 }
8747 else {
8748 c = rb_enc_codepoint_len(s, send, &clen, enc);
8749
8750 if (tr_find(c, squeez, del, nodel)) {
8751 modify = 1;
8752 }
8753 else {
8754 if (t != s) rb_enc_mbcput(c, t, enc);
8755 t += clen;
8757 }
8758 s += clen;
8759 }
8760 }
8761 TERM_FILL(t, TERM_LEN(str));
8762 STR_SET_LEN(str, t - RSTRING_PTR(str));
8763 ENC_CODERANGE_SET(str, cr);
8764
8765 if (modify) return str;
8766 return Qnil;
8767}
8768
8769
8770/*
8771 * call-seq:
8772 * delete(*selectors) -> new_string
8773 *
8774 * :include: doc/string/delete.rdoc
8775 *
8776 */
8777
8778static VALUE
8779rb_str_delete(int argc, VALUE *argv, VALUE str)
8780{
8781 str = str_duplicate(rb_cString, str);
8782 rb_str_delete_bang(argc, argv, str);
8783 return str;
8784}
8785
8786
8787/*
8788 * call-seq:
8789 * squeeze!(*selectors) -> self or nil
8790 *
8791 * Like String#squeeze, except that:
8792 *
8793 * - Characters are squeezed in +self+ (not in a copy of +self+).
8794 * - Returns +self+ if any changes are made, +nil+ otherwise.
8795 *
8796 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8797 */
8798
8799static VALUE
8800rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8801{
8802 char squeez[TR_TABLE_SIZE];
8803 rb_encoding *enc = 0;
8804 VALUE del = 0, nodel = 0;
8805 unsigned char *s, *send, *t;
8806 int i, modify = 0;
8807 int ascompat, singlebyte = single_byte_optimizable(str);
8808 unsigned int save;
8809
8810 if (argc == 0) {
8811 enc = STR_ENC_GET(str);
8812 }
8813 else {
8814 for (i=0; i<argc; i++) {
8815 VALUE s = argv[i];
8816
8817 StringValue(s);
8818 enc = rb_enc_check(str, s);
8819 if (singlebyte && !single_byte_optimizable(s))
8820 singlebyte = 0;
8821 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8822 }
8823 }
8824
8825 str_modify_keep_cr(str);
8826 s = t = (unsigned char *)RSTRING_PTR(str);
8827 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8828 send = (unsigned char *)RSTRING_END(str);
8829 save = -1;
8830 ascompat = rb_enc_asciicompat(enc);
8831
8832 if (singlebyte) {
8833 while (s < send) {
8834 unsigned int c = *s++;
8835 if (c != save || (argc > 0 && !squeez[c])) {
8836 *t++ = save = c;
8837 }
8838 }
8839 }
8840 else {
8841 while (s < send) {
8842 unsigned int c;
8843 int clen;
8844
8845 if (ascompat && (c = *s) < 0x80) {
8846 if (c != save || (argc > 0 && !squeez[c])) {
8847 *t++ = save = c;
8848 }
8849 s++;
8850 }
8851 else {
8852 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8853
8854 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8855 if (t != s) rb_enc_mbcput(c, t, enc);
8856 save = c;
8857 t += clen;
8858 }
8859 s += clen;
8860 }
8861 }
8862 }
8863
8864 TERM_FILL((char *)t, TERM_LEN(str));
8865 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8866 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8867 modify = 1;
8868 }
8869
8870 if (modify) return str;
8871 return Qnil;
8872}
8873
8874
8875/*
8876 * call-seq:
8877 * squeeze(*selectors) -> new_string
8878 *
8879 * :include: doc/string/squeeze.rdoc
8880 *
8881 */
8882
8883static VALUE
8884rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8885{
8886 str = str_duplicate(rb_cString, str);
8887 rb_str_squeeze_bang(argc, argv, str);
8888 return str;
8889}
8890
8891
8892/*
8893 * call-seq:
8894 * tr_s!(selector, replacements) -> self or nil
8895 *
8896 * Like String#tr_s, except:
8897 *
8898 * - Modifies +self+ in place (not a copy of +self+).
8899 * - Returns +self+ if any changes were made, +nil+ otherwise.
8900 *
8901 * Related: {Modifying}[rdoc-ref:String@Modifying].
8902 */
8903
8904static VALUE
8905rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8906{
8907 return tr_trans(str, src, repl, 1);
8908}
8909
8910
8911/*
8912 * call-seq:
8913 * tr_s(selector, replacements) -> new_string
8914 *
8915 * Like String#tr, except:
8916 *
8917 * - Also squeezes the modified portions of the translated string;
8918 * see String#squeeze.
8919 * - Returns the translated and squeezed string.
8920 *
8921 * Examples:
8922 *
8923 * 'hello'.tr_s('l', 'r') #=> "hero"
8924 * 'hello'.tr_s('el', '-') #=> "h-o"
8925 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8926 *
8927 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8928 *
8929 */
8930
8931static VALUE
8932rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8933{
8934 str = str_duplicate(rb_cString, str);
8935 tr_trans(str, src, repl, 1);
8936 return str;
8937}
8938
8939
8940/*
8941 * call-seq:
8942 * count(*selectors) -> integer
8943 *
8944 * :include: doc/string/count.rdoc
8945 */
8946
8947static VALUE
8948rb_str_count(int argc, VALUE *argv, VALUE str)
8949{
8950 char table[TR_TABLE_SIZE];
8951 rb_encoding *enc = 0;
8952 VALUE del = 0, nodel = 0, tstr;
8953 char *s, *send;
8954 int i;
8955 int ascompat;
8956 size_t n = 0;
8957
8959
8960 tstr = argv[0];
8961 StringValue(tstr);
8962 enc = rb_enc_check(str, tstr);
8963 if (argc == 1) {
8964 const char *ptstr;
8965 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8966 (ptstr = RSTRING_PTR(tstr),
8967 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8968 !is_broken_string(str)) {
8969 int clen;
8970 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8971
8972 s = RSTRING_PTR(str);
8973 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8974 send = RSTRING_END(str);
8975 while (s < send) {
8976 if (*(unsigned char*)s++ == c) n++;
8977 }
8978 return SIZET2NUM(n);
8979 }
8980 }
8981
8982 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8983 for (i=1; i<argc; i++) {
8984 tstr = argv[i];
8985 StringValue(tstr);
8986 enc = rb_enc_check(str, tstr);
8987 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8988 }
8989
8990 s = RSTRING_PTR(str);
8991 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8992 send = RSTRING_END(str);
8993 ascompat = rb_enc_asciicompat(enc);
8994 while (s < send) {
8995 unsigned int c;
8996
8997 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8998 if (table[c]) {
8999 n++;
9000 }
9001 s++;
9002 }
9003 else {
9004 int clen;
9005 c = rb_enc_codepoint_len(s, send, &clen, enc);
9006 if (tr_find(c, table, del, nodel)) {
9007 n++;
9008 }
9009 s += clen;
9010 }
9011 }
9012
9013 return SIZET2NUM(n);
9014}
9015
9016static VALUE
9017rb_fs_check(VALUE val)
9018{
9019 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9020 val = rb_check_string_type(val);
9021 if (NIL_P(val)) return 0;
9022 }
9023 return val;
9024}
9025
9026static const char isspacetable[256] = {
9027 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9028 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9029 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9030 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9034 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9035 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9036 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9037 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9038 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9039 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9040 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9041 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9042 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9043};
9044
9045#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9046
9047static long
9048split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9049{
9050 if (empty_count >= 0 && len == 0) {
9051 return empty_count + 1;
9052 }
9053 if (empty_count > 0) {
9054 /* make different substrings */
9055 if (result) {
9056 do {
9057 rb_ary_push(result, str_new_empty_String(str));
9058 } while (--empty_count > 0);
9059 }
9060 else {
9061 do {
9062 rb_yield(str_new_empty_String(str));
9063 } while (--empty_count > 0);
9064 }
9065 }
9066 str = rb_str_subseq(str, beg, len);
9067 if (result) {
9068 rb_ary_push(result, str);
9069 }
9070 else {
9071 rb_yield(str);
9072 }
9073 return empty_count;
9074}
9075
9076typedef enum {
9077 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9078} split_type_t;
9079
9080static split_type_t
9081literal_split_pattern(VALUE spat, split_type_t default_type)
9082{
9083 rb_encoding *enc = STR_ENC_GET(spat);
9084 const char *ptr;
9085 long len;
9086 RSTRING_GETMEM(spat, ptr, len);
9087 if (len == 0) {
9088 /* Special case - split into chars */
9089 return SPLIT_TYPE_CHARS;
9090 }
9091 else if (rb_enc_asciicompat(enc)) {
9092 if (len == 1 && ptr[0] == ' ') {
9093 return SPLIT_TYPE_AWK;
9094 }
9095 }
9096 else {
9097 int l;
9098 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9099 return SPLIT_TYPE_AWK;
9100 }
9101 }
9102 return default_type;
9103}
9104
9105/*
9106 * call-seq:
9107 * split(field_sep = $;, limit = 0) -> array_of_substrings
9108 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9109 *
9110 * :include: doc/string/split.rdoc
9111 *
9112 */
9113
9114static VALUE
9115rb_str_split_m(int argc, VALUE *argv, VALUE str)
9116{
9117 rb_encoding *enc;
9118 VALUE spat;
9119 VALUE limit;
9120 split_type_t split_type;
9121 long beg, end, i = 0, empty_count = -1;
9122 int lim = 0;
9123 VALUE result, tmp;
9124
9125 result = rb_block_given_p() ? Qfalse : Qnil;
9126 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9127 lim = NUM2INT(limit);
9128 if (lim <= 0) limit = Qnil;
9129 else if (lim == 1) {
9130 if (RSTRING_LEN(str) == 0)
9131 return result ? rb_ary_new2(0) : str;
9132 tmp = str_duplicate(rb_cString, str);
9133 if (!result) {
9134 rb_yield(tmp);
9135 return str;
9136 }
9137 return rb_ary_new3(1, tmp);
9138 }
9139 i = 1;
9140 }
9141 if (NIL_P(limit) && !lim) empty_count = 0;
9142
9143 enc = STR_ENC_GET(str);
9144 split_type = SPLIT_TYPE_REGEXP;
9145 if (!NIL_P(spat)) {
9146 spat = get_pat_quoted(spat, 0);
9147 }
9148 else if (NIL_P(spat = rb_fs)) {
9149 split_type = SPLIT_TYPE_AWK;
9150 }
9151 else if (!(spat = rb_fs_check(spat))) {
9152 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9153 }
9154 else {
9155 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9156 }
9157 if (split_type != SPLIT_TYPE_AWK) {
9158 switch (BUILTIN_TYPE(spat)) {
9159 case T_REGEXP:
9160 rb_reg_options(spat); /* check if uninitialized */
9161 tmp = RREGEXP_SRC(spat);
9162 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9163 if (split_type == SPLIT_TYPE_AWK) {
9164 spat = tmp;
9165 split_type = SPLIT_TYPE_STRING;
9166 }
9167 break;
9168
9169 case T_STRING:
9170 mustnot_broken(spat);
9171 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9172 break;
9173
9174 default:
9176 }
9177 }
9178
9179#define SPLIT_STR(beg, len) ( \
9180 empty_count = split_string(result, str, beg, len, empty_count), \
9181 str_mod_check(str, str_start, str_len))
9182
9183 beg = 0;
9184 char *ptr = RSTRING_PTR(str);
9185 char *const str_start = ptr;
9186 const long str_len = RSTRING_LEN(str);
9187 char *const eptr = str_start + str_len;
9188 if (split_type == SPLIT_TYPE_AWK) {
9189 char *bptr = ptr;
9190 int skip = 1;
9191 unsigned int c;
9192
9193 if (result) result = rb_ary_new();
9194 end = beg;
9195 if (is_ascii_string(str)) {
9196 while (ptr < eptr) {
9197 c = (unsigned char)*ptr++;
9198 if (skip) {
9199 if (ascii_isspace(c)) {
9200 beg = ptr - bptr;
9201 }
9202 else {
9203 end = ptr - bptr;
9204 skip = 0;
9205 if (!NIL_P(limit) && lim <= i) break;
9206 }
9207 }
9208 else if (ascii_isspace(c)) {
9209 SPLIT_STR(beg, end-beg);
9210 skip = 1;
9211 beg = ptr - bptr;
9212 if (!NIL_P(limit)) ++i;
9213 }
9214 else {
9215 end = ptr - bptr;
9216 }
9217 }
9218 }
9219 else {
9220 while (ptr < eptr) {
9221 int n;
9222
9223 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9224 ptr += n;
9225 if (skip) {
9226 if (rb_isspace(c)) {
9227 beg = ptr - bptr;
9228 }
9229 else {
9230 end = ptr - bptr;
9231 skip = 0;
9232 if (!NIL_P(limit) && lim <= i) break;
9233 }
9234 }
9235 else if (rb_isspace(c)) {
9236 SPLIT_STR(beg, end-beg);
9237 skip = 1;
9238 beg = ptr - bptr;
9239 if (!NIL_P(limit)) ++i;
9240 }
9241 else {
9242 end = ptr - bptr;
9243 }
9244 }
9245 }
9246 }
9247 else if (split_type == SPLIT_TYPE_STRING) {
9248 char *substr_start = ptr;
9249 char *sptr = RSTRING_PTR(spat);
9250 long slen = RSTRING_LEN(spat);
9251
9252 if (result) result = rb_ary_new();
9253 mustnot_broken(str);
9254 enc = rb_enc_check(str, spat);
9255 while (ptr < eptr &&
9256 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9257 /* Check we are at the start of a char */
9258 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9259 if (t != ptr + end) {
9260 ptr = t;
9261 continue;
9262 }
9263 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9264 str_mod_check(spat, sptr, slen);
9265 ptr += end + slen;
9266 substr_start = ptr;
9267 if (!NIL_P(limit) && lim <= ++i) break;
9268 }
9269 beg = ptr - str_start;
9270 }
9271 else if (split_type == SPLIT_TYPE_CHARS) {
9272 int n;
9273
9274 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9275 mustnot_broken(str);
9276 enc = rb_enc_get(str);
9277 while (ptr < eptr &&
9278 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9279 SPLIT_STR(ptr - str_start, n);
9280 ptr += n;
9281 if (!NIL_P(limit) && lim <= ++i) break;
9282 }
9283 beg = ptr - str_start;
9284 }
9285 else {
9286 if (result) result = rb_ary_new();
9287 long len = RSTRING_LEN(str);
9288 long start = beg;
9289 long idx;
9290 int last_null = 0;
9291 struct re_registers *regs;
9292 VALUE match = 0;
9293
9294 for (; rb_reg_search(spat, str, start, 0) >= 0;
9295 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9296 match = rb_backref_get();
9297 if (!result) rb_match_busy(match);
9298 regs = RMATCH_REGS(match);
9299 end = BEG(0);
9300 if (start == end && BEG(0) == END(0)) {
9301 if (!ptr) {
9302 SPLIT_STR(0, 0);
9303 break;
9304 }
9305 else if (last_null == 1) {
9306 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9307 beg = start;
9308 }
9309 else {
9310 if (start == len)
9311 start++;
9312 else
9313 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9314 last_null = 1;
9315 continue;
9316 }
9317 }
9318 else {
9319 SPLIT_STR(beg, end-beg);
9320 beg = start = END(0);
9321 }
9322 last_null = 0;
9323
9324 for (idx=1; idx < regs->num_regs; idx++) {
9325 if (BEG(idx) == -1) continue;
9326 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9327 }
9328 if (!NIL_P(limit) && lim <= ++i) break;
9329 }
9330 if (match) rb_match_unbusy(match);
9331 }
9332 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9333 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9334 }
9335
9336 return result ? result : str;
9337}
9338
9339VALUE
9340rb_str_split(VALUE str, const char *sep0)
9341{
9342 VALUE sep;
9343
9344 StringValue(str);
9345 sep = rb_str_new_cstr(sep0);
9346 return rb_str_split_m(1, &sep, str);
9347}
9348
9349#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9350
9351static inline int
9352enumerator_element(VALUE ary, VALUE e)
9353{
9354 if (ary) {
9355 rb_ary_push(ary, e);
9356 return 0;
9357 }
9358 else {
9359 rb_yield(e);
9360 return 1;
9361 }
9362}
9363
9364#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9365
9366static const char *
9367chomp_newline(const char *p, const char *e, rb_encoding *enc)
9368{
9369 const char *prev = rb_enc_prev_char(p, e, e, enc);
9370 if (rb_enc_is_newline(prev, e, enc)) {
9371 e = prev;
9372 prev = rb_enc_prev_char(p, e, e, enc);
9373 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9374 e = prev;
9375 }
9376 return e;
9377}
9378
9379static VALUE
9380get_rs(void)
9381{
9382 VALUE rs = rb_rs;
9383 if (!NIL_P(rs) &&
9384 (!RB_TYPE_P(rs, T_STRING) ||
9385 RSTRING_LEN(rs) != 1 ||
9386 RSTRING_PTR(rs)[0] != '\n')) {
9387 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9388 }
9389 return rs;
9390}
9391
9392#define rb_rs get_rs()
9393
9394static VALUE
9395rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9396{
9397 rb_encoding *enc;
9398 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9399 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9400 long pos, len, rslen;
9401 int rsnewline = 0;
9402
9403 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9404 rs = rb_rs;
9405 if (!NIL_P(opts)) {
9406 static ID keywords[1];
9407 if (!keywords[0]) {
9408 keywords[0] = rb_intern_const("chomp");
9409 }
9410 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9411 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9412 }
9413
9414 if (NIL_P(rs)) {
9415 if (!ENUM_ELEM(ary, str)) {
9416 return ary;
9417 }
9418 else {
9419 return orig;
9420 }
9421 }
9422
9423 if (!RSTRING_LEN(str)) goto end;
9424 str = rb_str_new_frozen(str);
9425 ptr = subptr = RSTRING_PTR(str);
9426 pend = RSTRING_END(str);
9427 len = RSTRING_LEN(str);
9428 StringValue(rs);
9429 rslen = RSTRING_LEN(rs);
9430
9431 if (rs == rb_default_rs)
9432 enc = rb_enc_get(str);
9433 else
9434 enc = rb_enc_check(str, rs);
9435
9436 if (rslen == 0) {
9437 /* paragraph mode */
9438 int n;
9439 const char *eol = NULL;
9440 subend = subptr;
9441 while (subend < pend) {
9442 long chomp_rslen = 0;
9443 do {
9444 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9445 n = 0;
9446 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9447 if (rb_enc_is_newline(subend + n, pend, enc)) {
9448 if (eol == subend) break;
9449 subend += rslen;
9450 if (subptr) {
9451 eol = subend;
9452 chomp_rslen = -rslen;
9453 }
9454 }
9455 else {
9456 if (!subptr) subptr = subend;
9457 subend += rslen;
9458 }
9459 rslen = 0;
9460 } while (subend < pend);
9461 if (!subptr) break;
9462 if (rslen == 0) chomp_rslen = 0;
9463 line = rb_str_subseq(str, subptr - ptr,
9464 subend - subptr + (chomp ? chomp_rslen : rslen));
9465 if (ENUM_ELEM(ary, line)) {
9466 str_mod_check(str, ptr, len);
9467 }
9468 subptr = eol = NULL;
9469 }
9470 goto end;
9471 }
9472 else {
9473 rsptr = RSTRING_PTR(rs);
9474 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9475 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9476 rsnewline = 1;
9477 }
9478 }
9479
9480 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9481 rs = rb_str_new(rsptr, rslen);
9482 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9483 rsptr = RSTRING_PTR(rs);
9484 rslen = RSTRING_LEN(rs);
9485 }
9486
9487 while (subptr < pend) {
9488 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9489 if (pos < 0) break;
9490 hit = subptr + pos;
9491 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9492 if (hit != adjusted) {
9493 subptr = adjusted;
9494 continue;
9495 }
9496 subend = hit += rslen;
9497 if (chomp) {
9498 if (rsnewline) {
9499 subend = chomp_newline(subptr, subend, enc);
9500 }
9501 else {
9502 subend -= rslen;
9503 }
9504 }
9505 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9506 if (ENUM_ELEM(ary, line)) {
9507 str_mod_check(str, ptr, len);
9508 }
9509 subptr = hit;
9510 }
9511
9512 if (subptr != pend) {
9513 if (chomp) {
9514 if (rsnewline) {
9515 pend = chomp_newline(subptr, pend, enc);
9516 }
9517 else if (pend - subptr >= rslen &&
9518 memcmp(pend - rslen, rsptr, rslen) == 0) {
9519 pend -= rslen;
9520 }
9521 }
9522 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9523 ENUM_ELEM(ary, line);
9524 RB_GC_GUARD(str);
9525 }
9526
9527 end:
9528 if (ary)
9529 return ary;
9530 else
9531 return orig;
9532}
9533
9534/*
9535 * call-seq:
9536 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9537 * each_line(record_separator = $/, chomp: false) -> enumerator
9538 *
9539 * :include: doc/string/each_line.rdoc
9540 *
9541 */
9542
9543static VALUE
9544rb_str_each_line(int argc, VALUE *argv, VALUE str)
9545{
9546 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9547 return rb_str_enumerate_lines(argc, argv, str, 0);
9548}
9549
9550/*
9551 * call-seq:
9552 * lines(record_separator = $/, chomp: false) -> array_of_strings
9553 *
9554 * Returns substrings ("lines") of +self+
9555 * according to the given arguments:
9556 *
9557 * s = <<~EOT
9558 * This is the first line.
9559 * This is line two.
9560 *
9561 * This is line four.
9562 * This is line five.
9563 * EOT
9564 *
9565 * With the default argument values:
9566 *
9567 * $/ # => "\n"
9568 * s.lines
9569 * # =>
9570 * ["This is the first line.\n",
9571 * "This is line two.\n",
9572 * "\n",
9573 * "This is line four.\n",
9574 * "This is line five.\n"]
9575 *
9576 * With a different +record_separator+:
9577 *
9578 * record_separator = ' is '
9579 * s.lines(record_separator)
9580 * # =>
9581 * ["This is ",
9582 * "the first line.\nThis is ",
9583 * "line two.\n\nThis is ",
9584 * "line four.\nThis is ",
9585 * "line five.\n"]
9586 *
9587 * With keyword argument +chomp+ as +true+,
9588 * removes the trailing newline from each line:
9589 *
9590 * s.lines(chomp: true)
9591 * # =>
9592 * ["This is the first line.",
9593 * "This is line two.",
9594 * "",
9595 * "This is line four.",
9596 * "This is line five."]
9597 *
9598 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9599 */
9600
9601static VALUE
9602rb_str_lines(int argc, VALUE *argv, VALUE str)
9603{
9604 VALUE ary = WANTARRAY("lines", 0);
9605 return rb_str_enumerate_lines(argc, argv, str, ary);
9606}
9607
9608static VALUE
9609rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9610{
9611 return LONG2FIX(RSTRING_LEN(str));
9612}
9613
9614static VALUE
9615rb_str_enumerate_bytes(VALUE str, VALUE ary)
9616{
9617 long i;
9618
9619 for (i=0; i<RSTRING_LEN(str); i++) {
9620 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9621 }
9622 if (ary)
9623 return ary;
9624 else
9625 return str;
9626}
9627
9628/*
9629 * call-seq:
9630 * each_byte {|byte| ... } -> self
9631 * each_byte -> enumerator
9632 *
9633 * :include: doc/string/each_byte.rdoc
9634 *
9635 */
9636
9637static VALUE
9638rb_str_each_byte(VALUE str)
9639{
9640 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9641 return rb_str_enumerate_bytes(str, 0);
9642}
9643
9644/*
9645 * call-seq:
9646 * bytes -> array_of_bytes
9647 *
9648 * :include: doc/string/bytes.rdoc
9649 *
9650 */
9651
9652static VALUE
9653rb_str_bytes(VALUE str)
9654{
9655 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9656 return rb_str_enumerate_bytes(str, ary);
9657}
9658
9659static VALUE
9660rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9661{
9662 return rb_str_length(str);
9663}
9664
9665static VALUE
9666rb_str_enumerate_chars(VALUE str, VALUE ary)
9667{
9668 VALUE orig = str;
9669 long i, len, n;
9670 const char *ptr;
9671 rb_encoding *enc;
9672
9673 str = rb_str_new_frozen(str);
9674 ptr = RSTRING_PTR(str);
9675 len = RSTRING_LEN(str);
9676 enc = rb_enc_get(str);
9677
9679 for (i = 0; i < len; i += n) {
9680 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9681 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9682 }
9683 }
9684 else {
9685 for (i = 0; i < len; i += n) {
9686 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9687 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9688 }
9689 }
9690 RB_GC_GUARD(str);
9691 if (ary)
9692 return ary;
9693 else
9694 return orig;
9695}
9696
9697/*
9698 * call-seq:
9699 * each_char {|char| ... } -> self
9700 * each_char -> enumerator
9701 *
9702 * :include: doc/string/each_char.rdoc
9703 *
9704 */
9705
9706static VALUE
9707rb_str_each_char(VALUE str)
9708{
9709 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9710 return rb_str_enumerate_chars(str, 0);
9711}
9712
9713/*
9714 * call-seq:
9715 * chars -> array_of_characters
9716 *
9717 * :include: doc/string/chars.rdoc
9718 *
9719 */
9720
9721static VALUE
9722rb_str_chars(VALUE str)
9723{
9724 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9725 return rb_str_enumerate_chars(str, ary);
9726}
9727
9728static VALUE
9729rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9730{
9731 VALUE orig = str;
9732 int n;
9733 unsigned int c;
9734 const char *ptr, *end;
9735 rb_encoding *enc;
9736
9737 if (single_byte_optimizable(str))
9738 return rb_str_enumerate_bytes(str, ary);
9739
9740 str = rb_str_new_frozen(str);
9741 ptr = RSTRING_PTR(str);
9742 end = RSTRING_END(str);
9743 enc = STR_ENC_GET(str);
9744
9745 while (ptr < end) {
9746 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9747 ENUM_ELEM(ary, UINT2NUM(c));
9748 ptr += n;
9749 }
9750 RB_GC_GUARD(str);
9751 if (ary)
9752 return ary;
9753 else
9754 return orig;
9755}
9756
9757/*
9758 * call-seq:
9759 * each_codepoint {|codepoint| ... } -> self
9760 * each_codepoint -> enumerator
9761 *
9762 * :include: doc/string/each_codepoint.rdoc
9763 *
9764 */
9765
9766static VALUE
9767rb_str_each_codepoint(VALUE str)
9768{
9769 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9770 return rb_str_enumerate_codepoints(str, 0);
9771}
9772
9773/*
9774 * call-seq:
9775 * codepoints -> array_of_integers
9776 *
9777 * :include: doc/string/codepoints.rdoc
9778 *
9779 */
9780
9781static VALUE
9782rb_str_codepoints(VALUE str)
9783{
9784 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9785 return rb_str_enumerate_codepoints(str, ary);
9786}
9787
9788static regex_t *
9789get_reg_grapheme_cluster(rb_encoding *enc)
9790{
9791 int encidx = rb_enc_to_index(enc);
9792
9793 const OnigUChar source_ascii[] = "\\X";
9794 const OnigUChar *source = source_ascii;
9795 size_t source_len = sizeof(source_ascii) - 1;
9796
9797 switch (encidx) {
9798#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9799#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9800#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9801#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9802#define CASE_UTF(e) \
9803 case ENCINDEX_UTF_##e: { \
9804 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9805 source = source_UTF_##e; \
9806 source_len = sizeof(source_UTF_##e); \
9807 break; \
9808 }
9809 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9810#undef CASE_UTF
9811#undef CHARS_16BE
9812#undef CHARS_16LE
9813#undef CHARS_32BE
9814#undef CHARS_32LE
9815 }
9816
9817 regex_t *reg_grapheme_cluster;
9818 OnigErrorInfo einfo;
9819 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9820 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9821 if (r) {
9822 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9823 onig_error_code_to_str(message, r, &einfo);
9824 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9825 }
9826
9827 return reg_grapheme_cluster;
9828}
9829
9830static regex_t *
9831get_cached_reg_grapheme_cluster(rb_encoding *enc)
9832{
9833 int encidx = rb_enc_to_index(enc);
9834 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9835
9836 if (encidx == rb_utf8_encindex()) {
9837 if (!reg_grapheme_cluster_utf8) {
9838 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9839 }
9840
9841 return reg_grapheme_cluster_utf8;
9842 }
9843
9844 return NULL;
9845}
9846
9847static VALUE
9848rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9849{
9850 size_t grapheme_cluster_count = 0;
9851 rb_encoding *enc = get_encoding(str);
9852 const char *ptr, *end;
9853
9854 if (!rb_enc_unicode_p(enc)) {
9855 return rb_str_length(str);
9856 }
9857
9858 bool cached_reg_grapheme_cluster = true;
9859 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9860 if (!reg_grapheme_cluster) {
9861 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9862 cached_reg_grapheme_cluster = false;
9863 }
9864
9865 ptr = RSTRING_PTR(str);
9866 end = RSTRING_END(str);
9867
9868 while (ptr < end) {
9869 OnigPosition len = onig_match(reg_grapheme_cluster,
9870 (const OnigUChar *)ptr, (const OnigUChar *)end,
9871 (const OnigUChar *)ptr, NULL, 0);
9872 if (len <= 0) break;
9873 grapheme_cluster_count++;
9874 ptr += len;
9875 }
9876
9877 if (!cached_reg_grapheme_cluster) {
9878 onig_free(reg_grapheme_cluster);
9879 }
9880
9881 return SIZET2NUM(grapheme_cluster_count);
9882}
9883
9884static VALUE
9885rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9886{
9887 VALUE orig = str;
9888 rb_encoding *enc = get_encoding(str);
9889 const char *ptr0, *ptr, *end;
9890
9891 if (!rb_enc_unicode_p(enc)) {
9892 return rb_str_enumerate_chars(str, ary);
9893 }
9894
9895 if (!ary) str = rb_str_new_frozen(str);
9896
9897 bool cached_reg_grapheme_cluster = true;
9898 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9899 if (!reg_grapheme_cluster) {
9900 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9901 cached_reg_grapheme_cluster = false;
9902 }
9903
9904 ptr0 = ptr = RSTRING_PTR(str);
9905 end = RSTRING_END(str);
9906
9907 while (ptr < end) {
9908 OnigPosition len = onig_match(reg_grapheme_cluster,
9909 (const OnigUChar *)ptr, (const OnigUChar *)end,
9910 (const OnigUChar *)ptr, NULL, 0);
9911 if (len <= 0) break;
9912 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9913 ptr += len;
9914 }
9915
9916 if (!cached_reg_grapheme_cluster) {
9917 onig_free(reg_grapheme_cluster);
9918 }
9919
9920 RB_GC_GUARD(str);
9921 if (ary)
9922 return ary;
9923 else
9924 return orig;
9925}
9926
9927/*
9928 * call-seq:
9929 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9930 * each_grapheme_cluster -> enumerator
9931 *
9932 * :include: doc/string/each_grapheme_cluster.rdoc
9933 *
9934 */
9935
9936static VALUE
9937rb_str_each_grapheme_cluster(VALUE str)
9938{
9939 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9940 return rb_str_enumerate_grapheme_clusters(str, 0);
9941}
9942
9943/*
9944 * call-seq:
9945 * grapheme_clusters -> array_of_grapheme_clusters
9946 *
9947 * :include: doc/string/grapheme_clusters.rdoc
9948 *
9949 */
9950
9951static VALUE
9952rb_str_grapheme_clusters(VALUE str)
9953{
9954 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9955 return rb_str_enumerate_grapheme_clusters(str, ary);
9956}
9957
9958static long
9959chopped_length(VALUE str)
9960{
9961 rb_encoding *enc = STR_ENC_GET(str);
9962 const char *p, *p2, *beg, *end;
9963
9964 beg = RSTRING_PTR(str);
9965 end = beg + RSTRING_LEN(str);
9966 if (beg >= end) return 0;
9967 p = rb_enc_prev_char(beg, end, end, enc);
9968 if (!p) return 0;
9969 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9970 p2 = rb_enc_prev_char(beg, p, end, enc);
9971 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9972 }
9973 return p - beg;
9974}
9975
9976/*
9977 * call-seq:
9978 * chop! -> self or nil
9979 *
9980 * Like String#chop, except that:
9981 *
9982 * - Removes trailing characters from +self+ (not from a copy of +self+).
9983 * - Returns +self+ if any characters are removed, +nil+ otherwise.
9984 *
9985 * Related: see {Modifying}[rdoc-ref:String@Modifying].
9986 */
9987
9988static VALUE
9989rb_str_chop_bang(VALUE str)
9990{
9991 str_modify_keep_cr(str);
9992 if (RSTRING_LEN(str) > 0) {
9993 long len;
9994 len = chopped_length(str);
9995 STR_SET_LEN(str, len);
9996 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9997 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9999 }
10000 return str;
10001 }
10002 return Qnil;
10003}
10004
10005
10006/*
10007 * call-seq:
10008 * chop -> new_string
10009 *
10010 * :include: doc/string/chop.rdoc
10011 *
10012 */
10013
10014static VALUE
10015rb_str_chop(VALUE str)
10016{
10017 return rb_str_subseq(str, 0, chopped_length(str));
10018}
10019
10020static long
10021smart_chomp(VALUE str, const char *e, const char *p)
10022{
10023 rb_encoding *enc = rb_enc_get(str);
10024 if (rb_enc_mbminlen(enc) > 1) {
10025 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10026 if (rb_enc_is_newline(pp, e, enc)) {
10027 e = pp;
10028 }
10029 pp = e - rb_enc_mbminlen(enc);
10030 if (pp >= p) {
10031 pp = rb_enc_left_char_head(p, pp, e, enc);
10032 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10033 e = pp;
10034 }
10035 }
10036 }
10037 else {
10038 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10039 case '\n':
10040 if (--e > p && *(e-1) == '\r') {
10041 --e;
10042 }
10043 break;
10044 case '\r':
10045 --e;
10046 break;
10047 }
10048 }
10049 return e - p;
10050}
10051
10052static long
10053chompped_length(VALUE str, VALUE rs)
10054{
10055 rb_encoding *enc;
10056 int newline;
10057 char *pp, *e, *rsptr;
10058 long rslen;
10059 char *const p = RSTRING_PTR(str);
10060 long len = RSTRING_LEN(str);
10061
10062 if (len == 0) return 0;
10063 e = p + len;
10064 if (rs == rb_default_rs) {
10065 return smart_chomp(str, e, p);
10066 }
10067
10068 enc = rb_enc_get(str);
10069 RSTRING_GETMEM(rs, rsptr, rslen);
10070 if (rslen == 0) {
10071 if (rb_enc_mbminlen(enc) > 1) {
10072 while (e > p) {
10073 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10074 if (!rb_enc_is_newline(pp, e, enc)) break;
10075 e = pp;
10076 pp -= rb_enc_mbminlen(enc);
10077 if (pp >= p) {
10078 pp = rb_enc_left_char_head(p, pp, e, enc);
10079 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10080 e = pp;
10081 }
10082 }
10083 }
10084 }
10085 else {
10086 while (e > p && *(e-1) == '\n') {
10087 --e;
10088 if (e > p && *(e-1) == '\r')
10089 --e;
10090 }
10091 }
10092 return e - p;
10093 }
10094 if (rslen > len) return len;
10095
10096 enc = rb_enc_get(rs);
10097 newline = rsptr[rslen-1];
10098 if (rslen == rb_enc_mbminlen(enc)) {
10099 if (rslen == 1) {
10100 if (newline == '\n')
10101 return smart_chomp(str, e, p);
10102 }
10103 else {
10104 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10105 return smart_chomp(str, e, p);
10106 }
10107 }
10108
10109 enc = rb_enc_check(str, rs);
10110 if (is_broken_string(rs)) {
10111 return len;
10112 }
10113 pp = e - rslen;
10114 if (p[len-1] == newline &&
10115 (rslen <= 1 ||
10116 memcmp(rsptr, pp, rslen) == 0)) {
10117 if (at_char_boundary(p, pp, e, enc))
10118 return len - rslen;
10119 RB_GC_GUARD(rs);
10120 }
10121 return len;
10122}
10123
10129static VALUE
10130chomp_rs(int argc, const VALUE *argv)
10131{
10132 rb_check_arity(argc, 0, 1);
10133 if (argc > 0) {
10134 VALUE rs = argv[0];
10135 if (!NIL_P(rs)) StringValue(rs);
10136 return rs;
10137 }
10138 else {
10139 return rb_rs;
10140 }
10141}
10142
10143VALUE
10144rb_str_chomp_string(VALUE str, VALUE rs)
10145{
10146 long olen = RSTRING_LEN(str);
10147 long len = chompped_length(str, rs);
10148 if (len >= olen) return Qnil;
10149 str_modify_keep_cr(str);
10150 STR_SET_LEN(str, len);
10151 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10152 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10154 }
10155 return str;
10156}
10157
10158/*
10159 * call-seq:
10160 * chomp!(line_sep = $/) -> self or nil
10161 *
10162 * Like String#chomp, except that:
10163 *
10164 * - Removes trailing characters from +self+ (not from a copy of +self+).
10165 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10166 *
10167 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10168 */
10169
10170static VALUE
10171rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10172{
10173 VALUE rs;
10174 str_modifiable(str);
10175 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10176 rs = chomp_rs(argc, argv);
10177 if (NIL_P(rs)) return Qnil;
10178 return rb_str_chomp_string(str, rs);
10179}
10180
10181
10182/*
10183 * call-seq:
10184 * chomp(line_sep = $/) -> new_string
10185 *
10186 * :include: doc/string/chomp.rdoc
10187 *
10188 */
10189
10190static VALUE
10191rb_str_chomp(int argc, VALUE *argv, VALUE str)
10192{
10193 VALUE rs = chomp_rs(argc, argv);
10194 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10195 return rb_str_subseq(str, 0, chompped_length(str, rs));
10196}
10197
10198static void
10199tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10200 VALUE str, int num_selectors, VALUE *selectors)
10201{
10202 int i;
10203
10204 for (i=0; i<num_selectors; i++) {
10205 VALUE selector = selectors[i];
10206 rb_encoding *enc;
10207
10208 StringValue(selector);
10209 enc = rb_enc_check(str, selector);
10210 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10211 }
10212}
10213
10214static long
10215lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10216{
10217 const char *const start = s;
10218
10219 if (!s || s >= e) return 0;
10220
10221 /* remove spaces at head */
10222 if (single_byte_optimizable(str)) {
10223 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10224 }
10225 else {
10226 while (s < e) {
10227 int n;
10228 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10229
10230 if (cc && !rb_isspace(cc)) break;
10231 s += n;
10232 }
10233 }
10234 return s - start;
10235}
10236
10237static long
10238lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10239 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10240{
10241 const char *const start = s;
10242
10243 if (!s || s >= e) return 0;
10244
10245 /* remove leading characters in the table */
10246 while (s < e) {
10247 int n;
10248 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10249
10250 if (!tr_find(cc, table, del, nodel)) break;
10251 s += n;
10252 }
10253 return s - start;
10254}
10255
10256/*
10257 * call-seq:
10258 * lstrip!(*selectors) -> self or nil
10259 *
10260 * Like String#lstrip, except that:
10261 *
10262 * - Performs stripping in +self+ (not in a copy of +self+).
10263 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10264 *
10265 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10266 */
10267
10268static VALUE
10269rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10270{
10271 rb_encoding *enc;
10272 char *start, *s;
10273 long olen, loffset;
10274
10275 str_modify_keep_cr(str);
10276 enc = STR_ENC_GET(str);
10277 RSTRING_GETMEM(str, start, olen);
10278 if (argc > 0) {
10279 char table[TR_TABLE_SIZE];
10280 VALUE del = 0, nodel = 0;
10281
10282 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10283 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10284 }
10285 else {
10286 loffset = lstrip_offset(str, start, start+olen, enc);
10287 }
10288
10289 if (loffset > 0) {
10290 long len = olen-loffset;
10291 s = start + loffset;
10292 memmove(start, s, len);
10293 STR_SET_LEN(str, len);
10294 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10295 return str;
10296 }
10297 return Qnil;
10298}
10299
10300
10301/*
10302 * call-seq:
10303 * lstrip(*selectors) -> new_string
10304 *
10305 * Returns a copy of +self+ with leading whitespace removed;
10306 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10307 *
10308 * whitespace = "\x00\t\n\v\f\r "
10309 * s = whitespace + 'abc' + whitespace
10310 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10311 * s.lstrip
10312 * # => "abc\u0000\t\n\v\f\r "
10313 *
10314 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10315 *
10316 * s = "---abc+++"
10317 * s.lstrip("-") # => "abc+++"
10318 *
10319 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10320 * and may use any of its valid forms, including negation, ranges, and escapes:
10321 *
10322 * "01234abc56789".lstrip("0-9") # "abc56789"
10323 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10324 *
10325 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10326 */
10327
10328static VALUE
10329rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10330{
10331 char *start;
10332 long len, loffset;
10333
10334 RSTRING_GETMEM(str, start, len);
10335 if (argc > 0) {
10336 char table[TR_TABLE_SIZE];
10337 VALUE del = 0, nodel = 0;
10338
10339 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10340 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10341 }
10342 else {
10343 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10344 }
10345 if (loffset <= 0) return str_duplicate(rb_cString, str);
10346 return rb_str_subseq(str, loffset, len - loffset);
10347}
10348
10349static long
10350rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10351{
10352 const char *t;
10353
10354 rb_str_check_dummy_enc(enc);
10356 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10357 }
10358 if (!s || s >= e) return 0;
10359 t = e;
10360
10361 /* remove trailing spaces or '\0's */
10362 if (single_byte_optimizable(str)) {
10363 unsigned char c;
10364 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10365 }
10366 else {
10367 char *tp;
10368
10369 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10370 unsigned int c = rb_enc_codepoint(tp, e, enc);
10371 if (c && !rb_isspace(c)) break;
10372 t = tp;
10373 }
10374 }
10375 return e - t;
10376}
10377
10378static long
10379rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10380 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10381{
10382 const char *t;
10383 char *tp;
10384
10385 rb_str_check_dummy_enc(enc);
10387 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10388 }
10389 if (!s || s >= e) return 0;
10390 t = e;
10391
10392 /* remove trailing characters in the table */
10393 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10394 unsigned int c = rb_enc_codepoint(tp, e, enc);
10395 if (!tr_find(c, table, del, nodel)) break;
10396 t = tp;
10397 }
10398
10399 return e - t;
10400}
10401
10402/*
10403 * call-seq:
10404 * rstrip!(*selectors) -> self or nil
10405 *
10406 * Like String#rstrip, except that:
10407 *
10408 * - Performs stripping in +self+ (not in a copy of +self+).
10409 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10410 *
10411 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10412 */
10413
10414static VALUE
10415rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10416{
10417 rb_encoding *enc;
10418 char *start;
10419 long olen, roffset;
10420
10421 str_modify_keep_cr(str);
10422 enc = STR_ENC_GET(str);
10423 RSTRING_GETMEM(str, start, olen);
10424 if (argc > 0) {
10425 char table[TR_TABLE_SIZE];
10426 VALUE del = 0, nodel = 0;
10427
10428 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10429 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10430 }
10431 else {
10432 roffset = rstrip_offset(str, start, start+olen, enc);
10433 }
10434 if (roffset > 0) {
10435 long len = olen - roffset;
10436
10437 STR_SET_LEN(str, len);
10438 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10439 return str;
10440 }
10441 return Qnil;
10442}
10443
10444
10445/*
10446 * call-seq:
10447 * rstrip(*selectors) -> new_string
10448 *
10449 * Returns a copy of +self+ with trailing whitespace removed;
10450 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10451 *
10452 * whitespace = "\x00\t\n\v\f\r "
10453 * s = whitespace + 'abc' + whitespace
10454 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10455 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10456 *
10457 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10458 *
10459 * s = "---abc+++"
10460 * s.rstrip("+") # => "---abc"
10461 *
10462 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10463 * and may use any of its valid forms, including negation, ranges, and escapes:
10464 *
10465 * "01234abc56789".rstrip("0-9") # "01234abc"
10466 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10467 *
10468 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10469 */
10470
10471static VALUE
10472rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10473{
10474 rb_encoding *enc;
10475 char *start;
10476 long olen, roffset;
10477
10478 enc = STR_ENC_GET(str);
10479 RSTRING_GETMEM(str, start, olen);
10480 if (argc > 0) {
10481 char table[TR_TABLE_SIZE];
10482 VALUE del = 0, nodel = 0;
10483
10484 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10485 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10486 }
10487 else {
10488 roffset = rstrip_offset(str, start, start+olen, enc);
10489 }
10490 if (roffset <= 0) return str_duplicate(rb_cString, str);
10491 return rb_str_subseq(str, 0, olen-roffset);
10492}
10493
10494
10495/*
10496 * call-seq:
10497 * strip!(*selectors) -> self or nil
10498 *
10499 * Like String#strip, except that:
10500 *
10501 * - Any modifications are made to +self+.
10502 * - Returns +self+ if any modification are made, +nil+ otherwise.
10503 *
10504 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10505 */
10506
10507static VALUE
10508rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10509{
10510 char *start;
10511 long olen, loffset, roffset;
10512 rb_encoding *enc;
10513
10514 str_modify_keep_cr(str);
10515 enc = STR_ENC_GET(str);
10516 RSTRING_GETMEM(str, start, olen);
10517
10518 if (argc > 0) {
10519 char table[TR_TABLE_SIZE];
10520 VALUE del = 0, nodel = 0;
10521
10522 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10523 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10524 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10525 }
10526 else {
10527 loffset = lstrip_offset(str, start, start+olen, enc);
10528 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10529 }
10530
10531 if (loffset > 0 || roffset > 0) {
10532 long len = olen-roffset;
10533 if (loffset > 0) {
10534 len -= loffset;
10535 memmove(start, start + loffset, len);
10536 }
10537 STR_SET_LEN(str, len);
10538 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10539 return str;
10540 }
10541 return Qnil;
10542}
10543
10544
10545/*
10546 * call-seq:
10547 * strip(*selectors) -> new_string
10548 *
10549 * Returns a copy of +self+ with leading and trailing whitespace removed;
10550 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10551 *
10552 * whitespace = "\x00\t\n\v\f\r "
10553 * s = whitespace + 'abc' + whitespace
10554 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10555 * s.strip # => "abc"
10556 *
10557 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10558 *
10559 * s = "---abc+++"
10560 * s.strip("-+") # => "abc"
10561 * s.strip("+-") # => "abc"
10562 *
10563 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10564 * and may use any of its valid forms, including negation, ranges, and escapes:
10565 *
10566 * "01234abc56789".strip("0-9") # "abc"
10567 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10568 *
10569 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10570 */
10571
10572static VALUE
10573rb_str_strip(int argc, VALUE *argv, VALUE str)
10574{
10575 char *start;
10576 long olen, loffset, roffset;
10577 rb_encoding *enc = STR_ENC_GET(str);
10578
10579 RSTRING_GETMEM(str, start, olen);
10580
10581 if (argc > 0) {
10582 char table[TR_TABLE_SIZE];
10583 VALUE del = 0, nodel = 0;
10584
10585 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10586 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10587 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10588 }
10589 else {
10590 loffset = lstrip_offset(str, start, start+olen, enc);
10591 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10592 }
10593
10594 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10595 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10596}
10597
10598static VALUE
10599scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10600{
10601 VALUE result = Qnil;
10602 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10603 if (pos >= 0) {
10604 VALUE match;
10605 struct re_registers *regs;
10606 if (BUILTIN_TYPE(pat) == T_STRING) {
10607 regs = NULL;
10608 end = pos + RSTRING_LEN(pat);
10609 }
10610 else {
10611 match = rb_backref_get();
10612 regs = RMATCH_REGS(match);
10613 pos = BEG(0);
10614 end = END(0);
10615 }
10616
10617 if (pos == end) {
10618 rb_encoding *enc = STR_ENC_GET(str);
10619 /*
10620 * Always consume at least one character of the input string
10621 */
10622 if (RSTRING_LEN(str) > end)
10623 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10624 RSTRING_END(str), enc);
10625 else
10626 *start = end + 1;
10627 }
10628 else {
10629 *start = end;
10630 }
10631
10632 if (!regs || regs->num_regs == 1) {
10633 result = rb_str_subseq(str, pos, end - pos);
10634 return result;
10635 }
10636 else {
10637 result = rb_ary_new2(regs->num_regs);
10638 for (int i = 1; i < regs->num_regs; i++) {
10639 VALUE s = Qnil;
10640 if (BEG(i) >= 0) {
10641 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10642 }
10643
10644 rb_ary_push(result, s);
10645 }
10646 }
10647
10648 RB_GC_GUARD(match);
10649 }
10650
10651 return result;
10652}
10653
10654
10655/*
10656 * call-seq:
10657 * scan(pattern) -> array_of_results
10658 * scan(pattern) {|result| ... } -> self
10659 *
10660 * :include: doc/string/scan.rdoc
10661 *
10662 */
10663
10664static VALUE
10665rb_str_scan(VALUE str, VALUE pat)
10666{
10667 VALUE result;
10668 long start = 0;
10669 long last = -1, prev = 0;
10670 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10671
10672 pat = get_pat_quoted(pat, 1);
10673 mustnot_broken(str);
10674 if (!rb_block_given_p()) {
10675 VALUE ary = rb_ary_new();
10676
10677 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10678 last = prev;
10679 prev = start;
10680 rb_ary_push(ary, result);
10681 }
10682 if (last >= 0) rb_pat_search(pat, str, last, 1);
10683 else rb_backref_set(Qnil);
10684 return ary;
10685 }
10686
10687 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10688 last = prev;
10689 prev = start;
10690 rb_yield(result);
10691 str_mod_check(str, p, len);
10692 }
10693 if (last >= 0) rb_pat_search(pat, str, last, 1);
10694 return str;
10695}
10696
10697
10698/*
10699 * call-seq:
10700 * hex -> integer
10701 *
10702 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10703 * returns its value as an integer.
10704 *
10705 * The leading substring is interpreted as hexadecimal when it begins with:
10706 *
10707 * - One or more character representing hexadecimal digits
10708 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10709 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10710 *
10711 * 'f'.hex # => 15
10712 * '11'.hex # => 17
10713 * 'FFF'.hex # => 4095
10714 * 'fffg'.hex # => 4095
10715 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10716 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10717 * 'deadbeef'.hex # => 3735928559
10718 *
10719 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10720 *
10721 * '0xfff'.hex # => 4095
10722 * '0xfffg'.hex # => 4095
10723 *
10724 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10725 *
10726 * '-fff'.hex # => -4095
10727 * '-0xFFF'.hex # => -4095
10728 *
10729 * For any substring not described above, returns zero:
10730 *
10731 * 'xxx'.hex # => 0
10732 * ''.hex # => 0
10733 *
10734 * Note that, unlike #oct, this method interprets only hexadecimal,
10735 * and not binary, octal, or decimal notations:
10736 *
10737 * '0b111'.hex # => 45329
10738 * '0o777'.hex # => 0
10739 * '0d999'.hex # => 55705
10740 *
10741 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10742 */
10743
10744static VALUE
10745rb_str_hex(VALUE str)
10746{
10747 return rb_str_to_inum(str, 16, FALSE);
10748}
10749
10750
10751/*
10752 * call-seq:
10753 * oct -> integer
10754 *
10755 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10756 * returns their value as an integer.
10757 *
10758 * In brief:
10759 *
10760 * # Interpreted as octal.
10761 * '777'.oct # => 511
10762 * '777x'.oct # => 511
10763 * '0777'.oct # => 511
10764 * '0o777'.oct # => 511
10765 * '-777'.oct # => -511
10766 * # Not interpreted as octal.
10767 * '0b111'.oct # => 7 # Interpreted as binary.
10768 * '0d999'.oct # => 999 # Interpreted as decimal.
10769 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10770 *
10771 * The leading substring is interpreted as octal when it begins with:
10772 *
10773 * - One or more character representing octal digits
10774 * (each in the range <tt>'0'..'7'</tt>);
10775 * the string to be interpreted ends at the first character that does not represent an octal digit:
10776 *
10777 * '7'.oct @ => 7
10778 * '11'.oct # => 9
10779 * '777'.oct # => 511
10780 * '0777'.oct # => 511
10781 * '7778'.oct # => 511
10782 * '777x'.oct # => 511
10783 *
10784 * - <tt>'0o'</tt>, followed by one or more octal digits:
10785 *
10786 * '0o777'.oct # => 511
10787 * '0o7778'.oct # => 511
10788 *
10789 * The leading substring is _not_ interpreted as octal when it begins with:
10790 *
10791 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10792 * (each in the range <tt>'0'..'1'</tt>);
10793 * the string to be interpreted ends at the first character that does not represent a binary digit.
10794 * the string is interpreted as binary digits (base 2):
10795 *
10796 * '0b111'.oct # => 7
10797 * '0b1112'.oct # => 7
10798 *
10799 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10800 * (each in the range <tt>'0'..'9'</tt>);
10801 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10802 * the string is interpreted as decimal digits (base 10):
10803 *
10804 * '0d999'.oct # => 999
10805 * '0d999x'.oct # => 999
10806 *
10807 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10808 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10809 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10810 * the string is interpreted as hexadecimal digits (base 16):
10811 *
10812 * '0xfff'.oct # => 4095
10813 * '0xfffg'.oct # => 4095
10814 *
10815 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10816 *
10817 * '-777'.oct # => -511
10818 * '-0777'.oct # => -511
10819 * '-0b111'.oct # => -7
10820 * '-0xfff'.oct # => -4095
10821 *
10822 * For any substring not described above, returns zero:
10823 *
10824 * 'foo'.oct # => 0
10825 * ''.oct # => 0
10826 *
10827 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10828 */
10829
10830static VALUE
10831rb_str_oct(VALUE str)
10832{
10833 return rb_str_to_inum(str, -8, FALSE);
10834}
10835
10836#ifndef HAVE_CRYPT_R
10837# include "ruby/thread_native.h"
10838# include "ruby/atomic.h"
10839
10840static struct {
10841 rb_nativethread_lock_t lock;
10842} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10843#endif
10844
10845/*
10846 * call-seq:
10847 * crypt(salt_str) -> new_string
10848 *
10849 * Returns the string generated by calling <code>crypt(3)</code>
10850 * standard library function with <code>str</code> and
10851 * <code>salt_str</code>, in this order, as its arguments. Please do
10852 * not use this method any longer. It is legacy; provided only for
10853 * backward compatibility with ruby scripts in earlier days. It is
10854 * bad to use in contemporary programs for several reasons:
10855 *
10856 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10857 * run. The generated string lacks data portability.
10858 *
10859 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10860 * (i.e. silently ends up in unexpected results).
10861 *
10862 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10863 * thread safe.
10864 *
10865 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10866 * very very weak. According to its manpage, Linux's traditional
10867 * <code>crypt(3)</code> output has only 2**56 variations; too
10868 * easy to brute force today. And this is the default behaviour.
10869 *
10870 * * In order to make things robust some OSes implement so-called
10871 * "modular" usage. To go through, you have to do a complex
10872 * build-up of the <code>salt_str</code> parameter, by hand.
10873 * Failure in generation of a proper salt string tends not to
10874 * yield any errors; typos in parameters are normally not
10875 * detectable.
10876 *
10877 * * For instance, in the following example, the second invocation
10878 * of String#crypt is wrong; it has a typo in "round=" (lacks
10879 * "s"). However the call does not fail and something unexpected
10880 * is generated.
10881 *
10882 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10883 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10884 *
10885 * * Even in the "modular" mode, some hash functions are considered
10886 * archaic and no longer recommended at all; for instance module
10887 * <code>$1$</code> is officially abandoned by its author: see
10888 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10889 * instance module <code>$3$</code> is considered completely
10890 * broken: see the manpage of FreeBSD.
10891 *
10892 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10893 * written above, <code>crypt(3)</code> on Mac OS never fails.
10894 * This means even if you build up a proper salt string it
10895 * generates a traditional DES hash anyways, and there is no way
10896 * for you to be aware of.
10897 *
10898 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10899 *
10900 * If for some reason you cannot migrate to other secure contemporary
10901 * password hashing algorithms, install the string-crypt gem and
10902 * <code>require 'string/crypt'</code> to continue using it.
10903 */
10904
10905static VALUE
10906rb_str_crypt(VALUE str, VALUE salt)
10907{
10908#ifdef HAVE_CRYPT_R
10909 VALUE databuf;
10910 struct crypt_data *data;
10911# define CRYPT_END() ALLOCV_END(databuf)
10912#else
10913 char *tmp_buf;
10914 extern char *crypt(const char *, const char *);
10915# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10916#endif
10917 VALUE result;
10918 const char *s, *saltp;
10919 char *res;
10920#ifdef BROKEN_CRYPT
10921 char salt_8bit_clean[3];
10922#endif
10923
10924 StringValue(salt);
10925 mustnot_wchar(str);
10926 mustnot_wchar(salt);
10927 s = StringValueCStr(str);
10928 saltp = RSTRING_PTR(salt);
10929 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10930 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10931 }
10932
10933#ifdef BROKEN_CRYPT
10934 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10935 salt_8bit_clean[0] = saltp[0] & 0x7f;
10936 salt_8bit_clean[1] = saltp[1] & 0x7f;
10937 salt_8bit_clean[2] = '\0';
10938 saltp = salt_8bit_clean;
10939 }
10940#endif
10941#ifdef HAVE_CRYPT_R
10942 data = ALLOCV(databuf, sizeof(struct crypt_data));
10943# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10944 data->initialized = 0;
10945# endif
10946 res = crypt_r(s, saltp, data);
10947#else
10948 rb_nativethread_lock_lock(&crypt_mutex.lock);
10949 res = crypt(s, saltp);
10950#endif
10951 if (!res) {
10952 int err = errno;
10953 CRYPT_END();
10954 rb_syserr_fail(err, "crypt");
10955 }
10956#ifdef HAVE_CRYPT_R
10957 result = rb_str_new_cstr(res);
10958 CRYPT_END();
10959#else
10960 // We need to copy this buffer because it's static and we need to unlock the mutex
10961 // before allocating a new object (the string to be returned). If we allocate while
10962 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10963 // if other ractors are waiting on this lock.
10964 size_t res_size = strlen(res)+1;
10965 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10966 memcpy(tmp_buf, res, res_size);
10967 res = tmp_buf;
10968 CRYPT_END();
10969 result = rb_str_new_cstr(res);
10970#endif
10971 return result;
10972}
10973
10974
10975/*
10976 * call-seq:
10977 * ord -> integer
10978 *
10979 * :include: doc/string/ord.rdoc
10980 *
10981 */
10982
10983static VALUE
10984rb_str_ord(VALUE s)
10985{
10986 unsigned int c;
10987
10988 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10989 return UINT2NUM(c);
10990}
10991/*
10992 * call-seq:
10993 * sum(n = 16) -> integer
10994 *
10995 * :include: doc/string/sum.rdoc
10996 *
10997 */
10998
10999static VALUE
11000rb_str_sum(int argc, VALUE *argv, VALUE str)
11001{
11002 int bits = 16;
11003 char *ptr, *p, *pend;
11004 long len;
11005 VALUE sum = INT2FIX(0);
11006 unsigned long sum0 = 0;
11007
11008 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11009 bits = 0;
11010 }
11011 ptr = p = RSTRING_PTR(str);
11012 len = RSTRING_LEN(str);
11013 pend = p + len;
11014
11015 while (p < pend) {
11016 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11017 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11018 str_mod_check(str, ptr, len);
11019 sum0 = 0;
11020 }
11021 sum0 += (unsigned char)*p;
11022 p++;
11023 }
11024
11025 if (bits == 0) {
11026 if (sum0) {
11027 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11028 }
11029 }
11030 else {
11031 if (sum == INT2FIX(0)) {
11032 if (bits < (int)sizeof(long)*CHAR_BIT) {
11033 sum0 &= (((unsigned long)1)<<bits)-1;
11034 }
11035 sum = LONG2FIX(sum0);
11036 }
11037 else {
11038 VALUE mod;
11039
11040 if (sum0) {
11041 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11042 }
11043
11044 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11045 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11046 sum = rb_funcall(sum, '&', 1, mod);
11047 }
11048 }
11049 return sum;
11050}
11051
11052static VALUE
11053rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11054{
11055 rb_encoding *enc;
11056 VALUE w;
11057 long width, len, flen = 1, fclen = 1;
11058 VALUE res;
11059 char *p;
11060 const char *f = " ";
11061 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11062 VALUE pad;
11063 int singlebyte = 1, cr;
11064 int termlen;
11065
11066 rb_scan_args(argc, argv, "11", &w, &pad);
11067 enc = STR_ENC_GET(str);
11068 termlen = rb_enc_mbminlen(enc);
11069 width = NUM2LONG(w);
11070 if (argc == 2) {
11071 StringValue(pad);
11072 enc = rb_enc_check(str, pad);
11073 f = RSTRING_PTR(pad);
11074 flen = RSTRING_LEN(pad);
11075 fclen = str_strlen(pad, enc); /* rb_enc_check */
11076 singlebyte = single_byte_optimizable(pad);
11077 if (flen == 0 || fclen == 0) {
11078 rb_raise(rb_eArgError, "zero width padding");
11079 }
11080 }
11081 len = str_strlen(str, enc); /* rb_enc_check */
11082 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11083 n = width - len;
11084 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11085 rlen = n - llen;
11086 cr = ENC_CODERANGE(str);
11087 if (flen > 1) {
11088 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11089 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11090 }
11091 size = RSTRING_LEN(str);
11092 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11093 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11094 (len += llen2 + rlen2) >= LONG_MAX - size) {
11095 rb_raise(rb_eArgError, "argument too big");
11096 }
11097 len += size;
11098 res = str_enc_new(rb_cString, 0, len, enc);
11099 p = RSTRING_PTR(res);
11100 if (flen <= 1) {
11101 memset(p, *f, llen);
11102 p += llen;
11103 }
11104 else {
11105 while (llen >= fclen) {
11106 memcpy(p,f,flen);
11107 p += flen;
11108 llen -= fclen;
11109 }
11110 if (llen > 0) {
11111 memcpy(p, f, llen2);
11112 p += llen2;
11113 }
11114 }
11115 memcpy(p, RSTRING_PTR(str), size);
11116 p += size;
11117 if (flen <= 1) {
11118 memset(p, *f, rlen);
11119 p += rlen;
11120 }
11121 else {
11122 while (rlen >= fclen) {
11123 memcpy(p,f,flen);
11124 p += flen;
11125 rlen -= fclen;
11126 }
11127 if (rlen > 0) {
11128 memcpy(p, f, rlen2);
11129 p += rlen2;
11130 }
11131 }
11132 TERM_FILL(p, termlen);
11133 STR_SET_LEN(res, p-RSTRING_PTR(res));
11134
11135 if (argc == 2)
11136 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11137 if (cr != ENC_CODERANGE_BROKEN)
11138 ENC_CODERANGE_SET(res, cr);
11139
11140 RB_GC_GUARD(pad);
11141 return res;
11142}
11143
11144
11145/*
11146 * call-seq:
11147 * ljust(width, pad_string = ' ') -> new_string
11148 *
11149 * :include: doc/string/ljust.rdoc
11150 *
11151 */
11152
11153static VALUE
11154rb_str_ljust(int argc, VALUE *argv, VALUE str)
11155{
11156 return rb_str_justify(argc, argv, str, 'l');
11157}
11158
11159/*
11160 * call-seq:
11161 * rjust(width, pad_string = ' ') -> new_string
11162 *
11163 * :include: doc/string/rjust.rdoc
11164 *
11165 */
11166
11167static VALUE
11168rb_str_rjust(int argc, VALUE *argv, VALUE str)
11169{
11170 return rb_str_justify(argc, argv, str, 'r');
11171}
11172
11173
11174/*
11175 * call-seq:
11176 * center(size, pad_string = ' ') -> new_string
11177 *
11178 * :include: doc/string/center.rdoc
11179 *
11180 */
11181
11182static VALUE
11183rb_str_center(int argc, VALUE *argv, VALUE str)
11184{
11185 return rb_str_justify(argc, argv, str, 'c');
11186}
11187
11188/*
11189 * call-seq:
11190 * partition(pattern) -> [pre_match, first_match, post_match]
11191 *
11192 * :include: doc/string/partition.rdoc
11193 *
11194 */
11195
11196static VALUE
11197rb_str_partition(VALUE str, VALUE sep)
11198{
11199 long pos;
11200
11201 sep = get_pat_quoted(sep, 0);
11202 if (RB_TYPE_P(sep, T_REGEXP)) {
11203 if (rb_reg_search(sep, str, 0, 0) < 0) {
11204 goto failed;
11205 }
11206 VALUE match = rb_backref_get();
11207 struct re_registers *regs = RMATCH_REGS(match);
11208
11209 pos = BEG(0);
11210 sep = rb_str_subseq(str, pos, END(0) - pos);
11211 }
11212 else {
11213 pos = rb_str_index(str, sep, 0);
11214 if (pos < 0) goto failed;
11215 }
11216 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11217 sep,
11218 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11219 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11220
11221 failed:
11222 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11223}
11224
11225/*
11226 * call-seq:
11227 * rpartition(pattern) -> [pre_match, last_match, post_match]
11228 *
11229 * :include: doc/string/rpartition.rdoc
11230 *
11231 */
11232
11233static VALUE
11234rb_str_rpartition(VALUE str, VALUE sep)
11235{
11236 long pos = RSTRING_LEN(str);
11237
11238 sep = get_pat_quoted(sep, 0);
11239 if (RB_TYPE_P(sep, T_REGEXP)) {
11240 if (rb_reg_search(sep, str, pos, 1) < 0) {
11241 goto failed;
11242 }
11243 VALUE match = rb_backref_get();
11244 struct re_registers *regs = RMATCH_REGS(match);
11245
11246 pos = BEG(0);
11247 sep = rb_str_subseq(str, pos, END(0) - pos);
11248 }
11249 else {
11250 pos = rb_str_sublen(str, pos);
11251 pos = rb_str_rindex(str, sep, pos);
11252 if (pos < 0) {
11253 goto failed;
11254 }
11255 }
11256
11257 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11258 sep,
11259 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11260 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11261 failed:
11262 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11263}
11264
11265/*
11266 * call-seq:
11267 * start_with?(*patterns) -> true or false
11268 *
11269 * :include: doc/string/start_with_p.rdoc
11270 *
11271 */
11272
11273static VALUE
11274rb_str_start_with(int argc, VALUE *argv, VALUE str)
11275{
11276 int i;
11277
11278 for (i=0; i<argc; i++) {
11279 VALUE tmp = argv[i];
11280 if (RB_TYPE_P(tmp, T_REGEXP)) {
11281 if (rb_reg_start_with_p(tmp, str))
11282 return Qtrue;
11283 }
11284 else {
11285 const char *p, *s, *e;
11286 long slen, tlen;
11287 rb_encoding *enc;
11288
11289 StringValue(tmp);
11290 enc = rb_enc_check(str, tmp);
11291 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11292 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11293 p = RSTRING_PTR(str);
11294 e = p + slen;
11295 s = p + tlen;
11296 if (!at_char_right_boundary(p, s, e, enc))
11297 continue;
11298 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11299 return Qtrue;
11300 }
11301 }
11302 return Qfalse;
11303}
11304
11305/*
11306 * call-seq:
11307 * end_with?(*strings) -> true or false
11308 *
11309 * :include: doc/string/end_with_p.rdoc
11310 *
11311 */
11312
11313static VALUE
11314rb_str_end_with(int argc, VALUE *argv, VALUE str)
11315{
11316 int i;
11317
11318 for (i=0; i<argc; i++) {
11319 VALUE tmp = argv[i];
11320 const char *p, *s, *e;
11321 long slen, tlen;
11322 rb_encoding *enc;
11323
11324 StringValue(tmp);
11325 enc = rb_enc_check(str, tmp);
11326 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11327 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11328 p = RSTRING_PTR(str);
11329 e = p + slen;
11330 s = e - tlen;
11331 if (!at_char_boundary(p, s, e, enc))
11332 continue;
11333 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11334 return Qtrue;
11335 }
11336 return Qfalse;
11337}
11338
11348static long
11349deleted_prefix_length(VALUE str, VALUE prefix)
11350{
11351 const char *strptr, *prefixptr;
11352 long olen, prefixlen;
11353 rb_encoding *enc = rb_enc_get(str);
11354
11355 StringValue(prefix);
11356
11357 if (!is_broken_string(prefix) ||
11358 !rb_enc_asciicompat(enc) ||
11359 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11360 enc = rb_enc_check(str, prefix);
11361 }
11362
11363 /* return 0 if not start with prefix */
11364 prefixlen = RSTRING_LEN(prefix);
11365 if (prefixlen <= 0) return 0;
11366 olen = RSTRING_LEN(str);
11367 if (olen < prefixlen) return 0;
11368 strptr = RSTRING_PTR(str);
11369 prefixptr = RSTRING_PTR(prefix);
11370 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11371 if (is_broken_string(prefix)) {
11372 if (!is_broken_string(str)) {
11373 /* prefix in a valid string cannot be broken */
11374 return 0;
11375 }
11376 const char *strend = strptr + olen;
11377 const char *after_prefix = strptr + prefixlen;
11378 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11379 /* prefix does not end at char-boundary */
11380 return 0;
11381 }
11382 }
11383 /* prefix part in `str` also should be valid. */
11384
11385 return prefixlen;
11386}
11387
11388/*
11389 * call-seq:
11390 * delete_prefix!(prefix) -> self or nil
11391 *
11392 * Like String#delete_prefix, except that +self+ is modified in place;
11393 * returns +self+ if the prefix is removed, +nil+ otherwise.
11394 *
11395 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11396 */
11397
11398static VALUE
11399rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11400{
11401 long prefixlen;
11402 str_modify_keep_cr(str);
11403
11404 prefixlen = deleted_prefix_length(str, prefix);
11405 if (prefixlen <= 0) return Qnil;
11406
11407 return rb_str_drop_bytes(str, prefixlen);
11408}
11409
11410/*
11411 * call-seq:
11412 * delete_prefix(prefix) -> new_string
11413 *
11414 * :include: doc/string/delete_prefix.rdoc
11415 *
11416 */
11417
11418static VALUE
11419rb_str_delete_prefix(VALUE str, VALUE prefix)
11420{
11421 long prefixlen;
11422
11423 prefixlen = deleted_prefix_length(str, prefix);
11424 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11425
11426 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11427}
11428
11438static long
11439deleted_suffix_length(VALUE str, VALUE suffix)
11440{
11441 const char *strptr, *suffixptr;
11442 long olen, suffixlen;
11443 rb_encoding *enc;
11444
11445 StringValue(suffix);
11446 if (is_broken_string(suffix)) return 0;
11447 enc = rb_enc_check(str, suffix);
11448
11449 /* return 0 if not start with suffix */
11450 suffixlen = RSTRING_LEN(suffix);
11451 if (suffixlen <= 0) return 0;
11452 olen = RSTRING_LEN(str);
11453 if (olen < suffixlen) return 0;
11454 strptr = RSTRING_PTR(str);
11455 suffixptr = RSTRING_PTR(suffix);
11456 const char *strend = strptr + olen;
11457 const char *before_suffix = strend - suffixlen;
11458 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11459 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11460
11461 return suffixlen;
11462}
11463
11464/*
11465 * call-seq:
11466 * delete_suffix!(suffix) -> self or nil
11467 *
11468 * Like String#delete_suffix, except that +self+ is modified in place;
11469 * returns +self+ if the suffix is removed, +nil+ otherwise.
11470 *
11471 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11472 */
11473
11474static VALUE
11475rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11476{
11477 long olen, suffixlen, len;
11478 str_modifiable(str);
11479
11480 suffixlen = deleted_suffix_length(str, suffix);
11481 if (suffixlen <= 0) return Qnil;
11482
11483 olen = RSTRING_LEN(str);
11484 str_modify_keep_cr(str);
11485 len = olen - suffixlen;
11486 STR_SET_LEN(str, len);
11487 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11488 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11490 }
11491 return str;
11492}
11493
11494/*
11495 * call-seq:
11496 * delete_suffix(suffix) -> new_string
11497 *
11498 * :include: doc/string/delete_suffix.rdoc
11499 *
11500 */
11501
11502static VALUE
11503rb_str_delete_suffix(VALUE str, VALUE suffix)
11504{
11505 long suffixlen;
11506
11507 suffixlen = deleted_suffix_length(str, suffix);
11508 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11509
11510 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11511}
11512
11513void
11514rb_str_setter(VALUE val, ID id, VALUE *var)
11515{
11516 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11517 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11518 }
11519 *var = val;
11520}
11521
11522static void
11523nil_setter_warning(ID id)
11524{
11525 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11526}
11527
11528void
11529rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11530{
11531 rb_str_setter(val, id, var);
11532 if (!NIL_P(*var)) {
11533 nil_setter_warning(id);
11534 }
11535}
11536
11537static void
11538rb_fs_setter(VALUE val, ID id, VALUE *var)
11539{
11540 val = rb_fs_check(val);
11541 if (!val) {
11542 rb_raise(rb_eTypeError,
11543 "value of %"PRIsVALUE" must be String or Regexp",
11544 rb_id2str(id));
11545 }
11546 if (!NIL_P(val)) {
11547 nil_setter_warning(id);
11548 }
11549 *var = val;
11550}
11551
11552
11553/*
11554 * call-seq:
11555 * force_encoding(encoding) -> self
11556 *
11557 * :include: doc/string/force_encoding.rdoc
11558 *
11559 */
11560
11561static VALUE
11562rb_str_force_encoding(VALUE str, VALUE enc)
11563{
11564 str_modifiable(str);
11565
11566 rb_encoding *encoding = rb_to_encoding(enc);
11567 int idx = rb_enc_to_index(encoding);
11568
11569 // If the encoding is unchanged, we do nothing.
11570 if (ENCODING_GET(str) == idx) {
11571 return str;
11572 }
11573
11574 rb_enc_associate_index(str, idx);
11575
11576 // If the coderange was 7bit and the new encoding is ASCII-compatible
11577 // we can keep the coderange.
11578 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11579 return str;
11580 }
11581
11583 return str;
11584}
11585
11586/*
11587 * call-seq:
11588 * b -> new_string
11589 *
11590 * :include: doc/string/b.rdoc
11591 *
11592 */
11593
11594static VALUE
11595rb_str_b(VALUE str)
11596{
11597 VALUE str2;
11598 if (STR_EMBED_P(str)) {
11599 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11600 }
11601 else {
11602 str2 = str_alloc_heap(rb_cString);
11603 }
11604 str_replace_shared_without_enc(str2, str);
11605
11606 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11607 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11608 // If we know the receiver's code range then we know the result's code range.
11609 int cr = ENC_CODERANGE(str);
11610 switch (cr) {
11611 case ENC_CODERANGE_7BIT:
11613 break;
11617 break;
11618 default:
11619 ENC_CODERANGE_CLEAR(str2);
11620 break;
11621 }
11622 }
11623
11624 return str2;
11625}
11626
11627/*
11628 * call-seq:
11629 * valid_encoding? -> true or false
11630 *
11631 * :include: doc/string/valid_encoding_p.rdoc
11632 *
11633 */
11634
11635static VALUE
11636rb_str_valid_encoding_p(VALUE str)
11637{
11638 int cr = rb_enc_str_coderange(str);
11639
11640 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11641}
11642
11643/*
11644 * call-seq:
11645 * ascii_only? -> true or false
11646 *
11647 * Returns whether +self+ contains only ASCII characters:
11648 *
11649 * 'abc'.ascii_only? # => true
11650 * "abc\u{6666}".ascii_only? # => false
11651 *
11652 * Related: see {Querying}[rdoc-ref:String@Querying].
11653 */
11654
11655static VALUE
11656rb_str_is_ascii_only_p(VALUE str)
11657{
11658 int cr = rb_enc_str_coderange(str);
11659
11660 return RBOOL(cr == ENC_CODERANGE_7BIT);
11661}
11662
11663VALUE
11665{
11666 static const char ellipsis[] = "...";
11667 const long ellipsislen = sizeof(ellipsis) - 1;
11668 rb_encoding *const enc = rb_enc_get(str);
11669 const long blen = RSTRING_LEN(str);
11670 const char *const p = RSTRING_PTR(str), *e = p + blen;
11671 VALUE estr, ret = 0;
11672
11673 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11674 if (len * rb_enc_mbminlen(enc) >= blen ||
11675 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11676 ret = str;
11677 }
11678 else if (len <= ellipsislen ||
11679 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11680 if (rb_enc_asciicompat(enc)) {
11681 ret = rb_str_new(ellipsis, len);
11682 rb_enc_associate(ret, enc);
11683 }
11684 else {
11685 estr = rb_usascii_str_new(ellipsis, len);
11686 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11687 }
11688 }
11689 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11690 rb_str_cat(ret, ellipsis, ellipsislen);
11691 }
11692 else {
11693 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11694 rb_enc_from_encoding(enc), 0, Qnil);
11695 rb_str_append(ret, estr);
11696 }
11697 return ret;
11698}
11699
11700static VALUE
11701str_compat_and_valid(VALUE str, rb_encoding *enc)
11702{
11703 int cr;
11704 str = StringValue(str);
11705 cr = rb_enc_str_coderange(str);
11706 if (cr == ENC_CODERANGE_BROKEN) {
11707 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11708 }
11709 else {
11710 rb_encoding *e = STR_ENC_GET(str);
11711 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11712 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11713 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11714 }
11715 }
11716 return str;
11717}
11718
11719static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11720
11721VALUE
11723{
11724 rb_encoding *enc = STR_ENC_GET(str);
11725 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11726}
11727
11728VALUE
11729rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11730{
11731 int cr = ENC_CODERANGE_UNKNOWN;
11732 if (enc == STR_ENC_GET(str)) {
11733 /* cached coderange makes sense only when enc equals the
11734 * actual encoding of str */
11735 cr = ENC_CODERANGE(str);
11736 }
11737 return enc_str_scrub(enc, str, repl, cr);
11738}
11739
11740static VALUE
11741enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11742{
11743 int encidx;
11744 VALUE buf = Qnil;
11745 const char *rep, *p, *e, *p1, *sp;
11746 long replen = -1;
11747 long slen;
11748
11749 if (rb_block_given_p()) {
11750 if (!NIL_P(repl))
11751 rb_raise(rb_eArgError, "both of block and replacement given");
11752 replen = 0;
11753 }
11754
11755 if (ENC_CODERANGE_CLEAN_P(cr))
11756 return Qnil;
11757
11758 if (!NIL_P(repl)) {
11759 repl = str_compat_and_valid(repl, enc);
11760 }
11761
11762 if (rb_enc_dummy_p(enc)) {
11763 return Qnil;
11764 }
11765 encidx = rb_enc_to_index(enc);
11766
11767#define DEFAULT_REPLACE_CHAR(str) do { \
11768 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11769 rep = replace; replen = (int)sizeof(replace); \
11770 } while (0)
11771
11772 slen = RSTRING_LEN(str);
11773 p = RSTRING_PTR(str);
11774 e = RSTRING_END(str);
11775 p1 = p;
11776 sp = p;
11777
11778 if (rb_enc_asciicompat(enc)) {
11779 int rep7bit_p;
11780 if (!replen) {
11781 rep = NULL;
11782 rep7bit_p = FALSE;
11783 }
11784 else if (!NIL_P(repl)) {
11785 rep = RSTRING_PTR(repl);
11786 replen = RSTRING_LEN(repl);
11787 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11788 }
11789 else if (encidx == rb_utf8_encindex()) {
11790 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11791 rep7bit_p = FALSE;
11792 }
11793 else {
11794 DEFAULT_REPLACE_CHAR("?");
11795 rep7bit_p = TRUE;
11796 }
11797 cr = ENC_CODERANGE_7BIT;
11798
11799 p = search_nonascii(p, e);
11800 if (!p) {
11801 p = e;
11802 }
11803 while (p < e) {
11804 int ret = rb_enc_precise_mbclen(p, e, enc);
11805 if (MBCLEN_NEEDMORE_P(ret)) {
11806 break;
11807 }
11808 else if (MBCLEN_CHARFOUND_P(ret)) {
11810 p += MBCLEN_CHARFOUND_LEN(ret);
11811 }
11812 else if (MBCLEN_INVALID_P(ret)) {
11813 /*
11814 * p1~p: valid ascii/multibyte chars
11815 * p ~e: invalid bytes + unknown bytes
11816 */
11817 long clen = rb_enc_mbmaxlen(enc);
11818 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11819 if (p > p1) {
11820 rb_str_buf_cat(buf, p1, p - p1);
11821 }
11822
11823 if (e - p < clen) clen = e - p;
11824 if (clen <= 2) {
11825 clen = 1;
11826 }
11827 else {
11828 const char *q = p;
11829 clen--;
11830 for (; clen > 1; clen--) {
11831 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11832 if (MBCLEN_NEEDMORE_P(ret)) break;
11833 if (MBCLEN_INVALID_P(ret)) continue;
11835 }
11836 }
11837 if (rep) {
11838 rb_str_buf_cat(buf, rep, replen);
11839 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11840 }
11841 else {
11842 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11843 str_mod_check(str, sp, slen);
11844 repl = str_compat_and_valid(repl, enc);
11845 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11848 }
11849 p += clen;
11850 p1 = p;
11851 p = search_nonascii(p, e);
11852 if (!p) {
11853 p = e;
11854 break;
11855 }
11856 }
11857 else {
11859 }
11860 }
11861 if (NIL_P(buf)) {
11862 if (p == e) {
11863 ENC_CODERANGE_SET(str, cr);
11864 return Qnil;
11865 }
11866 buf = rb_str_buf_new(RSTRING_LEN(str));
11867 }
11868 if (p1 < p) {
11869 rb_str_buf_cat(buf, p1, p - p1);
11870 }
11871 if (p < e) {
11872 if (rep) {
11873 rb_str_buf_cat(buf, rep, replen);
11874 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11875 }
11876 else {
11877 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11878 str_mod_check(str, sp, slen);
11879 repl = str_compat_and_valid(repl, enc);
11880 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11883 }
11884 }
11885 }
11886 else {
11887 /* ASCII incompatible */
11888 long mbminlen = rb_enc_mbminlen(enc);
11889 if (!replen) {
11890 rep = NULL;
11891 }
11892 else if (!NIL_P(repl)) {
11893 rep = RSTRING_PTR(repl);
11894 replen = RSTRING_LEN(repl);
11895 }
11896 else if (encidx == ENCINDEX_UTF_16BE) {
11897 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11898 }
11899 else if (encidx == ENCINDEX_UTF_16LE) {
11900 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11901 }
11902 else if (encidx == ENCINDEX_UTF_32BE) {
11903 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11904 }
11905 else if (encidx == ENCINDEX_UTF_32LE) {
11906 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11907 }
11908 else {
11909 DEFAULT_REPLACE_CHAR("?");
11910 }
11911
11912 while (p < e) {
11913 int ret = rb_enc_precise_mbclen(p, e, enc);
11914 if (MBCLEN_NEEDMORE_P(ret)) {
11915 break;
11916 }
11917 else if (MBCLEN_CHARFOUND_P(ret)) {
11918 p += MBCLEN_CHARFOUND_LEN(ret);
11919 }
11920 else if (MBCLEN_INVALID_P(ret)) {
11921 const char *q = p;
11922 long clen = rb_enc_mbmaxlen(enc);
11923 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11924 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11925
11926 if (e - p < clen) clen = e - p;
11927 if (clen <= mbminlen * 2) {
11928 clen = mbminlen;
11929 }
11930 else {
11931 clen -= mbminlen;
11932 for (; clen > mbminlen; clen-=mbminlen) {
11933 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11934 if (MBCLEN_NEEDMORE_P(ret)) break;
11935 if (MBCLEN_INVALID_P(ret)) continue;
11937 }
11938 }
11939 if (rep) {
11940 rb_str_buf_cat(buf, rep, replen);
11941 }
11942 else {
11943 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11944 str_mod_check(str, sp, slen);
11945 repl = str_compat_and_valid(repl, enc);
11946 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11947 }
11948 p += clen;
11949 p1 = p;
11950 }
11951 else {
11953 }
11954 }
11955 if (NIL_P(buf)) {
11956 if (p == e) {
11958 return Qnil;
11959 }
11960 buf = rb_str_buf_new(RSTRING_LEN(str));
11961 }
11962 if (p1 < p) {
11963 rb_str_buf_cat(buf, p1, p - p1);
11964 }
11965 if (p < e) {
11966 if (rep) {
11967 rb_str_buf_cat(buf, rep, replen);
11968 }
11969 else {
11970 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11971 str_mod_check(str, sp, slen);
11972 repl = str_compat_and_valid(repl, enc);
11973 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11974 }
11975 }
11977 }
11978 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11979 return buf;
11980}
11981
11982/*
11983 * call-seq:
11984 * scrub(replacement_string = default_replacement_string) -> new_string
11985 * scrub{|sequence| ... } -> new_string
11986 *
11987 * :include: doc/string/scrub.rdoc
11988 *
11989 */
11990static VALUE
11991str_scrub(int argc, VALUE *argv, VALUE str)
11992{
11993 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11994 VALUE new = rb_str_scrub(str, repl);
11995 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11996}
11997
11998/*
11999 * call-seq:
12000 * scrub!(replacement_string = default_replacement_string) -> self
12001 * scrub!{|sequence| ... } -> self
12002 *
12003 * Like String#scrub, except that:
12004 *
12005 * - Any replacements are made in +self+.
12006 * - Returns +self+.
12007 *
12008 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12009 *
12010 */
12011static VALUE
12012str_scrub_bang(int argc, VALUE *argv, VALUE str)
12013{
12014 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12015 VALUE new = rb_str_scrub(str, repl);
12016 if (!NIL_P(new)) rb_str_replace(str, new);
12017 return str;
12018}
12019
12020static ID id_normalize;
12021static ID id_normalized_p;
12022static VALUE mUnicodeNormalize;
12023
12024static VALUE
12025unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12026{
12027 static int UnicodeNormalizeRequired = 0;
12028 VALUE argv2[2];
12029
12030 if (!UnicodeNormalizeRequired) {
12031 rb_require("unicode_normalize/normalize.rb");
12032 UnicodeNormalizeRequired = 1;
12033 }
12034 argv2[0] = str;
12035 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12036 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12037}
12038
12039/*
12040 * call-seq:
12041 * unicode_normalize(form = :nfc) -> string
12042 *
12043 * :include: doc/string/unicode_normalize.rdoc
12044 *
12045 */
12046static VALUE
12047rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12048{
12049 return unicode_normalize_common(argc, argv, str, id_normalize);
12050}
12051
12052/*
12053 * call-seq:
12054 * unicode_normalize!(form = :nfc) -> self
12055 *
12056 * Like String#unicode_normalize, except that the normalization
12057 * is performed on +self+ (not on a copy of +self+).
12058 *
12059 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12060 *
12061 */
12062static VALUE
12063rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12064{
12065 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12066}
12067
12068/* call-seq:
12069 * unicode_normalized?(form = :nfc) -> true or false
12070 *
12071 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12072 * see String#unicode_normalize.
12073 *
12074 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12075 *
12076 * Examples:
12077 *
12078 * "a\u0300".unicode_normalized? # => false
12079 * "a\u0300".unicode_normalized?(:nfd) # => true
12080 * "\u00E0".unicode_normalized? # => true
12081 * "\u00E0".unicode_normalized?(:nfd) # => false
12082 *
12083 *
12084 * Raises an exception if +self+ is not in a Unicode encoding:
12085 *
12086 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12087 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12088 *
12089 * Related: see {Querying}[rdoc-ref:String@Querying].
12090 */
12091static VALUE
12092rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12093{
12094 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12095}
12096
12097/**********************************************************************
12098 * Document-class: Symbol
12099 *
12100 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12101 *
12102 * You can create a +Symbol+ object explicitly with:
12103 *
12104 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12105 *
12106 * The same +Symbol+ object will be
12107 * created for a given name or string for the duration of a program's
12108 * execution, regardless of the context or meaning of that name. Thus
12109 * if <code>Fred</code> is a constant in one context, a method in
12110 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12111 * will be the same object in all three contexts.
12112 *
12113 * module One
12114 * class Fred
12115 * end
12116 * $f1 = :Fred
12117 * end
12118 * module Two
12119 * Fred = 1
12120 * $f2 = :Fred
12121 * end
12122 * def Fred()
12123 * end
12124 * $f3 = :Fred
12125 * $f1.object_id #=> 2514190
12126 * $f2.object_id #=> 2514190
12127 * $f3.object_id #=> 2514190
12128 *
12129 * Constant, method, and variable names are returned as symbols:
12130 *
12131 * module One
12132 * Two = 2
12133 * def three; 3 end
12134 * @four = 4
12135 * @@five = 5
12136 * $six = 6
12137 * end
12138 * seven = 7
12139 *
12140 * One.constants
12141 * # => [:Two]
12142 * One.instance_methods(true)
12143 * # => [:three]
12144 * One.instance_variables
12145 * # => [:@four]
12146 * One.class_variables
12147 * # => [:@@five]
12148 * global_variables.grep(/six/)
12149 * # => [:$six]
12150 * local_variables
12151 * # => [:seven]
12152 *
12153 * A +Symbol+ object differs from a String object in that
12154 * a +Symbol+ object represents an identifier, while a String object
12155 * represents text or data.
12156 *
12157 * == What's Here
12158 *
12159 * First, what's elsewhere. Class +Symbol+:
12160 *
12161 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12162 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12163 *
12164 * Here, class +Symbol+ provides methods that are useful for:
12165 *
12166 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12167 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12168 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12169 *
12170 * === Methods for Querying
12171 *
12172 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12173 * - #=~: Returns the index of the first substring in symbol that matches a
12174 * given Regexp or other object; returns +nil+ if no match is found.
12175 * - #[], #slice : Returns a substring of symbol
12176 * determined by a given index, start/length, or range, or string.
12177 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12178 * - #encoding: Returns the Encoding object that represents the encoding
12179 * of symbol.
12180 * - #end_with?: Returns +true+ if symbol ends with
12181 * any of the given strings.
12182 * - #match: Returns a MatchData object if symbol
12183 * matches a given Regexp; +nil+ otherwise.
12184 * - #match?: Returns +true+ if symbol
12185 * matches a given Regexp; +false+ otherwise.
12186 * - #length, #size: Returns the number of characters in symbol.
12187 * - #start_with?: Returns +true+ if symbol starts with
12188 * any of the given strings.
12189 *
12190 * === Methods for Comparing
12191 *
12192 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12193 * or larger than symbol.
12194 * - #==, #===: Returns +true+ if a given symbol has the same content and
12195 * encoding.
12196 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12197 * symbol is smaller than, equal to, or larger than symbol.
12198 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12199 * after Unicode case folding; +false+ otherwise.
12200 *
12201 * === Methods for Converting
12202 *
12203 * - #capitalize: Returns symbol with the first character upcased
12204 * and all other characters downcased.
12205 * - #downcase: Returns symbol with all characters downcased.
12206 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12207 * - #name: Returns the frozen string corresponding to symbol.
12208 * - #succ, #next: Returns the symbol that is the successor to symbol.
12209 * - #swapcase: Returns symbol with all upcase characters downcased
12210 * and all downcase characters upcased.
12211 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12212 * - #to_s, #id2name: Returns the string corresponding to +self+.
12213 * - #to_sym, #intern: Returns +self+.
12214 * - #upcase: Returns symbol with all characters upcased.
12215 *
12216 */
12217
12218
12219/*
12220 * call-seq:
12221 * symbol == object -> true or false
12222 *
12223 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12224 */
12225
12226#define sym_equal rb_obj_equal
12227
12228static int
12229sym_printable(const char *s, const char *send, rb_encoding *enc)
12230{
12231 while (s < send) {
12232 int n;
12233 int c = rb_enc_precise_mbclen(s, send, enc);
12234
12235 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12236 n = MBCLEN_CHARFOUND_LEN(c);
12237 c = rb_enc_mbc_to_codepoint(s, send, enc);
12238 if (!rb_enc_isprint(c, enc)) return FALSE;
12239 s += n;
12240 }
12241 return TRUE;
12242}
12243
12244int
12245rb_str_symname_p(VALUE sym)
12246{
12247 rb_encoding *enc;
12248 const char *ptr;
12249 long len;
12250 rb_encoding *resenc = rb_default_internal_encoding();
12251
12252 if (resenc == NULL) resenc = rb_default_external_encoding();
12253 enc = STR_ENC_GET(sym);
12254 ptr = RSTRING_PTR(sym);
12255 len = RSTRING_LEN(sym);
12256 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12257 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12258 return FALSE;
12259 }
12260 return TRUE;
12261}
12262
12263VALUE
12264rb_str_quote_unprintable(VALUE str)
12265{
12266 rb_encoding *enc;
12267 const char *ptr;
12268 long len;
12269 rb_encoding *resenc;
12270
12271 Check_Type(str, T_STRING);
12272 resenc = rb_default_internal_encoding();
12273 if (resenc == NULL) resenc = rb_default_external_encoding();
12274 enc = STR_ENC_GET(str);
12275 ptr = RSTRING_PTR(str);
12276 len = RSTRING_LEN(str);
12277 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12278 !sym_printable(ptr, ptr + len, enc)) {
12279 return rb_str_escape(str);
12280 }
12281 return str;
12282}
12283
12284VALUE
12285rb_id_quote_unprintable(ID id)
12286{
12287 VALUE str = rb_id2str(id);
12288 if (!rb_str_symname_p(str)) {
12289 return rb_str_escape(str);
12290 }
12291 return str;
12292}
12293
12294/*
12295 * call-seq:
12296 * inspect -> string
12297 *
12298 * Returns a string representation of +self+ (including the leading colon):
12299 *
12300 * :foo.inspect # => ":foo"
12301 *
12302 * Related: Symbol#to_s, Symbol#name.
12303 *
12304 */
12305
12306static VALUE
12307sym_inspect(VALUE sym)
12308{
12309 VALUE str = rb_sym2str(sym);
12310 const char *ptr;
12311 long len;
12312 char *dest;
12313
12314 if (!rb_str_symname_p(str)) {
12315 str = rb_str_inspect(str);
12316 len = RSTRING_LEN(str);
12317 rb_str_resize(str, len + 1);
12318 dest = RSTRING_PTR(str);
12319 memmove(dest + 1, dest, len);
12320 }
12321 else {
12322 rb_encoding *enc = STR_ENC_GET(str);
12323 VALUE orig_str = str;
12324
12325 len = RSTRING_LEN(orig_str);
12326 str = rb_enc_str_new(0, len + 1, enc);
12327
12328 // Get data pointer after allocation
12329 ptr = RSTRING_PTR(orig_str);
12330 dest = RSTRING_PTR(str);
12331 memcpy(dest + 1, ptr, len);
12332
12333 RB_GC_GUARD(orig_str);
12334 }
12335 dest[0] = ':';
12336
12338
12339 return str;
12340}
12341
12342VALUE
12344{
12345 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12346 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12347 return str;
12348}
12349
12350VALUE
12351rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12352{
12353 VALUE obj;
12354
12355 if (argc < 1) {
12356 rb_raise(rb_eArgError, "no receiver given");
12357 }
12358 obj = argv[0];
12359 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12360}
12361
12362/*
12363 * call-seq:
12364 * succ
12365 *
12366 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12367 *
12368 * :foo.succ # => :fop
12369 *
12370 * Related: String#succ.
12371 */
12372
12373static VALUE
12374sym_succ(VALUE sym)
12375{
12376 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12377}
12378
12379/*
12380 * call-seq:
12381 * self <=> other -> -1, 0, 1, or nil
12382 *
12383 * Compares +self+ and +other+, using String#<=>.
12384 *
12385 * Returns:
12386 *
12387 * - <tt>self.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12388 * - +nil+, otherwise.
12389 *
12390 * Examples:
12391 *
12392 * :bar <=> :foo # => -1
12393 * :foo <=> :foo # => 0
12394 * :foo <=> :bar # => 1
12395 * :foo <=> 'bar' # => nil
12396 *
12397 * \Class \Symbol includes module Comparable,
12398 * each of whose methods uses Symbol#<=> for comparison.
12399 *
12400 * Related: String#<=>.
12401 */
12402
12403static VALUE
12404sym_cmp(VALUE sym, VALUE other)
12405{
12406 if (!SYMBOL_P(other)) {
12407 return Qnil;
12408 }
12409 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12410}
12411
12412/*
12413 * call-seq:
12414 * casecmp(object) -> -1, 0, 1, or nil
12415 *
12416 * :include: doc/symbol/casecmp.rdoc
12417 *
12418 */
12419
12420static VALUE
12421sym_casecmp(VALUE sym, VALUE other)
12422{
12423 if (!SYMBOL_P(other)) {
12424 return Qnil;
12425 }
12426 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12427}
12428
12429/*
12430 * call-seq:
12431 * casecmp?(object) -> true, false, or nil
12432 *
12433 * :include: doc/symbol/casecmp_p.rdoc
12434 *
12435 */
12436
12437static VALUE
12438sym_casecmp_p(VALUE sym, VALUE other)
12439{
12440 if (!SYMBOL_P(other)) {
12441 return Qnil;
12442 }
12443 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12444}
12445
12446/*
12447 * call-seq:
12448 * symbol =~ object -> integer or nil
12449 *
12450 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12451 * including possible updates to global variables;
12452 * see String#=~.
12453 *
12454 */
12455
12456static VALUE
12457sym_match(VALUE sym, VALUE other)
12458{
12459 return rb_str_match(rb_sym2str(sym), other);
12460}
12461
12462/*
12463 * call-seq:
12464 * match(pattern, offset = 0) -> matchdata or nil
12465 * match(pattern, offset = 0) {|matchdata| } -> object
12466 *
12467 * Equivalent to <tt>self.to_s.match</tt>,
12468 * including possible updates to global variables;
12469 * see String#match.
12470 *
12471 */
12472
12473static VALUE
12474sym_match_m(int argc, VALUE *argv, VALUE sym)
12475{
12476 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12477}
12478
12479/*
12480 * call-seq:
12481 * match?(pattern, offset) -> true or false
12482 *
12483 * Equivalent to <tt>sym.to_s.match?</tt>;
12484 * see String#match.
12485 *
12486 */
12487
12488static VALUE
12489sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12490{
12491 return rb_str_match_m_p(argc, argv, sym);
12492}
12493
12494/*
12495 * call-seq:
12496 * symbol[index] -> string or nil
12497 * symbol[start, length] -> string or nil
12498 * symbol[range] -> string or nil
12499 * symbol[regexp, capture = 0] -> string or nil
12500 * symbol[substring] -> string or nil
12501 *
12502 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12503 *
12504 */
12505
12506static VALUE
12507sym_aref(int argc, VALUE *argv, VALUE sym)
12508{
12509 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12510}
12511
12512/*
12513 * call-seq:
12514 * length -> integer
12515 *
12516 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12517 */
12518
12519static VALUE
12520sym_length(VALUE sym)
12521{
12522 return rb_str_length(rb_sym2str(sym));
12523}
12524
12525/*
12526 * call-seq:
12527 * empty? -> true or false
12528 *
12529 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12530 *
12531 */
12532
12533static VALUE
12534sym_empty(VALUE sym)
12535{
12536 return rb_str_empty(rb_sym2str(sym));
12537}
12538
12539/*
12540 * call-seq:
12541 * upcase(mapping) -> symbol
12542 *
12543 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12544 *
12545 * See String#upcase.
12546 *
12547 */
12548
12549static VALUE
12550sym_upcase(int argc, VALUE *argv, VALUE sym)
12551{
12552 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12553}
12554
12555/*
12556 * call-seq:
12557 * downcase(mapping) -> symbol
12558 *
12559 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12560 *
12561 * See String#downcase.
12562 *
12563 * Related: Symbol#upcase.
12564 *
12565 */
12566
12567static VALUE
12568sym_downcase(int argc, VALUE *argv, VALUE sym)
12569{
12570 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12571}
12572
12573/*
12574 * call-seq:
12575 * capitalize(mapping) -> symbol
12576 *
12577 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12578 *
12579 * See String#capitalize.
12580 *
12581 */
12582
12583static VALUE
12584sym_capitalize(int argc, VALUE *argv, VALUE sym)
12585{
12586 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12587}
12588
12589/*
12590 * call-seq:
12591 * swapcase(mapping) -> symbol
12592 *
12593 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12594 *
12595 * See String#swapcase.
12596 *
12597 */
12598
12599static VALUE
12600sym_swapcase(int argc, VALUE *argv, VALUE sym)
12601{
12602 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12603}
12604
12605/*
12606 * call-seq:
12607 * start_with?(*string_or_regexp) -> true or false
12608 *
12609 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12610 *
12611 */
12612
12613static VALUE
12614sym_start_with(int argc, VALUE *argv, VALUE sym)
12615{
12616 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12617}
12618
12619/*
12620 * call-seq:
12621 * end_with?(*strings) -> true or false
12622 *
12623 *
12624 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12625 *
12626 */
12627
12628static VALUE
12629sym_end_with(int argc, VALUE *argv, VALUE sym)
12630{
12631 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12632}
12633
12634/*
12635 * call-seq:
12636 * encoding -> encoding
12637 *
12638 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12639 *
12640 */
12641
12642static VALUE
12643sym_encoding(VALUE sym)
12644{
12645 return rb_obj_encoding(rb_sym2str(sym));
12646}
12647
12648static VALUE
12649string_for_symbol(VALUE name)
12650{
12651 if (!RB_TYPE_P(name, T_STRING)) {
12652 VALUE tmp = rb_check_string_type(name);
12653 if (NIL_P(tmp)) {
12654 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12655 name);
12656 }
12657 name = tmp;
12658 }
12659 return name;
12660}
12661
12662ID
12664{
12665 if (SYMBOL_P(name)) {
12666 return SYM2ID(name);
12667 }
12668 name = string_for_symbol(name);
12669 return rb_intern_str(name);
12670}
12671
12672VALUE
12674{
12675 if (SYMBOL_P(name)) {
12676 return name;
12677 }
12678 name = string_for_symbol(name);
12679 return rb_str_intern(name);
12680}
12681
12682/*
12683 * call-seq:
12684 * Symbol.all_symbols -> array_of_symbols
12685 *
12686 * Returns an array of all symbols currently in Ruby's symbol table:
12687 *
12688 * Symbol.all_symbols.size # => 9334
12689 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12690 *
12691 */
12692
12693static VALUE
12694sym_all_symbols(VALUE _)
12695{
12696 return rb_sym_all_symbols();
12697}
12698
12699VALUE
12700rb_str_to_interned_str(VALUE str)
12701{
12702 return rb_fstring(str);
12703}
12704
12705VALUE
12706rb_interned_str(const char *ptr, long len)
12707{
12708 struct RString fake_str = {RBASIC_INIT};
12709 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12710}
12711
12712VALUE
12714{
12715 return rb_interned_str(ptr, strlen(ptr));
12716}
12717
12718VALUE
12719rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12720{
12721 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12722 rb_enc_autoload(enc);
12723 }
12724
12725 struct RString fake_str = {RBASIC_INIT};
12726 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12727}
12728
12729VALUE
12730rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12731{
12732 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12733 rb_enc_autoload(enc);
12734 }
12735
12736 struct RString fake_str = {RBASIC_INIT};
12737 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12738 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12739 return str;
12740}
12741
12742VALUE
12744{
12745 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12746}
12747
12748#if USE_YJIT || USE_ZJIT
12749void
12750rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12751{
12752 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12753 ssize_t code = RB_NUM2SSIZE(codepoint);
12754
12755 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12756 rb_str_buf_cat_byte(str, (char) code);
12757 return;
12758 }
12759 }
12760
12761 rb_str_concat(str, codepoint);
12762}
12763#endif
12764
12765static int
12766fstring_set_class_i(VALUE *str, void *data)
12767{
12768 RBASIC_SET_CLASS(*str, rb_cString);
12769
12770 return ST_CONTINUE;
12771}
12772
12773void
12774Init_String(void)
12775{
12776 rb_cString = rb_define_class("String", rb_cObject);
12777
12778 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12779
12781 rb_define_alloc_func(rb_cString, empty_str_alloc);
12782 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12783 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12784 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12786 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12787 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12790 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12791 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12792 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12793 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12796 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12797 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12798 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12799 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12802 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12803 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12804 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12805 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12806 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12808 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12810 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12811 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12812 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12813 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12814 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12815 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12816 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12817 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12818 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12819 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12820 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12821 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12822 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12823 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12825 rb_define_method(rb_cString, "+@", str_uplus, 0);
12826 rb_define_method(rb_cString, "-@", str_uminus, 0);
12827 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12828 rb_define_alias(rb_cString, "dedup", "-@");
12829
12830 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12831 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12832 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12833 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12836 rb_define_method(rb_cString, "undump", str_undump, 0);
12837
12838 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12839 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12840 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12841 sym_fold = ID2SYM(rb_intern_const("fold"));
12842
12843 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12844 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12845 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12846 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12847
12848 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12849 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12850 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12851 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12852
12853 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12854 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12855 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12856 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12857 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12858 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12859 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12860 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12861 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12862 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12863 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12864 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12866 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12867 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12868 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12869 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12870 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12871
12872 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12873 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12874 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12875
12876 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12877
12878 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12879 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12880 rb_define_method(rb_cString, "center", rb_str_center, -1);
12881
12882 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12883 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12884 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12885 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12886 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12887 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12888 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12889 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12890 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12891
12892 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12893 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12894 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12895 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12896 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12897 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12898 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12899 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12900 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12901
12902 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12903 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12904 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12905 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12906 rb_define_method(rb_cString, "count", rb_str_count, -1);
12907
12908 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12909 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12910 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12911 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12912
12913 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12914 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12915 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12916 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12917 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12918
12919 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12920
12921 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12922 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12923
12924 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12925 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12926
12927 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12928 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12929 rb_define_method(rb_cString, "b", rb_str_b, 0);
12930 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12931 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12932
12933 /* define UnicodeNormalize module here so that we don't have to look it up */
12934 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12935 id_normalize = rb_intern_const("normalize");
12936 id_normalized_p = rb_intern_const("normalized?");
12937
12938 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12939 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12940 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12941
12942 rb_fs = Qnil;
12943 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12944 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12945 rb_gc_register_address(&rb_fs);
12946
12947 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12951 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12952
12953 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12954 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12955 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12956 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12957 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12958 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12959
12960 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12961 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12962 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12963 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12964
12965 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12966 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12967 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12968 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12969 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12970 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12971 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12972
12973 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12974 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12975 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12976 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12977
12978 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12979 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12980
12981 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12982}
12983
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:707
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:415
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1798
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1591
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1704
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2958
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2770
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3248
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1010
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3037
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:132
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:135
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:133
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:130
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:127
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:124
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:129
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:65
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:131
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:128
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:136
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:653
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3896
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1422
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1418
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1425
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1416
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1420
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:675
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2208
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2226
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1354
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3622
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:264
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:582
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:176
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1342
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3306
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1343
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:948
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1208
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3029
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1227
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12719
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:254
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2335
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3733
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1156
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1448
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1349
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:967
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12743
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:832
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:715
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2030
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2036
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1950
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1232
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4223
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3721
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1486
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1923
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1753
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1513
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2488
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3798
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1424
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12343
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2561
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1400
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1747
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3057
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5340
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4161
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3154
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11664
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1783
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1789
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1190
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1002
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1519
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1997
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4147
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3566
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2424
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2015
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6547
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3162
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12713
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1430
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3764
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3104
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4268
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3388
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7226
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2791
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12706
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4215
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4035
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4190
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3740
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3279
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5824
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11722
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1703
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2951
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3251
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3370
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1202
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2745
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7333
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1412
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1719
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2438
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5742
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9340
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1196
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1851
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2017
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2096
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3399
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1647
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12673
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12663
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1862
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3500
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4467
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1442
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2928
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2810
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1436
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2823
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1780
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:495
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:208
Definition string.c:8220
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:307
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113