Ruby 3.5.0dev (2025-06-06 revision 347e581a4cbe2bbf7c13532038f2a68b0b37099a)
string.c (347e581a4cbe2bbf7c13532038f2a68b0b37099a)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/thread.h"
46#include "ruby/util.h"
47#include "ruby_assert.h"
48#include "shape.h"
49#include "vm_sync.h"
51
52#if defined HAVE_CRYPT_R
53# if defined HAVE_CRYPT_H
54# include <crypt.h>
55# endif
56#elif !defined HAVE_CRYPT
57# include "missing/crypt.h"
58# define HAVE_CRYPT_R 1
59#endif
60
61#define BEG(no) (regs->beg[(no)])
62#define END(no) (regs->end[(no)])
63
64#undef rb_str_new
65#undef rb_usascii_str_new
66#undef rb_utf8_str_new
67#undef rb_enc_str_new
68#undef rb_str_new_cstr
69#undef rb_usascii_str_new_cstr
70#undef rb_utf8_str_new_cstr
71#undef rb_enc_str_new_cstr
72#undef rb_external_str_new_cstr
73#undef rb_locale_str_new_cstr
74#undef rb_str_dup_frozen
75#undef rb_str_buf_new_cstr
76#undef rb_str_buf_cat
77#undef rb_str_buf_cat2
78#undef rb_str_cat2
79#undef rb_str_cat_cstr
80#undef rb_fstring_cstr
81
84
85/* Flags of RString
86 *
87 * 0: STR_SHARED (equal to ELTS_SHARED)
88 * The string is shared. The buffer this string points to is owned by
89 * another string (the shared root).
90 * 1: RSTRING_NOEMBED
91 * The string is not embedded. When a string is embedded, the contents
92 * follow the header. When a string is not embedded, the contents is
93 * on a separately allocated buffer.
94 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
95 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
96 * It emits a deprecation warning when mutated for the first time.
97 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
98 * The string was allocated by the `Symbol#to_s` method.
99 * It emits a deprecation warning when mutated for the first time.
100 * 4: STR_PRECOMPUTED_HASH
101 * The string is embedded and has its precomputed hashcode stored
102 * after the terminator.
103 * 5: STR_SHARED_ROOT
104 * Other strings may point to the contents of this string. When this
105 * flag is set, STR_SHARED must not be set.
106 * 6: STR_BORROWED
107 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
108 * to be unshared by rb_str_tmp_frozen_release.
109 * 7: STR_TMPLOCK
110 * The pointer to the buffer is passed to a system call such as
111 * read(2). Any modification and realloc is prohibited.
112 * 8-9: ENC_CODERANGE
113 * Stores the coderange of the string.
114 * 10-16: ENCODING
115 * Stores the encoding of the string.
116 * 17: RSTRING_FSTR
117 * The string is a fstring. The string is deduplicated in the fstring
118 * table.
119 * 18: STR_NOFREE
120 * Do not free this string's buffer when the string is reclaimed
121 * by the garbage collector. Used for when the string buffer is a C
122 * string literal.
123 * 19: STR_FAKESTR
124 * The string is not allocated or managed by the garbage collector.
125 * Typically, the string object header (struct RString) is temporarily
126 * allocated on C stack.
127 */
128
129#define RUBY_MAX_CHAR_LEN 16
130#define STR_PRECOMPUTED_HASH FL_USER4
131#define STR_SHARED_ROOT FL_USER5
132#define STR_BORROWED FL_USER6
133#define STR_TMPLOCK FL_USER7
134#define STR_NOFREE FL_USER18
135#define STR_FAKESTR FL_USER19
136
137#define STR_SET_NOEMBED(str) do {\
138 FL_SET((str), STR_NOEMBED);\
139 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
140} while (0)
141#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
142
143#define STR_SET_LEN(str, n) do { \
144 RSTRING(str)->len = (n); \
145} while (0)
146
147static inline bool
148str_encindex_fastpath(int encindex)
149{
150 // The overwhelming majority of strings are in one of these 3 encodings.
151 switch (encindex) {
152 case ENCINDEX_ASCII_8BIT:
153 case ENCINDEX_UTF_8:
154 case ENCINDEX_US_ASCII:
155 return true;
156 default:
157 return false;
158 }
159}
160
161static inline bool
162str_enc_fastpath(VALUE str)
163{
164 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
165}
166
167#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
168#define TERM_FILL(ptr, termlen) do {\
169 char *const term_fill_ptr = (ptr);\
170 const int term_fill_len = (termlen);\
171 *term_fill_ptr = '\0';\
172 if (UNLIKELY(term_fill_len > 1))\
173 memset(term_fill_ptr, 0, term_fill_len);\
174} while (0)
175
176#define RESIZE_CAPA(str,capacity) do {\
177 const int termlen = TERM_LEN(str);\
178 RESIZE_CAPA_TERM(str,capacity,termlen);\
179} while (0)
180#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
181 if (STR_EMBED_P(str)) {\
182 if (str_embed_capa(str) < capacity + termlen) {\
183 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
184 const long tlen = RSTRING_LEN(str);\
185 memcpy(tmp, RSTRING_PTR(str), tlen);\
186 RSTRING(str)->as.heap.ptr = tmp;\
187 RSTRING(str)->len = tlen;\
188 STR_SET_NOEMBED(str);\
189 RSTRING(str)->as.heap.aux.capa = (capacity);\
190 }\
191 }\
192 else {\
193 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
194 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
195 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
196 RSTRING(str)->as.heap.aux.capa = (capacity);\
197 }\
198} while (0)
199
200#define STR_SET_SHARED(str, shared_str) do { \
201 if (!FL_TEST(str, STR_FAKESTR)) { \
202 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
203 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
204 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
205 FL_SET((str), STR_SHARED); \
206 FL_SET((shared_str), STR_SHARED_ROOT); \
207 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
208 FL_SET_RAW((shared_str), STR_BORROWED); \
209 } \
210} while (0)
211
212#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
213#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
214/* TODO: include the terminator size in capa. */
215
216#define STR_ENC_GET(str) get_encoding(str)
217
218#if !defined SHARABLE_MIDDLE_SUBSTRING
219# define SHARABLE_MIDDLE_SUBSTRING 0
220#endif
221#if !SHARABLE_MIDDLE_SUBSTRING
222#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
223#else
224#define SHARABLE_SUBSTRING_P(beg, len, end) 1
225#endif
226
227
228static inline long
229str_embed_capa(VALUE str)
230{
231 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
232}
233
234bool
235rb_str_reembeddable_p(VALUE str)
236{
237 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
238}
239
240static inline size_t
241rb_str_embed_size(long capa)
242{
243 return offsetof(struct RString, as.embed.ary) + capa;
244}
245
246size_t
247rb_str_size_as_embedded(VALUE str)
248{
249 size_t real_size;
250 if (STR_EMBED_P(str)) {
251 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
252 }
253 /* if the string is not currently embedded, but it can be embedded, how
254 * much space would it require */
255 else if (rb_str_reembeddable_p(str)) {
256 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
257 }
258 else {
259 real_size = sizeof(struct RString);
260 }
261
262 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
263 real_size += sizeof(st_index_t);
264 }
265
266 return real_size;
267}
268
269static inline bool
270STR_EMBEDDABLE_P(long len, long termlen)
271{
272 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
273}
274
275static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
276static VALUE str_new_frozen(VALUE klass, VALUE orig);
277static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
278static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
279static VALUE str_new(VALUE klass, const char *ptr, long len);
280static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
281static inline void str_modifiable(VALUE str);
282static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
283static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
284
285static inline void
286str_make_independent(VALUE str)
287{
288 long len = RSTRING_LEN(str);
289 int termlen = TERM_LEN(str);
290 str_make_independent_expand((str), len, 0L, termlen);
291}
292
293static inline int str_dependent_p(VALUE str);
294
295void
296rb_str_make_independent(VALUE str)
297{
298 if (str_dependent_p(str)) {
299 str_make_independent(str);
300 }
301}
302
303void
304rb_str_make_embedded(VALUE str)
305{
306 RUBY_ASSERT(rb_str_reembeddable_p(str));
307 RUBY_ASSERT(!STR_EMBED_P(str));
308
309 char *buf = RSTRING(str)->as.heap.ptr;
310 long len = RSTRING(str)->len;
311
312 STR_SET_EMBED(str);
313 STR_SET_LEN(str, len);
314
315 if (len > 0) {
316 memcpy(RSTRING_PTR(str), buf, len);
317 ruby_xfree(buf);
318 }
319
320 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
321}
322
323void
324rb_debug_rstring_null_ptr(const char *func)
325{
326 fprintf(stderr, "%s is returning NULL!! "
327 "SIGSEGV is highly expected to follow immediately.\n"
328 "If you could reproduce, attach your debugger here, "
329 "and look at the passed string.\n",
330 func);
331}
332
333/* symbols for [up|down|swap]case/capitalize options */
334static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
335
336static rb_encoding *
337get_encoding(VALUE str)
338{
339 return rb_enc_from_index(ENCODING_GET(str));
340}
341
342static void
343mustnot_broken(VALUE str)
344{
345 if (is_broken_string(str)) {
346 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
347 }
348}
349
350static void
351mustnot_wchar(VALUE str)
352{
353 rb_encoding *enc = STR_ENC_GET(str);
354 if (rb_enc_mbminlen(enc) > 1) {
355 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
356 }
357}
358
359static int fstring_cmp(VALUE a, VALUE b);
360
361static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
362
363#if SIZEOF_LONG == SIZEOF_VOIDP
364#define PRECOMPUTED_FAKESTR_HASH 1
365#else
366#endif
367
368#ifdef PRECOMPUTED_FAKESTR_HASH
369static st_index_t
370fstring_hash(VALUE str)
371{
372 st_index_t h;
373 if (FL_TEST_RAW(str, STR_FAKESTR)) {
374 // register_fstring precomputes the hash and stores it in capa for fake strings
375 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
376 }
377 else {
378 h = rb_str_hash(str);
379 }
380 // rb_str_hash doesn't include the encoding for ascii only strings, so
381 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
382 return rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
383}
384#else
385#define fstring_hash rb_str_hash
386#endif
387
388static inline bool
389BARE_STRING_P(VALUE str)
390{
391 if (RBASIC_CLASS(str) != rb_cString) return false;
392
393 if (FL_TEST_RAW(str, FL_EXIVAR)) {
394 return rb_ivar_count(str) == 0;
395 }
396 return true;
397}
398
399static inline st_index_t
400str_do_hash(VALUE str)
401{
402 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
403 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
404 if (e && !is_ascii_string(str)) {
405 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
406 }
407 return h;
408}
409
410static VALUE
411str_store_precomputed_hash(VALUE str, st_index_t hash)
412{
413 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
414 RUBY_ASSERT(STR_EMBED_P(str));
415
416#if RUBY_DEBUG
417 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
418 size_t free_bytes = str_embed_capa(str) - used_bytes;
419 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
420#endif
421
422 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
423
424 FL_SET(str, STR_PRECOMPUTED_HASH);
425
426 return str;
427}
428
430 bool copy;
431 bool force_precompute_hash;
432};
433
434static VALUE
435build_fstring(VALUE str, struct fstr_update_arg *arg)
436{
437 // Unless the string is empty or binary, its coderange has been precomputed.
438 int coderange = ENC_CODERANGE(str);
439
440 if (FL_TEST_RAW(str, STR_FAKESTR)) {
441 if (arg->copy) {
442 VALUE new_str;
443 long len = RSTRING_LEN(str);
444 long capa = len + sizeof(st_index_t);
445 int term_len = TERM_LEN(str);
446
447 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
448 new_str = str_alloc_embed(rb_cString, capa + term_len);
449 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
450 STR_SET_LEN(new_str, RSTRING_LEN(str));
451 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
452 rb_enc_copy(new_str, str);
453 str_store_precomputed_hash(new_str, str_do_hash(str));
454 }
455 else {
456 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
457 rb_enc_copy(new_str, str);
458#ifdef PRECOMPUTED_FAKESTR_HASH
459 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
460 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
461 }
462#endif
463 }
464 str = new_str;
465 }
466 else {
467 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
468 RSTRING(str)->len,
469 ENCODING_GET(str));
470 }
471 OBJ_FREEZE(str);
472 }
473 else {
474 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
475 str = str_new_frozen(rb_cString, str);
476 }
477 if (STR_SHARED_P(str)) { /* str should not be shared */
478 /* shared substring */
479 str_make_independent(str);
481 }
482 if (!BARE_STRING_P(str)) {
483 str = str_new_frozen(rb_cString, str);
484 }
485 }
486
487 ENC_CODERANGE_SET(str, coderange);
488 RBASIC(str)->flags |= RSTRING_FSTR;
489
492 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
495 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
496
497 return str;
498}
499
500VALUE
501rb_fstring(VALUE str)
502{
503 VALUE fstr;
504 int bare;
505
506 Check_Type(str, T_STRING);
507
508 if (FL_TEST(str, RSTRING_FSTR))
509 return str;
510
511 bare = BARE_STRING_P(str);
512 if (!bare) {
513 if (STR_EMBED_P(str)) {
514 OBJ_FREEZE(str);
515 return str;
516 }
517
518 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
520 return str;
521 }
522 }
523
524 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
525 rb_str_resize(str, RSTRING_LEN(str));
526
527 fstr = register_fstring(str, false, false);
528
529 if (!bare) {
530 str_replace_shared_without_enc(str, fstr);
531 OBJ_FREEZE(str);
532 return str;
533 }
534 return fstr;
535}
536
537#define FSTRING_TABLE_EMPTY Qfalse
538#define FSTRING_TABLE_TOMBSTONE Qtrue
539#define FSTRING_TABLE_MOVED Qundef
540
542 VALUE str;
543 VALUE hash;
544};
545
547 struct fstring_table_entry *entries;
548 unsigned int capacity;
549 unsigned int deleted_entries;
550 rb_atomic_t count; // TODO: pad to own cache line?
551};
552
553static void
554fstring_table_free(void *ptr)
555{
556 struct fstring_table_struct *table = ptr;
557 xfree(table->entries);
558}
559
560static size_t
561fstring_table_size(const void *ptr)
562{
563 const struct fstring_table_struct *table = ptr;
564 return sizeof(struct fstring_table_struct) + sizeof(struct fstring_table_entry) * table->capacity;
565}
566
567// We declare a type for the table so that we can lean on Ruby's GC for deferred reclamation
568static const rb_data_type_t fstring_table_type = {
569 .wrap_struct_name = "VM/fstring_table",
570 .function = {
571 .dmark = NULL,
572 .dfree = fstring_table_free,
573 .dsize = fstring_table_size,
574 },
575 .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
576};
577
578
579static VALUE fstring_table_obj;
580
581static VALUE
582new_fstring_table(int capacity)
583{
584 VALUE obj;
585 struct fstring_table_struct *table;
586 obj = TypedData_Make_Struct(0, struct fstring_table_struct, &fstring_table_type, table);
587 table->capacity = capacity;
588 table->count = 0;
589 table->entries = ZALLOC_N(struct fstring_table_entry, capacity);
590 return obj;
591}
592
593void
594Init_fstring_table(void)
595{
596 fstring_table_obj = new_fstring_table(8192);
597 rb_gc_register_address(&fstring_table_obj);
598}
599
600#if 0
601
602// Linear probe
603struct fstring_table_probe {
604 int idx;
605 int mask;
606};
607
608static int
609fstring_table_probe_start(struct fstring_table_probe *probe, struct fstring_table_struct *table, VALUE hash_code)
610{
611 RUBY_ASSERT((table->capacity & (table->capacity - 1)) == 0);
612 probe->mask = table->capacity - 1;
613 probe->idx = hash_code & probe->mask;
614 return probe->idx;
615}
616
617static int
618fstring_table_probe_next(struct fstring_table_probe *probe)
619{
620 probe->idx = (probe->idx + 1) & probe->mask;
621 return probe->idx;
622}
623
624#else
625
626// Struct containing probe information. Intended that the compiler should always inline this
627// Quadratic probing
629 int idx;
630 int d;
631 int mask;
632};
633
634static int
635fstring_table_probe_start(struct fstring_table_probe *probe, struct fstring_table_struct *table, VALUE hash_code)
636{
637 RUBY_ASSERT((table->capacity & (table->capacity - 1)) == 0);
638 probe->d = 0;
639 probe->mask = table->capacity - 1;
640 probe->idx = hash_code & probe->mask;
641 return probe->idx;
642}
643
644static int
645fstring_table_probe_next(struct fstring_table_probe *probe)
646{
647 probe->d++;
648 probe->idx = (probe->idx + probe->d) & probe->mask;
649 return probe->idx;
650}
651#endif
652
653#define RUBY_ATOMIC_VALUE_LOAD(x) (VALUE)(RUBY_ATOMIC_PTR_LOAD(x))
654
655static void
656fstring_insert_on_resize(struct fstring_table_struct *table, VALUE hash_code, VALUE value)
657{
658 struct fstring_table_probe probe;
659 int idx = fstring_table_probe_start(&probe, table, hash_code);
660
661 for (;;) {
662 struct fstring_table_entry *entry = &table->entries[idx];
663 VALUE candidate = entry->str;
664
665 RUBY_ASSERT(candidate != FSTRING_TABLE_TOMBSTONE);
666 RUBY_ASSERT(candidate != FSTRING_TABLE_MOVED);
667
668 if (candidate == FSTRING_TABLE_EMPTY) {
669 table->count++;
670
671 RUBY_ASSERT(table->count < table->capacity / 2);
672 RUBY_ASSERT(entry->hash == 0);
673
674 entry->str = value;
675 entry->hash = hash_code;
676 return;
677 }
678
679 idx = fstring_table_probe_next(&probe);
680 }
681}
682
683// Rebuilds the table
684static void
685fstring_try_resize_without_locking(VALUE old_table_obj)
686{
687 // Check if another thread has already resized
688 if (RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj) != old_table_obj) {
689 goto end;
690 }
691
692 struct fstring_table_struct *old_table = RTYPEDDATA_GET_DATA(old_table_obj);
693
694 // This may overcount by up to the number of threads concurrently attempting to insert
695 // GC may also happen between now and the table being rebuilt
696 int expected_count = RUBY_ATOMIC_LOAD(old_table->count) - old_table->deleted_entries;
697
698 struct fstring_table_entry *old_entries = old_table->entries;
699 int old_capacity = old_table->capacity;
700 int new_capacity = old_capacity * 2;
701 if (new_capacity > expected_count * 8) {
702 new_capacity = old_capacity / 2;
703 }
704 else if (new_capacity > expected_count * 4) {
705 new_capacity = old_capacity;
706 }
707
708 // May cause GC and therefore deletes, so must hapen first
709 VALUE new_table_obj = new_fstring_table(new_capacity);
710 struct fstring_table_struct *new_table = RTYPEDDATA_GET_DATA(new_table_obj);
711
712 for (int i = 0; i < old_capacity; i++) {
713 struct fstring_table_entry *entry = &old_entries[i];
714 VALUE val = RUBY_ATOMIC_VALUE_EXCHANGE(entry->str, FSTRING_TABLE_MOVED);
715 RUBY_ASSERT(val != FSTRING_TABLE_MOVED);
716 if (val == FSTRING_TABLE_EMPTY) continue;
717 if (val == FSTRING_TABLE_TOMBSTONE) continue;
718 if (rb_objspace_garbage_object_p(val)) continue;
719
720 VALUE hash_code = RUBY_ATOMIC_VALUE_LOAD(entry->hash);
721 if (hash_code == 0) {
722 // Either in-progress insert or extremely unlikely 0 hash
723 // Re-calculate the hash ourselves
724 hash_code = fstring_hash(val);
725 }
726 RUBY_ASSERT(hash_code == fstring_hash(val));
727 fstring_insert_on_resize(new_table, hash_code, val);
728 }
729
730#if 0
731 fprintf(stderr, "resized: %p(%i) -> %p(%i) (count: %i->%i)\n", old_table, old_table->capacity, new_table, new_table->capacity, old_table->count, new_table->count);
732#endif
733
734 RUBY_ATOMIC_VALUE_SET(fstring_table_obj, new_table_obj);
735
736end:
737 RB_GC_GUARD(old_table_obj);
738}
739
740static void
741fstring_try_resize(VALUE old_table_obj)
742{
743 RB_VM_LOCKING() {
744 fstring_try_resize_without_locking(old_table_obj);
745 }
746}
747
748static VALUE
749fstring_find_or_insert(VALUE hash_code, VALUE value, struct fstr_update_arg *arg)
750{
751 struct fstring_table_probe probe;
752 bool inserting = false;
753 int idx;
754 VALUE table_obj;
755 struct fstring_table_struct *table;
756
757 retry:
758 table_obj = RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj);
759 RUBY_ASSERT(table_obj);
760 table = RTYPEDDATA_GET_DATA(table_obj);
761 idx = fstring_table_probe_start(&probe, table, hash_code);
762
763 for (;;) {
764 struct fstring_table_entry *entry = &table->entries[idx];
765 VALUE candidate = RUBY_ATOMIC_VALUE_LOAD(entry->str);
766
767 if (candidate == FSTRING_TABLE_EMPTY) {
768 // Not in table
769 if (!inserting) {
770 // Prepare a string suitable for inserting into the table
771 value = build_fstring(value, arg);
772 RUBY_ASSERT(hash_code == fstring_hash(value));
773 inserting = true;
774 }
775
776 unsigned int prev_count = RUBY_ATOMIC_FETCH_ADD(table->count, 1);
777
778 if (UNLIKELY(prev_count > table->capacity / 2)) {
779 fstring_try_resize(table_obj);
780 goto retry;
781 }
782
783 VALUE found = RUBY_ATOMIC_VALUE_CAS(entry->str, FSTRING_TABLE_EMPTY, value);
784 if (found == FSTRING_TABLE_EMPTY) {
785 // Success! Our value was inserted
786
787 // Also set the hash code
788 RUBY_ATOMIC_VALUE_SET(entry->hash, hash_code);
789
790 RB_GC_GUARD(table_obj);
791 return value;
792 }
793 else {
794 // Nothing was inserted
795 RUBY_ATOMIC_DEC(table->count); // we didn't end up inserting
796
797 // Another thread won the race, try again at the same location
798 continue;
799 }
800 }
801 else if (candidate == FSTRING_TABLE_TOMBSTONE) {
802 // Deleted entry, continue searching
803 }
804 else if (candidate == FSTRING_TABLE_MOVED) {
805 // Wait
806 RB_VM_LOCKING();
807
808 goto retry;
809 }
810 else {
811 VALUE candidate_hash = RUBY_ATOMIC_VALUE_LOAD(entry->hash);
812 if ((candidate_hash == hash_code || candidate_hash == 0) && !fstring_cmp(candidate, value)) {
813 // We've found a match
814 if (UNLIKELY(rb_objspace_garbage_object_p(candidate))) {
815 // This is a weakref table, so after marking but before sweeping is complete we may find a matching garbage object.
816 // Skip it and mark it as a tombstone to help other threads out
817 RUBY_ATOMIC_VALUE_CAS(entry->str, candidate, FSTRING_TABLE_TOMBSTONE);
818
819 // Fall through and continue our search
820 }
821 else {
822 RB_GC_GUARD(table_obj);
823 return candidate;
824 }
825 }
826 }
827
828 idx = fstring_table_probe_next(&probe);
829 }
830}
831
832
833// Removes an fstring from the table. Compares by identity
834static void
835fstring_delete(VALUE hash_code, VALUE value)
836{
837 // Delete is never called concurrently, so atomic operations are unnecessary
838 VALUE table_obj = RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj);
839 RUBY_ASSERT_ALWAYS(table_obj);
840 struct fstring_table_struct *table = RTYPEDDATA_GET_DATA(table_obj);
841
842 struct fstring_table_probe probe;
843 int idx = fstring_table_probe_start(&probe, table, hash_code);
844
845 for (;;) {
846 struct fstring_table_entry *entry = &table->entries[idx];
847 VALUE candidate = entry->str;
848
849 // Allocations should only occur at the beginning of the resize
850 RUBY_ASSERT(candidate != FSTRING_TABLE_MOVED);
851
852 if (candidate == FSTRING_TABLE_EMPTY) {
853 // We didn't find our string to delete
854 return;
855 }
856 else if (candidate == value) {
857 // We found our string, replace it with a tombstone and increment the count
858 entry->str = FSTRING_TABLE_TOMBSTONE;
859 table->deleted_entries++;
860 return;
861 }
862
863 idx = fstring_table_probe_next(&probe);
864 }
865}
866
867static VALUE
868register_fstring(VALUE str, bool copy, bool force_precompute_hash)
869{
870 struct fstr_update_arg args = {
871 .copy = copy,
872 .force_precompute_hash = force_precompute_hash
873 };
874
875#if SIZEOF_VOIDP == SIZEOF_LONG
876 if (FL_TEST_RAW(str, STR_FAKESTR)) {
877 // if the string hasn't been interned, we'll need the hash twice, so we
878 // compute it once and store it in capa
879 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
880 }
881#endif
882
883 VALUE hash_code = fstring_hash(str);
884 VALUE result = fstring_find_or_insert(hash_code, str, &args);
885
886 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
888 RUBY_ASSERT(OBJ_FROZEN(result));
889 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
891
892 return result;
893}
894
895void
896rb_fstring_foreach_with_replace(st_foreach_check_callback_func *func, st_update_callback_func *replace, st_data_t arg)
897{
898 // Assume locking and barrier (which there is no assert for)
899 ASSERT_vm_locking();
900
901 VALUE table_obj = RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj);
902 if (!table_obj) {
903 // Table not yet initialized. Nothing to iterate over
904 return;
905 }
906 struct fstring_table_struct *table = RTYPEDDATA_GET_DATA(table_obj);
907
908 for (unsigned int i = 0; i < table->capacity; i++) {
909 VALUE key = table->entries[i].str;
910 if(key == FSTRING_TABLE_EMPTY) continue;
911 if(key == FSTRING_TABLE_TOMBSTONE) continue;
912
913 enum st_retval retval;
914 retval = (*func)(key, key, arg, 0);
915
916 if (retval == ST_REPLACE && replace) {
917 st_data_t value = key;
918 retval = (*replace)(&key, &value, arg, TRUE);
919 table->entries[i].str = key;
920 }
921 switch (retval) {
922 case ST_REPLACE:
923 case ST_CONTINUE:
924 break;
925 case ST_CHECK:
926 rb_bug("unsupported");
927 case ST_STOP:
928 return;
929 case ST_DELETE:
930 table->entries[i].str = FSTRING_TABLE_TOMBSTONE;
931 break;
932 }
933 }
934}
935
936bool
937rb_obj_is_fstring_table(VALUE obj)
938{
939 ASSERT_vm_locking();
940
941 return obj == fstring_table_obj;
942}
943
944void
945rb_gc_free_fstring(VALUE obj)
946{
947 // Assume locking and barrier (which there is no assert for)
948 ASSERT_vm_locking();
949
950 VALUE str_hash = fstring_hash(obj);
951 fstring_delete(str_hash, obj);
952
953 RB_DEBUG_COUNTER_INC(obj_str_fstr);
954
955 FL_UNSET(obj, RSTRING_FSTR);
956}
957
958static VALUE
959setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
960{
961 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
962 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
963
964 if (!name) {
966 name = "";
967 }
968
969 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
970
971 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
972 fake_str->len = len;
973 fake_str->as.heap.ptr = (char *)name;
974 fake_str->as.heap.aux.capa = len;
975 return (VALUE)fake_str;
976}
977
978/*
979 * set up a fake string which refers a static string literal.
980 */
981VALUE
982rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
983{
984 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
985}
986
987/*
988 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
989 * shared string which refers a static string literal. `ptr` must
990 * point a constant string.
991 */
992VALUE
993rb_fstring_new(const char *ptr, long len)
994{
995 struct RString fake_str;
996 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
997}
998
999VALUE
1000rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
1001{
1002 struct RString fake_str;
1003 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
1004}
1005
1006VALUE
1007rb_fstring_cstr(const char *ptr)
1008{
1009 return rb_fstring_new(ptr, strlen(ptr));
1010}
1011
1012static int
1013fstring_cmp(VALUE a, VALUE b)
1014{
1015 long alen, blen;
1016 const char *aptr, *bptr;
1017
1020
1021 RSTRING_GETMEM(a, aptr, alen);
1022 RSTRING_GETMEM(b, bptr, blen);
1023 return (alen != blen ||
1024 ENCODING_GET(a) != ENCODING_GET(b) ||
1025 memcmp(aptr, bptr, alen) != 0);
1026}
1027
1028static inline bool
1029single_byte_optimizable(VALUE str)
1030{
1031 int encindex = ENCODING_GET(str);
1032 switch (encindex) {
1033 case ENCINDEX_ASCII_8BIT:
1034 case ENCINDEX_US_ASCII:
1035 return true;
1036 case ENCINDEX_UTF_8:
1037 // For UTF-8 it's worth scanning the string coderange when unknown.
1039 }
1040 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
1041 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
1042 return true;
1043 }
1044
1045 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
1046 return true;
1047 }
1048
1049 /* Conservative. Possibly single byte.
1050 * "\xa1" in Shift_JIS for example. */
1051 return false;
1052}
1053
1055
1056static inline const char *
1057search_nonascii(const char *p, const char *e)
1058{
1059 const uintptr_t *s, *t;
1060
1061#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
1062# if SIZEOF_UINTPTR_T == 8
1063# define NONASCII_MASK UINT64_C(0x8080808080808080)
1064# elif SIZEOF_UINTPTR_T == 4
1065# define NONASCII_MASK UINT32_C(0x80808080)
1066# else
1067# error "don't know what to do."
1068# endif
1069#else
1070# if SIZEOF_UINTPTR_T == 8
1071# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
1072# elif SIZEOF_UINTPTR_T == 4
1073# define NONASCII_MASK 0x80808080UL /* or...? */
1074# else
1075# error "don't know what to do."
1076# endif
1077#endif
1078
1079 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
1080#if !UNALIGNED_WORD_ACCESS
1081 if ((uintptr_t)p % SIZEOF_VOIDP) {
1082 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
1083 p += l;
1084 switch (l) {
1085 default: UNREACHABLE;
1086#if SIZEOF_VOIDP > 4
1087 case 7: if (p[-7]&0x80) return p-7;
1088 case 6: if (p[-6]&0x80) return p-6;
1089 case 5: if (p[-5]&0x80) return p-5;
1090 case 4: if (p[-4]&0x80) return p-4;
1091#endif
1092 case 3: if (p[-3]&0x80) return p-3;
1093 case 2: if (p[-2]&0x80) return p-2;
1094 case 1: if (p[-1]&0x80) return p-1;
1095 case 0: break;
1096 }
1097 }
1098#endif
1099#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
1100#define aligned_ptr(value) \
1101 __builtin_assume_aligned((value), sizeof(uintptr_t))
1102#else
1103#define aligned_ptr(value) (uintptr_t *)(value)
1104#endif
1105 s = aligned_ptr(p);
1106 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
1107#undef aligned_ptr
1108 for (;s < t; s++) {
1109 if (*s & NONASCII_MASK) {
1110#ifdef WORDS_BIGENDIAN
1111 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
1112#else
1113 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
1114#endif
1115 }
1116 }
1117 p = (const char *)s;
1118 }
1119
1120 switch (e - p) {
1121 default: UNREACHABLE;
1122#if SIZEOF_VOIDP > 4
1123 case 7: if (e[-7]&0x80) return e-7;
1124 case 6: if (e[-6]&0x80) return e-6;
1125 case 5: if (e[-5]&0x80) return e-5;
1126 case 4: if (e[-4]&0x80) return e-4;
1127#endif
1128 case 3: if (e[-3]&0x80) return e-3;
1129 case 2: if (e[-2]&0x80) return e-2;
1130 case 1: if (e[-1]&0x80) return e-1;
1131 case 0: return NULL;
1132 }
1133}
1134
1135static int
1136coderange_scan(const char *p, long len, rb_encoding *enc)
1137{
1138 const char *e = p + len;
1139
1140 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
1141 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
1142 p = search_nonascii(p, e);
1144 }
1145
1146 if (rb_enc_asciicompat(enc)) {
1147 p = search_nonascii(p, e);
1148 if (!p) return ENC_CODERANGE_7BIT;
1149 for (;;) {
1150 int ret = rb_enc_precise_mbclen(p, e, enc);
1151 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
1152 p += MBCLEN_CHARFOUND_LEN(ret);
1153 if (p == e) break;
1154 p = search_nonascii(p, e);
1155 if (!p) break;
1156 }
1157 }
1158 else {
1159 while (p < e) {
1160 int ret = rb_enc_precise_mbclen(p, e, enc);
1161 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
1162 p += MBCLEN_CHARFOUND_LEN(ret);
1163 }
1164 }
1165 return ENC_CODERANGE_VALID;
1166}
1167
1168long
1169rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
1170{
1171 const char *p = s;
1172
1173 if (*cr == ENC_CODERANGE_BROKEN)
1174 return e - s;
1175
1176 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
1177 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
1178 if (*cr == ENC_CODERANGE_VALID) return e - s;
1179 p = search_nonascii(p, e);
1181 return e - s;
1182 }
1183 else if (rb_enc_asciicompat(enc)) {
1184 p = search_nonascii(p, e);
1185 if (!p) {
1186 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
1187 return e - s;
1188 }
1189 for (;;) {
1190 int ret = rb_enc_precise_mbclen(p, e, enc);
1191 if (!MBCLEN_CHARFOUND_P(ret)) {
1193 return p - s;
1194 }
1195 p += MBCLEN_CHARFOUND_LEN(ret);
1196 if (p == e) break;
1197 p = search_nonascii(p, e);
1198 if (!p) break;
1199 }
1200 }
1201 else {
1202 while (p < e) {
1203 int ret = rb_enc_precise_mbclen(p, e, enc);
1204 if (!MBCLEN_CHARFOUND_P(ret)) {
1206 return p - s;
1207 }
1208 p += MBCLEN_CHARFOUND_LEN(ret);
1209 }
1210 }
1211 *cr = ENC_CODERANGE_VALID;
1212 return e - s;
1213}
1214
1215static inline void
1216str_enc_copy(VALUE str1, VALUE str2)
1217{
1218 rb_enc_set_index(str1, ENCODING_GET(str2));
1219}
1220
1221/* Like str_enc_copy, but does not check frozen status of str1.
1222 * You should use this only if you're certain that str1 is not frozen. */
1223static inline void
1224str_enc_copy_direct(VALUE str1, VALUE str2)
1225{
1226 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
1227 if (inlined_encoding == ENCODING_INLINE_MAX) {
1228 rb_enc_set_index(str1, rb_enc_get_index(str2));
1229 }
1230 else {
1231 ENCODING_SET_INLINED(str1, inlined_encoding);
1232 }
1233}
1234
1235static void
1236rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
1237{
1238 /* this function is designed for copying encoding and coderange
1239 * from src to new string "dest" which is made from the part of src.
1240 */
1241 str_enc_copy(dest, src);
1242 if (RSTRING_LEN(dest) == 0) {
1243 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
1245 else
1247 return;
1248 }
1249 switch (ENC_CODERANGE(src)) {
1250 case ENC_CODERANGE_7BIT:
1252 break;
1254 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
1255 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
1257 else
1259 break;
1260 default:
1261 break;
1262 }
1263}
1264
1265static void
1266rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
1267{
1268 str_enc_copy(dest, src);
1269 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
1270}
1271
1272static int
1273enc_coderange_scan(VALUE str, rb_encoding *enc)
1274{
1275 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
1276}
1277
1278int
1279rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
1280{
1281 return enc_coderange_scan(str, enc);
1282}
1283
1284int
1286{
1287 int cr = ENC_CODERANGE(str);
1288
1289 if (cr == ENC_CODERANGE_UNKNOWN) {
1290 cr = enc_coderange_scan(str, get_encoding(str));
1291 ENC_CODERANGE_SET(str, cr);
1292 }
1293 return cr;
1294}
1295
1296static inline bool
1297rb_enc_str_asciicompat(VALUE str)
1298{
1299 int encindex = ENCODING_GET_INLINED(str);
1300 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
1301}
1302
1303int
1305{
1306 switch(ENC_CODERANGE(str)) {
1308 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
1309 case ENC_CODERANGE_7BIT:
1310 return true;
1311 default:
1312 return false;
1313 }
1314}
1315
1316static inline void
1317str_mod_check(VALUE s, const char *p, long len)
1318{
1319 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
1320 rb_raise(rb_eRuntimeError, "string modified");
1321 }
1322}
1323
1324static size_t
1325str_capacity(VALUE str, const int termlen)
1326{
1327 if (STR_EMBED_P(str)) {
1328 return str_embed_capa(str) - termlen;
1329 }
1330 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
1331 return RSTRING(str)->len;
1332 }
1333 else {
1334 return RSTRING(str)->as.heap.aux.capa;
1335 }
1336}
1337
1338size_t
1340{
1341 return str_capacity(str, TERM_LEN(str));
1342}
1343
1344static inline void
1345must_not_null(const char *ptr)
1346{
1347 if (!ptr) {
1348 rb_raise(rb_eArgError, "NULL pointer given");
1349 }
1350}
1351
1352static inline VALUE
1353str_alloc_embed(VALUE klass, size_t capa)
1354{
1355 size_t size = rb_str_embed_size(capa);
1356 RUBY_ASSERT(size > 0);
1357 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1358
1359 NEWOBJ_OF(str, struct RString, klass,
1361
1362 return (VALUE)str;
1363}
1364
1365static inline VALUE
1366str_alloc_heap(VALUE klass)
1367{
1368 NEWOBJ_OF(str, struct RString, klass,
1369 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1370
1371 return (VALUE)str;
1372}
1373
1374static inline VALUE
1375empty_str_alloc(VALUE klass)
1376{
1377 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1378 VALUE str = str_alloc_embed(klass, 0);
1379 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1381 return str;
1382}
1383
1384static VALUE
1385str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1386{
1387 VALUE str;
1388
1389 if (len < 0) {
1390 rb_raise(rb_eArgError, "negative string size (or size too big)");
1391 }
1392
1393 if (enc == NULL) {
1394 enc = rb_ascii8bit_encoding();
1395 }
1396
1397 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1398
1399 int termlen = rb_enc_mbminlen(enc);
1400
1401 if (STR_EMBEDDABLE_P(len, termlen)) {
1402 str = str_alloc_embed(klass, len + termlen);
1403 if (len == 0) {
1404 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1405 }
1406 }
1407 else {
1408 str = str_alloc_heap(klass);
1409 RSTRING(str)->as.heap.aux.capa = len;
1410 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1411 * integer overflow. If we can STATIC_ASSERT that, the following
1412 * mul_add_mul can be reverted to a simple ALLOC_N. */
1413 RSTRING(str)->as.heap.ptr =
1414 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1415 }
1416
1417 rb_enc_raw_set(str, enc);
1418
1419 if (ptr) {
1420 memcpy(RSTRING_PTR(str), ptr, len);
1421 }
1422
1423 STR_SET_LEN(str, len);
1424 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1425 return str;
1426}
1427
1428static VALUE
1429str_new(VALUE klass, const char *ptr, long len)
1430{
1431 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1432}
1433
1434VALUE
1435rb_str_new(const char *ptr, long len)
1436{
1437 return str_new(rb_cString, ptr, len);
1438}
1439
1440VALUE
1441rb_usascii_str_new(const char *ptr, long len)
1442{
1443 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1444}
1445
1446VALUE
1447rb_utf8_str_new(const char *ptr, long len)
1448{
1449 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1450}
1451
1452VALUE
1453rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1454{
1455 return str_enc_new(rb_cString, ptr, len, enc);
1456}
1457
1458VALUE
1460{
1461 must_not_null(ptr);
1462 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1463 * memory regions, and that cannot be detected by the MSAN. Just
1464 * trust the programmer that the argument passed here is a sane C
1465 * string. */
1466 __msan_unpoison_string(ptr);
1467 return rb_str_new(ptr, strlen(ptr));
1468}
1469
1470VALUE
1472{
1473 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1474}
1475
1476VALUE
1478{
1479 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1480}
1481
1482VALUE
1484{
1485 must_not_null(ptr);
1486 if (rb_enc_mbminlen(enc) != 1) {
1487 rb_raise(rb_eArgError, "wchar encoding given");
1488 }
1489 return rb_enc_str_new(ptr, strlen(ptr), enc);
1490}
1491
1492static VALUE
1493str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1494{
1495 VALUE str;
1496
1497 if (len < 0) {
1498 rb_raise(rb_eArgError, "negative string size (or size too big)");
1499 }
1500
1501 if (!ptr) {
1502 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1503 }
1504 else {
1505 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1506 str = str_alloc_heap(klass);
1507 RSTRING(str)->len = len;
1508 RSTRING(str)->as.heap.ptr = (char *)ptr;
1509 RSTRING(str)->as.heap.aux.capa = len;
1510 RBASIC(str)->flags |= STR_NOFREE;
1511 rb_enc_associate_index(str, encindex);
1512 }
1513 return str;
1514}
1515
1516VALUE
1517rb_str_new_static(const char *ptr, long len)
1518{
1519 return str_new_static(rb_cString, ptr, len, 0);
1520}
1521
1522VALUE
1524{
1525 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1526}
1527
1528VALUE
1530{
1531 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1532}
1533
1534VALUE
1536{
1537 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1538}
1539
1540static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1541 rb_encoding *from, rb_encoding *to,
1542 int ecflags, VALUE ecopts);
1543
1544static inline bool
1545is_enc_ascii_string(VALUE str, rb_encoding *enc)
1546{
1547 int encidx = rb_enc_to_index(enc);
1548 if (rb_enc_get_index(str) == encidx)
1549 return is_ascii_string(str);
1550 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1551}
1552
1553VALUE
1554rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1555{
1556 long len;
1557 const char *ptr;
1558 VALUE newstr;
1559
1560 if (!to) return str;
1561 if (!from) from = rb_enc_get(str);
1562 if (from == to) return str;
1563 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1564 rb_is_ascii8bit_enc(to)) {
1565 if (STR_ENC_GET(str) != to) {
1566 str = rb_str_dup(str);
1567 rb_enc_associate(str, to);
1568 }
1569 return str;
1570 }
1571
1572 RSTRING_GETMEM(str, ptr, len);
1573 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1574 from, to, ecflags, ecopts);
1575 if (NIL_P(newstr)) {
1576 /* some error, return original */
1577 return str;
1578 }
1579 return newstr;
1580}
1581
1582VALUE
1583rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1584 rb_encoding *from, int ecflags, VALUE ecopts)
1585{
1586 long olen;
1587
1588 olen = RSTRING_LEN(newstr);
1589 if (ofs < -olen || olen < ofs)
1590 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1591 if (ofs < 0) ofs += olen;
1592 if (!from) {
1593 STR_SET_LEN(newstr, ofs);
1594 return rb_str_cat(newstr, ptr, len);
1595 }
1596
1597 rb_str_modify(newstr);
1598 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1599 rb_enc_get(newstr),
1600 ecflags, ecopts);
1601}
1602
1603VALUE
1604rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1605{
1606 STR_SET_LEN(str, 0);
1607 rb_enc_associate(str, enc);
1608 rb_str_cat(str, ptr, len);
1609 return str;
1610}
1611
1612static VALUE
1613str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1614 rb_encoding *from, rb_encoding *to,
1615 int ecflags, VALUE ecopts)
1616{
1617 rb_econv_t *ec;
1619 long olen;
1620 VALUE econv_wrapper;
1621 const unsigned char *start, *sp;
1622 unsigned char *dest, *dp;
1623 size_t converted_output = (size_t)ofs;
1624
1625 olen = rb_str_capacity(newstr);
1626
1627 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1628 RBASIC_CLEAR_CLASS(econv_wrapper);
1629 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1630 if (!ec) return Qnil;
1631 DATA_PTR(econv_wrapper) = ec;
1632
1633 sp = (unsigned char*)ptr;
1634 start = sp;
1635 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1636 (dp = dest + converted_output),
1637 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1639 /* destination buffer short */
1640 size_t converted_input = sp - start;
1641 size_t rest = len - converted_input;
1642 converted_output = dp - dest;
1643 rb_str_set_len(newstr, converted_output);
1644 if (converted_input && converted_output &&
1645 rest < (LONG_MAX / converted_output)) {
1646 rest = (rest * converted_output) / converted_input;
1647 }
1648 else {
1649 rest = olen;
1650 }
1651 olen += rest < 2 ? 2 : rest;
1652 rb_str_resize(newstr, olen);
1653 }
1654 DATA_PTR(econv_wrapper) = 0;
1655 RB_GC_GUARD(econv_wrapper);
1656 rb_econv_close(ec);
1657 switch (ret) {
1658 case econv_finished:
1659 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1660 rb_str_set_len(newstr, len);
1661 rb_enc_associate(newstr, to);
1662 return newstr;
1663
1664 default:
1665 return Qnil;
1666 }
1667}
1668
1669VALUE
1671{
1672 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1673}
1674
1675VALUE
1677{
1678 rb_encoding *ienc;
1679 VALUE str;
1680 const int eidx = rb_enc_to_index(eenc);
1681
1682 if (!ptr) {
1683 return rb_enc_str_new(ptr, len, eenc);
1684 }
1685
1686 /* ASCII-8BIT case, no conversion */
1687 if ((eidx == rb_ascii8bit_encindex()) ||
1688 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1689 return rb_str_new(ptr, len);
1690 }
1691 /* no default_internal or same encoding, no conversion */
1692 ienc = rb_default_internal_encoding();
1693 if (!ienc || eenc == ienc) {
1694 return rb_enc_str_new(ptr, len, eenc);
1695 }
1696 /* ASCII compatible, and ASCII only string, no conversion in
1697 * default_internal */
1698 if ((eidx == rb_ascii8bit_encindex()) ||
1699 (eidx == rb_usascii_encindex()) ||
1700 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1701 return rb_enc_str_new(ptr, len, ienc);
1702 }
1703 /* convert from the given encoding to default_internal */
1704 str = rb_enc_str_new(NULL, 0, ienc);
1705 /* when the conversion failed for some reason, just ignore the
1706 * default_internal and result in the given encoding as-is. */
1707 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1708 rb_str_initialize(str, ptr, len, eenc);
1709 }
1710 return str;
1711}
1712
1713VALUE
1714rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1715{
1716 int eidx = rb_enc_to_index(eenc);
1717 if (eidx == rb_usascii_encindex() &&
1718 !is_ascii_string(str)) {
1719 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1720 return str;
1721 }
1722 rb_enc_associate_index(str, eidx);
1723 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1724}
1725
1726VALUE
1727rb_external_str_new(const char *ptr, long len)
1728{
1729 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1730}
1731
1732VALUE
1734{
1735 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1736}
1737
1738VALUE
1739rb_locale_str_new(const char *ptr, long len)
1740{
1741 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1742}
1743
1744VALUE
1746{
1747 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1748}
1749
1750VALUE
1752{
1753 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1754}
1755
1756VALUE
1758{
1759 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1760}
1761
1762VALUE
1764{
1765 return rb_str_export_to_enc(str, rb_default_external_encoding());
1766}
1767
1768VALUE
1770{
1771 return rb_str_export_to_enc(str, rb_locale_encoding());
1772}
1773
1774VALUE
1776{
1777 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1778}
1779
1780static VALUE
1781str_replace_shared_without_enc(VALUE str2, VALUE str)
1782{
1783 const int termlen = TERM_LEN(str);
1784 char *ptr;
1785 long len;
1786
1787 RSTRING_GETMEM(str, ptr, len);
1788 if (str_embed_capa(str2) >= len + termlen) {
1789 char *ptr2 = RSTRING(str2)->as.embed.ary;
1790 STR_SET_EMBED(str2);
1791 memcpy(ptr2, RSTRING_PTR(str), len);
1792 TERM_FILL(ptr2+len, termlen);
1793 }
1794 else {
1795 VALUE root;
1796 if (STR_SHARED_P(str)) {
1797 root = RSTRING(str)->as.heap.aux.shared;
1798 RSTRING_GETMEM(str, ptr, len);
1799 }
1800 else {
1801 root = rb_str_new_frozen(str);
1802 RSTRING_GETMEM(root, ptr, len);
1803 }
1804 RUBY_ASSERT(OBJ_FROZEN(root));
1805
1806 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1807 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1808 rb_fatal("about to free a possible shared root");
1809 }
1810 char *ptr2 = STR_HEAP_PTR(str2);
1811 if (ptr2 != ptr) {
1812 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1813 }
1814 }
1815 FL_SET(str2, STR_NOEMBED);
1816 RSTRING(str2)->as.heap.ptr = ptr;
1817 STR_SET_SHARED(str2, root);
1818 }
1819
1820 STR_SET_LEN(str2, len);
1821
1822 return str2;
1823}
1824
1825static VALUE
1826str_replace_shared(VALUE str2, VALUE str)
1827{
1828 str_replace_shared_without_enc(str2, str);
1829 rb_enc_cr_str_exact_copy(str2, str);
1830 return str2;
1831}
1832
1833static VALUE
1834str_new_shared(VALUE klass, VALUE str)
1835{
1836 return str_replace_shared(str_alloc_heap(klass), str);
1837}
1838
1839VALUE
1841{
1842 return str_new_shared(rb_obj_class(str), str);
1843}
1844
1845VALUE
1847{
1848 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1849 return str_new_frozen(rb_obj_class(orig), orig);
1850}
1851
1852static VALUE
1853rb_str_new_frozen_String(VALUE orig)
1854{
1855 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1856 return str_new_frozen(rb_cString, orig);
1857}
1858
1859
1860VALUE
1861rb_str_frozen_bare_string(VALUE orig)
1862{
1863 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1864 return str_new_frozen(rb_cString, orig);
1865}
1866
1867VALUE
1868rb_str_tmp_frozen_acquire(VALUE orig)
1869{
1870 if (OBJ_FROZEN_RAW(orig)) return orig;
1871 return str_new_frozen_buffer(0, orig, FALSE);
1872}
1873
1874VALUE
1875rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1876{
1877 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1878 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1879
1880 VALUE str = str_alloc_heap(0);
1881 OBJ_FREEZE(str);
1882 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1883 FL_SET(str, STR_SHARED_ROOT);
1884
1885 size_t capa = str_capacity(orig, TERM_LEN(orig));
1886
1887 /* If the string is embedded then we want to create a copy that is heap
1888 * allocated. If the string is shared then the shared root must be
1889 * embedded, so we want to create a copy. If the string is a shared root
1890 * then it must be embedded, so we want to create a copy. */
1891 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1892 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1893 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1894 }
1895 else {
1896 /* orig must be heap allocated and not shared, so we can safely transfer
1897 * the pointer to str. */
1898 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1899 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1900 RBASIC(orig)->flags &= ~STR_NOFREE;
1901 STR_SET_SHARED(orig, str);
1902 }
1903
1904 RSTRING(str)->len = RSTRING(orig)->len;
1905 RSTRING(str)->as.heap.aux.capa = capa;
1906
1907 return str;
1908}
1909
1910void
1911rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1912{
1913 if (RBASIC_CLASS(tmp) != 0)
1914 return;
1915
1916 if (STR_EMBED_P(tmp)) {
1918 }
1919 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1920 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1921 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1922
1923 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1924 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1925 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1926
1927 /* Unshare orig since the root (tmp) only has this one child. */
1928 FL_UNSET_RAW(orig, STR_SHARED);
1929 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1930 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1932
1933 /* Make tmp embedded and empty so it is safe for sweeping. */
1934 STR_SET_EMBED(tmp);
1935 STR_SET_LEN(tmp, 0);
1936 }
1937 }
1938}
1939
1940static VALUE
1941str_new_frozen(VALUE klass, VALUE orig)
1942{
1943 return str_new_frozen_buffer(klass, orig, TRUE);
1944}
1945
1946static VALUE
1947heap_str_make_shared(VALUE klass, VALUE orig)
1948{
1949 RUBY_ASSERT(!STR_EMBED_P(orig));
1950 RUBY_ASSERT(!STR_SHARED_P(orig));
1951
1952 VALUE str = str_alloc_heap(klass);
1953 STR_SET_LEN(str, RSTRING_LEN(orig));
1954 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1955 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1956 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1957 RBASIC(orig)->flags &= ~STR_NOFREE;
1958 STR_SET_SHARED(orig, str);
1959 if (klass == 0)
1960 FL_UNSET_RAW(str, STR_BORROWED);
1961 return str;
1962}
1963
1964static VALUE
1965str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1966{
1967 VALUE str;
1968
1969 long len = RSTRING_LEN(orig);
1970 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1971 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1972
1973 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1974 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1975 RUBY_ASSERT(STR_EMBED_P(str));
1976 }
1977 else {
1978 if (FL_TEST_RAW(orig, STR_SHARED)) {
1979 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1980 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1981 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1982 RUBY_ASSERT(ofs >= 0);
1983 RUBY_ASSERT(rest >= 0);
1984 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1986
1987 if ((ofs > 0) || (rest > 0) ||
1988 (klass != RBASIC(shared)->klass) ||
1989 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1990 str = str_new_shared(klass, shared);
1991 RUBY_ASSERT(!STR_EMBED_P(str));
1992 RSTRING(str)->as.heap.ptr += ofs;
1993 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1994 }
1995 else {
1996 if (RBASIC_CLASS(shared) == 0)
1997 FL_SET_RAW(shared, STR_BORROWED);
1998 return shared;
1999 }
2000 }
2001 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
2002 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
2003 STR_SET_EMBED(str);
2004 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
2005 STR_SET_LEN(str, RSTRING_LEN(orig));
2006 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
2007 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
2008 }
2009 else {
2010 str = heap_str_make_shared(klass, orig);
2011 }
2012 }
2013
2014 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
2015 OBJ_FREEZE(str);
2016 return str;
2017}
2018
2019VALUE
2020rb_str_new_with_class(VALUE obj, const char *ptr, long len)
2021{
2022 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
2023}
2024
2025static VALUE
2026str_new_empty_String(VALUE str)
2027{
2028 VALUE v = rb_str_new(0, 0);
2029 rb_enc_copy(v, str);
2030 return v;
2031}
2032
2033#define STR_BUF_MIN_SIZE 63
2034
2035VALUE
2037{
2038 if (STR_EMBEDDABLE_P(capa, 1)) {
2039 return str_alloc_embed(rb_cString, capa + 1);
2040 }
2041
2042 VALUE str = str_alloc_heap(rb_cString);
2043
2044 RSTRING(str)->as.heap.aux.capa = capa;
2045 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
2046 RSTRING(str)->as.heap.ptr[0] = '\0';
2047
2048 return str;
2049}
2050
2051VALUE
2053{
2054 VALUE str;
2055 long len = strlen(ptr);
2056
2057 str = rb_str_buf_new(len);
2058 rb_str_buf_cat(str, ptr, len);
2059
2060 return str;
2061}
2062
2063VALUE
2065{
2066 return str_new(0, 0, len);
2067}
2068
2069void
2071{
2072 if (STR_EMBED_P(str)) {
2073 RB_DEBUG_COUNTER_INC(obj_str_embed);
2074 }
2075 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
2076 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
2077 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
2078 }
2079 else {
2080 RB_DEBUG_COUNTER_INC(obj_str_ptr);
2081 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2082 }
2083}
2084
2085size_t
2086rb_str_memsize(VALUE str)
2087{
2088 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
2089 return STR_HEAP_SIZE(str);
2090 }
2091 else {
2092 return 0;
2093 }
2094}
2095
2096VALUE
2098{
2099 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
2100}
2101
2102static inline void str_discard(VALUE str);
2103static void str_shared_replace(VALUE str, VALUE str2);
2104
2105void
2107{
2108 if (str != str2) str_shared_replace(str, str2);
2109}
2110
2111static void
2112str_shared_replace(VALUE str, VALUE str2)
2113{
2114 rb_encoding *enc;
2115 int cr;
2116 int termlen;
2117
2118 RUBY_ASSERT(str2 != str);
2119 enc = STR_ENC_GET(str2);
2120 cr = ENC_CODERANGE(str2);
2121 str_discard(str);
2122 termlen = rb_enc_mbminlen(enc);
2123
2124 STR_SET_LEN(str, RSTRING_LEN(str2));
2125
2126 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
2127 STR_SET_EMBED(str);
2128 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
2129 rb_enc_associate(str, enc);
2130 ENC_CODERANGE_SET(str, cr);
2131 }
2132 else {
2133 if (STR_EMBED_P(str2)) {
2134 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
2135 long len = RSTRING_LEN(str2);
2136 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
2137
2138 char *new_ptr = ALLOC_N(char, len + termlen);
2139 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
2140 RSTRING(str2)->as.heap.ptr = new_ptr;
2141 STR_SET_LEN(str2, len);
2142 RSTRING(str2)->as.heap.aux.capa = len;
2143 STR_SET_NOEMBED(str2);
2144 }
2145
2146 STR_SET_NOEMBED(str);
2147 FL_UNSET(str, STR_SHARED);
2148 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
2149
2150 if (FL_TEST(str2, STR_SHARED)) {
2151 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
2152 STR_SET_SHARED(str, shared);
2153 }
2154 else {
2155 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
2156 }
2157
2158 /* abandon str2 */
2159 STR_SET_EMBED(str2);
2160 RSTRING_PTR(str2)[0] = 0;
2161 STR_SET_LEN(str2, 0);
2162 rb_enc_associate(str, enc);
2163 ENC_CODERANGE_SET(str, cr);
2164 }
2165}
2166
2167VALUE
2169{
2170 VALUE str;
2171
2172 if (RB_TYPE_P(obj, T_STRING)) {
2173 return obj;
2174 }
2175 str = rb_funcall(obj, idTo_s, 0);
2176 return rb_obj_as_string_result(str, obj);
2177}
2178
2179VALUE
2180rb_obj_as_string_result(VALUE str, VALUE obj)
2181{
2182 if (!RB_TYPE_P(str, T_STRING))
2183 return rb_any_to_s(obj);
2184 return str;
2185}
2186
2187static VALUE
2188str_replace(VALUE str, VALUE str2)
2189{
2190 long len;
2191
2192 len = RSTRING_LEN(str2);
2193 if (STR_SHARED_P(str2)) {
2194 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
2196 STR_SET_NOEMBED(str);
2197 STR_SET_LEN(str, len);
2198 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
2199 STR_SET_SHARED(str, shared);
2200 rb_enc_cr_str_exact_copy(str, str2);
2201 }
2202 else {
2203 str_replace_shared(str, str2);
2204 }
2205
2206 return str;
2207}
2208
2209static inline VALUE
2210ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
2211{
2212 size_t size = rb_str_embed_size(capa);
2213 RUBY_ASSERT(size > 0);
2214 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
2215
2216 NEWOBJ_OF(str, struct RString, klass,
2218
2219 return (VALUE)str;
2220}
2221
2222static inline VALUE
2223ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
2224{
2225 NEWOBJ_OF(str, struct RString, klass,
2226 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
2227
2228 return (VALUE)str;
2229}
2230
2231static inline VALUE
2232str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
2233{
2234 int encidx = 0;
2235 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
2236 encidx = rb_enc_get_index(str);
2237 flags &= ~ENCODING_MASK;
2238 }
2239 FL_SET_RAW(dup, flags & ~FL_FREEZE);
2240 if (encidx) rb_enc_associate_index(dup, encidx);
2241 return dup;
2242}
2243
2244static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
2245
2246static inline VALUE
2247str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
2248{
2249 VALUE flags = FL_TEST_RAW(str, flag_mask);
2250 long len = RSTRING_LEN(str);
2251
2252 RUBY_ASSERT(STR_EMBED_P(dup));
2253 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
2254 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
2255 STR_SET_LEN(dup, RSTRING_LEN(str));
2256 return str_duplicate_setup_encoding(str, dup, flags);
2257}
2258
2259static inline VALUE
2260str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
2261{
2262 VALUE flags = FL_TEST_RAW(str, flag_mask);
2263 VALUE root = str;
2264 if (FL_TEST_RAW(str, STR_SHARED)) {
2265 root = RSTRING(str)->as.heap.aux.shared;
2266 }
2267 else if (UNLIKELY(!(flags & FL_FREEZE))) {
2268 root = str = str_new_frozen(klass, str);
2269 flags = FL_TEST_RAW(str, flag_mask);
2270 }
2271 RUBY_ASSERT(!STR_SHARED_P(root));
2273
2274 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
2275 FL_SET(root, STR_SHARED_ROOT);
2276 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
2277 flags |= RSTRING_NOEMBED | STR_SHARED;
2278
2279 STR_SET_LEN(dup, RSTRING_LEN(str));
2280 return str_duplicate_setup_encoding(str, dup, flags);
2281}
2282
2283static inline VALUE
2284str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
2285{
2286 if (STR_EMBED_P(str)) {
2287 return str_duplicate_setup_embed(klass, str, dup);
2288 }
2289 else {
2290 return str_duplicate_setup_heap(klass, str, dup);
2291 }
2292}
2293
2294static inline VALUE
2295str_duplicate(VALUE klass, VALUE str)
2296{
2297 VALUE dup;
2298 if (STR_EMBED_P(str)) {
2299 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
2300 }
2301 else {
2302 dup = str_alloc_heap(klass);
2303 }
2304
2305 return str_duplicate_setup(klass, str, dup);
2306}
2307
2308VALUE
2310{
2311 return str_duplicate(rb_obj_class(str), str);
2312}
2313
2314/* :nodoc: */
2315VALUE
2316rb_str_dup_m(VALUE str)
2317{
2318 if (LIKELY(BARE_STRING_P(str))) {
2319 return str_duplicate(rb_obj_class(str), str);
2320 }
2321 else {
2322 return rb_obj_dup(str);
2323 }
2324}
2325
2326VALUE
2328{
2329 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2330 return str_duplicate(rb_cString, str);
2331}
2332
2333VALUE
2334rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2335{
2336 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2337 VALUE new_str, klass = rb_cString;
2338
2339 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2340 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2341 str_duplicate_setup_embed(klass, str, new_str);
2342 }
2343 else {
2344 new_str = ec_str_alloc_heap(ec, klass);
2345 str_duplicate_setup_heap(klass, str, new_str);
2346 }
2347 if (chilled) {
2348 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2349 }
2350 return new_str;
2351}
2352
2353VALUE
2354rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2355{
2356 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2357 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2358 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2359 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2360 return rb_str_freeze(str);
2361}
2362
2363/*
2364 * The documentation block below uses an include (instead of inline text)
2365 * because the included text has non-ASCII characters (which are not allowed in a C file).
2366 */
2367
2368/*
2369 *
2370 * call-seq:
2371 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2372 *
2373 * :include: doc/string/new.rdoc
2374 *
2375 */
2376
2377static VALUE
2378rb_str_init(int argc, VALUE *argv, VALUE str)
2379{
2380 static ID keyword_ids[2];
2381 VALUE orig, opt, venc, vcapa;
2382 VALUE kwargs[2];
2383 rb_encoding *enc = 0;
2384 int n;
2385
2386 if (!keyword_ids[0]) {
2387 keyword_ids[0] = rb_id_encoding();
2388 CONST_ID(keyword_ids[1], "capacity");
2389 }
2390
2391 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2392 if (!NIL_P(opt)) {
2393 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2394 venc = kwargs[0];
2395 vcapa = kwargs[1];
2396 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2397 enc = rb_to_encoding(venc);
2398 }
2399 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2400 long capa = NUM2LONG(vcapa);
2401 long len = 0;
2402 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2403
2404 if (capa < STR_BUF_MIN_SIZE) {
2405 capa = STR_BUF_MIN_SIZE;
2406 }
2407 if (n == 1) {
2408 StringValue(orig);
2409 len = RSTRING_LEN(orig);
2410 if (capa < len) {
2411 capa = len;
2412 }
2413 if (orig == str) n = 0;
2414 }
2415 str_modifiable(str);
2416 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2417 /* make noembed always */
2418 const size_t size = (size_t)capa + termlen;
2419 const char *const old_ptr = RSTRING_PTR(str);
2420 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2421 char *new_ptr = ALLOC_N(char, size);
2422 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2423 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2424 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2425 RSTRING(str)->as.heap.ptr = new_ptr;
2426 }
2427 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2428 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2429 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2430 }
2431 STR_SET_LEN(str, len);
2432 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2433 if (n == 1) {
2434 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2435 rb_enc_cr_str_exact_copy(str, orig);
2436 }
2437 FL_SET(str, STR_NOEMBED);
2438 RSTRING(str)->as.heap.aux.capa = capa;
2439 }
2440 else if (n == 1) {
2441 rb_str_replace(str, orig);
2442 }
2443 if (enc) {
2444 rb_enc_associate(str, enc);
2446 }
2447 }
2448 else if (n == 1) {
2449 rb_str_replace(str, orig);
2450 }
2451 return str;
2452}
2453
2454/* :nodoc: */
2455static VALUE
2456rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2457{
2458 if (klass != rb_cString) {
2459 return rb_class_new_instance_pass_kw(argc, argv, klass);
2460 }
2461
2462 static ID keyword_ids[2];
2463 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2464 VALUE kwargs[2];
2465 rb_encoding *enc = NULL;
2466
2467 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2468 if (NIL_P(opt)) {
2469 return rb_class_new_instance_pass_kw(argc, argv, klass);
2470 }
2471
2472 keyword_ids[0] = rb_id_encoding();
2473 CONST_ID(keyword_ids[1], "capacity");
2474 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2475 encoding = kwargs[0];
2476 capacity = kwargs[1];
2477
2478 if (n == 1) {
2479 orig = StringValue(orig);
2480 }
2481 else {
2482 orig = Qnil;
2483 }
2484
2485 if (UNDEF_P(encoding)) {
2486 if (!NIL_P(orig)) {
2487 encoding = rb_obj_encoding(orig);
2488 }
2489 }
2490
2491 if (!UNDEF_P(encoding)) {
2492 enc = rb_to_encoding(encoding);
2493 }
2494
2495 // If capacity is nil, we're basically just duping `orig`.
2496 if (UNDEF_P(capacity)) {
2497 if (NIL_P(orig)) {
2498 VALUE empty_str = str_new(klass, "", 0);
2499 if (enc) {
2500 rb_enc_associate(empty_str, enc);
2501 }
2502 return empty_str;
2503 }
2504 VALUE copy = str_duplicate(klass, orig);
2505 rb_enc_associate(copy, enc);
2506 ENC_CODERANGE_CLEAR(copy);
2507 return copy;
2508 }
2509
2510 long capa = 0;
2511 capa = NUM2LONG(capacity);
2512 if (capa < 0) {
2513 capa = 0;
2514 }
2515
2516 if (!NIL_P(orig)) {
2517 long orig_capa = rb_str_capacity(orig);
2518 if (orig_capa > capa) {
2519 capa = orig_capa;
2520 }
2521 }
2522
2523 VALUE str = str_enc_new(klass, NULL, capa, enc);
2524 STR_SET_LEN(str, 0);
2525 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2526
2527 if (!NIL_P(orig)) {
2528 rb_str_buf_append(str, orig);
2529 }
2530
2531 return str;
2532}
2533
2534#ifdef NONASCII_MASK
2535#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2536
2537/*
2538 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2539 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2540 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2541 *
2542 * if (!(byte & 0x80))
2543 * byte |= 0x40; // turn on bit6
2544 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2545 *
2546 * This function calculates whether a byte is leading or not for all bytes
2547 * in the argument word by concurrently using the above logic, and then
2548 * adds up the number of leading bytes in the word.
2549 */
2550static inline uintptr_t
2551count_utf8_lead_bytes_with_word(const uintptr_t *s)
2552{
2553 uintptr_t d = *s;
2554
2555 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2556 d = (d>>6) | (~d>>7);
2557 d &= NONASCII_MASK >> 7;
2558
2559 /* Gather all bytes. */
2560#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2561 /* use only if it can use POPCNT */
2562 return rb_popcount_intptr(d);
2563#else
2564 d += (d>>8);
2565 d += (d>>16);
2566# if SIZEOF_VOIDP == 8
2567 d += (d>>32);
2568# endif
2569 return (d&0xF);
2570#endif
2571}
2572#endif
2573
2574static inline long
2575enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2576{
2577 long c;
2578 const char *q;
2579
2580 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2581 long diff = (long)(e - p);
2582 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2583 }
2584#ifdef NONASCII_MASK
2585 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2586 uintptr_t len = 0;
2587 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2588 const uintptr_t *s, *t;
2589 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2590 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2591 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2592 while (p < (const char *)s) {
2593 if (is_utf8_lead_byte(*p)) len++;
2594 p++;
2595 }
2596 while (s < t) {
2597 len += count_utf8_lead_bytes_with_word(s);
2598 s++;
2599 }
2600 p = (const char *)s;
2601 }
2602 while (p < e) {
2603 if (is_utf8_lead_byte(*p)) len++;
2604 p++;
2605 }
2606 return (long)len;
2607 }
2608#endif
2609 else if (rb_enc_asciicompat(enc)) {
2610 c = 0;
2611 if (ENC_CODERANGE_CLEAN_P(cr)) {
2612 while (p < e) {
2613 if (ISASCII(*p)) {
2614 q = search_nonascii(p, e);
2615 if (!q)
2616 return c + (e - p);
2617 c += q - p;
2618 p = q;
2619 }
2620 p += rb_enc_fast_mbclen(p, e, enc);
2621 c++;
2622 }
2623 }
2624 else {
2625 while (p < e) {
2626 if (ISASCII(*p)) {
2627 q = search_nonascii(p, e);
2628 if (!q)
2629 return c + (e - p);
2630 c += q - p;
2631 p = q;
2632 }
2633 p += rb_enc_mbclen(p, e, enc);
2634 c++;
2635 }
2636 }
2637 return c;
2638 }
2639
2640 for (c=0; p<e; c++) {
2641 p += rb_enc_mbclen(p, e, enc);
2642 }
2643 return c;
2644}
2645
2646long
2647rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2648{
2649 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2650}
2651
2652/* To get strlen with cr
2653 * Note that given cr is not used.
2654 */
2655long
2656rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2657{
2658 long c;
2659 const char *q;
2660 int ret;
2661
2662 *cr = 0;
2663 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2664 long diff = (long)(e - p);
2665 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2666 }
2667 else if (rb_enc_asciicompat(enc)) {
2668 c = 0;
2669 while (p < e) {
2670 if (ISASCII(*p)) {
2671 q = search_nonascii(p, e);
2672 if (!q) {
2673 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2674 return c + (e - p);
2675 }
2676 c += q - p;
2677 p = q;
2678 }
2679 ret = rb_enc_precise_mbclen(p, e, enc);
2680 if (MBCLEN_CHARFOUND_P(ret)) {
2681 *cr |= ENC_CODERANGE_VALID;
2682 p += MBCLEN_CHARFOUND_LEN(ret);
2683 }
2684 else {
2686 p++;
2687 }
2688 c++;
2689 }
2690 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2691 return c;
2692 }
2693
2694 for (c=0; p<e; c++) {
2695 ret = rb_enc_precise_mbclen(p, e, enc);
2696 if (MBCLEN_CHARFOUND_P(ret)) {
2697 *cr |= ENC_CODERANGE_VALID;
2698 p += MBCLEN_CHARFOUND_LEN(ret);
2699 }
2700 else {
2702 if (p + rb_enc_mbminlen(enc) <= e)
2703 p += rb_enc_mbminlen(enc);
2704 else
2705 p = e;
2706 }
2707 }
2708 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2709 return c;
2710}
2711
2712/* enc must be str's enc or rb_enc_check(str, str2) */
2713static long
2714str_strlen(VALUE str, rb_encoding *enc)
2715{
2716 const char *p, *e;
2717 int cr;
2718
2719 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2720 if (!enc) enc = STR_ENC_GET(str);
2721 p = RSTRING_PTR(str);
2722 e = RSTRING_END(str);
2723 cr = ENC_CODERANGE(str);
2724
2725 if (cr == ENC_CODERANGE_UNKNOWN) {
2726 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2727 if (cr) ENC_CODERANGE_SET(str, cr);
2728 return n;
2729 }
2730 else {
2731 return enc_strlen(p, e, enc, cr);
2732 }
2733}
2734
2735long
2737{
2738 return str_strlen(str, NULL);
2739}
2740
2741/*
2742 * call-seq:
2743 * length -> integer
2744 *
2745 * :include: doc/string/length.rdoc
2746 *
2747 */
2748
2749VALUE
2751{
2752 return LONG2NUM(str_strlen(str, NULL));
2753}
2754
2755/*
2756 * call-seq:
2757 * bytesize -> integer
2758 *
2759 * :include: doc/string/bytesize.rdoc
2760 *
2761 */
2762
2763VALUE
2764rb_str_bytesize(VALUE str)
2765{
2766 return LONG2NUM(RSTRING_LEN(str));
2767}
2768
2769/*
2770 * call-seq:
2771 * empty? -> true or false
2772 *
2773 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2774 *
2775 * "hello".empty? # => false
2776 * " ".empty? # => false
2777 * "".empty? # => true
2778 *
2779 */
2780
2781static VALUE
2782rb_str_empty(VALUE str)
2783{
2784 return RBOOL(RSTRING_LEN(str) == 0);
2785}
2786
2787/*
2788 * call-seq:
2789 * self + other_string -> new_string
2790 *
2791 * Returns a new string containing +other_string+ concatenated to +self+:
2792 *
2793 * 'Hello from ' + self.to_s # => "Hello from main"
2794 *
2795 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2796 */
2797
2798VALUE
2800{
2801 VALUE str3;
2802 rb_encoding *enc;
2803 char *ptr1, *ptr2, *ptr3;
2804 long len1, len2;
2805 int termlen;
2806
2807 StringValue(str2);
2808 enc = rb_enc_check_str(str1, str2);
2809 RSTRING_GETMEM(str1, ptr1, len1);
2810 RSTRING_GETMEM(str2, ptr2, len2);
2811 termlen = rb_enc_mbminlen(enc);
2812 if (len1 > LONG_MAX - len2) {
2813 rb_raise(rb_eArgError, "string size too big");
2814 }
2815 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2816 ptr3 = RSTRING_PTR(str3);
2817 memcpy(ptr3, ptr1, len1);
2818 memcpy(ptr3+len1, ptr2, len2);
2819 TERM_FILL(&ptr3[len1+len2], termlen);
2820
2821 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2823 RB_GC_GUARD(str1);
2824 RB_GC_GUARD(str2);
2825 return str3;
2826}
2827
2828/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2829VALUE
2830rb_str_opt_plus(VALUE str1, VALUE str2)
2831{
2834 long len1, len2;
2835 MAYBE_UNUSED(char) *ptr1, *ptr2;
2836 RSTRING_GETMEM(str1, ptr1, len1);
2837 RSTRING_GETMEM(str2, ptr2, len2);
2838 int enc1 = rb_enc_get_index(str1);
2839 int enc2 = rb_enc_get_index(str2);
2840
2841 if (enc1 < 0) {
2842 return Qundef;
2843 }
2844 else if (enc2 < 0) {
2845 return Qundef;
2846 }
2847 else if (enc1 != enc2) {
2848 return Qundef;
2849 }
2850 else if (len1 > LONG_MAX - len2) {
2851 return Qundef;
2852 }
2853 else {
2854 return rb_str_plus(str1, str2);
2855 }
2856
2857}
2858
2859/*
2860 * call-seq:
2861 * self * n -> new_string
2862 *
2863 * Returns a new string containing +n+ copies of +self+:
2864 *
2865 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2866 * 'No!' * 0 # => ""
2867 *
2868 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2869 */
2870
2871VALUE
2873{
2874 VALUE str2;
2875 long n, len;
2876 char *ptr2;
2877 int termlen;
2878
2879 if (times == INT2FIX(1)) {
2880 return str_duplicate(rb_cString, str);
2881 }
2882 if (times == INT2FIX(0)) {
2883 str2 = str_alloc_embed(rb_cString, 0);
2884 rb_enc_copy(str2, str);
2885 return str2;
2886 }
2887 len = NUM2LONG(times);
2888 if (len < 0) {
2889 rb_raise(rb_eArgError, "negative argument");
2890 }
2891 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2892 if (STR_EMBEDDABLE_P(len, 1)) {
2893 str2 = str_alloc_embed(rb_cString, len + 1);
2894 memset(RSTRING_PTR(str2), 0, len + 1);
2895 }
2896 else {
2897 str2 = str_alloc_heap(rb_cString);
2898 RSTRING(str2)->as.heap.aux.capa = len;
2899 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2900 }
2901 STR_SET_LEN(str2, len);
2902 rb_enc_copy(str2, str);
2903 return str2;
2904 }
2905 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2906 rb_raise(rb_eArgError, "argument too big");
2907 }
2908
2909 len *= RSTRING_LEN(str);
2910 termlen = TERM_LEN(str);
2911 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2912 ptr2 = RSTRING_PTR(str2);
2913 if (len) {
2914 n = RSTRING_LEN(str);
2915 memcpy(ptr2, RSTRING_PTR(str), n);
2916 while (n <= len/2) {
2917 memcpy(ptr2 + n, ptr2, n);
2918 n *= 2;
2919 }
2920 memcpy(ptr2 + n, ptr2, len-n);
2921 }
2922 STR_SET_LEN(str2, len);
2923 TERM_FILL(&ptr2[len], termlen);
2924 rb_enc_cr_str_copy_for_substr(str2, str);
2925
2926 return str2;
2927}
2928
2929/*
2930 * call-seq:
2931 * self % object -> new_string
2932 *
2933 * Returns the result of formatting +object+ into the format specifications
2934 * contained in +self+
2935 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2936 *
2937 * '%05d' % 123 # => "00123"
2938 *
2939 * If +self+ contains multiple format specifications,
2940 * +object+ must be an array or hash containing the objects to be formatted:
2941 *
2942 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2943 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2944 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2945 *
2946 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2947 */
2948
2949static VALUE
2950rb_str_format_m(VALUE str, VALUE arg)
2951{
2952 VALUE tmp = rb_check_array_type(arg);
2953
2954 if (!NIL_P(tmp)) {
2955 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2956 }
2957 return rb_str_format(1, &arg, str);
2958}
2959
2960static inline void
2961rb_check_lockedtmp(VALUE str)
2962{
2963 if (FL_TEST(str, STR_TMPLOCK)) {
2964 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2965 }
2966}
2967
2968// If none of these flags are set, we know we have an modifiable string.
2969// If any is set, we need to do more detailed checks.
2970#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2971static inline void
2972str_modifiable(VALUE str)
2973{
2974 RUBY_ASSERT(ruby_thread_has_gvl_p());
2975
2976 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2977 if (CHILLED_STRING_P(str)) {
2978 CHILLED_STRING_MUTATED(str);
2979 }
2980 rb_check_lockedtmp(str);
2981 rb_check_frozen(str);
2982 }
2983}
2984
2985static inline int
2986str_dependent_p(VALUE str)
2987{
2988 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2989 return FALSE;
2990 }
2991 else {
2992 return TRUE;
2993 }
2994}
2995
2996// If none of these flags are set, we know we have an independent string.
2997// If any is set, we need to do more detailed checks.
2998#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2999static inline int
3000str_independent(VALUE str)
3001{
3002 RUBY_ASSERT(ruby_thread_has_gvl_p());
3003
3004 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
3005 str_modifiable(str);
3006 return !str_dependent_p(str);
3007 }
3008 return TRUE;
3009}
3010
3011static void
3012str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
3013{
3014 RUBY_ASSERT(ruby_thread_has_gvl_p());
3015
3016 char *ptr;
3017 char *oldptr;
3018 long capa = len + expand;
3019
3020 if (len > capa) len = capa;
3021
3022 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
3023 ptr = RSTRING(str)->as.heap.ptr;
3024 STR_SET_EMBED(str);
3025 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
3026 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3027 STR_SET_LEN(str, len);
3028 return;
3029 }
3030
3031 ptr = ALLOC_N(char, (size_t)capa + termlen);
3032 oldptr = RSTRING_PTR(str);
3033 if (oldptr) {
3034 memcpy(ptr, oldptr, len);
3035 }
3036 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
3037 xfree(oldptr);
3038 }
3039 STR_SET_NOEMBED(str);
3040 FL_UNSET(str, STR_SHARED|STR_NOFREE);
3041 TERM_FILL(ptr + len, termlen);
3042 RSTRING(str)->as.heap.ptr = ptr;
3043 STR_SET_LEN(str, len);
3044 RSTRING(str)->as.heap.aux.capa = capa;
3045}
3046
3047void
3048rb_str_modify(VALUE str)
3049{
3050 if (!str_independent(str))
3051 str_make_independent(str);
3053}
3054
3055void
3057{
3058 RUBY_ASSERT(ruby_thread_has_gvl_p());
3059
3060 int termlen = TERM_LEN(str);
3061 long len = RSTRING_LEN(str);
3062
3063 if (expand < 0) {
3064 rb_raise(rb_eArgError, "negative expanding string size");
3065 }
3066 if (expand >= LONG_MAX - len) {
3067 rb_raise(rb_eArgError, "string size too big");
3068 }
3069
3070 if (!str_independent(str)) {
3071 str_make_independent_expand(str, len, expand, termlen);
3072 }
3073 else if (expand > 0) {
3074 RESIZE_CAPA_TERM(str, len + expand, termlen);
3075 }
3077}
3078
3079/* As rb_str_modify(), but don't clear coderange */
3080static void
3081str_modify_keep_cr(VALUE str)
3082{
3083 if (!str_independent(str))
3084 str_make_independent(str);
3086 /* Force re-scan later */
3088}
3089
3090static inline void
3091str_discard(VALUE str)
3092{
3093 str_modifiable(str);
3094 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
3095 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
3096 RSTRING(str)->as.heap.ptr = 0;
3097 STR_SET_LEN(str, 0);
3098 }
3099}
3100
3101void
3103{
3104 int encindex = rb_enc_get_index(str);
3105
3106 if (RB_UNLIKELY(encindex == -1)) {
3107 rb_raise(rb_eTypeError, "not encoding capable object");
3108 }
3109
3110 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
3111 return;
3112 }
3113
3114 rb_encoding *enc = rb_enc_from_index(encindex);
3115 if (!rb_enc_asciicompat(enc)) {
3116 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
3117 }
3118}
3119
3120VALUE
3122{
3123 RUBY_ASSERT(ruby_thread_has_gvl_p());
3124
3125 VALUE s = *ptr;
3126 if (!RB_TYPE_P(s, T_STRING)) {
3127 s = rb_str_to_str(s);
3128 *ptr = s;
3129 }
3130 return s;
3131}
3132
3133char *
3135{
3136 VALUE str = rb_string_value(ptr);
3137 return RSTRING_PTR(str);
3138}
3139
3140static int
3141zero_filled(const char *s, int n)
3142{
3143 for (; n > 0; --n) {
3144 if (*s++) return 0;
3145 }
3146 return 1;
3147}
3148
3149static const char *
3150str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
3151{
3152 const char *e = s + len;
3153
3154 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
3155 if (zero_filled(s, minlen)) return s;
3156 }
3157 return 0;
3158}
3159
3160static char *
3161str_fill_term(VALUE str, char *s, long len, int termlen)
3162{
3163 /* This function assumes that (capa + termlen) bytes of memory
3164 * is allocated, like many other functions in this file.
3165 */
3166 if (str_dependent_p(str)) {
3167 if (!zero_filled(s + len, termlen))
3168 str_make_independent_expand(str, len, 0L, termlen);
3169 }
3170 else {
3171 TERM_FILL(s + len, termlen);
3172 return s;
3173 }
3174 return RSTRING_PTR(str);
3175}
3176
3177void
3178rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
3179{
3180 long capa = str_capacity(str, oldtermlen) + oldtermlen;
3181 long len = RSTRING_LEN(str);
3182
3183 RUBY_ASSERT(capa >= len);
3184 if (capa - len < termlen) {
3185 rb_check_lockedtmp(str);
3186 str_make_independent_expand(str, len, 0L, termlen);
3187 }
3188 else if (str_dependent_p(str)) {
3189 if (termlen > oldtermlen)
3190 str_make_independent_expand(str, len, 0L, termlen);
3191 }
3192 else {
3193 if (!STR_EMBED_P(str)) {
3194 /* modify capa instead of realloc */
3195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
3196 RSTRING(str)->as.heap.aux.capa = capa - termlen;
3197 }
3198 if (termlen > oldtermlen) {
3199 TERM_FILL(RSTRING_PTR(str) + len, termlen);
3200 }
3201 }
3202
3203 return;
3204}
3205
3206static char *
3207str_null_check(VALUE str, int *w)
3208{
3209 char *s = RSTRING_PTR(str);
3210 long len = RSTRING_LEN(str);
3211 rb_encoding *enc = rb_enc_get(str);
3212 const int minlen = rb_enc_mbminlen(enc);
3213
3214 if (minlen > 1) {
3215 *w = 1;
3216 if (str_null_char(s, len, minlen, enc)) {
3217 return NULL;
3218 }
3219 return str_fill_term(str, s, len, minlen);
3220 }
3221 *w = 0;
3222 if (!s || memchr(s, 0, len)) {
3223 return NULL;
3224 }
3225 if (s[len]) {
3226 s = str_fill_term(str, s, len, minlen);
3227 }
3228 return s;
3229}
3230
3231char *
3232rb_str_to_cstr(VALUE str)
3233{
3234 int w;
3235 return str_null_check(str, &w);
3236}
3237
3238char *
3240{
3241 VALUE str = rb_string_value(ptr);
3242 int w;
3243 char *s = str_null_check(str, &w);
3244 if (!s) {
3245 if (w) {
3246 rb_raise(rb_eArgError, "string contains null char");
3247 }
3248 rb_raise(rb_eArgError, "string contains null byte");
3249 }
3250 return s;
3251}
3252
3253char *
3254rb_str_fill_terminator(VALUE str, const int newminlen)
3255{
3256 char *s = RSTRING_PTR(str);
3257 long len = RSTRING_LEN(str);
3258 return str_fill_term(str, s, len, newminlen);
3259}
3260
3261VALUE
3263{
3264 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
3265 return str;
3266}
3267
3268/*
3269 * call-seq:
3270 * String.try_convert(object) -> object, new_string, or nil
3271 *
3272 * Attempts to convert the given +object+ to a string.
3273 *
3274 * If +object+ is already a string, returns +object+, unmodified.
3275 *
3276 * Otherwise if +object+ responds to <tt>:to_str</tt>,
3277 * calls <tt>object.to_str</tt> and returns the result.
3278 *
3279 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
3280 *
3281 * Raises an exception unless <tt>object.to_str</tt> returns a string.
3282 */
3283static VALUE
3284rb_str_s_try_convert(VALUE dummy, VALUE str)
3285{
3286 return rb_check_string_type(str);
3287}
3288
3289static char*
3290str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
3291{
3292 long nth = *nthp;
3293 if (rb_enc_mbmaxlen(enc) == 1) {
3294 p += nth;
3295 }
3296 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3297 p += nth * rb_enc_mbmaxlen(enc);
3298 }
3299 else if (rb_enc_asciicompat(enc)) {
3300 const char *p2, *e2;
3301 int n;
3302
3303 while (p < e && 0 < nth) {
3304 e2 = p + nth;
3305 if (e < e2) {
3306 *nthp = nth;
3307 return (char *)e;
3308 }
3309 if (ISASCII(*p)) {
3310 p2 = search_nonascii(p, e2);
3311 if (!p2) {
3312 nth -= e2 - p;
3313 *nthp = nth;
3314 return (char *)e2;
3315 }
3316 nth -= p2 - p;
3317 p = p2;
3318 }
3319 n = rb_enc_mbclen(p, e, enc);
3320 p += n;
3321 nth--;
3322 }
3323 *nthp = nth;
3324 if (nth != 0) {
3325 return (char *)e;
3326 }
3327 return (char *)p;
3328 }
3329 else {
3330 while (p < e && nth--) {
3331 p += rb_enc_mbclen(p, e, enc);
3332 }
3333 }
3334 if (p > e) p = e;
3335 *nthp = nth;
3336 return (char*)p;
3337}
3338
3339char*
3340rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3341{
3342 return str_nth_len(p, e, &nth, enc);
3343}
3344
3345static char*
3346str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3347{
3348 if (singlebyte)
3349 p += nth;
3350 else {
3351 p = str_nth_len(p, e, &nth, enc);
3352 }
3353 if (!p) return 0;
3354 if (p > e) p = e;
3355 return (char *)p;
3356}
3357
3358/* char offset to byte offset */
3359static long
3360str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3361{
3362 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3363 if (!pp) return e - p;
3364 return pp - p;
3365}
3366
3367long
3368rb_str_offset(VALUE str, long pos)
3369{
3370 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3371 STR_ENC_GET(str), single_byte_optimizable(str));
3372}
3373
3374#ifdef NONASCII_MASK
3375static char *
3376str_utf8_nth(const char *p, const char *e, long *nthp)
3377{
3378 long nth = *nthp;
3379 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3380 const uintptr_t *s, *t;
3381 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3382 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3383 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3384 while (p < (const char *)s) {
3385 if (is_utf8_lead_byte(*p)) nth--;
3386 p++;
3387 }
3388 do {
3389 nth -= count_utf8_lead_bytes_with_word(s);
3390 s++;
3391 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3392 p = (char *)s;
3393 }
3394 while (p < e) {
3395 if (is_utf8_lead_byte(*p)) {
3396 if (nth == 0) break;
3397 nth--;
3398 }
3399 p++;
3400 }
3401 *nthp = nth;
3402 return (char *)p;
3403}
3404
3405static long
3406str_utf8_offset(const char *p, const char *e, long nth)
3407{
3408 const char *pp = str_utf8_nth(p, e, &nth);
3409 return pp - p;
3410}
3411#endif
3412
3413/* byte offset to char offset */
3414long
3415rb_str_sublen(VALUE str, long pos)
3416{
3417 if (single_byte_optimizable(str) || pos < 0)
3418 return pos;
3419 else {
3420 char *p = RSTRING_PTR(str);
3421 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3422 }
3423}
3424
3425static VALUE
3426str_subseq(VALUE str, long beg, long len)
3427{
3428 VALUE str2;
3429
3430 RUBY_ASSERT(beg >= 0);
3431 RUBY_ASSERT(len >= 0);
3432 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3433
3434 const int termlen = TERM_LEN(str);
3435 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3436 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3437 RB_GC_GUARD(str);
3438 return str2;
3439 }
3440
3441 str2 = str_alloc_heap(rb_cString);
3442 if (str_embed_capa(str2) >= len + termlen) {
3443 char *ptr2 = RSTRING(str2)->as.embed.ary;
3444 STR_SET_EMBED(str2);
3445 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3446 TERM_FILL(ptr2+len, termlen);
3447
3448 STR_SET_LEN(str2, len);
3449 RB_GC_GUARD(str);
3450 }
3451 else {
3452 str_replace_shared(str2, str);
3453 RUBY_ASSERT(!STR_EMBED_P(str2));
3454 ENC_CODERANGE_CLEAR(str2);
3455 RSTRING(str2)->as.heap.ptr += beg;
3456 if (RSTRING_LEN(str2) > len) {
3457 STR_SET_LEN(str2, len);
3458 }
3459 }
3460
3461 return str2;
3462}
3463
3464VALUE
3465rb_str_subseq(VALUE str, long beg, long len)
3466{
3467 VALUE str2 = str_subseq(str, beg, len);
3468 rb_enc_cr_str_copy_for_substr(str2, str);
3469 return str2;
3470}
3471
3472char *
3473rb_str_subpos(VALUE str, long beg, long *lenp)
3474{
3475 long len = *lenp;
3476 long slen = -1L;
3477 const long blen = RSTRING_LEN(str);
3478 rb_encoding *enc = STR_ENC_GET(str);
3479 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3480
3481 if (len < 0) return 0;
3482 if (beg < 0 && -beg < 0) return 0;
3483 if (!blen) {
3484 len = 0;
3485 }
3486 if (single_byte_optimizable(str)) {
3487 if (beg > blen) return 0;
3488 if (beg < 0) {
3489 beg += blen;
3490 if (beg < 0) return 0;
3491 }
3492 if (len > blen - beg)
3493 len = blen - beg;
3494 if (len < 0) return 0;
3495 p = s + beg;
3496 goto end;
3497 }
3498 if (beg < 0) {
3499 if (len > -beg) len = -beg;
3500 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3501 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3502 beg = -beg;
3503 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3504 p = e;
3505 if (!p) return 0;
3506 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3507 if (!p) return 0;
3508 len = e - p;
3509 goto end;
3510 }
3511 else {
3512 slen = str_strlen(str, enc);
3513 beg += slen;
3514 if (beg < 0) return 0;
3515 p = s + beg;
3516 if (len == 0) goto end;
3517 }
3518 }
3519 else if (beg > 0 && beg > blen) {
3520 return 0;
3521 }
3522 if (len == 0) {
3523 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3524 p = s + beg;
3525 }
3526#ifdef NONASCII_MASK
3527 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3528 enc == rb_utf8_encoding()) {
3529 p = str_utf8_nth(s, e, &beg);
3530 if (beg > 0) return 0;
3531 len = str_utf8_offset(p, e, len);
3532 }
3533#endif
3534 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3535 int char_sz = rb_enc_mbmaxlen(enc);
3536
3537 p = s + beg * char_sz;
3538 if (p > e) {
3539 return 0;
3540 }
3541 else if (len * char_sz > e - p)
3542 len = e - p;
3543 else
3544 len *= char_sz;
3545 }
3546 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3547 if (beg > 0) return 0;
3548 len = 0;
3549 }
3550 else {
3551 len = str_offset(p, e, len, enc, 0);
3552 }
3553 end:
3554 *lenp = len;
3555 RB_GC_GUARD(str);
3556 return p;
3557}
3558
3559static VALUE str_substr(VALUE str, long beg, long len, int empty);
3560
3561VALUE
3562rb_str_substr(VALUE str, long beg, long len)
3563{
3564 return str_substr(str, beg, len, TRUE);
3565}
3566
3567VALUE
3568rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3569{
3570 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3571}
3572
3573static VALUE
3574str_substr(VALUE str, long beg, long len, int empty)
3575{
3576 char *p = rb_str_subpos(str, beg, &len);
3577
3578 if (!p) return Qnil;
3579 if (!len && !empty) return Qnil;
3580
3581 beg = p - RSTRING_PTR(str);
3582
3583 VALUE str2 = str_subseq(str, beg, len);
3584 rb_enc_cr_str_copy_for_substr(str2, str);
3585 return str2;
3586}
3587
3588/* :nodoc: */
3589VALUE
3591{
3592 if (CHILLED_STRING_P(str)) {
3593 FL_UNSET_RAW(str, STR_CHILLED);
3594 }
3595
3596 if (OBJ_FROZEN(str)) return str;
3597 rb_str_resize(str, RSTRING_LEN(str));
3598 return rb_obj_freeze(str);
3599}
3600
3601/*
3602 * call-seq:
3603 * +string -> new_string or self
3604 *
3605 * Returns +self+ if +self+ is not frozen and can be mutated
3606 * without warning issuance.
3607 *
3608 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3609 *
3610 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3611 */
3612static VALUE
3613str_uplus(VALUE str)
3614{
3615 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3616 return rb_str_dup(str);
3617 }
3618 else {
3619 return str;
3620 }
3621}
3622
3623/*
3624 * call-seq:
3625 * -self -> frozen_string
3626 *
3627 * Returns a frozen string equal to +self+.
3628 *
3629 * The returned string is +self+ if and only if all of the following are true:
3630 *
3631 * - +self+ is already frozen.
3632 * - +self+ is an instance of \String (rather than of a subclass of \String)
3633 * - +self+ has no instance variables set on it.
3634 *
3635 * Otherwise, the returned string is a frozen copy of +self+.
3636 *
3637 * Returning +self+, when possible, saves duplicating +self+;
3638 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3639 *
3640 * It may also save duplicating other, already-existing, strings:
3641 *
3642 * s0 = 'foo'
3643 * s1 = 'foo'
3644 * s0.object_id == s1.object_id # => false
3645 * (-s0).object_id == (-s1).object_id # => true
3646 *
3647 * Note that method #-@ is convenient for defining a constant:
3648 *
3649 * FileName = -'config/database.yml'
3650 *
3651 * While its alias #dedup is better suited for chaining:
3652 *
3653 * 'foo'.dedup.gsub!('o')
3654 *
3655 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3656 */
3657static VALUE
3658str_uminus(VALUE str)
3659{
3660 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3661 str = rb_str_dup(str);
3662 }
3663 return rb_fstring(str);
3664}
3665
3666RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3667#define rb_str_dup_frozen rb_str_new_frozen
3668
3669VALUE
3671{
3672 if (FL_TEST(str, STR_TMPLOCK)) {
3673 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3674 }
3675 FL_SET(str, STR_TMPLOCK);
3676 return str;
3677}
3678
3679VALUE
3681{
3682 if (!FL_TEST(str, STR_TMPLOCK)) {
3683 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3684 }
3685 FL_UNSET(str, STR_TMPLOCK);
3686 return str;
3687}
3688
3689VALUE
3690rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3691{
3692 rb_str_locktmp(str);
3693 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3694}
3695
3696void
3698{
3699 RUBY_ASSERT(ruby_thread_has_gvl_p());
3700
3701 long capa;
3702 const int termlen = TERM_LEN(str);
3703
3704 str_modifiable(str);
3705 if (STR_SHARED_P(str)) {
3706 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3707 }
3708 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3709 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3710 }
3711
3712 int cr = ENC_CODERANGE(str);
3713 if (len == 0) {
3714 /* Empty string does not contain non-ASCII */
3716 }
3717 else if (cr == ENC_CODERANGE_UNKNOWN) {
3718 /* Leave unknown. */
3719 }
3720 else if (len > RSTRING_LEN(str)) {
3721 if (ENC_CODERANGE_CLEAN_P(cr)) {
3722 /* Update the coderange regarding the extended part. */
3723 const char *const prev_end = RSTRING_END(str);
3724 const char *const new_end = RSTRING_PTR(str) + len;
3725 rb_encoding *enc = rb_enc_get(str);
3726 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3727 ENC_CODERANGE_SET(str, cr);
3728 }
3729 else if (cr == ENC_CODERANGE_BROKEN) {
3730 /* May be valid now, by appended part. */
3732 }
3733 }
3734 else if (len < RSTRING_LEN(str)) {
3735 if (cr != ENC_CODERANGE_7BIT) {
3736 /* ASCII-only string is keeping after truncated. Valid
3737 * and broken may be invalid or valid, leave unknown. */
3739 }
3740 }
3741
3742 STR_SET_LEN(str, len);
3743 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3744}
3745
3746VALUE
3747rb_str_resize(VALUE str, long len)
3748{
3749 if (len < 0) {
3750 rb_raise(rb_eArgError, "negative string size (or size too big)");
3751 }
3752
3753 int independent = str_independent(str);
3754 long slen = RSTRING_LEN(str);
3755 const int termlen = TERM_LEN(str);
3756
3757 if (slen > len || (termlen != 1 && slen < len)) {
3759 }
3760
3761 {
3762 long capa;
3763 if (STR_EMBED_P(str)) {
3764 if (len == slen) return str;
3765 if (str_embed_capa(str) >= len + termlen) {
3766 STR_SET_LEN(str, len);
3767 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3768 return str;
3769 }
3770 str_make_independent_expand(str, slen, len - slen, termlen);
3771 }
3772 else if (str_embed_capa(str) >= len + termlen) {
3773 char *ptr = STR_HEAP_PTR(str);
3774 STR_SET_EMBED(str);
3775 if (slen > len) slen = len;
3776 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3777 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3778 STR_SET_LEN(str, len);
3779 if (independent) ruby_xfree(ptr);
3780 return str;
3781 }
3782 else if (!independent) {
3783 if (len == slen) return str;
3784 str_make_independent_expand(str, slen, len - slen, termlen);
3785 }
3786 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3787 (capa - len) > (len < 1024 ? len : 1024)) {
3788 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3789 (size_t)len + termlen, STR_HEAP_SIZE(str));
3790 RSTRING(str)->as.heap.aux.capa = len;
3791 }
3792 else if (len == slen) return str;
3793 STR_SET_LEN(str, len);
3794 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3795 }
3796 return str;
3797}
3798
3799static void
3800str_ensure_available_capa(VALUE str, long len)
3801{
3802 str_modify_keep_cr(str);
3803
3804 const int termlen = TERM_LEN(str);
3805 long olen = RSTRING_LEN(str);
3806
3807 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3808 rb_raise(rb_eArgError, "string sizes too big");
3809 }
3810
3811 long total = olen + len;
3812 long capa = str_capacity(str, termlen);
3813
3814 if (capa < total) {
3815 if (total >= LONG_MAX / 2) {
3816 capa = total;
3817 }
3818 while (total > capa) {
3819 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3820 }
3821 RESIZE_CAPA_TERM(str, capa, termlen);
3822 }
3823}
3824
3825static VALUE
3826str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3827{
3828 if (keep_cr) {
3829 str_modify_keep_cr(str);
3830 }
3831 else {
3832 rb_str_modify(str);
3833 }
3834 if (len == 0) return 0;
3835
3836 long total, olen, off = -1;
3837 char *sptr;
3838 const int termlen = TERM_LEN(str);
3839
3840 RSTRING_GETMEM(str, sptr, olen);
3841 if (ptr >= sptr && ptr <= sptr + olen) {
3842 off = ptr - sptr;
3843 }
3844
3845 long capa = str_capacity(str, termlen);
3846
3847 if (olen > LONG_MAX - len) {
3848 rb_raise(rb_eArgError, "string sizes too big");
3849 }
3850 total = olen + len;
3851 if (capa < total) {
3852 if (total >= LONG_MAX / 2) {
3853 capa = total;
3854 }
3855 while (total > capa) {
3856 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3857 }
3858 RESIZE_CAPA_TERM(str, capa, termlen);
3859 sptr = RSTRING_PTR(str);
3860 }
3861 if (off != -1) {
3862 ptr = sptr + off;
3863 }
3864 memcpy(sptr + olen, ptr, len);
3865 STR_SET_LEN(str, total);
3866 TERM_FILL(sptr + total, termlen); /* sentinel */
3867
3868 return str;
3869}
3870
3871#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3872#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3873
3874VALUE
3875rb_str_cat(VALUE str, const char *ptr, long len)
3876{
3877 if (len == 0) return str;
3878 if (len < 0) {
3879 rb_raise(rb_eArgError, "negative string size (or size too big)");
3880 }
3881 return str_buf_cat(str, ptr, len);
3882}
3883
3884VALUE
3885rb_str_cat_cstr(VALUE str, const char *ptr)
3886{
3887 must_not_null(ptr);
3888 return rb_str_buf_cat(str, ptr, strlen(ptr));
3889}
3890
3891static void
3892rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3893{
3894 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3895
3896 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3897 if (UNLIKELY(!str_independent(str))) {
3898 str_make_independent(str);
3899 }
3900
3901 long string_length = -1;
3902 const int null_terminator_length = 1;
3903 char *sptr;
3904 RSTRING_GETMEM(str, sptr, string_length);
3905
3906 // Ensure the resulting string wouldn't be too long.
3907 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3908 rb_raise(rb_eArgError, "string sizes too big");
3909 }
3910
3911 long string_capacity = str_capacity(str, null_terminator_length);
3912
3913 // Get the code range before any modifications since those might clear the code range.
3914 int cr = ENC_CODERANGE(str);
3915
3916 // Check if the string has spare string_capacity to write the new byte.
3917 if (LIKELY(string_capacity >= string_length + 1)) {
3918 // In fast path we can write the new byte and note the string's new length.
3919 sptr[string_length] = byte;
3920 STR_SET_LEN(str, string_length + 1);
3921 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3922 }
3923 else {
3924 // If there's not enough string_capacity, make a call into the general string concatenation function.
3925 str_buf_cat(str, (char *)&byte, 1);
3926 }
3927
3928 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3929 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3930 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3931 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3932 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3933 if (ISASCII(byte)) {
3935 }
3936 else {
3938
3939 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3940 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3941 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3942 }
3943 }
3944 }
3945}
3946
3947RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3948RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3949RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3950
3951static VALUE
3952rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3953 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3954{
3955 int str_encindex = ENCODING_GET(str);
3956 int res_encindex;
3957 int str_cr, res_cr;
3958 rb_encoding *str_enc, *ptr_enc;
3959
3960 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3961
3962 if (str_encindex == ptr_encindex) {
3963 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3964 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3965 }
3966 }
3967 else {
3968 str_enc = rb_enc_from_index(str_encindex);
3969 ptr_enc = rb_enc_from_index(ptr_encindex);
3970 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3971 if (len == 0)
3972 return str;
3973 if (RSTRING_LEN(str) == 0) {
3974 rb_str_buf_cat(str, ptr, len);
3975 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3976 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3977 return str;
3978 }
3979 goto incompatible;
3980 }
3981 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3982 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3983 }
3984 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3985 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3986 str_cr = rb_enc_str_coderange(str);
3987 }
3988 }
3989 }
3990 if (ptr_cr_ret)
3991 *ptr_cr_ret = ptr_cr;
3992
3993 if (str_encindex != ptr_encindex &&
3994 str_cr != ENC_CODERANGE_7BIT &&
3995 ptr_cr != ENC_CODERANGE_7BIT) {
3996 str_enc = rb_enc_from_index(str_encindex);
3997 ptr_enc = rb_enc_from_index(ptr_encindex);
3998 goto incompatible;
3999 }
4000
4001 if (str_cr == ENC_CODERANGE_UNKNOWN) {
4002 res_encindex = str_encindex;
4003 res_cr = ENC_CODERANGE_UNKNOWN;
4004 }
4005 else if (str_cr == ENC_CODERANGE_7BIT) {
4006 if (ptr_cr == ENC_CODERANGE_7BIT) {
4007 res_encindex = str_encindex;
4008 res_cr = ENC_CODERANGE_7BIT;
4009 }
4010 else {
4011 res_encindex = ptr_encindex;
4012 res_cr = ptr_cr;
4013 }
4014 }
4015 else if (str_cr == ENC_CODERANGE_VALID) {
4016 res_encindex = str_encindex;
4017 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
4018 res_cr = str_cr;
4019 else
4020 res_cr = ptr_cr;
4021 }
4022 else { /* str_cr == ENC_CODERANGE_BROKEN */
4023 res_encindex = str_encindex;
4024 res_cr = str_cr;
4025 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
4026 }
4027
4028 if (len < 0) {
4029 rb_raise(rb_eArgError, "negative string size (or size too big)");
4030 }
4031 str_buf_cat(str, ptr, len);
4032 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
4033 return str;
4034
4035 incompatible:
4036 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
4037 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
4039}
4040
4041VALUE
4042rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
4043{
4044 return rb_enc_cr_str_buf_cat(str, ptr, len,
4045 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
4046}
4047
4048VALUE
4050{
4051 /* ptr must reference NUL terminated ASCII string. */
4052 int encindex = ENCODING_GET(str);
4053 rb_encoding *enc = rb_enc_from_index(encindex);
4054 if (rb_enc_asciicompat(enc)) {
4055 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
4056 encindex, ENC_CODERANGE_7BIT, 0);
4057 }
4058 else {
4059 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
4060 while (*ptr) {
4061 unsigned int c = (unsigned char)*ptr;
4062 int len = rb_enc_codelen(c, enc);
4063 rb_enc_mbcput(c, buf, enc);
4064 rb_enc_cr_str_buf_cat(str, buf, len,
4065 encindex, ENC_CODERANGE_VALID, 0);
4066 ptr++;
4067 }
4068 return str;
4069 }
4070}
4071
4072VALUE
4074{
4075 int str2_cr = rb_enc_str_coderange(str2);
4076
4077 if (str_enc_fastpath(str)) {
4078 switch (str2_cr) {
4079 case ENC_CODERANGE_7BIT:
4080 // If RHS is 7bit we can do simple concatenation
4081 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
4082 RB_GC_GUARD(str2);
4083 return str;
4085 // If RHS is valid, we can do simple concatenation if encodings are the same
4086 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
4087 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
4088 int str_cr = ENC_CODERANGE(str);
4089 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
4090 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
4091 }
4092 RB_GC_GUARD(str2);
4093 return str;
4094 }
4095 }
4096 }
4097
4098 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
4099 ENCODING_GET(str2), str2_cr, &str2_cr);
4100
4101 ENC_CODERANGE_SET(str2, str2_cr);
4102
4103 return str;
4104}
4105
4106VALUE
4108{
4109 StringValue(str2);
4110 return rb_str_buf_append(str, str2);
4111}
4112
4113VALUE
4114rb_str_concat_literals(size_t num, const VALUE *strary)
4115{
4116 VALUE str;
4117 size_t i, s = 0;
4118 unsigned long len = 1;
4119
4120 if (UNLIKELY(!num)) return rb_str_new(0, 0);
4121 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
4122
4123 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
4124 str = rb_str_buf_new(len);
4125 str_enc_copy_direct(str, strary[0]);
4126
4127 for (i = s; i < num; ++i) {
4128 const VALUE v = strary[i];
4129 int encidx = ENCODING_GET(v);
4130
4131 rb_str_buf_append(str, v);
4132 if (encidx != ENCINDEX_US_ASCII) {
4133 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
4134 rb_enc_set_index(str, encidx);
4135 }
4136 }
4137 return str;
4138}
4139
4140/*
4141 * call-seq:
4142 * concat(*objects) -> string
4143 *
4144 * Concatenates each object in +objects+ to +self+ and returns +self+:
4145 *
4146 * s = 'foo'
4147 * s.concat('bar', 'baz') # => "foobarbaz"
4148 * s # => "foobarbaz"
4149 *
4150 * For each given object +object+ that is an Integer,
4151 * the value is considered a codepoint and converted to a character before concatenation:
4152 *
4153 * s = 'foo'
4154 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
4155 *
4156 * Related: String#<<, which takes a single argument.
4157 */
4158static VALUE
4159rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
4160{
4161 str_modifiable(str);
4162
4163 if (argc == 1) {
4164 return rb_str_concat(str, argv[0]);
4165 }
4166 else if (argc > 1) {
4167 int i;
4168 VALUE arg_str = rb_str_tmp_new(0);
4169 rb_enc_copy(arg_str, str);
4170 for (i = 0; i < argc; i++) {
4171 rb_str_concat(arg_str, argv[i]);
4172 }
4173 rb_str_buf_append(str, arg_str);
4174 }
4175
4176 return str;
4177}
4178
4179/*
4180 * call-seq:
4181 * append_as_bytes(*objects) -> self
4182 *
4183 * Concatenates each object in +objects+ into +self+; returns +self+;
4184 * performs no encoding validation or conversion:
4185 *
4186 * s = 'foo'
4187 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
4188 * s.valid_encoding? # => false
4189 * s.append_as_bytes("\xAC 12")
4190 * s.valid_encoding? # => true
4191 *
4192 * When a given object is an integer,
4193 * the value is considered an 8-bit byte;
4194 * if the integer occupies more than one byte (i.e,. is greater than 255),
4195 * appends only the low-order byte (similar to String#setbyte):
4196 *
4197 * s = ""
4198 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
4199 * s.bytesize # => 2
4200 *
4201 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4202 */
4203
4204VALUE
4205rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
4206{
4207 long needed_capacity = 0;
4208 volatile VALUE t0;
4209 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
4210
4211 for (int index = 0; index < argc; index++) {
4212 VALUE obj = argv[index];
4213 enum ruby_value_type type = types[index] = rb_type(obj);
4214 switch (type) {
4215 case T_FIXNUM:
4216 case T_BIGNUM:
4217 needed_capacity++;
4218 break;
4219 case T_STRING:
4220 needed_capacity += RSTRING_LEN(obj);
4221 break;
4222 default:
4223 rb_raise(
4225 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
4226 rb_obj_class(obj)
4227 );
4228 break;
4229 }
4230 }
4231
4232 str_ensure_available_capa(str, needed_capacity);
4233 char *sptr = RSTRING_END(str);
4234
4235 for (int index = 0; index < argc; index++) {
4236 VALUE obj = argv[index];
4237 enum ruby_value_type type = types[index];
4238 switch (type) {
4239 case T_FIXNUM:
4240 case T_BIGNUM: {
4241 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
4242 char byte = (char)(NUM2INT(obj) & 0xFF);
4243 *sptr = byte;
4244 sptr++;
4245 break;
4246 }
4247 case T_STRING: {
4248 const char *ptr;
4249 long len;
4250 RSTRING_GETMEM(obj, ptr, len);
4251 memcpy(sptr, ptr, len);
4252 sptr += len;
4253 break;
4254 }
4255 default:
4256 rb_bug("append_as_bytes arguments should have been validated");
4257 }
4258 }
4259
4260 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
4261 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
4262
4263 int cr = ENC_CODERANGE(str);
4264 switch (cr) {
4265 case ENC_CODERANGE_7BIT: {
4266 for (int index = 0; index < argc; index++) {
4267 VALUE obj = argv[index];
4268 enum ruby_value_type type = types[index];
4269 switch (type) {
4270 case T_FIXNUM:
4271 case T_BIGNUM: {
4272 if (!ISASCII(NUM2INT(obj))) {
4273 goto clear_cr;
4274 }
4275 break;
4276 }
4277 case T_STRING: {
4278 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
4279 goto clear_cr;
4280 }
4281 break;
4282 }
4283 default:
4284 rb_bug("append_as_bytes arguments should have been validated");
4285 }
4286 }
4287 break;
4288 }
4290 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
4291 goto keep_cr;
4292 }
4293 else {
4294 goto clear_cr;
4295 }
4296 break;
4297 default:
4298 goto clear_cr;
4299 break;
4300 }
4301
4302 RB_GC_GUARD(t0);
4303
4304 clear_cr:
4305 // If no fast path was hit, we clear the coderange.
4306 // append_as_bytes is predominently meant to be used in
4307 // buffering situation, hence it's likely the coderange
4308 // will never be scanned, so it's not worth spending time
4309 // precomputing the coderange except for simple and common
4310 // situations.
4312 keep_cr:
4313 return str;
4314}
4315
4316/*
4317 * call-seq:
4318 * self << object -> self
4319 *
4320 * Appends a string representation of +object+ to +self+;
4321 * returns +self+.
4322 *
4323 * If +object+ is a string, appends it to +self+:
4324 *
4325 * s = 'foo'
4326 * s << 'bar' # => "foobar"
4327 * s # => "foobar"
4328 *
4329 * If +object+ is an integer,
4330 * its value is considered a codepoint;
4331 * converts the value to a character before concatenating:
4332 *
4333 * s = 'foo'
4334 * s << 33 # => "foo!"
4335 *
4336 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4337 * and the encoding of +self+ is Encoding::US_ASCII,
4338 * changes the encoding to Encoding::ASCII_8BIT:
4339 *
4340 * s = 'foo'.encode(Encoding::US_ASCII)
4341 * s.encoding # => #<Encoding:US-ASCII>
4342 * s << 0xff # => "foo\xFF"
4343 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4344 *
4345 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4346 *
4347 * s = 'foo'
4348 * s.encoding # => <Encoding:UTF-8>
4349 * s << 0x00110000 # 1114112 out of char range (RangeError)
4350 * s = 'foo'.encode(Encoding::EUC_JP)
4351 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4352 *
4353 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4354 */
4355VALUE
4357{
4358 unsigned int code;
4359 rb_encoding *enc = STR_ENC_GET(str1);
4360 int encidx;
4361
4362 if (RB_INTEGER_TYPE_P(str2)) {
4363 if (rb_num_to_uint(str2, &code) == 0) {
4364 }
4365 else if (FIXNUM_P(str2)) {
4366 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4367 }
4368 else {
4369 rb_raise(rb_eRangeError, "bignum out of char range");
4370 }
4371 }
4372 else {
4373 return rb_str_append(str1, str2);
4374 }
4375
4376 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4377
4378 if (encidx >= 0) {
4379 rb_str_buf_cat_byte(str1, (unsigned char)code);
4380 }
4381 else {
4382 long pos = RSTRING_LEN(str1);
4383 int cr = ENC_CODERANGE(str1);
4384 int len;
4385 char *buf;
4386
4387 switch (len = rb_enc_codelen(code, enc)) {
4388 case ONIGERR_INVALID_CODE_POINT_VALUE:
4389 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4390 break;
4391 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4392 case 0:
4393 rb_raise(rb_eRangeError, "%u out of char range", code);
4394 break;
4395 }
4396 buf = ALLOCA_N(char, len + 1);
4397 rb_enc_mbcput(code, buf, enc);
4398 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4399 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4400 }
4401 rb_str_resize(str1, pos+len);
4402 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4403 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4405 }
4406 else if (cr == ENC_CODERANGE_BROKEN) {
4408 }
4409 ENC_CODERANGE_SET(str1, cr);
4410 }
4411 return str1;
4412}
4413
4414int
4415rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4416{
4417 int encidx = rb_enc_to_index(enc);
4418
4419 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4420 /* US-ASCII automatically extended to ASCII-8BIT */
4421 if (code > 0xFF) {
4422 rb_raise(rb_eRangeError, "%u out of char range", code);
4423 }
4424 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4425 return ENCINDEX_ASCII_8BIT;
4426 }
4427 return encidx;
4428 }
4429 else {
4430 return -1;
4431 }
4432}
4433
4434/*
4435 * call-seq:
4436 * prepend(*other_strings) -> string
4437 *
4438 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4439 *
4440 * s = 'foo'
4441 * s.prepend('bar', 'baz') # => "barbazfoo"
4442 * s # => "barbazfoo"
4443 *
4444 * Related: String#concat.
4445 */
4446
4447static VALUE
4448rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4449{
4450 str_modifiable(str);
4451
4452 if (argc == 1) {
4453 rb_str_update(str, 0L, 0L, argv[0]);
4454 }
4455 else if (argc > 1) {
4456 int i;
4457 VALUE arg_str = rb_str_tmp_new(0);
4458 rb_enc_copy(arg_str, str);
4459 for (i = 0; i < argc; i++) {
4460 rb_str_append(arg_str, argv[i]);
4461 }
4462 rb_str_update(str, 0L, 0L, arg_str);
4463 }
4464
4465 return str;
4466}
4467
4468st_index_t
4470{
4471 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4472 st_index_t precomputed_hash;
4473 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4474
4475 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4476 return precomputed_hash;
4477 }
4478
4479 return str_do_hash(str);
4480}
4481
4482int
4484{
4485 long len1, len2;
4486 const char *ptr1, *ptr2;
4487 RSTRING_GETMEM(str1, ptr1, len1);
4488 RSTRING_GETMEM(str2, ptr2, len2);
4489 return (len1 != len2 ||
4490 !rb_str_comparable(str1, str2) ||
4491 memcmp(ptr1, ptr2, len1) != 0);
4492}
4493
4494/*
4495 * call-seq:
4496 * hash -> integer
4497 *
4498 * Returns the integer hash value for +self+.
4499 * The value is based on the length, content and encoding of +self+.
4500 *
4501 * Related: Object#hash.
4502 */
4503
4504static VALUE
4505rb_str_hash_m(VALUE str)
4506{
4507 st_index_t hval = rb_str_hash(str);
4508 return ST2FIX(hval);
4509}
4510
4511#define lesser(a,b) (((a)>(b))?(b):(a))
4512
4513int
4515{
4516 int idx1, idx2;
4517 int rc1, rc2;
4518
4519 if (RSTRING_LEN(str1) == 0) return TRUE;
4520 if (RSTRING_LEN(str2) == 0) return TRUE;
4521 idx1 = ENCODING_GET(str1);
4522 idx2 = ENCODING_GET(str2);
4523 if (idx1 == idx2) return TRUE;
4524 rc1 = rb_enc_str_coderange(str1);
4525 rc2 = rb_enc_str_coderange(str2);
4526 if (rc1 == ENC_CODERANGE_7BIT) {
4527 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4528 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4529 return TRUE;
4530 }
4531 if (rc2 == ENC_CODERANGE_7BIT) {
4532 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4533 return TRUE;
4534 }
4535 return FALSE;
4536}
4537
4538int
4540{
4541 long len1, len2;
4542 const char *ptr1, *ptr2;
4543 int retval;
4544
4545 if (str1 == str2) return 0;
4546 RSTRING_GETMEM(str1, ptr1, len1);
4547 RSTRING_GETMEM(str2, ptr2, len2);
4548 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4549 if (len1 == len2) {
4550 if (!rb_str_comparable(str1, str2)) {
4551 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4552 return 1;
4553 return -1;
4554 }
4555 return 0;
4556 }
4557 if (len1 > len2) return 1;
4558 return -1;
4559 }
4560 if (retval > 0) return 1;
4561 return -1;
4562}
4563
4564/*
4565 * call-seq:
4566 * self == object -> true or false
4567 *
4568 * Returns whether +object+ is equal to +self+.
4569 *
4570 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4571 *
4572 * s = 'foo'
4573 * s == 'foo' # => true
4574 * s == 'food' # => false
4575 * s == 'FOO' # => false
4576 *
4577 * Returns +false+ if the two strings' encodings are not compatible:
4578 *
4579 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4580 *
4581 * When +object+ is not a string:
4582 *
4583 * - If +object+ responds to method <tt>to_str</tt>,
4584 * <tt>object == self</tt> is called and its return value is returned.
4585 * - If +object+ does not respond to <tt>to_str</tt>,
4586 * +false+ is returned.
4587 *
4588 * Related: {Comparing}[rdoc-ref:String@Comparing].
4589 */
4590
4591VALUE
4593{
4594 if (str1 == str2) return Qtrue;
4595 if (!RB_TYPE_P(str2, T_STRING)) {
4596 if (!rb_respond_to(str2, idTo_str)) {
4597 return Qfalse;
4598 }
4599 return rb_equal(str2, str1);
4600 }
4601 return rb_str_eql_internal(str1, str2);
4602}
4603
4604/*
4605 * call-seq:
4606 * eql?(object) -> true or false
4607 *
4608 * Returns +true+ if +object+ has the same length and content;
4609 * as +self+; +false+ otherwise:
4610 *
4611 * s = 'foo'
4612 * s.eql?('foo') # => true
4613 * s.eql?('food') # => false
4614 * s.eql?('FOO') # => false
4615 *
4616 * Returns +false+ if the two strings' encodings are not compatible:
4617 *
4618 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1).eql?("\u{c4 d6 dc}") # => false
4619 *
4620 */
4621
4622VALUE
4623rb_str_eql(VALUE str1, VALUE str2)
4624{
4625 if (str1 == str2) return Qtrue;
4626 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4627 return rb_str_eql_internal(str1, str2);
4628}
4629
4630/*
4631 * call-seq:
4632 * self <=> other_string -> -1, 0, 1, or nil
4633 *
4634 * Compares +self+ and +other_string+, returning:
4635 *
4636 * - -1 if +other_string+ is larger.
4637 * - 0 if the two are equal.
4638 * - 1 if +other_string+ is smaller.
4639 * - +nil+ if the two are incomparable.
4640 *
4641 * Examples:
4642 *
4643 * 'foo' <=> 'foo' # => 0
4644 * 'foo' <=> 'food' # => -1
4645 * 'food' <=> 'foo' # => 1
4646 * 'FOO' <=> 'foo' # => -1
4647 * 'foo' <=> 'FOO' # => 1
4648 * 'foo' <=> 1 # => nil
4649 *
4650 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4651 */
4652
4653static VALUE
4654rb_str_cmp_m(VALUE str1, VALUE str2)
4655{
4656 int result;
4657 VALUE s = rb_check_string_type(str2);
4658 if (NIL_P(s)) {
4659 return rb_invcmp(str1, str2);
4660 }
4661 result = rb_str_cmp(str1, s);
4662 return INT2FIX(result);
4663}
4664
4665static VALUE str_casecmp(VALUE str1, VALUE str2);
4666static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4667
4668/*
4669 * call-seq:
4670 * casecmp(other_string) -> -1, 0, 1, or nil
4671 *
4672 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4673 *
4674 * - -1 if <tt>other_string.downcase</tt> is larger.
4675 * - 0 if the two are equal.
4676 * - 1 if <tt>other_string.downcase</tt> is smaller.
4677 * - +nil+ if the two are incomparable.
4678 *
4679 * Examples:
4680 *
4681 * 'foo'.casecmp('foo') # => 0
4682 * 'foo'.casecmp('food') # => -1
4683 * 'food'.casecmp('foo') # => 1
4684 * 'FOO'.casecmp('foo') # => 0
4685 * 'foo'.casecmp('FOO') # => 0
4686 * 'foo'.casecmp(1) # => nil
4687 *
4688 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4689 *
4690 * Related: String#casecmp?.
4691 *
4692 */
4693
4694static VALUE
4695rb_str_casecmp(VALUE str1, VALUE str2)
4696{
4697 VALUE s = rb_check_string_type(str2);
4698 if (NIL_P(s)) {
4699 return Qnil;
4700 }
4701 return str_casecmp(str1, s);
4702}
4703
4704static VALUE
4705str_casecmp(VALUE str1, VALUE str2)
4706{
4707 long len;
4708 rb_encoding *enc;
4709 const char *p1, *p1end, *p2, *p2end;
4710
4711 enc = rb_enc_compatible(str1, str2);
4712 if (!enc) {
4713 return Qnil;
4714 }
4715
4716 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4717 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4718 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4719 while (p1 < p1end && p2 < p2end) {
4720 if (*p1 != *p2) {
4721 unsigned int c1 = TOLOWER(*p1 & 0xff);
4722 unsigned int c2 = TOLOWER(*p2 & 0xff);
4723 if (c1 != c2)
4724 return INT2FIX(c1 < c2 ? -1 : 1);
4725 }
4726 p1++;
4727 p2++;
4728 }
4729 }
4730 else {
4731 while (p1 < p1end && p2 < p2end) {
4732 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4733 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4734
4735 if (0 <= c1 && 0 <= c2) {
4736 c1 = TOLOWER(c1);
4737 c2 = TOLOWER(c2);
4738 if (c1 != c2)
4739 return INT2FIX(c1 < c2 ? -1 : 1);
4740 }
4741 else {
4742 int r;
4743 l1 = rb_enc_mbclen(p1, p1end, enc);
4744 l2 = rb_enc_mbclen(p2, p2end, enc);
4745 len = l1 < l2 ? l1 : l2;
4746 r = memcmp(p1, p2, len);
4747 if (r != 0)
4748 return INT2FIX(r < 0 ? -1 : 1);
4749 if (l1 != l2)
4750 return INT2FIX(l1 < l2 ? -1 : 1);
4751 }
4752 p1 += l1;
4753 p2 += l2;
4754 }
4755 }
4756 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4757 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4758 return INT2FIX(-1);
4759}
4760
4761/*
4762 * call-seq:
4763 * casecmp?(other_string) -> true, false, or nil
4764 *
4765 * Returns +true+ if +self+ and +other_string+ are equal after
4766 * Unicode case folding, otherwise +false+:
4767 *
4768 * 'foo'.casecmp?('foo') # => true
4769 * 'foo'.casecmp?('food') # => false
4770 * 'food'.casecmp?('foo') # => false
4771 * 'FOO'.casecmp?('foo') # => true
4772 * 'foo'.casecmp?('FOO') # => true
4773 *
4774 * Returns +nil+ if the two values are incomparable:
4775 *
4776 * 'foo'.casecmp?(1) # => nil
4777 *
4778 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4779 *
4780 * Related: String#casecmp.
4781 *
4782 */
4783
4784static VALUE
4785rb_str_casecmp_p(VALUE str1, VALUE str2)
4786{
4787 VALUE s = rb_check_string_type(str2);
4788 if (NIL_P(s)) {
4789 return Qnil;
4790 }
4791 return str_casecmp_p(str1, s);
4792}
4793
4794static VALUE
4795str_casecmp_p(VALUE str1, VALUE str2)
4796{
4797 rb_encoding *enc;
4798 VALUE folded_str1, folded_str2;
4799 VALUE fold_opt = sym_fold;
4800
4801 enc = rb_enc_compatible(str1, str2);
4802 if (!enc) {
4803 return Qnil;
4804 }
4805
4806 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4807 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4808
4809 return rb_str_eql(folded_str1, folded_str2);
4810}
4811
4812static long
4813strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4814 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4815{
4816 const char *search_start = str_ptr;
4817 long pos, search_len = str_len - offset;
4818
4819 for (;;) {
4820 const char *t;
4821 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4822 if (pos < 0) return pos;
4823 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4824 if (t == search_start + pos) break;
4825 search_len -= t - search_start;
4826 if (search_len <= 0) return -1;
4827 offset += t - search_start;
4828 search_start = t;
4829 }
4830 return pos + offset;
4831}
4832
4833/* found index in byte */
4834#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4835#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4836
4837static long
4838rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4839{
4840 const char *str_ptr, *str_ptr_end, *sub_ptr;
4841 long str_len, sub_len;
4842 rb_encoding *enc;
4843
4844 enc = rb_enc_check(str, sub);
4845 if (is_broken_string(sub)) return -1;
4846
4847 str_ptr = RSTRING_PTR(str);
4848 str_ptr_end = RSTRING_END(str);
4849 str_len = RSTRING_LEN(str);
4850 sub_ptr = RSTRING_PTR(sub);
4851 sub_len = RSTRING_LEN(sub);
4852
4853 if (str_len < sub_len) return -1;
4854
4855 if (offset != 0) {
4856 long str_len_char, sub_len_char;
4857 int single_byte = single_byte_optimizable(str);
4858 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4859 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4860 if (offset < 0) {
4861 offset += str_len_char;
4862 if (offset < 0) return -1;
4863 }
4864 if (str_len_char - offset < sub_len_char) return -1;
4865 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4866 str_ptr += offset;
4867 }
4868 if (sub_len == 0) return offset;
4869
4870 /* need proceed one character at a time */
4871 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4872}
4873
4874
4875/*
4876 * call-seq:
4877 * index(substring, offset = 0) -> integer or nil
4878 * index(regexp, offset = 0) -> integer or nil
4879 *
4880 * :include: doc/string/index.rdoc
4881 *
4882 */
4883
4884static VALUE
4885rb_str_index_m(int argc, VALUE *argv, VALUE str)
4886{
4887 VALUE sub;
4888 VALUE initpos;
4889 rb_encoding *enc = STR_ENC_GET(str);
4890 long pos;
4891
4892 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4893 long slen = str_strlen(str, enc); /* str's enc */
4894 pos = NUM2LONG(initpos);
4895 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4896 if (RB_TYPE_P(sub, T_REGEXP)) {
4898 }
4899 return Qnil;
4900 }
4901 }
4902 else {
4903 pos = 0;
4904 }
4905
4906 if (RB_TYPE_P(sub, T_REGEXP)) {
4907 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4908 enc, single_byte_optimizable(str));
4909
4910 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4911 VALUE match = rb_backref_get();
4912 struct re_registers *regs = RMATCH_REGS(match);
4913 pos = rb_str_sublen(str, BEG(0));
4914 return LONG2NUM(pos);
4915 }
4916 }
4917 else {
4918 StringValue(sub);
4919 pos = rb_str_index(str, sub, pos);
4920 if (pos >= 0) {
4921 pos = rb_str_sublen(str, pos);
4922 return LONG2NUM(pos);
4923 }
4924 }
4925 return Qnil;
4926}
4927
4928/* Ensure that the given pos is a valid character boundary.
4929 * Note that in this function, "character" means a code point
4930 * (Unicode scalar value), not a grapheme cluster.
4931 */
4932static void
4933str_ensure_byte_pos(VALUE str, long pos)
4934{
4935 if (!single_byte_optimizable(str)) {
4936 const char *s = RSTRING_PTR(str);
4937 const char *e = RSTRING_END(str);
4938 const char *p = s + pos;
4939 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4940 rb_raise(rb_eIndexError,
4941 "offset %ld does not land on character boundary", pos);
4942 }
4943 }
4944}
4945
4946/*
4947 * call-seq:
4948 * byteindex(object, offset = 0) -> integer or nil
4949 *
4950 * Returns the 0-based integer index of a substring of +self+
4951 * specified by +object+ (a string or Regexp) and +offset+,
4952 * or +nil+ if there is no such substring;
4953 * the returned index is the count of _bytes_ (not characters).
4954 *
4955 * When +object+ is a string,
4956 * returns the index of the first found substring equal to +object+:
4957 *
4958 * s = 'foo' # => "foo"
4959 * s.size # => 3 # Three 1-byte characters.
4960 s.bytesize # => 3 # Three bytes.
4961 * s.byteindex('f') # => 0
4962 * s.byteindex('o') # => 1
4963 * s.byteindex('oo') # => 1
4964 * s.byteindex('ooo') # => nil
4965 *
4966 * When +object+ is a Regexp,
4967 * returns the index of the first found substring matching +object+;
4968 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4969 *
4970 * s = 'foo'
4971 * s.byteindex(/f/) # => 0
4972 * $~ # => #<MatchData "f">
4973 * s.byteindex(/o/) # => 1
4974 * s.byteindex(/oo/) # => 1
4975 * s.byteindex(/ooo/) # => nil
4976 * $~ # => nil
4977 *
4978 * \Integer argument +offset+, if given, specifies the 0-based index
4979 * of the byte where searching is to begin.
4980 *
4981 * When +offset+ is non-negative,
4982 * searching begins at byte position +offset+:
4983 *
4984 * s = 'foo'
4985 * s.byteindex('o', 1) # => 1
4986 * s.byteindex('o', 2) # => 2
4987 * s.byteindex('o', 3) # => nil
4988 *
4989 * When +offset+ is negative, counts backward from the end of +self+:
4990 *
4991 * s = 'foo'
4992 * s.byteindex('o', -1) # => 2
4993 * s.byteindex('o', -2) # => 1
4994 * s.byteindex('o', -3) # => 1
4995 * s.byteindex('o', -4) # => nil
4996 *
4997 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4998 *
4999 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
5000 * s.size # => 2 # Two 3-byte characters.
5001 * s.bytesize # => 6 # Six bytes.
5002 * s.byteindex("\uFFFF") # => 0
5003 * s.byteindex("\uFFFF", 1) # Raises IndexError
5004 * s.byteindex("\uFFFF", 2) # Raises IndexError
5005 * s.byteindex("\uFFFF", 3) # => 3
5006 * s.byteindex("\uFFFF", 4) # Raises IndexError
5007 * s.byteindex("\uFFFF", 5) # Raises IndexError
5008 * s.byteindex("\uFFFF", 6) # => nil
5009 *
5010 * Related: see {Querying}[rdoc-ref:String@Querying].
5011 */
5012
5013static VALUE
5014rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
5015{
5016 VALUE sub;
5017 VALUE initpos;
5018 long pos;
5019
5020 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5021 long slen = RSTRING_LEN(str);
5022 pos = NUM2LONG(initpos);
5023 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
5024 if (RB_TYPE_P(sub, T_REGEXP)) {
5026 }
5027 return Qnil;
5028 }
5029 }
5030 else {
5031 pos = 0;
5032 }
5033
5034 str_ensure_byte_pos(str, pos);
5035
5036 if (RB_TYPE_P(sub, T_REGEXP)) {
5037 if (rb_reg_search(sub, str, pos, 0) >= 0) {
5038 VALUE match = rb_backref_get();
5039 struct re_registers *regs = RMATCH_REGS(match);
5040 pos = BEG(0);
5041 return LONG2NUM(pos);
5042 }
5043 }
5044 else {
5045 StringValue(sub);
5046 pos = rb_str_byteindex(str, sub, pos);
5047 if (pos >= 0) return LONG2NUM(pos);
5048 }
5049 return Qnil;
5050}
5051
5052#ifndef HAVE_MEMRCHR
5053static void*
5054memrchr(const char *search_str, int chr, long search_len)
5055{
5056 const char *ptr = search_str + search_len;
5057 while (ptr > search_str) {
5058 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
5059 }
5060
5061 return ((void *)0);
5062}
5063#endif
5064
5065static long
5066str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
5067{
5068 char *hit, *adjusted;
5069 int c;
5070 long slen, searchlen;
5071 char *sbeg, *e, *t;
5072
5073 sbeg = RSTRING_PTR(str);
5074 slen = RSTRING_LEN(sub);
5075 if (slen == 0) return s - sbeg;
5076 e = RSTRING_END(str);
5077 t = RSTRING_PTR(sub);
5078 c = *t & 0xff;
5079 searchlen = s - sbeg + 1;
5080
5081 if (memcmp(s, t, slen) == 0) {
5082 return s - sbeg;
5083 }
5084
5085 do {
5086 hit = memrchr(sbeg, c, searchlen);
5087 if (!hit) break;
5088 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
5089 if (hit != adjusted) {
5090 searchlen = adjusted - sbeg;
5091 continue;
5092 }
5093 if (memcmp(hit, t, slen) == 0)
5094 return hit - sbeg;
5095 searchlen = adjusted - sbeg;
5096 } while (searchlen > 0);
5097
5098 return -1;
5099}
5100
5101/* found index in byte */
5102static long
5103rb_str_rindex(VALUE str, VALUE sub, long pos)
5104{
5105 long len, slen;
5106 char *sbeg, *s;
5107 rb_encoding *enc;
5108 int singlebyte;
5109
5110 enc = rb_enc_check(str, sub);
5111 if (is_broken_string(sub)) return -1;
5112 singlebyte = single_byte_optimizable(str);
5113 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
5114 slen = str_strlen(sub, enc); /* rb_enc_check */
5115
5116 /* substring longer than string */
5117 if (len < slen) return -1;
5118 if (len - pos < slen) pos = len - slen;
5119 if (len == 0) return pos;
5120
5121 sbeg = RSTRING_PTR(str);
5122
5123 if (pos == 0) {
5124 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
5125 return 0;
5126 else
5127 return -1;
5128 }
5129
5130 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
5131 return str_rindex(str, sub, s, enc);
5132}
5133
5134/*
5135 * call-seq:
5136 * rindex(substring, offset = self.length) -> integer or nil
5137 * rindex(regexp, offset = self.length) -> integer or nil
5138 *
5139 * Returns the Integer index of the _last_ occurrence of the given +substring+,
5140 * or +nil+ if none found:
5141 *
5142 * 'foo'.rindex('f') # => 0
5143 * 'foo'.rindex('o') # => 2
5144 * 'foo'.rindex('oo') # => 1
5145 * 'foo'.rindex('ooo') # => nil
5146 *
5147 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
5148 * or +nil+ if none found:
5149 *
5150 * 'foo'.rindex(/f/) # => 0
5151 * 'foo'.rindex(/o/) # => 2
5152 * 'foo'.rindex(/oo/) # => 1
5153 * 'foo'.rindex(/ooo/) # => nil
5154 *
5155 * The _last_ match means starting at the possible last position, not
5156 * the last of longest matches.
5157 *
5158 * 'foo'.rindex(/o+/) # => 2
5159 * $~ #=> #<MatchData "o">
5160 *
5161 * To get the last longest match, needs to combine with negative
5162 * lookbehind.
5163 *
5164 * 'foo'.rindex(/(?<!o)o+/) # => 1
5165 * $~ #=> #<MatchData "oo">
5166 *
5167 * Or String#index with negative lookforward.
5168 *
5169 * 'foo'.index(/o+(?!.*o)/) # => 1
5170 * $~ #=> #<MatchData "oo">
5171 *
5172 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
5173 * string to _end_ the search:
5174 *
5175 * 'foo'.rindex('o', 0) # => nil
5176 * 'foo'.rindex('o', 1) # => 1
5177 * 'foo'.rindex('o', 2) # => 2
5178 * 'foo'.rindex('o', 3) # => 2
5179 *
5180 * If +offset+ is a negative Integer, the maximum starting position in the
5181 * string to _end_ the search is the sum of the string's length and +offset+:
5182 *
5183 * 'foo'.rindex('o', -1) # => 2
5184 * 'foo'.rindex('o', -2) # => 1
5185 * 'foo'.rindex('o', -3) # => nil
5186 * 'foo'.rindex('o', -4) # => nil
5187 *
5188 * Related: String#index.
5189 */
5190
5191static VALUE
5192rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
5193{
5194 VALUE sub;
5195 VALUE initpos;
5196 rb_encoding *enc = STR_ENC_GET(str);
5197 long pos, len = str_strlen(str, enc); /* str's enc */
5198
5199 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5200 pos = NUM2LONG(initpos);
5201 if (pos < 0 && (pos += len) < 0) {
5202 if (RB_TYPE_P(sub, T_REGEXP)) {
5204 }
5205 return Qnil;
5206 }
5207 if (pos > len) pos = len;
5208 }
5209 else {
5210 pos = len;
5211 }
5212
5213 if (RB_TYPE_P(sub, T_REGEXP)) {
5214 /* enc = rb_enc_check(str, sub); */
5215 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
5216 enc, single_byte_optimizable(str));
5217
5218 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5219 VALUE match = rb_backref_get();
5220 struct re_registers *regs = RMATCH_REGS(match);
5221 pos = rb_str_sublen(str, BEG(0));
5222 return LONG2NUM(pos);
5223 }
5224 }
5225 else {
5226 StringValue(sub);
5227 pos = rb_str_rindex(str, sub, pos);
5228 if (pos >= 0) {
5229 pos = rb_str_sublen(str, pos);
5230 return LONG2NUM(pos);
5231 }
5232 }
5233 return Qnil;
5234}
5235
5236static long
5237rb_str_byterindex(VALUE str, VALUE sub, long pos)
5238{
5239 long len, slen;
5240 char *sbeg, *s;
5241 rb_encoding *enc;
5242
5243 enc = rb_enc_check(str, sub);
5244 if (is_broken_string(sub)) return -1;
5245 len = RSTRING_LEN(str);
5246 slen = RSTRING_LEN(sub);
5247
5248 /* substring longer than string */
5249 if (len < slen) return -1;
5250 if (len - pos < slen) pos = len - slen;
5251 if (len == 0) return pos;
5252
5253 sbeg = RSTRING_PTR(str);
5254
5255 if (pos == 0) {
5256 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
5257 return 0;
5258 else
5259 return -1;
5260 }
5261
5262 s = sbeg + pos;
5263 return str_rindex(str, sub, s, enc);
5264}
5265
5266
5267/*
5268 * call-seq:
5269 * byterindex(substring, offset = self.bytesize) -> integer or nil
5270 * byterindex(regexp, offset = self.bytesize) -> integer or nil
5271 *
5272 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
5273 * or +nil+ if none found:
5274 *
5275 * 'foo'.byterindex('f') # => 0
5276 * 'foo'.byterindex('o') # => 2
5277 * 'foo'.byterindex('oo') # => 1
5278 * 'foo'.byterindex('ooo') # => nil
5279 *
5280 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
5281 * or +nil+ if none found:
5282 *
5283 * 'foo'.byterindex(/f/) # => 0
5284 * 'foo'.byterindex(/o/) # => 2
5285 * 'foo'.byterindex(/oo/) # => 1
5286 * 'foo'.byterindex(/ooo/) # => nil
5287 *
5288 * The _last_ match means starting at the possible last position, not
5289 * the last of longest matches.
5290 *
5291 * 'foo'.byterindex(/o+/) # => 2
5292 * $~ #=> #<MatchData "o">
5293 *
5294 * To get the last longest match, needs to combine with negative
5295 * lookbehind.
5296 *
5297 * 'foo'.byterindex(/(?<!o)o+/) # => 1
5298 * $~ #=> #<MatchData "oo">
5299 *
5300 * Or String#byteindex with negative lookforward.
5301 *
5302 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
5303 * $~ #=> #<MatchData "oo">
5304 *
5305 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
5306 * string to _end_ the search:
5307 *
5308 * 'foo'.byterindex('o', 0) # => nil
5309 * 'foo'.byterindex('o', 1) # => 1
5310 * 'foo'.byterindex('o', 2) # => 2
5311 * 'foo'.byterindex('o', 3) # => 2
5312 *
5313 * If +offset+ is a negative Integer, the maximum starting position in the
5314 * string to _end_ the search is the sum of the string's length and +offset+:
5315 *
5316 * 'foo'.byterindex('o', -1) # => 2
5317 * 'foo'.byterindex('o', -2) # => 1
5318 * 'foo'.byterindex('o', -3) # => nil
5319 * 'foo'.byterindex('o', -4) # => nil
5320 *
5321 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
5322 * raised.
5323 *
5324 * Related: String#byteindex.
5325 */
5326
5327static VALUE
5328rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
5329{
5330 VALUE sub;
5331 VALUE initpos;
5332 long pos, len = RSTRING_LEN(str);
5333
5334 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5335 pos = NUM2LONG(initpos);
5336 if (pos < 0 && (pos += len) < 0) {
5337 if (RB_TYPE_P(sub, T_REGEXP)) {
5339 }
5340 return Qnil;
5341 }
5342 if (pos > len) pos = len;
5343 }
5344 else {
5345 pos = len;
5346 }
5347
5348 str_ensure_byte_pos(str, pos);
5349
5350 if (RB_TYPE_P(sub, T_REGEXP)) {
5351 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5352 VALUE match = rb_backref_get();
5353 struct re_registers *regs = RMATCH_REGS(match);
5354 pos = BEG(0);
5355 return LONG2NUM(pos);
5356 }
5357 }
5358 else {
5359 StringValue(sub);
5360 pos = rb_str_byterindex(str, sub, pos);
5361 if (pos >= 0) return LONG2NUM(pos);
5362 }
5363 return Qnil;
5364}
5365
5366/*
5367 * call-seq:
5368 * self =~ object -> integer or nil
5369 *
5370 * When +object+ is a Regexp, returns the index of the first substring in +self+
5371 * matched by +object+,
5372 * or +nil+ if no match is found;
5373 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5374 *
5375 * 'foo' =~ /f/ # => 0
5376 * $~ # => #<MatchData "f">
5377 * 'foo' =~ /o/ # => 1
5378 * $~ # => #<MatchData "o">
5379 * 'foo' =~ /x/ # => nil
5380 * $~ # => nil
5381 *
5382 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5383 * (see Regexp#=~):
5384 *
5385 * number = nil
5386 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5387 * number # => nil # Not assigned.
5388 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5389 * number # => "9" # Assigned.
5390 *
5391 * If +object+ is not a Regexp, returns the value
5392 * returned by <tt>object =~ self</tt>.
5393 *
5394 * Related: see {Querying}[rdoc-ref:String@Querying].
5395 */
5396
5397static VALUE
5398rb_str_match(VALUE x, VALUE y)
5399{
5400 switch (OBJ_BUILTIN_TYPE(y)) {
5401 case T_STRING:
5402 rb_raise(rb_eTypeError, "type mismatch: String given");
5403
5404 case T_REGEXP:
5405 return rb_reg_match(y, x);
5406
5407 default:
5408 return rb_funcall(y, idEqTilde, 1, x);
5409 }
5410}
5411
5412
5413static VALUE get_pat(VALUE);
5414
5415
5416/*
5417 * call-seq:
5418 * match(pattern, offset = 0) -> matchdata or nil
5419 * match(pattern, offset = 0) {|matchdata| ... } -> object
5420 *
5421 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
5422 *
5423 * Note: also updates Regexp@Global+Variables.
5424 *
5425 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5426 * regexp = Regexp.new(pattern)
5427 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5428 * (see Regexp#match):
5429 * matchdata = regexp.match(self)
5430 *
5431 * With no block given, returns the computed +matchdata+:
5432 *
5433 * 'foo'.match('f') # => #<MatchData "f">
5434 * 'foo'.match('o') # => #<MatchData "o">
5435 * 'foo'.match('x') # => nil
5436 *
5437 * If Integer argument +offset+ is given, the search begins at index +offset+:
5438 *
5439 * 'foo'.match('f', 1) # => nil
5440 * 'foo'.match('o', 1) # => #<MatchData "o">
5441 *
5442 * With a block given, calls the block with the computed +matchdata+
5443 * and returns the block's return value:
5444 *
5445 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5446 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
5447 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
5448 *
5449 */
5450
5451static VALUE
5452rb_str_match_m(int argc, VALUE *argv, VALUE str)
5453{
5454 VALUE re, result;
5455 if (argc < 1)
5456 rb_check_arity(argc, 1, 2);
5457 re = argv[0];
5458 argv[0] = str;
5459 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5460 if (!NIL_P(result) && rb_block_given_p()) {
5461 return rb_yield(result);
5462 }
5463 return result;
5464}
5465
5466/*
5467 * call-seq:
5468 * match?(pattern, offset = 0) -> true or false
5469 *
5470 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5471 *
5472 * Note: does not update Regexp@Global+Variables.
5473 *
5474 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5475 * regexp = Regexp.new(pattern)
5476 *
5477 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5478 * +false+ otherwise:
5479 *
5480 * 'foo'.match?(/o/) # => true
5481 * 'foo'.match?('o') # => true
5482 * 'foo'.match?(/x/) # => false
5483 *
5484 * If Integer argument +offset+ is given, the search begins at index +offset+:
5485 * 'foo'.match?('f', 1) # => false
5486 * 'foo'.match?('o', 1) # => true
5487 *
5488 */
5489
5490static VALUE
5491rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5492{
5493 VALUE re;
5494 rb_check_arity(argc, 1, 2);
5495 re = get_pat(argv[0]);
5496 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5497}
5498
5499enum neighbor_char {
5500 NEIGHBOR_NOT_CHAR,
5501 NEIGHBOR_FOUND,
5502 NEIGHBOR_WRAPPED
5503};
5504
5505static enum neighbor_char
5506enc_succ_char(char *p, long len, rb_encoding *enc)
5507{
5508 long i;
5509 int l;
5510
5511 if (rb_enc_mbminlen(enc) > 1) {
5512 /* wchar, trivial case */
5513 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5514 if (!MBCLEN_CHARFOUND_P(r)) {
5515 return NEIGHBOR_NOT_CHAR;
5516 }
5517 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5518 l = rb_enc_code_to_mbclen(c, enc);
5519 if (!l) return NEIGHBOR_NOT_CHAR;
5520 if (l != len) return NEIGHBOR_WRAPPED;
5521 rb_enc_mbcput(c, p, enc);
5522 r = rb_enc_precise_mbclen(p, p + len, enc);
5523 if (!MBCLEN_CHARFOUND_P(r)) {
5524 return NEIGHBOR_NOT_CHAR;
5525 }
5526 return NEIGHBOR_FOUND;
5527 }
5528 while (1) {
5529 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5530 p[i] = '\0';
5531 if (i < 0)
5532 return NEIGHBOR_WRAPPED;
5533 ++((unsigned char*)p)[i];
5534 l = rb_enc_precise_mbclen(p, p+len, enc);
5535 if (MBCLEN_CHARFOUND_P(l)) {
5536 l = MBCLEN_CHARFOUND_LEN(l);
5537 if (l == len) {
5538 return NEIGHBOR_FOUND;
5539 }
5540 else {
5541 memset(p+l, 0xff, len-l);
5542 }
5543 }
5544 if (MBCLEN_INVALID_P(l) && i < len-1) {
5545 long len2;
5546 int l2;
5547 for (len2 = len-1; 0 < len2; len2--) {
5548 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5549 if (!MBCLEN_INVALID_P(l2))
5550 break;
5551 }
5552 memset(p+len2+1, 0xff, len-(len2+1));
5553 }
5554 }
5555}
5556
5557static enum neighbor_char
5558enc_pred_char(char *p, long len, rb_encoding *enc)
5559{
5560 long i;
5561 int l;
5562 if (rb_enc_mbminlen(enc) > 1) {
5563 /* wchar, trivial case */
5564 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5565 if (!MBCLEN_CHARFOUND_P(r)) {
5566 return NEIGHBOR_NOT_CHAR;
5567 }
5568 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5569 if (!c) return NEIGHBOR_NOT_CHAR;
5570 --c;
5571 l = rb_enc_code_to_mbclen(c, enc);
5572 if (!l) return NEIGHBOR_NOT_CHAR;
5573 if (l != len) return NEIGHBOR_WRAPPED;
5574 rb_enc_mbcput(c, p, enc);
5575 r = rb_enc_precise_mbclen(p, p + len, enc);
5576 if (!MBCLEN_CHARFOUND_P(r)) {
5577 return NEIGHBOR_NOT_CHAR;
5578 }
5579 return NEIGHBOR_FOUND;
5580 }
5581 while (1) {
5582 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5583 p[i] = '\xff';
5584 if (i < 0)
5585 return NEIGHBOR_WRAPPED;
5586 --((unsigned char*)p)[i];
5587 l = rb_enc_precise_mbclen(p, p+len, enc);
5588 if (MBCLEN_CHARFOUND_P(l)) {
5589 l = MBCLEN_CHARFOUND_LEN(l);
5590 if (l == len) {
5591 return NEIGHBOR_FOUND;
5592 }
5593 else {
5594 memset(p+l, 0, len-l);
5595 }
5596 }
5597 if (MBCLEN_INVALID_P(l) && i < len-1) {
5598 long len2;
5599 int l2;
5600 for (len2 = len-1; 0 < len2; len2--) {
5601 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5602 if (!MBCLEN_INVALID_P(l2))
5603 break;
5604 }
5605 memset(p+len2+1, 0, len-(len2+1));
5606 }
5607 }
5608}
5609
5610/*
5611 overwrite +p+ by succeeding letter in +enc+ and returns
5612 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5613 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5614 assuming each ranges are successive, and mbclen
5615 never change in each ranges.
5616 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5617 character.
5618 */
5619static enum neighbor_char
5620enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5621{
5622 enum neighbor_char ret;
5623 unsigned int c;
5624 int ctype;
5625 int range;
5626 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5627
5628 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5629 int try;
5630 const int max_gaps = 1;
5631
5632 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5633 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5634 ctype = ONIGENC_CTYPE_DIGIT;
5635 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5636 ctype = ONIGENC_CTYPE_ALPHA;
5637 else
5638 return NEIGHBOR_NOT_CHAR;
5639
5640 MEMCPY(save, p, char, len);
5641 for (try = 0; try <= max_gaps; ++try) {
5642 ret = enc_succ_char(p, len, enc);
5643 if (ret == NEIGHBOR_FOUND) {
5644 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5645 if (rb_enc_isctype(c, ctype, enc))
5646 return NEIGHBOR_FOUND;
5647 }
5648 }
5649 MEMCPY(p, save, char, len);
5650 range = 1;
5651 while (1) {
5652 MEMCPY(save, p, char, len);
5653 ret = enc_pred_char(p, len, enc);
5654 if (ret == NEIGHBOR_FOUND) {
5655 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5656 if (!rb_enc_isctype(c, ctype, enc)) {
5657 MEMCPY(p, save, char, len);
5658 break;
5659 }
5660 }
5661 else {
5662 MEMCPY(p, save, char, len);
5663 break;
5664 }
5665 range++;
5666 }
5667 if (range == 1) {
5668 return NEIGHBOR_NOT_CHAR;
5669 }
5670
5671 if (ctype != ONIGENC_CTYPE_DIGIT) {
5672 MEMCPY(carry, p, char, len);
5673 return NEIGHBOR_WRAPPED;
5674 }
5675
5676 MEMCPY(carry, p, char, len);
5677 enc_succ_char(carry, len, enc);
5678 return NEIGHBOR_WRAPPED;
5679}
5680
5681
5682static VALUE str_succ(VALUE str);
5683
5684/*
5685 * call-seq:
5686 * succ -> new_str
5687 *
5688 * Returns the successor to +self+. The successor is calculated by
5689 * incrementing characters.
5690 *
5691 * The first character to be incremented is the rightmost alphanumeric:
5692 * or, if no alphanumerics, the rightmost character:
5693 *
5694 * 'THX1138'.succ # => "THX1139"
5695 * '<<koala>>'.succ # => "<<koalb>>"
5696 * '***'.succ # => '**+'
5697 *
5698 * The successor to a digit is another digit, "carrying" to the next-left
5699 * character for a "rollover" from 9 to 0, and prepending another digit
5700 * if necessary:
5701 *
5702 * '00'.succ # => "01"
5703 * '09'.succ # => "10"
5704 * '99'.succ # => "100"
5705 *
5706 * The successor to a letter is another letter of the same case,
5707 * carrying to the next-left character for a rollover,
5708 * and prepending another same-case letter if necessary:
5709 *
5710 * 'aa'.succ # => "ab"
5711 * 'az'.succ # => "ba"
5712 * 'zz'.succ # => "aaa"
5713 * 'AA'.succ # => "AB"
5714 * 'AZ'.succ # => "BA"
5715 * 'ZZ'.succ # => "AAA"
5716 *
5717 * The successor to a non-alphanumeric character is the next character
5718 * in the underlying character set's collating sequence,
5719 * carrying to the next-left character for a rollover,
5720 * and prepending another character if necessary:
5721 *
5722 * s = 0.chr * 3
5723 * s # => "\x00\x00\x00"
5724 * s.succ # => "\x00\x00\x01"
5725 * s = 255.chr * 3
5726 * s # => "\xFF\xFF\xFF"
5727 * s.succ # => "\x01\x00\x00\x00"
5728 *
5729 * Carrying can occur between and among mixtures of alphanumeric characters:
5730 *
5731 * s = 'zz99zz99'
5732 * s.succ # => "aaa00aa00"
5733 * s = '99zz99zz'
5734 * s.succ # => "100aa00aa"
5735 *
5736 * The successor to an empty +String+ is a new empty +String+:
5737 *
5738 * ''.succ # => ""
5739 *
5740 */
5741
5742VALUE
5744{
5745 VALUE str;
5746 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5747 rb_enc_cr_str_copy_for_substr(str, orig);
5748 return str_succ(str);
5749}
5750
5751static VALUE
5752str_succ(VALUE str)
5753{
5754 rb_encoding *enc;
5755 char *sbeg, *s, *e, *last_alnum = 0;
5756 int found_alnum = 0;
5757 long l, slen;
5758 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5759 long carry_pos = 0, carry_len = 1;
5760 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5761
5762 slen = RSTRING_LEN(str);
5763 if (slen == 0) return str;
5764
5765 enc = STR_ENC_GET(str);
5766 sbeg = RSTRING_PTR(str);
5767 s = e = sbeg + slen;
5768
5769 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5770 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5771 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5772 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5773 break;
5774 }
5775 }
5776 l = rb_enc_precise_mbclen(s, e, enc);
5777 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5778 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5779 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5780 switch (neighbor) {
5781 case NEIGHBOR_NOT_CHAR:
5782 continue;
5783 case NEIGHBOR_FOUND:
5784 return str;
5785 case NEIGHBOR_WRAPPED:
5786 last_alnum = s;
5787 break;
5788 }
5789 found_alnum = 1;
5790 carry_pos = s - sbeg;
5791 carry_len = l;
5792 }
5793 if (!found_alnum) { /* str contains no alnum */
5794 s = e;
5795 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5796 enum neighbor_char neighbor;
5797 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5798 l = rb_enc_precise_mbclen(s, e, enc);
5799 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5800 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5801 MEMCPY(tmp, s, char, l);
5802 neighbor = enc_succ_char(tmp, l, enc);
5803 switch (neighbor) {
5804 case NEIGHBOR_FOUND:
5805 MEMCPY(s, tmp, char, l);
5806 return str;
5807 break;
5808 case NEIGHBOR_WRAPPED:
5809 MEMCPY(s, tmp, char, l);
5810 break;
5811 case NEIGHBOR_NOT_CHAR:
5812 break;
5813 }
5814 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5815 /* wrapped to \0...\0. search next valid char. */
5816 enc_succ_char(s, l, enc);
5817 }
5818 if (!rb_enc_asciicompat(enc)) {
5819 MEMCPY(carry, s, char, l);
5820 carry_len = l;
5821 }
5822 carry_pos = s - sbeg;
5823 }
5825 }
5826 RESIZE_CAPA(str, slen + carry_len);
5827 sbeg = RSTRING_PTR(str);
5828 s = sbeg + carry_pos;
5829 memmove(s + carry_len, s, slen - carry_pos);
5830 memmove(s, carry, carry_len);
5831 slen += carry_len;
5832 STR_SET_LEN(str, slen);
5833 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5835 return str;
5836}
5837
5838
5839/*
5840 * call-seq:
5841 * succ! -> self
5842 *
5843 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5844 */
5845
5846static VALUE
5847rb_str_succ_bang(VALUE str)
5848{
5849 rb_str_modify(str);
5850 str_succ(str);
5851 return str;
5852}
5853
5854static int
5855all_digits_p(const char *s, long len)
5856{
5857 while (len-- > 0) {
5858 if (!ISDIGIT(*s)) return 0;
5859 s++;
5860 }
5861 return 1;
5862}
5863
5864static int
5865str_upto_i(VALUE str, VALUE arg)
5866{
5867 rb_yield(str);
5868 return 0;
5869}
5870
5871/*
5872 * call-seq:
5873 * upto(other_string, exclusive = false) {|string| ... } -> self
5874 * upto(other_string, exclusive = false) -> new_enumerator
5875 *
5876 * With a block given, calls the block with each +String+ value
5877 * returned by successive calls to String#succ;
5878 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5879 * the sequence terminates when value +other_string+ is reached;
5880 * returns +self+:
5881 *
5882 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5883 * Output:
5884 *
5885 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5886 *
5887 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5888 *
5889 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5890 *
5891 * Output:
5892 *
5893 * a8 a9 b0 b1 b2 b3 b4 b5
5894 *
5895 * If +other_string+ would not be reached, does not call the block:
5896 *
5897 * '25'.upto('5') {|s| fail s }
5898 * 'aa'.upto('a') {|s| fail s }
5899 *
5900 * With no block given, returns a new Enumerator:
5901 *
5902 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5903 *
5904 */
5905
5906static VALUE
5907rb_str_upto(int argc, VALUE *argv, VALUE beg)
5908{
5909 VALUE end, exclusive;
5910
5911 rb_scan_args(argc, argv, "11", &end, &exclusive);
5912 RETURN_ENUMERATOR(beg, argc, argv);
5913 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5914}
5915
5916VALUE
5917rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5918{
5919 VALUE current, after_end;
5920 ID succ;
5921 int n, ascii;
5922 rb_encoding *enc;
5923
5924 CONST_ID(succ, "succ");
5925 StringValue(end);
5926 enc = rb_enc_check(beg, end);
5927 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5928 /* single character */
5929 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5930 char c = RSTRING_PTR(beg)[0];
5931 char e = RSTRING_PTR(end)[0];
5932
5933 if (c > e || (excl && c == e)) return beg;
5934 for (;;) {
5935 VALUE str = rb_enc_str_new(&c, 1, enc);
5937 if ((*each)(str, arg)) break;
5938 if (!excl && c == e) break;
5939 c++;
5940 if (excl && c == e) break;
5941 }
5942 return beg;
5943 }
5944 /* both edges are all digits */
5945 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5946 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5947 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5948 VALUE b, e;
5949 int width;
5950
5951 width = RSTRING_LENINT(beg);
5952 b = rb_str_to_inum(beg, 10, FALSE);
5953 e = rb_str_to_inum(end, 10, FALSE);
5954 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5955 long bi = FIX2LONG(b);
5956 long ei = FIX2LONG(e);
5957 rb_encoding *usascii = rb_usascii_encoding();
5958
5959 while (bi <= ei) {
5960 if (excl && bi == ei) break;
5961 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5962 bi++;
5963 }
5964 }
5965 else {
5966 ID op = excl ? '<' : idLE;
5967 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5968
5969 args[0] = INT2FIX(width);
5970 while (rb_funcall(b, op, 1, e)) {
5971 args[1] = b;
5972 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5973 b = rb_funcallv(b, succ, 0, 0);
5974 }
5975 }
5976 return beg;
5977 }
5978 /* normal case */
5979 n = rb_str_cmp(beg, end);
5980 if (n > 0 || (excl && n == 0)) return beg;
5981
5982 after_end = rb_funcallv(end, succ, 0, 0);
5983 current = str_duplicate(rb_cString, beg);
5984 while (!rb_str_equal(current, after_end)) {
5985 VALUE next = Qnil;
5986 if (excl || !rb_str_equal(current, end))
5987 next = rb_funcallv(current, succ, 0, 0);
5988 if ((*each)(current, arg)) break;
5989 if (NIL_P(next)) break;
5990 current = next;
5991 StringValue(current);
5992 if (excl && rb_str_equal(current, end)) break;
5993 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5994 break;
5995 }
5996
5997 return beg;
5998}
5999
6000VALUE
6001rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
6002{
6003 VALUE current;
6004 ID succ;
6005
6006 CONST_ID(succ, "succ");
6007 /* both edges are all digits */
6008 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
6009 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
6010 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
6011 int width = RSTRING_LENINT(beg);
6012 b = rb_str_to_inum(beg, 10, FALSE);
6013 if (FIXNUM_P(b)) {
6014 long bi = FIX2LONG(b);
6015 rb_encoding *usascii = rb_usascii_encoding();
6016
6017 while (FIXABLE(bi)) {
6018 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
6019 bi++;
6020 }
6021 b = LONG2NUM(bi);
6022 }
6023 args[0] = INT2FIX(width);
6024 while (1) {
6025 args[1] = b;
6026 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
6027 b = rb_funcallv(b, succ, 0, 0);
6028 }
6029 }
6030 /* normal case */
6031 current = str_duplicate(rb_cString, beg);
6032 while (1) {
6033 VALUE next = rb_funcallv(current, succ, 0, 0);
6034 if ((*each)(current, arg)) break;
6035 current = next;
6036 StringValue(current);
6037 if (RSTRING_LEN(current) == 0)
6038 break;
6039 }
6040
6041 return beg;
6042}
6043
6044static int
6045include_range_i(VALUE str, VALUE arg)
6046{
6047 VALUE *argp = (VALUE *)arg;
6048 if (!rb_equal(str, *argp)) return 0;
6049 *argp = Qnil;
6050 return 1;
6051}
6052
6053VALUE
6054rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
6055{
6056 beg = rb_str_new_frozen(beg);
6057 StringValue(end);
6058 end = rb_str_new_frozen(end);
6059 if (NIL_P(val)) return Qfalse;
6060 val = rb_check_string_type(val);
6061 if (NIL_P(val)) return Qfalse;
6062 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
6063 rb_enc_asciicompat(STR_ENC_GET(end)) &&
6064 rb_enc_asciicompat(STR_ENC_GET(val))) {
6065 const char *bp = RSTRING_PTR(beg);
6066 const char *ep = RSTRING_PTR(end);
6067 const char *vp = RSTRING_PTR(val);
6068 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
6069 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
6070 return Qfalse;
6071 else {
6072 char b = *bp;
6073 char e = *ep;
6074 char v = *vp;
6075
6076 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
6077 if (b <= v && v < e) return Qtrue;
6078 return RBOOL(!RTEST(exclusive) && v == e);
6079 }
6080 }
6081 }
6082#if 0
6083 /* both edges are all digits */
6084 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
6085 all_digits_p(bp, RSTRING_LEN(beg)) &&
6086 all_digits_p(ep, RSTRING_LEN(end))) {
6087 /* TODO */
6088 }
6089#endif
6090 }
6091 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
6092
6093 return RBOOL(NIL_P(val));
6094}
6095
6096static VALUE
6097rb_str_subpat(VALUE str, VALUE re, VALUE backref)
6098{
6099 if (rb_reg_search(re, str, 0, 0) >= 0) {
6100 VALUE match = rb_backref_get();
6101 int nth = rb_reg_backref_number(match, backref);
6102 return rb_reg_nth_match(nth, match);
6103 }
6104 return Qnil;
6105}
6106
6107static VALUE
6108rb_str_aref(VALUE str, VALUE indx)
6109{
6110 long idx;
6111
6112 if (FIXNUM_P(indx)) {
6113 idx = FIX2LONG(indx);
6114 }
6115 else if (RB_TYPE_P(indx, T_REGEXP)) {
6116 return rb_str_subpat(str, indx, INT2FIX(0));
6117 }
6118 else if (RB_TYPE_P(indx, T_STRING)) {
6119 if (rb_str_index(str, indx, 0) != -1)
6120 return str_duplicate(rb_cString, indx);
6121 return Qnil;
6122 }
6123 else {
6124 /* check if indx is Range */
6125 long beg, len = str_strlen(str, NULL);
6126 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6127 case Qfalse:
6128 break;
6129 case Qnil:
6130 return Qnil;
6131 default:
6132 return rb_str_substr(str, beg, len);
6133 }
6134 idx = NUM2LONG(indx);
6135 }
6136
6137 return str_substr(str, idx, 1, FALSE);
6138}
6139
6140
6141/*
6142 * call-seq:
6143 * self[index] -> new_string or nil
6144 * self[start, length] -> new_string or nil
6145 * self[range] -> new_string or nil
6146 * self[regexp, capture = 0] -> new_string or nil
6147 * self[substring] -> new_string or nil
6148 *
6149 * Returns the substring of +self+ specified by the arguments.
6150 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
6151 *
6152 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6153 */
6154
6155static VALUE
6156rb_str_aref_m(int argc, VALUE *argv, VALUE str)
6157{
6158 if (argc == 2) {
6159 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6160 return rb_str_subpat(str, argv[0], argv[1]);
6161 }
6162 else {
6163 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
6164 }
6165 }
6166 rb_check_arity(argc, 1, 2);
6167 return rb_str_aref(str, argv[0]);
6168}
6169
6170VALUE
6172{
6173 char *ptr = RSTRING_PTR(str);
6174 long olen = RSTRING_LEN(str), nlen;
6175
6176 str_modifiable(str);
6177 if (len > olen) len = olen;
6178 nlen = olen - len;
6179 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
6180 char *oldptr = ptr;
6181 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
6182 STR_SET_EMBED(str);
6183 ptr = RSTRING(str)->as.embed.ary;
6184 memmove(ptr, oldptr + len, nlen);
6185 if (fl == STR_NOEMBED) xfree(oldptr);
6186 }
6187 else {
6188 if (!STR_SHARED_P(str)) {
6189 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
6190 rb_enc_cr_str_exact_copy(shared, str);
6191 OBJ_FREEZE(shared);
6192 }
6193 ptr = RSTRING(str)->as.heap.ptr += len;
6194 }
6195 STR_SET_LEN(str, nlen);
6196
6197 if (!SHARABLE_MIDDLE_SUBSTRING) {
6198 TERM_FILL(ptr + nlen, TERM_LEN(str));
6199 }
6201 return str;
6202}
6203
6204static void
6205rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
6206{
6207 char *sptr;
6208 long slen;
6209 int cr;
6210
6211 if (beg == 0 && vlen == 0) {
6212 rb_str_drop_bytes(str, len);
6213 return;
6214 }
6215
6216 str_modify_keep_cr(str);
6217 RSTRING_GETMEM(str, sptr, slen);
6218 if (len < vlen) {
6219 /* expand string */
6220 RESIZE_CAPA(str, slen + vlen - len);
6221 sptr = RSTRING_PTR(str);
6222 }
6223
6225 cr = rb_enc_str_coderange(val);
6226 else
6228
6229 if (vlen != len) {
6230 memmove(sptr + beg + vlen,
6231 sptr + beg + len,
6232 slen - (beg + len));
6233 }
6234 if (vlen < beg && len < 0) {
6235 MEMZERO(sptr + slen, char, -len);
6236 }
6237 if (vlen > 0) {
6238 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
6239 }
6240 slen += vlen - len;
6241 STR_SET_LEN(str, slen);
6242 TERM_FILL(&sptr[slen], TERM_LEN(str));
6243 ENC_CODERANGE_SET(str, cr);
6244}
6245
6246static inline void
6247rb_str_update_0(VALUE str, long beg, long len, VALUE val)
6248{
6249 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
6250}
6251
6252void
6253rb_str_update(VALUE str, long beg, long len, VALUE val)
6254{
6255 long slen;
6256 char *p, *e;
6257 rb_encoding *enc;
6258 int singlebyte = single_byte_optimizable(str);
6259 int cr;
6260
6261 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
6262
6263 StringValue(val);
6264 enc = rb_enc_check(str, val);
6265 slen = str_strlen(str, enc); /* rb_enc_check */
6266
6267 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6268 rb_raise(rb_eIndexError, "index %ld out of string", beg);
6269 }
6270 if (beg < 0) {
6271 beg += slen;
6272 }
6273 RUBY_ASSERT(beg >= 0);
6274 RUBY_ASSERT(beg <= slen);
6275
6276 if (len > slen - beg) {
6277 len = slen - beg;
6278 }
6279 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
6280 if (!p) p = RSTRING_END(str);
6281 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
6282 if (!e) e = RSTRING_END(str);
6283 /* error check */
6284 beg = p - RSTRING_PTR(str); /* physical position */
6285 len = e - p; /* physical length */
6286 rb_str_update_0(str, beg, len, val);
6287 rb_enc_associate(str, enc);
6289 if (cr != ENC_CODERANGE_BROKEN)
6290 ENC_CODERANGE_SET(str, cr);
6291}
6292
6293static void
6294rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
6295{
6296 int nth;
6297 VALUE match;
6298 long start, end, len;
6299 rb_encoding *enc;
6300 struct re_registers *regs;
6301
6302 if (rb_reg_search(re, str, 0, 0) < 0) {
6303 rb_raise(rb_eIndexError, "regexp not matched");
6304 }
6305 match = rb_backref_get();
6306 nth = rb_reg_backref_number(match, backref);
6307 regs = RMATCH_REGS(match);
6308 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
6309 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
6310 }
6311 if (nth < 0) {
6312 nth += regs->num_regs;
6313 }
6314
6315 start = BEG(nth);
6316 if (start == -1) {
6317 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
6318 }
6319 end = END(nth);
6320 len = end - start;
6321 StringValue(val);
6322 enc = rb_enc_check_str(str, val);
6323 rb_str_update_0(str, start, len, val);
6324 rb_enc_associate(str, enc);
6325}
6326
6327static VALUE
6328rb_str_aset(VALUE str, VALUE indx, VALUE val)
6329{
6330 long idx, beg;
6331
6332 switch (TYPE(indx)) {
6333 case T_REGEXP:
6334 rb_str_subpat_set(str, indx, INT2FIX(0), val);
6335 return val;
6336
6337 case T_STRING:
6338 beg = rb_str_index(str, indx, 0);
6339 if (beg < 0) {
6340 rb_raise(rb_eIndexError, "string not matched");
6341 }
6342 beg = rb_str_sublen(str, beg);
6343 rb_str_update(str, beg, str_strlen(indx, NULL), val);
6344 return val;
6345
6346 default:
6347 /* check if indx is Range */
6348 {
6349 long beg, len;
6350 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6351 rb_str_update(str, beg, len, val);
6352 return val;
6353 }
6354 }
6355 /* FALLTHROUGH */
6356
6357 case T_FIXNUM:
6358 idx = NUM2LONG(indx);
6359 rb_str_update(str, idx, 1, val);
6360 return val;
6361 }
6362}
6363
6364/*
6365 * call-seq:
6366 * self[index] = new_string
6367 * self[start, length] = new_string
6368 * self[range] = new_string
6369 * self[regexp, capture = 0] = new_string
6370 * self[substring] = new_string
6371 *
6372 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6373 * See {String Slices}[rdoc-ref:String@String+Slices].
6374 *
6375 * A few examples:
6376 *
6377 * s = 'foo'
6378 * s[2] = 'rtune' # => "rtune"
6379 * s # => "fortune"
6380 * s[1, 5] = 'init' # => "init"
6381 * s # => "finite"
6382 * s[3..4] = 'al' # => "al"
6383 * s # => "finale"
6384 * s[/e$/] = 'ly' # => "ly"
6385 * s # => "finally"
6386 * s['lly'] = 'ncial' # => "ncial"
6387 * s # => "financial"
6388 *
6389 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6390 */
6391
6392static VALUE
6393rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6394{
6395 if (argc == 3) {
6396 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6397 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6398 }
6399 else {
6400 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6401 }
6402 return argv[2];
6403 }
6404 rb_check_arity(argc, 2, 3);
6405 return rb_str_aset(str, argv[0], argv[1]);
6406}
6407
6408/*
6409 * call-seq:
6410 * insert(index, other_string) -> self
6411 *
6412 * Inserts the given +other_string+ into +self+; returns +self+.
6413 *
6414 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
6415 *
6416 * 'foo'.insert(1, 'bar') # => "fbaroo"
6417 *
6418 * If the Integer +index+ is negative, counts backward from the end of +self+
6419 * and inserts +other_string+ at offset <tt>index+1</tt>
6420 * (that is, _after_ <tt>self[index]</tt>):
6421 *
6422 * 'foo'.insert(-2, 'bar') # => "fobaro"
6423 *
6424 */
6425
6426static VALUE
6427rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6428{
6429 long pos = NUM2LONG(idx);
6430
6431 if (pos == -1) {
6432 return rb_str_append(str, str2);
6433 }
6434 else if (pos < 0) {
6435 pos++;
6436 }
6437 rb_str_update(str, pos, 0, str2);
6438 return str;
6439}
6440
6441
6442/*
6443 * call-seq:
6444 * slice!(index) -> new_string or nil
6445 * slice!(start, length) -> new_string or nil
6446 * slice!(range) -> new_string or nil
6447 * slice!(regexp, capture = 0) -> new_string or nil
6448 * slice!(substring) -> new_string or nil
6449 *
6450 * Removes and returns the substring of +self+ specified by the arguments.
6451 * See {String Slices}[rdoc-ref:String@String+Slices].
6452 *
6453 * A few examples:
6454 *
6455 * string = "This is a string"
6456 * string.slice!(2) #=> "i"
6457 * string.slice!(3..6) #=> " is "
6458 * string.slice!(/s.*t/) #=> "sa st"
6459 * string.slice!("r") #=> "r"
6460 * string #=> "Thing"
6461 *
6462 */
6463
6464static VALUE
6465rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6466{
6467 VALUE result = Qnil;
6468 VALUE indx;
6469 long beg, len = 1;
6470 char *p;
6471
6472 rb_check_arity(argc, 1, 2);
6473 str_modify_keep_cr(str);
6474 indx = argv[0];
6475 if (RB_TYPE_P(indx, T_REGEXP)) {
6476 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6477 VALUE match = rb_backref_get();
6478 struct re_registers *regs = RMATCH_REGS(match);
6479 int nth = 0;
6480 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6481 if ((nth += regs->num_regs) <= 0) return Qnil;
6482 }
6483 else if (nth >= regs->num_regs) return Qnil;
6484 beg = BEG(nth);
6485 len = END(nth) - beg;
6486 goto subseq;
6487 }
6488 else if (argc == 2) {
6489 beg = NUM2LONG(indx);
6490 len = NUM2LONG(argv[1]);
6491 goto num_index;
6492 }
6493 else if (FIXNUM_P(indx)) {
6494 beg = FIX2LONG(indx);
6495 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6496 if (!len) return Qnil;
6497 beg = p - RSTRING_PTR(str);
6498 goto subseq;
6499 }
6500 else if (RB_TYPE_P(indx, T_STRING)) {
6501 beg = rb_str_index(str, indx, 0);
6502 if (beg == -1) return Qnil;
6503 len = RSTRING_LEN(indx);
6504 result = str_duplicate(rb_cString, indx);
6505 goto squash;
6506 }
6507 else {
6508 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6509 case Qnil:
6510 return Qnil;
6511 case Qfalse:
6512 beg = NUM2LONG(indx);
6513 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6514 if (!len) return Qnil;
6515 beg = p - RSTRING_PTR(str);
6516 goto subseq;
6517 default:
6518 goto num_index;
6519 }
6520 }
6521
6522 num_index:
6523 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6524 beg = p - RSTRING_PTR(str);
6525
6526 subseq:
6527 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6528 rb_enc_cr_str_copy_for_substr(result, str);
6529
6530 squash:
6531 if (len > 0) {
6532 if (beg == 0) {
6533 rb_str_drop_bytes(str, len);
6534 }
6535 else {
6536 char *sptr = RSTRING_PTR(str);
6537 long slen = RSTRING_LEN(str);
6538 if (beg + len > slen) /* pathological check */
6539 len = slen - beg;
6540 memmove(sptr + beg,
6541 sptr + beg + len,
6542 slen - (beg + len));
6543 slen -= len;
6544 STR_SET_LEN(str, slen);
6545 TERM_FILL(&sptr[slen], TERM_LEN(str));
6546 }
6547 }
6548 return result;
6549}
6550
6551static VALUE
6552get_pat(VALUE pat)
6553{
6554 VALUE val;
6555
6556 switch (OBJ_BUILTIN_TYPE(pat)) {
6557 case T_REGEXP:
6558 return pat;
6559
6560 case T_STRING:
6561 break;
6562
6563 default:
6564 val = rb_check_string_type(pat);
6565 if (NIL_P(val)) {
6566 Check_Type(pat, T_REGEXP);
6567 }
6568 pat = val;
6569 }
6570
6571 return rb_reg_regcomp(pat);
6572}
6573
6574static VALUE
6575get_pat_quoted(VALUE pat, int check)
6576{
6577 VALUE val;
6578
6579 switch (OBJ_BUILTIN_TYPE(pat)) {
6580 case T_REGEXP:
6581 return pat;
6582
6583 case T_STRING:
6584 break;
6585
6586 default:
6587 val = rb_check_string_type(pat);
6588 if (NIL_P(val)) {
6589 Check_Type(pat, T_REGEXP);
6590 }
6591 pat = val;
6592 }
6593 if (check && is_broken_string(pat)) {
6594 rb_exc_raise(rb_reg_check_preprocess(pat));
6595 }
6596 return pat;
6597}
6598
6599static long
6600rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6601{
6602 if (BUILTIN_TYPE(pat) == T_STRING) {
6603 pos = rb_str_byteindex(str, pat, pos);
6604 if (set_backref_str) {
6605 if (pos >= 0) {
6606 str = rb_str_new_frozen_String(str);
6607 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6608 if (match) {
6609 *match = match_data;
6610 }
6611 }
6612 else {
6614 }
6615 }
6616 return pos;
6617 }
6618 else {
6619 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6620 }
6621}
6622
6623static long
6624rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6625{
6626 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6627}
6628
6629
6630/*
6631 * call-seq:
6632 * sub!(pattern, replacement) -> self or nil
6633 * sub!(pattern) {|match| ... } -> self or nil
6634 *
6635 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6636 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6637 *
6638 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6639 *
6640 * Related: String#sub, String#gsub, String#gsub!.
6641 *
6642 */
6643
6644static VALUE
6645rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6646{
6647 VALUE pat, repl, hash = Qnil;
6648 int iter = 0;
6649 long plen;
6650 int min_arity = rb_block_given_p() ? 1 : 2;
6651 long beg;
6652
6653 rb_check_arity(argc, min_arity, 2);
6654 if (argc == 1) {
6655 iter = 1;
6656 }
6657 else {
6658 repl = argv[1];
6659 hash = rb_check_hash_type(argv[1]);
6660 if (NIL_P(hash)) {
6661 StringValue(repl);
6662 }
6663 }
6664
6665 pat = get_pat_quoted(argv[0], 1);
6666
6667 str_modifiable(str);
6668 beg = rb_pat_search(pat, str, 0, 1);
6669 if (beg >= 0) {
6670 rb_encoding *enc;
6671 int cr = ENC_CODERANGE(str);
6672 long beg0, end0;
6673 VALUE match, match0 = Qnil;
6674 struct re_registers *regs;
6675 char *p, *rp;
6676 long len, rlen;
6677
6678 match = rb_backref_get();
6679 regs = RMATCH_REGS(match);
6680 if (RB_TYPE_P(pat, T_STRING)) {
6681 beg0 = beg;
6682 end0 = beg0 + RSTRING_LEN(pat);
6683 match0 = pat;
6684 }
6685 else {
6686 beg0 = BEG(0);
6687 end0 = END(0);
6688 if (iter) match0 = rb_reg_nth_match(0, match);
6689 }
6690
6691 if (iter || !NIL_P(hash)) {
6692 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6693
6694 if (iter) {
6695 repl = rb_obj_as_string(rb_yield(match0));
6696 }
6697 else {
6698 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6699 repl = rb_obj_as_string(repl);
6700 }
6701 str_mod_check(str, p, len);
6702 rb_check_frozen(str);
6703 }
6704 else {
6705 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6706 }
6707
6708 enc = rb_enc_compatible(str, repl);
6709 if (!enc) {
6710 rb_encoding *str_enc = STR_ENC_GET(str);
6711 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6712 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6713 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6714 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6715 rb_enc_inspect_name(str_enc),
6716 rb_enc_inspect_name(STR_ENC_GET(repl)));
6717 }
6718 enc = STR_ENC_GET(repl);
6719 }
6720 rb_str_modify(str);
6721 rb_enc_associate(str, enc);
6723 int cr2 = ENC_CODERANGE(repl);
6724 if (cr2 == ENC_CODERANGE_BROKEN ||
6725 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6727 else
6728 cr = cr2;
6729 }
6730 plen = end0 - beg0;
6731 rlen = RSTRING_LEN(repl);
6732 len = RSTRING_LEN(str);
6733 if (rlen > plen) {
6734 RESIZE_CAPA(str, len + rlen - plen);
6735 }
6736 p = RSTRING_PTR(str);
6737 if (rlen != plen) {
6738 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6739 }
6740 rp = RSTRING_PTR(repl);
6741 memmove(p + beg0, rp, rlen);
6742 len += rlen - plen;
6743 STR_SET_LEN(str, len);
6744 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6745 ENC_CODERANGE_SET(str, cr);
6746
6747 RB_GC_GUARD(match);
6748
6749 return str;
6750 }
6751 return Qnil;
6752}
6753
6754
6755/*
6756 * call-seq:
6757 * sub(pattern, replacement) -> new_string
6758 * sub(pattern) {|match| ... } -> new_string
6759 *
6760 * Returns a copy of +self+ with only the first occurrence
6761 * (not all occurrences) of the given +pattern+ replaced.
6762 *
6763 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6764 *
6765 * Related: String#sub!, String#gsub, String#gsub!.
6766 *
6767 */
6768
6769static VALUE
6770rb_str_sub(int argc, VALUE *argv, VALUE str)
6771{
6772 str = str_duplicate(rb_cString, str);
6773 rb_str_sub_bang(argc, argv, str);
6774 return str;
6775}
6776
6777static VALUE
6778str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6779{
6780 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6781 long beg, beg0, end0;
6782 long offset, blen, slen, len, last;
6783 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6784 char *sp, *cp;
6785 int need_backref_str = -1;
6786 rb_encoding *str_enc;
6787
6788 switch (argc) {
6789 case 1:
6790 RETURN_ENUMERATOR(str, argc, argv);
6791 mode = ITER;
6792 break;
6793 case 2:
6794 repl = argv[1];
6795 hash = rb_check_hash_type(argv[1]);
6796 if (NIL_P(hash)) {
6797 StringValue(repl);
6798 }
6799 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6800 mode = FAST_MAP;
6801 }
6802 else {
6803 mode = MAP;
6804 }
6805 break;
6806 default:
6807 rb_error_arity(argc, 1, 2);
6808 }
6809
6810 pat = get_pat_quoted(argv[0], 1);
6811 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6812
6813 if (beg < 0) {
6814 if (bang) return Qnil; /* no match, no substitution */
6815 return str_duplicate(rb_cString, str);
6816 }
6817
6818 offset = 0;
6819 blen = RSTRING_LEN(str) + 30; /* len + margin */
6820 dest = rb_str_buf_new(blen);
6821 sp = RSTRING_PTR(str);
6822 slen = RSTRING_LEN(str);
6823 cp = sp;
6824 str_enc = STR_ENC_GET(str);
6825 rb_enc_associate(dest, str_enc);
6826 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6827
6828 do {
6829 struct re_registers *regs = RMATCH_REGS(match);
6830 if (RB_TYPE_P(pat, T_STRING)) {
6831 beg0 = beg;
6832 end0 = beg0 + RSTRING_LEN(pat);
6833 match0 = pat;
6834 }
6835 else {
6836 beg0 = BEG(0);
6837 end0 = END(0);
6838 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6839 }
6840
6841 if (mode != STR) {
6842 if (mode == ITER) {
6843 val = rb_obj_as_string(rb_yield(match0));
6844 }
6845 else {
6846 struct RString fake_str;
6847 VALUE key;
6848 if (mode == FAST_MAP) {
6849 // It is safe to use a fake_str here because we established that it won't escape,
6850 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6851 // default proc.
6852 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6853 }
6854 else {
6855 key = rb_str_subseq(str, beg0, end0 - beg0);
6856 }
6857 val = rb_hash_aref(hash, key);
6858 val = rb_obj_as_string(val);
6859 }
6860 str_mod_check(str, sp, slen);
6861 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6862 rb_raise(rb_eRuntimeError, "block should not cheat");
6863 }
6864 }
6865 else if (need_backref_str) {
6866 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6867 if (need_backref_str < 0) {
6868 need_backref_str = val != repl;
6869 }
6870 }
6871 else {
6872 val = repl;
6873 }
6874
6875 len = beg0 - offset; /* copy pre-match substr */
6876 if (len) {
6877 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6878 }
6879
6880 rb_str_buf_append(dest, val);
6881
6882 last = offset;
6883 offset = end0;
6884 if (beg0 == end0) {
6885 /*
6886 * Always consume at least one character of the input string
6887 * in order to prevent infinite loops.
6888 */
6889 if (RSTRING_LEN(str) <= end0) break;
6890 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6891 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6892 offset = end0 + len;
6893 }
6894 cp = RSTRING_PTR(str) + offset;
6895 if (offset > RSTRING_LEN(str)) break;
6896
6897 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6898 if (mode != FAST_MAP && mode != STR) {
6899 match = Qnil;
6900 }
6901 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6902
6903 RB_GC_GUARD(match);
6904 } while (beg >= 0);
6905
6906 if (RSTRING_LEN(str) > offset) {
6907 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6908 }
6909 rb_pat_search0(pat, str, last, 1, &match);
6910 if (bang) {
6911 str_shared_replace(str, dest);
6912 }
6913 else {
6914 str = dest;
6915 }
6916
6917 return str;
6918}
6919
6920
6921/*
6922 * call-seq:
6923 * gsub!(pattern, replacement) -> self or nil
6924 * gsub!(pattern) {|match| ... } -> self or nil
6925 * gsub!(pattern) -> an_enumerator
6926 *
6927 * Performs the specified substring replacement(s) on +self+;
6928 * returns +self+ if any replacement occurred, +nil+ otherwise.
6929 *
6930 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6931 *
6932 * Returns an Enumerator if no +replacement+ and no block given.
6933 *
6934 * Related: String#sub, String#gsub, String#sub!.
6935 *
6936 */
6937
6938static VALUE
6939rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6940{
6941 str_modify_keep_cr(str);
6942 return str_gsub(argc, argv, str, 1);
6943}
6944
6945
6946/*
6947 * call-seq:
6948 * gsub(pattern, replacement) -> new_string
6949 * gsub(pattern) {|match| ... } -> new_string
6950 * gsub(pattern) -> enumerator
6951 *
6952 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6953 *
6954 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6955 *
6956 * Returns an Enumerator if no +replacement+ and no block given.
6957 *
6958 * Related: String#sub, String#sub!, String#gsub!.
6959 *
6960 */
6961
6962static VALUE
6963rb_str_gsub(int argc, VALUE *argv, VALUE str)
6964{
6965 return str_gsub(argc, argv, str, 0);
6966}
6967
6968
6969/*
6970 * call-seq:
6971 * replace(other_string) -> self
6972 *
6973 * Replaces the contents of +self+ with the contents of +other_string+:
6974 *
6975 * s = 'foo' # => "foo"
6976 * s.replace('bar') # => "bar"
6977 *
6978 */
6979
6980VALUE
6982{
6983 str_modifiable(str);
6984 if (str == str2) return str;
6985
6986 StringValue(str2);
6987 str_discard(str);
6988 return str_replace(str, str2);
6989}
6990
6991/*
6992 * call-seq:
6993 * clear -> self
6994 *
6995 * Removes the contents of +self+:
6996 *
6997 * s = 'foo' # => "foo"
6998 * s.clear # => ""
6999 *
7000 */
7001
7002static VALUE
7003rb_str_clear(VALUE str)
7004{
7005 str_discard(str);
7006 STR_SET_EMBED(str);
7007 STR_SET_LEN(str, 0);
7008 RSTRING_PTR(str)[0] = 0;
7009 if (rb_enc_asciicompat(STR_ENC_GET(str)))
7011 else
7013 return str;
7014}
7015
7016/*
7017 * call-seq:
7018 * chr -> string
7019 *
7020 * Returns a string containing the first character of +self+:
7021 *
7022 * s = 'foo' # => "foo"
7023 * s.chr # => "f"
7024 *
7025 */
7026
7027static VALUE
7028rb_str_chr(VALUE str)
7029{
7030 return rb_str_substr(str, 0, 1);
7031}
7032
7033/*
7034 * call-seq:
7035 * getbyte(index) -> integer or nil
7036 *
7037 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
7038 *
7039 * s = 'abcde' # => "abcde"
7040 * s.getbyte(0) # => 97
7041 * s.getbyte(-1) # => 101
7042 * s.getbyte(5) # => nil
7043 *
7044 * Related: String#setbyte.
7045 */
7046VALUE
7047rb_str_getbyte(VALUE str, VALUE index)
7048{
7049 long pos = NUM2LONG(index);
7050
7051 if (pos < 0)
7052 pos += RSTRING_LEN(str);
7053 if (pos < 0 || RSTRING_LEN(str) <= pos)
7054 return Qnil;
7055
7056 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
7057}
7058
7059/*
7060 * call-seq:
7061 * setbyte(index, integer) -> integer
7062 *
7063 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
7064 *
7065 * s = 'abcde' # => "abcde"
7066 * s.setbyte(0, 98) # => 98
7067 * s # => "bbcde"
7068 *
7069 * Related: String#getbyte.
7070 */
7071VALUE
7072rb_str_setbyte(VALUE str, VALUE index, VALUE value)
7073{
7074 long pos = NUM2LONG(index);
7075 long len = RSTRING_LEN(str);
7076 char *ptr, *head, *left = 0;
7077 rb_encoding *enc;
7078 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
7079
7080 if (pos < -len || len <= pos)
7081 rb_raise(rb_eIndexError, "index %ld out of string", pos);
7082 if (pos < 0)
7083 pos += len;
7084
7085 VALUE v = rb_to_int(value);
7086 VALUE w = rb_int_and(v, INT2FIX(0xff));
7087 char byte = (char)(NUM2INT(w) & 0xFF);
7088
7089 if (!str_independent(str))
7090 str_make_independent(str);
7091 enc = STR_ENC_GET(str);
7092 head = RSTRING_PTR(str);
7093 ptr = &head[pos];
7094 if (!STR_EMBED_P(str)) {
7095 cr = ENC_CODERANGE(str);
7096 switch (cr) {
7097 case ENC_CODERANGE_7BIT:
7098 left = ptr;
7099 *ptr = byte;
7100 if (ISASCII(byte)) goto end;
7101 nlen = rb_enc_precise_mbclen(left, head+len, enc);
7102 if (!MBCLEN_CHARFOUND_P(nlen))
7104 else
7106 goto end;
7108 left = rb_enc_left_char_head(head, ptr, head+len, enc);
7109 width = rb_enc_precise_mbclen(left, head+len, enc);
7110 *ptr = byte;
7111 nlen = rb_enc_precise_mbclen(left, head+len, enc);
7112 if (!MBCLEN_CHARFOUND_P(nlen))
7114 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
7116 goto end;
7117 }
7118 }
7120 *ptr = byte;
7121
7122 end:
7123 return value;
7124}
7125
7126static VALUE
7127str_byte_substr(VALUE str, long beg, long len, int empty)
7128{
7129 long n = RSTRING_LEN(str);
7130
7131 if (beg > n || len < 0) return Qnil;
7132 if (beg < 0) {
7133 beg += n;
7134 if (beg < 0) return Qnil;
7135 }
7136 if (len > n - beg)
7137 len = n - beg;
7138 if (len <= 0) {
7139 if (!empty) return Qnil;
7140 len = 0;
7141 }
7142
7143 VALUE str2 = str_subseq(str, beg, len);
7144
7145 str_enc_copy_direct(str2, str);
7146
7147 if (RSTRING_LEN(str2) == 0) {
7148 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
7150 else
7152 }
7153 else {
7154 switch (ENC_CODERANGE(str)) {
7155 case ENC_CODERANGE_7BIT:
7157 break;
7158 default:
7160 break;
7161 }
7162 }
7163
7164 return str2;
7165}
7166
7167VALUE
7168rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
7169{
7170 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
7171}
7172
7173static VALUE
7174str_byte_aref(VALUE str, VALUE indx)
7175{
7176 long idx;
7177 if (FIXNUM_P(indx)) {
7178 idx = FIX2LONG(indx);
7179 }
7180 else {
7181 /* check if indx is Range */
7182 long beg, len = RSTRING_LEN(str);
7183
7184 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
7185 case Qfalse:
7186 break;
7187 case Qnil:
7188 return Qnil;
7189 default:
7190 return str_byte_substr(str, beg, len, TRUE);
7191 }
7192
7193 idx = NUM2LONG(indx);
7194 }
7195 return str_byte_substr(str, idx, 1, FALSE);
7196}
7197
7198/*
7199 * call-seq:
7200 * byteslice(index, length = 1) -> string or nil
7201 * byteslice(range) -> string or nil
7202 *
7203 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
7204 *
7205 * With integer arguments +index+ and +length+ given,
7206 * returns the substring beginning at the given +index+
7207 * of the given +length+ (if possible),
7208 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
7209 *
7210 * s = '0123456789' # => "0123456789"
7211 * s.byteslice(2) # => "2"
7212 * s.byteslice(200) # => nil
7213 * s.byteslice(4, 3) # => "456"
7214 * s.byteslice(4, 30) # => "456789"
7215 * s.byteslice(4, -1) # => nil
7216 * s.byteslice(40, 2) # => nil
7217 *
7218 * In either case above, counts backwards from the end of +self+
7219 * if +index+ is negative:
7220 *
7221 * s = '0123456789' # => "0123456789"
7222 * s.byteslice(-4) # => "6"
7223 * s.byteslice(-4, 3) # => "678"
7224 *
7225 * With Range argument +range+ given, returns
7226 * <tt>byteslice(range.begin, range.size)</tt>:
7227 *
7228 * s = '0123456789' # => "0123456789"
7229 * s.byteslice(4..6) # => "456"
7230 * s.byteslice(-6..-4) # => "456"
7231 * s.byteslice(5..2) # => "" # range.size is zero.
7232 * s.byteslice(40..42) # => nil
7233 *
7234 * In all cases, a returned string has the same encoding as +self+:
7235 *
7236 * s.encoding # => #<Encoding:UTF-8>
7237 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
7238 *
7239 */
7240
7241static VALUE
7242rb_str_byteslice(int argc, VALUE *argv, VALUE str)
7243{
7244 if (argc == 2) {
7245 long beg = NUM2LONG(argv[0]);
7246 long len = NUM2LONG(argv[1]);
7247 return str_byte_substr(str, beg, len, TRUE);
7248 }
7249 rb_check_arity(argc, 1, 2);
7250 return str_byte_aref(str, argv[0]);
7251}
7252
7253static void
7254str_check_beg_len(VALUE str, long *beg, long *len)
7255{
7256 long end, slen = RSTRING_LEN(str);
7257
7258 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
7259 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
7260 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
7261 }
7262 if (*beg < 0) {
7263 *beg += slen;
7264 }
7265 RUBY_ASSERT(*beg >= 0);
7266 RUBY_ASSERT(*beg <= slen);
7267
7268 if (*len > slen - *beg) {
7269 *len = slen - *beg;
7270 }
7271 end = *beg + *len;
7272 str_ensure_byte_pos(str, *beg);
7273 str_ensure_byte_pos(str, end);
7274}
7275
7276/*
7277 * call-seq:
7278 * bytesplice(index, length, str) -> string
7279 * bytesplice(index, length, str, str_index, str_length) -> string
7280 * bytesplice(range, str) -> string
7281 * bytesplice(range, str, str_range) -> string
7282 *
7283 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
7284 * The portion of the string affected is determined using
7285 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
7286 * If the replacement string is not the same length as the text it is replacing,
7287 * the string will be adjusted accordingly.
7288 *
7289 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
7290 *
7291 * The form that take an Integer will raise an IndexError if the value is out
7292 * of range; the Range form will raise a RangeError.
7293 * If the beginning or ending offset does not land on character (codepoint)
7294 * boundary, an IndexError will be raised.
7295 */
7296
7297static VALUE
7298rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
7299{
7300 long beg, len, vbeg, vlen;
7301 VALUE val;
7302 int cr;
7303
7304 rb_check_arity(argc, 2, 5);
7305 if (!(argc == 2 || argc == 3 || argc == 5)) {
7306 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
7307 }
7308 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
7309 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
7310 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
7311 rb_builtin_class_name(argv[0]));
7312 }
7313 val = argv[1];
7314 StringValue(val);
7315 if (argc == 2) {
7316 /* bytesplice(range, str) */
7317 vbeg = 0;
7318 vlen = RSTRING_LEN(val);
7319 }
7320 else {
7321 /* bytesplice(range, str, str_range) */
7322 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
7323 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
7324 rb_builtin_class_name(argv[2]));
7325 }
7326 }
7327 }
7328 else {
7329 beg = NUM2LONG(argv[0]);
7330 len = NUM2LONG(argv[1]);
7331 val = argv[2];
7332 StringValue(val);
7333 if (argc == 3) {
7334 /* bytesplice(index, length, str) */
7335 vbeg = 0;
7336 vlen = RSTRING_LEN(val);
7337 }
7338 else {
7339 /* bytesplice(index, length, str, str_index, str_length) */
7340 vbeg = NUM2LONG(argv[3]);
7341 vlen = NUM2LONG(argv[4]);
7342 }
7343 }
7344 str_check_beg_len(str, &beg, &len);
7345 str_check_beg_len(val, &vbeg, &vlen);
7346 str_modify_keep_cr(str);
7347
7348 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
7349 rb_enc_associate(str, rb_enc_check(str, val));
7350 }
7351
7352 rb_str_update_1(str, beg, len, val, vbeg, vlen);
7354 if (cr != ENC_CODERANGE_BROKEN)
7355 ENC_CODERANGE_SET(str, cr);
7356 return str;
7357}
7358
7359/*
7360 * call-seq:
7361 * reverse -> string
7362 *
7363 * Returns a new string with the characters from +self+ in reverse order.
7364 *
7365 * 'stressed'.reverse # => "desserts"
7366 *
7367 */
7368
7369static VALUE
7370rb_str_reverse(VALUE str)
7371{
7372 rb_encoding *enc;
7373 VALUE rev;
7374 char *s, *e, *p;
7375 int cr;
7376
7377 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
7378 enc = STR_ENC_GET(str);
7379 rev = rb_str_new(0, RSTRING_LEN(str));
7380 s = RSTRING_PTR(str); e = RSTRING_END(str);
7381 p = RSTRING_END(rev);
7382 cr = ENC_CODERANGE(str);
7383
7384 if (RSTRING_LEN(str) > 1) {
7385 if (single_byte_optimizable(str)) {
7386 while (s < e) {
7387 *--p = *s++;
7388 }
7389 }
7390 else if (cr == ENC_CODERANGE_VALID) {
7391 while (s < e) {
7392 int clen = rb_enc_fast_mbclen(s, e, enc);
7393
7394 p -= clen;
7395 memcpy(p, s, clen);
7396 s += clen;
7397 }
7398 }
7399 else {
7400 cr = rb_enc_asciicompat(enc) ?
7402 while (s < e) {
7403 int clen = rb_enc_mbclen(s, e, enc);
7404
7405 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7406 p -= clen;
7407 memcpy(p, s, clen);
7408 s += clen;
7409 }
7410 }
7411 }
7412 STR_SET_LEN(rev, RSTRING_LEN(str));
7413 str_enc_copy_direct(rev, str);
7414 ENC_CODERANGE_SET(rev, cr);
7415
7416 return rev;
7417}
7418
7419
7420/*
7421 * call-seq:
7422 * reverse! -> self
7423 *
7424 * Returns +self+ with its characters reversed:
7425 *
7426 * s = 'stressed'
7427 * s.reverse! # => "desserts"
7428 * s # => "desserts"
7429 *
7430 */
7431
7432static VALUE
7433rb_str_reverse_bang(VALUE str)
7434{
7435 if (RSTRING_LEN(str) > 1) {
7436 if (single_byte_optimizable(str)) {
7437 char *s, *e, c;
7438
7439 str_modify_keep_cr(str);
7440 s = RSTRING_PTR(str);
7441 e = RSTRING_END(str) - 1;
7442 while (s < e) {
7443 c = *s;
7444 *s++ = *e;
7445 *e-- = c;
7446 }
7447 }
7448 else {
7449 str_shared_replace(str, rb_str_reverse(str));
7450 }
7451 }
7452 else {
7453 str_modify_keep_cr(str);
7454 }
7455 return str;
7456}
7457
7458
7459/*
7460 * call-seq:
7461 * include?(other_string) -> true or false
7462 *
7463 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
7464 *
7465 * s = 'foo'
7466 * s.include?('f') # => true
7467 * s.include?('fo') # => true
7468 * s.include?('food') # => false
7469 *
7470 */
7471
7472VALUE
7473rb_str_include(VALUE str, VALUE arg)
7474{
7475 long i;
7476
7477 StringValue(arg);
7478 i = rb_str_index(str, arg, 0);
7479
7480 return RBOOL(i != -1);
7481}
7482
7483
7484/*
7485 * call-seq:
7486 * to_i(base = 10) -> integer
7487 *
7488 * Returns the result of interpreting leading characters in +self+
7489 * as an integer in the given +base+ (which must be in (0, 2..36)):
7490 *
7491 * '123456'.to_i # => 123456
7492 * '123def'.to_i(16) # => 1195503
7493 *
7494 * With +base+ zero, string +object+ may contain leading characters
7495 * to specify the actual base:
7496 *
7497 * '123def'.to_i(0) # => 123
7498 * '0123def'.to_i(0) # => 83
7499 * '0b123def'.to_i(0) # => 1
7500 * '0o123def'.to_i(0) # => 83
7501 * '0d123def'.to_i(0) # => 123
7502 * '0x123def'.to_i(0) # => 1195503
7503 *
7504 * Characters past a leading valid number (in the given +base+) are ignored:
7505 *
7506 * '12.345'.to_i # => 12
7507 * '12345'.to_i(2) # => 1
7508 *
7509 * Returns zero if there is no leading valid number:
7510 *
7511 * 'abcdef'.to_i # => 0
7512 * '2'.to_i(2) # => 0
7513 *
7514 */
7515
7516static VALUE
7517rb_str_to_i(int argc, VALUE *argv, VALUE str)
7518{
7519 int base = 10;
7520
7521 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7522 rb_raise(rb_eArgError, "invalid radix %d", base);
7523 }
7524 return rb_str_to_inum(str, base, FALSE);
7525}
7526
7527
7528/*
7529 * call-seq:
7530 * to_f -> float
7531 *
7532 * Returns the result of interpreting leading characters in +self+ as a Float:
7533 *
7534 * '3.14159'.to_f # => 3.14159
7535 * '1.234e-2'.to_f # => 0.01234
7536 *
7537 * Characters past a leading valid number (in the given +base+) are ignored:
7538 *
7539 * '3.14 (pi to two places)'.to_f # => 3.14
7540 *
7541 * Returns zero if there is no leading valid number:
7542 *
7543 * 'abcdef'.to_f # => 0.0
7544 *
7545 */
7546
7547static VALUE
7548rb_str_to_f(VALUE str)
7549{
7550 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7551}
7552
7553
7554/*
7555 * call-seq:
7556 * to_s -> self or string
7557 *
7558 * Returns +self+ if +self+ is a +String+,
7559 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7560 */
7561
7562static VALUE
7563rb_str_to_s(VALUE str)
7564{
7565 if (rb_obj_class(str) != rb_cString) {
7566 return str_duplicate(rb_cString, str);
7567 }
7568 return str;
7569}
7570
7571#if 0
7572static void
7573str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7574{
7575 char s[RUBY_MAX_CHAR_LEN];
7576 int n = rb_enc_codelen(c, enc);
7577
7578 rb_enc_mbcput(c, s, enc);
7579 rb_enc_str_buf_cat(str, s, n, enc);
7580}
7581#endif
7582
7583#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7584
7585int
7586rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7587{
7588 char buf[CHAR_ESC_LEN + 1];
7589 int l;
7590
7591#if SIZEOF_INT > 4
7592 c &= 0xffffffff;
7593#endif
7594 if (unicode_p) {
7595 if (c < 0x7F && ISPRINT(c)) {
7596 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7597 }
7598 else if (c < 0x10000) {
7599 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7600 }
7601 else {
7602 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7603 }
7604 }
7605 else {
7606 if (c < 0x100) {
7607 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7608 }
7609 else {
7610 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7611 }
7612 }
7613 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7614 rb_str_buf_cat(result, buf, l);
7615 return l;
7616}
7617
7618const char *
7619ruby_escaped_char(int c)
7620{
7621 switch (c) {
7622 case '\0': return "\\0";
7623 case '\n': return "\\n";
7624 case '\r': return "\\r";
7625 case '\t': return "\\t";
7626 case '\f': return "\\f";
7627 case '\013': return "\\v";
7628 case '\010': return "\\b";
7629 case '\007': return "\\a";
7630 case '\033': return "\\e";
7631 case '\x7f': return "\\c?";
7632 }
7633 return NULL;
7634}
7635
7636VALUE
7637rb_str_escape(VALUE str)
7638{
7639 int encidx = ENCODING_GET(str);
7640 rb_encoding *enc = rb_enc_from_index(encidx);
7641 const char *p = RSTRING_PTR(str);
7642 const char *pend = RSTRING_END(str);
7643 const char *prev = p;
7644 char buf[CHAR_ESC_LEN + 1];
7645 VALUE result = rb_str_buf_new(0);
7646 int unicode_p = rb_enc_unicode_p(enc);
7647 int asciicompat = rb_enc_asciicompat(enc);
7648
7649 while (p < pend) {
7650 unsigned int c;
7651 const char *cc;
7652 int n = rb_enc_precise_mbclen(p, pend, enc);
7653 if (!MBCLEN_CHARFOUND_P(n)) {
7654 if (p > prev) str_buf_cat(result, prev, p - prev);
7655 n = rb_enc_mbminlen(enc);
7656 if (pend < p + n)
7657 n = (int)(pend - p);
7658 while (n--) {
7659 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7660 str_buf_cat(result, buf, strlen(buf));
7661 prev = ++p;
7662 }
7663 continue;
7664 }
7665 n = MBCLEN_CHARFOUND_LEN(n);
7666 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7667 p += n;
7668 cc = ruby_escaped_char(c);
7669 if (cc) {
7670 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7671 str_buf_cat(result, cc, strlen(cc));
7672 prev = p;
7673 }
7674 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7675 }
7676 else {
7677 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7678 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7679 prev = p;
7680 }
7681 }
7682 if (p > prev) str_buf_cat(result, prev, p - prev);
7683 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7684
7685 return result;
7686}
7687
7688/*
7689 * call-seq:
7690 * inspect -> string
7691 *
7692 * Returns a printable version of +self+, enclosed in double-quotes,
7693 * and with special characters escaped:
7694 *
7695 * s = "foo\tbar\tbaz\n"
7696 * s.inspect
7697 * # => "\"foo\\tbar\\tbaz\\n\""
7698 *
7699 */
7700
7701VALUE
7703{
7704 int encidx = ENCODING_GET(str);
7705 rb_encoding *enc = rb_enc_from_index(encidx);
7706 const char *p, *pend, *prev;
7707 char buf[CHAR_ESC_LEN + 1];
7708 VALUE result = rb_str_buf_new(0);
7709 rb_encoding *resenc = rb_default_internal_encoding();
7710 int unicode_p = rb_enc_unicode_p(enc);
7711 int asciicompat = rb_enc_asciicompat(enc);
7712
7713 if (resenc == NULL) resenc = rb_default_external_encoding();
7714 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7715 rb_enc_associate(result, resenc);
7716 str_buf_cat2(result, "\"");
7717
7718 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7719 prev = p;
7720 while (p < pend) {
7721 unsigned int c, cc;
7722 int n;
7723
7724 n = rb_enc_precise_mbclen(p, pend, enc);
7725 if (!MBCLEN_CHARFOUND_P(n)) {
7726 if (p > prev) str_buf_cat(result, prev, p - prev);
7727 n = rb_enc_mbminlen(enc);
7728 if (pend < p + n)
7729 n = (int)(pend - p);
7730 while (n--) {
7731 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7732 str_buf_cat(result, buf, strlen(buf));
7733 prev = ++p;
7734 }
7735 continue;
7736 }
7737 n = MBCLEN_CHARFOUND_LEN(n);
7738 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7739 p += n;
7740 if ((asciicompat || unicode_p) &&
7741 (c == '"'|| c == '\\' ||
7742 (c == '#' &&
7743 p < pend &&
7744 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7745 (cc = rb_enc_codepoint(p,pend,enc),
7746 (cc == '$' || cc == '@' || cc == '{'))))) {
7747 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7748 str_buf_cat2(result, "\\");
7749 if (asciicompat || enc == resenc) {
7750 prev = p - n;
7751 continue;
7752 }
7753 }
7754 switch (c) {
7755 case '\n': cc = 'n'; break;
7756 case '\r': cc = 'r'; break;
7757 case '\t': cc = 't'; break;
7758 case '\f': cc = 'f'; break;
7759 case '\013': cc = 'v'; break;
7760 case '\010': cc = 'b'; break;
7761 case '\007': cc = 'a'; break;
7762 case 033: cc = 'e'; break;
7763 default: cc = 0; break;
7764 }
7765 if (cc) {
7766 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7767 buf[0] = '\\';
7768 buf[1] = (char)cc;
7769 str_buf_cat(result, buf, 2);
7770 prev = p;
7771 continue;
7772 }
7773 /* The special casing of 0x85 (NEXT_LINE) here is because
7774 * Oniguruma historically treats it as printable, but it
7775 * doesn't match the print POSIX bracket class or character
7776 * property in regexps.
7777 *
7778 * See Ruby Bug #16842 for details:
7779 * https://bugs.ruby-lang.org/issues/16842
7780 */
7781 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7782 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7783 continue;
7784 }
7785 else {
7786 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7787 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7788 prev = p;
7789 continue;
7790 }
7791 }
7792 if (p > prev) str_buf_cat(result, prev, p - prev);
7793 str_buf_cat2(result, "\"");
7794
7795 return result;
7796}
7797
7798#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7799
7800/*
7801 * call-seq:
7802 * dump -> string
7803 *
7804 * Returns a printable version of +self+, enclosed in double-quotes,
7805 * with special characters escaped, and with non-printing characters
7806 * replaced by hexadecimal notation:
7807 *
7808 * "hello \n ''".dump # => "\"hello \\n ''\""
7809 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7810 *
7811 * Related: String#undump (inverse of String#dump).
7812 *
7813 */
7814
7815VALUE
7817{
7818 int encidx = rb_enc_get_index(str);
7819 rb_encoding *enc = rb_enc_from_index(encidx);
7820 long len;
7821 const char *p, *pend;
7822 char *q, *qend;
7823 VALUE result;
7824 int u8 = (encidx == rb_utf8_encindex());
7825 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7826
7827 len = 2; /* "" */
7828 if (!rb_enc_asciicompat(enc)) {
7829 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7830 len += strlen(enc->name);
7831 }
7832
7833 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7834 while (p < pend) {
7835 int clen;
7836 unsigned char c = *p++;
7837
7838 switch (c) {
7839 case '"': case '\\':
7840 case '\n': case '\r':
7841 case '\t': case '\f':
7842 case '\013': case '\010': case '\007': case '\033':
7843 clen = 2;
7844 break;
7845
7846 case '#':
7847 clen = IS_EVSTR(p, pend) ? 2 : 1;
7848 break;
7849
7850 default:
7851 if (ISPRINT(c)) {
7852 clen = 1;
7853 }
7854 else {
7855 if (u8 && c > 0x7F) { /* \u notation */
7856 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7857 if (MBCLEN_CHARFOUND_P(n)) {
7858 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7859 if (cc <= 0xFFFF)
7860 clen = 6; /* \uXXXX */
7861 else if (cc <= 0xFFFFF)
7862 clen = 9; /* \u{XXXXX} */
7863 else
7864 clen = 10; /* \u{XXXXXX} */
7865 p += MBCLEN_CHARFOUND_LEN(n)-1;
7866 break;
7867 }
7868 }
7869 clen = 4; /* \xNN */
7870 }
7871 break;
7872 }
7873
7874 if (clen > LONG_MAX - len) {
7875 rb_raise(rb_eRuntimeError, "string size too big");
7876 }
7877 len += clen;
7878 }
7879
7880 result = rb_str_new(0, len);
7881 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7882 q = RSTRING_PTR(result); qend = q + len + 1;
7883
7884 *q++ = '"';
7885 while (p < pend) {
7886 unsigned char c = *p++;
7887
7888 if (c == '"' || c == '\\') {
7889 *q++ = '\\';
7890 *q++ = c;
7891 }
7892 else if (c == '#') {
7893 if (IS_EVSTR(p, pend)) *q++ = '\\';
7894 *q++ = '#';
7895 }
7896 else if (c == '\n') {
7897 *q++ = '\\';
7898 *q++ = 'n';
7899 }
7900 else if (c == '\r') {
7901 *q++ = '\\';
7902 *q++ = 'r';
7903 }
7904 else if (c == '\t') {
7905 *q++ = '\\';
7906 *q++ = 't';
7907 }
7908 else if (c == '\f') {
7909 *q++ = '\\';
7910 *q++ = 'f';
7911 }
7912 else if (c == '\013') {
7913 *q++ = '\\';
7914 *q++ = 'v';
7915 }
7916 else if (c == '\010') {
7917 *q++ = '\\';
7918 *q++ = 'b';
7919 }
7920 else if (c == '\007') {
7921 *q++ = '\\';
7922 *q++ = 'a';
7923 }
7924 else if (c == '\033') {
7925 *q++ = '\\';
7926 *q++ = 'e';
7927 }
7928 else if (ISPRINT(c)) {
7929 *q++ = c;
7930 }
7931 else {
7932 *q++ = '\\';
7933 if (u8) {
7934 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7935 if (MBCLEN_CHARFOUND_P(n)) {
7936 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7937 p += n;
7938 if (cc <= 0xFFFF)
7939 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7940 else
7941 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7942 q += strlen(q);
7943 continue;
7944 }
7945 }
7946 snprintf(q, qend-q, "x%02X", c);
7947 q += 3;
7948 }
7949 }
7950 *q++ = '"';
7951 *q = '\0';
7952 if (!rb_enc_asciicompat(enc)) {
7953 snprintf(q, qend-q, nonascii_suffix, enc->name);
7954 encidx = rb_ascii8bit_encindex();
7955 }
7956 /* result from dump is ASCII */
7957 rb_enc_associate_index(result, encidx);
7959 return result;
7960}
7961
7962static int
7963unescape_ascii(unsigned int c)
7964{
7965 switch (c) {
7966 case 'n':
7967 return '\n';
7968 case 'r':
7969 return '\r';
7970 case 't':
7971 return '\t';
7972 case 'f':
7973 return '\f';
7974 case 'v':
7975 return '\13';
7976 case 'b':
7977 return '\010';
7978 case 'a':
7979 return '\007';
7980 case 'e':
7981 return 033;
7982 }
7984}
7985
7986static void
7987undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7988{
7989 const char *s = *ss;
7990 unsigned int c;
7991 int codelen;
7992 size_t hexlen;
7993 unsigned char buf[6];
7994 static rb_encoding *enc_utf8 = NULL;
7995
7996 switch (*s) {
7997 case '\\':
7998 case '"':
7999 case '#':
8000 rb_str_cat(undumped, s, 1); /* cat itself */
8001 s++;
8002 break;
8003 case 'n':
8004 case 'r':
8005 case 't':
8006 case 'f':
8007 case 'v':
8008 case 'b':
8009 case 'a':
8010 case 'e':
8011 *buf = unescape_ascii(*s);
8012 rb_str_cat(undumped, (char *)buf, 1);
8013 s++;
8014 break;
8015 case 'u':
8016 if (*binary) {
8017 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
8018 }
8019 *utf8 = true;
8020 if (++s >= s_end) {
8021 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
8022 }
8023 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
8024 if (*penc != enc_utf8) {
8025 *penc = enc_utf8;
8026 rb_enc_associate(undumped, enc_utf8);
8027 }
8028 if (*s == '{') { /* handle \u{...} form */
8029 s++;
8030 for (;;) {
8031 if (s >= s_end) {
8032 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
8033 }
8034 if (*s == '}') {
8035 s++;
8036 break;
8037 }
8038 if (ISSPACE(*s)) {
8039 s++;
8040 continue;
8041 }
8042 c = scan_hex(s, s_end-s, &hexlen);
8043 if (hexlen == 0 || hexlen > 6) {
8044 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
8045 }
8046 if (c > 0x10ffff) {
8047 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
8048 }
8049 if (0xd800 <= c && c <= 0xdfff) {
8050 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
8051 }
8052 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
8053 rb_str_cat(undumped, (char *)buf, codelen);
8054 s += hexlen;
8055 }
8056 }
8057 else { /* handle \uXXXX form */
8058 c = scan_hex(s, 4, &hexlen);
8059 if (hexlen != 4) {
8060 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
8061 }
8062 if (0xd800 <= c && c <= 0xdfff) {
8063 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
8064 }
8065 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
8066 rb_str_cat(undumped, (char *)buf, codelen);
8067 s += hexlen;
8068 }
8069 break;
8070 case 'x':
8071 if (*utf8) {
8072 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
8073 }
8074 *binary = true;
8075 if (++s >= s_end) {
8076 rb_raise(rb_eRuntimeError, "invalid hex escape");
8077 }
8078 *buf = scan_hex(s, 2, &hexlen);
8079 if (hexlen != 2) {
8080 rb_raise(rb_eRuntimeError, "invalid hex escape");
8081 }
8082 rb_str_cat(undumped, (char *)buf, 1);
8083 s += hexlen;
8084 break;
8085 default:
8086 rb_str_cat(undumped, s-1, 2);
8087 s++;
8088 }
8089
8090 *ss = s;
8091}
8092
8093static VALUE rb_str_is_ascii_only_p(VALUE str);
8094
8095/*
8096 * call-seq:
8097 * undump -> string
8098 *
8099 * Returns an unescaped version of +self+:
8100 *
8101 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
8102 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
8103 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
8104 * s_undumped == s_orig # => true
8105 *
8106 * Related: String#dump (inverse of String#undump).
8107 *
8108 */
8109
8110static VALUE
8111str_undump(VALUE str)
8112{
8113 const char *s = RSTRING_PTR(str);
8114 const char *s_end = RSTRING_END(str);
8115 rb_encoding *enc = rb_enc_get(str);
8116 VALUE undumped = rb_enc_str_new(s, 0L, enc);
8117 bool utf8 = false;
8118 bool binary = false;
8119 int w;
8120
8122 if (rb_str_is_ascii_only_p(str) == Qfalse) {
8123 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
8124 }
8125 if (!str_null_check(str, &w)) {
8126 rb_raise(rb_eRuntimeError, "string contains null byte");
8127 }
8128 if (RSTRING_LEN(str) < 2) goto invalid_format;
8129 if (*s != '"') goto invalid_format;
8130
8131 /* strip '"' at the start */
8132 s++;
8133
8134 for (;;) {
8135 if (s >= s_end) {
8136 rb_raise(rb_eRuntimeError, "unterminated dumped string");
8137 }
8138
8139 if (*s == '"') {
8140 /* epilogue */
8141 s++;
8142 if (s == s_end) {
8143 /* ascii compatible dumped string */
8144 break;
8145 }
8146 else {
8147 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
8148 static const char dup_suffix[] = ".dup";
8149 const char *encname;
8150 int encidx;
8151 ptrdiff_t size;
8152
8153 /* check separately for strings dumped by older versions */
8154 size = sizeof(dup_suffix) - 1;
8155 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
8156
8157 size = sizeof(force_encoding_suffix) - 1;
8158 if (s_end - s <= size) goto invalid_format;
8159 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
8160 s += size;
8161
8162 if (utf8) {
8163 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
8164 }
8165
8166 encname = s;
8167 s = memchr(s, '"', s_end-s);
8168 size = s - encname;
8169 if (!s) goto invalid_format;
8170 if (s_end - s != 2) goto invalid_format;
8171 if (s[0] != '"' || s[1] != ')') goto invalid_format;
8172
8173 encidx = rb_enc_find_index2(encname, (long)size);
8174 if (encidx < 0) {
8175 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
8176 }
8177 rb_enc_associate_index(undumped, encidx);
8178 }
8179 break;
8180 }
8181
8182 if (*s == '\\') {
8183 s++;
8184 if (s >= s_end) {
8185 rb_raise(rb_eRuntimeError, "invalid escape");
8186 }
8187 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
8188 }
8189 else {
8190 rb_str_cat(undumped, s++, 1);
8191 }
8192 }
8193
8194 RB_GC_GUARD(str);
8195
8196 return undumped;
8197invalid_format:
8198 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
8199}
8200
8201static void
8202rb_str_check_dummy_enc(rb_encoding *enc)
8203{
8204 if (rb_enc_dummy_p(enc)) {
8205 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
8206 rb_enc_name(enc));
8207 }
8208}
8209
8210static rb_encoding *
8211str_true_enc(VALUE str)
8212{
8213 rb_encoding *enc = STR_ENC_GET(str);
8214 rb_str_check_dummy_enc(enc);
8215 return enc;
8216}
8217
8218static OnigCaseFoldType
8219check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
8220{
8221 if (argc==0)
8222 return flags;
8223 if (argc>2)
8224 rb_raise(rb_eArgError, "too many options");
8225 if (argv[0]==sym_turkic) {
8226 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
8227 if (argc==2) {
8228 if (argv[1]==sym_lithuanian)
8229 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
8230 else
8231 rb_raise(rb_eArgError, "invalid second option");
8232 }
8233 }
8234 else if (argv[0]==sym_lithuanian) {
8235 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
8236 if (argc==2) {
8237 if (argv[1]==sym_turkic)
8238 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
8239 else
8240 rb_raise(rb_eArgError, "invalid second option");
8241 }
8242 }
8243 else if (argc>1)
8244 rb_raise(rb_eArgError, "too many options");
8245 else if (argv[0]==sym_ascii)
8246 flags |= ONIGENC_CASE_ASCII_ONLY;
8247 else if (argv[0]==sym_fold) {
8248 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
8249 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
8250 else
8251 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
8252 }
8253 else
8254 rb_raise(rb_eArgError, "invalid option");
8255 return flags;
8256}
8257
8258static inline bool
8259case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
8260{
8261 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
8262 return true;
8263 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
8264}
8265
8266/* 16 should be long enough to absorb any kind of single character length increase */
8267#define CASE_MAPPING_ADDITIONAL_LENGTH 20
8268#ifndef CASEMAP_DEBUG
8269# define CASEMAP_DEBUG 0
8270#endif
8271
8272struct mapping_buffer;
8273typedef struct mapping_buffer {
8274 size_t capa;
8275 size_t used;
8276 struct mapping_buffer *next;
8277 OnigUChar space[FLEX_ARY_LEN];
8279
8280static void
8281mapping_buffer_free(void *p)
8282{
8283 mapping_buffer *previous_buffer;
8284 mapping_buffer *current_buffer = p;
8285 while (current_buffer) {
8286 previous_buffer = current_buffer;
8287 current_buffer = current_buffer->next;
8288 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
8289 }
8290}
8291
8292static const rb_data_type_t mapping_buffer_type = {
8293 "mapping_buffer",
8294 {0, mapping_buffer_free,},
8295 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
8296};
8297
8298static VALUE
8299rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
8300{
8301 VALUE target;
8302
8303 const OnigUChar *source_current, *source_end;
8304 int target_length = 0;
8305 VALUE buffer_anchor;
8306 mapping_buffer *current_buffer = 0;
8307 mapping_buffer **pre_buffer;
8308 size_t buffer_count = 0;
8309 int buffer_length_or_invalid;
8310
8311 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
8312
8313 source_current = (OnigUChar*)RSTRING_PTR(source);
8314 source_end = (OnigUChar*)RSTRING_END(source);
8315
8316 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
8317 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
8318 while (source_current < source_end) {
8319 /* increase multiplier using buffer count to converge quickly */
8320 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
8321 if (CASEMAP_DEBUG) {
8322 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
8323 }
8324 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
8325 *pre_buffer = current_buffer;
8326 pre_buffer = &current_buffer->next;
8327 current_buffer->next = NULL;
8328 current_buffer->capa = capa;
8329 buffer_length_or_invalid = enc->case_map(flags,
8330 &source_current, source_end,
8331 current_buffer->space,
8332 current_buffer->space+current_buffer->capa,
8333 enc);
8334 if (buffer_length_or_invalid < 0) {
8335 current_buffer = DATA_PTR(buffer_anchor);
8336 DATA_PTR(buffer_anchor) = 0;
8337 mapping_buffer_free(current_buffer);
8338 rb_raise(rb_eArgError, "input string invalid");
8339 }
8340 target_length += current_buffer->used = buffer_length_or_invalid;
8341 }
8342 if (CASEMAP_DEBUG) {
8343 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
8344 }
8345
8346 if (buffer_count==1) {
8347 target = rb_str_new((const char*)current_buffer->space, target_length);
8348 }
8349 else {
8350 char *target_current;
8351
8352 target = rb_str_new(0, target_length);
8353 target_current = RSTRING_PTR(target);
8354 current_buffer = DATA_PTR(buffer_anchor);
8355 while (current_buffer) {
8356 memcpy(target_current, current_buffer->space, current_buffer->used);
8357 target_current += current_buffer->used;
8358 current_buffer = current_buffer->next;
8359 }
8360 }
8361 current_buffer = DATA_PTR(buffer_anchor);
8362 DATA_PTR(buffer_anchor) = 0;
8363 mapping_buffer_free(current_buffer);
8364
8365 RB_GC_GUARD(buffer_anchor);
8366
8367 /* TODO: check about string terminator character */
8368 str_enc_copy_direct(target, source);
8369 /*ENC_CODERANGE_SET(mapped, cr);*/
8370
8371 return target;
8372}
8373
8374static VALUE
8375rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
8376{
8377 const OnigUChar *source_current, *source_end;
8378 OnigUChar *target_current, *target_end;
8379 long old_length = RSTRING_LEN(source);
8380 int length_or_invalid;
8381
8382 if (old_length == 0) return Qnil;
8383
8384 source_current = (OnigUChar*)RSTRING_PTR(source);
8385 source_end = (OnigUChar*)RSTRING_END(source);
8386 if (source == target) {
8387 target_current = (OnigUChar*)source_current;
8388 target_end = (OnigUChar*)source_end;
8389 }
8390 else {
8391 target_current = (OnigUChar*)RSTRING_PTR(target);
8392 target_end = (OnigUChar*)RSTRING_END(target);
8393 }
8394
8395 length_or_invalid = onigenc_ascii_only_case_map(flags,
8396 &source_current, source_end,
8397 target_current, target_end, enc);
8398 if (length_or_invalid < 0)
8399 rb_raise(rb_eArgError, "input string invalid");
8400 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8401 fprintf(stderr, "problem with rb_str_ascii_casemap"
8402 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8403 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8404 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8405 }
8406
8407 str_enc_copy(target, source);
8408
8409 return target;
8410}
8411
8412static bool
8413upcase_single(VALUE str)
8414{
8415 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8416 bool modified = false;
8417
8418 while (s < send) {
8419 unsigned int c = *(unsigned char*)s;
8420
8421 if ('a' <= c && c <= 'z') {
8422 *s = 'A' + (c - 'a');
8423 modified = true;
8424 }
8425 s++;
8426 }
8427 return modified;
8428}
8429
8430/*
8431 * call-seq:
8432 * upcase!(*options) -> self or nil
8433 *
8434 * Upcases the characters in +self+;
8435 * returns +self+ if any changes were made, +nil+ otherwise:
8436 *
8437 * s = 'Hello World!' # => "Hello World!"
8438 * s.upcase! # => "HELLO WORLD!"
8439 * s # => "HELLO WORLD!"
8440 * s.upcase! # => nil
8441 *
8442 * The casing may be affected by the given +options+;
8443 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8444 *
8445 * Related: String#upcase, String#downcase, String#downcase!.
8446 *
8447 */
8448
8449static VALUE
8450rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8451{
8452 rb_encoding *enc;
8453 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8454
8455 flags = check_case_options(argc, argv, flags);
8456 str_modify_keep_cr(str);
8457 enc = str_true_enc(str);
8458 if (case_option_single_p(flags, enc, str)) {
8459 if (upcase_single(str))
8460 flags |= ONIGENC_CASE_MODIFIED;
8461 }
8462 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8463 rb_str_ascii_casemap(str, str, &flags, enc);
8464 else
8465 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8466
8467 if (ONIGENC_CASE_MODIFIED&flags) return str;
8468 return Qnil;
8469}
8470
8471
8472/*
8473 * call-seq:
8474 * upcase(*options) -> string
8475 *
8476 * Returns a string containing the upcased characters in +self+:
8477 *
8478 * s = 'Hello World!' # => "Hello World!"
8479 * s.upcase # => "HELLO WORLD!"
8480 *
8481 * The casing may be affected by the given +options+;
8482 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8483 *
8484 * Related: String#upcase!, String#downcase, String#downcase!.
8485 *
8486 */
8487
8488static VALUE
8489rb_str_upcase(int argc, VALUE *argv, VALUE str)
8490{
8491 rb_encoding *enc;
8492 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8493 VALUE ret;
8494
8495 flags = check_case_options(argc, argv, flags);
8496 enc = str_true_enc(str);
8497 if (case_option_single_p(flags, enc, str)) {
8498 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8499 str_enc_copy_direct(ret, str);
8500 upcase_single(ret);
8501 }
8502 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8503 ret = rb_str_new(0, RSTRING_LEN(str));
8504 rb_str_ascii_casemap(str, ret, &flags, enc);
8505 }
8506 else {
8507 ret = rb_str_casemap(str, &flags, enc);
8508 }
8509
8510 return ret;
8511}
8512
8513static bool
8514downcase_single(VALUE str)
8515{
8516 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8517 bool modified = false;
8518
8519 while (s < send) {
8520 unsigned int c = *(unsigned char*)s;
8521
8522 if ('A' <= c && c <= 'Z') {
8523 *s = 'a' + (c - 'A');
8524 modified = true;
8525 }
8526 s++;
8527 }
8528
8529 return modified;
8530}
8531
8532/*
8533 * call-seq:
8534 * downcase!(*options) -> self or nil
8535 *
8536 * Downcases the characters in +self+;
8537 * returns +self+ if any changes were made, +nil+ otherwise:
8538 *
8539 * s = 'Hello World!' # => "Hello World!"
8540 * s.downcase! # => "hello world!"
8541 * s # => "hello world!"
8542 * s.downcase! # => nil
8543 *
8544 * The casing may be affected by the given +options+;
8545 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8546 *
8547 * Related: String#downcase, String#upcase, String#upcase!.
8548 *
8549 */
8550
8551static VALUE
8552rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8553{
8554 rb_encoding *enc;
8555 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8556
8557 flags = check_case_options(argc, argv, flags);
8558 str_modify_keep_cr(str);
8559 enc = str_true_enc(str);
8560 if (case_option_single_p(flags, enc, str)) {
8561 if (downcase_single(str))
8562 flags |= ONIGENC_CASE_MODIFIED;
8563 }
8564 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8565 rb_str_ascii_casemap(str, str, &flags, enc);
8566 else
8567 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8568
8569 if (ONIGENC_CASE_MODIFIED&flags) return str;
8570 return Qnil;
8571}
8572
8573
8574/*
8575 * call-seq:
8576 * downcase(*options) -> string
8577 *
8578 * Returns a string containing the downcased characters in +self+:
8579 *
8580 * s = 'Hello World!' # => "Hello World!"
8581 * s.downcase # => "hello world!"
8582 *
8583 * The casing may be affected by the given +options+;
8584 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8585 *
8586 * Related: String#downcase!, String#upcase, String#upcase!.
8587 *
8588 */
8589
8590static VALUE
8591rb_str_downcase(int argc, VALUE *argv, VALUE str)
8592{
8593 rb_encoding *enc;
8594 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8595 VALUE ret;
8596
8597 flags = check_case_options(argc, argv, flags);
8598 enc = str_true_enc(str);
8599 if (case_option_single_p(flags, enc, str)) {
8600 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8601 str_enc_copy_direct(ret, str);
8602 downcase_single(ret);
8603 }
8604 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8605 ret = rb_str_new(0, RSTRING_LEN(str));
8606 rb_str_ascii_casemap(str, ret, &flags, enc);
8607 }
8608 else {
8609 ret = rb_str_casemap(str, &flags, enc);
8610 }
8611
8612 return ret;
8613}
8614
8615
8616/*
8617 * call-seq:
8618 * capitalize!(*options) -> self or nil
8619 *
8620 * Upcases the first character in +self+;
8621 * downcases the remaining characters;
8622 * returns +self+ if any changes were made, +nil+ otherwise:
8623 *
8624 * s = 'hello World!' # => "hello World!"
8625 * s.capitalize! # => "Hello world!"
8626 * s # => "Hello world!"
8627 * s.capitalize! # => nil
8628 *
8629 * The casing may be affected by the given +options+;
8630 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8631 *
8632 * Related: String#capitalize.
8633 *
8634 */
8635
8636static VALUE
8637rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8638{
8639 rb_encoding *enc;
8640 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8641
8642 flags = check_case_options(argc, argv, flags);
8643 str_modify_keep_cr(str);
8644 enc = str_true_enc(str);
8645 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8646 if (flags&ONIGENC_CASE_ASCII_ONLY)
8647 rb_str_ascii_casemap(str, str, &flags, enc);
8648 else
8649 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8650
8651 if (ONIGENC_CASE_MODIFIED&flags) return str;
8652 return Qnil;
8653}
8654
8655
8656/*
8657 * call-seq:
8658 * capitalize(*options) -> string
8659 *
8660 * Returns a string containing the characters in +self+;
8661 * the first character is upcased;
8662 * the remaining characters are downcased:
8663 *
8664 * s = 'hello World!' # => "hello World!"
8665 * s.capitalize # => "Hello world!"
8666 *
8667 * The casing may be affected by the given +options+;
8668 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8669 *
8670 * Related: String#capitalize!.
8671 *
8672 */
8673
8674static VALUE
8675rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8676{
8677 rb_encoding *enc;
8678 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8679 VALUE ret;
8680
8681 flags = check_case_options(argc, argv, flags);
8682 enc = str_true_enc(str);
8683 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8684 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8685 ret = rb_str_new(0, RSTRING_LEN(str));
8686 rb_str_ascii_casemap(str, ret, &flags, enc);
8687 }
8688 else {
8689 ret = rb_str_casemap(str, &flags, enc);
8690 }
8691 return ret;
8692}
8693
8694
8695/*
8696 * call-seq:
8697 * swapcase!(*options) -> self or nil
8698 *
8699 * Upcases each lowercase character in +self+;
8700 * downcases uppercase character;
8701 * returns +self+ if any changes were made, +nil+ otherwise:
8702 *
8703 * s = 'Hello World!' # => "Hello World!"
8704 * s.swapcase! # => "hELLO wORLD!"
8705 * s # => "hELLO wORLD!"
8706 * ''.swapcase! # => nil
8707 *
8708 * The casing may be affected by the given +options+;
8709 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8710 *
8711 * Related: String#swapcase.
8712 *
8713 */
8714
8715static VALUE
8716rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8717{
8718 rb_encoding *enc;
8719 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8720
8721 flags = check_case_options(argc, argv, flags);
8722 str_modify_keep_cr(str);
8723 enc = str_true_enc(str);
8724 if (flags&ONIGENC_CASE_ASCII_ONLY)
8725 rb_str_ascii_casemap(str, str, &flags, enc);
8726 else
8727 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8728
8729 if (ONIGENC_CASE_MODIFIED&flags) return str;
8730 return Qnil;
8731}
8732
8733
8734/*
8735 * call-seq:
8736 * swapcase(*options) -> string
8737 *
8738 * Returns a string containing the characters in +self+, with cases reversed;
8739 * each uppercase character is downcased;
8740 * each lowercase character is upcased:
8741 *
8742 * s = 'Hello World!' # => "Hello World!"
8743 * s.swapcase # => "hELLO wORLD!"
8744 *
8745 * The casing may be affected by the given +options+;
8746 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8747 *
8748 * Related: String#swapcase!.
8749 *
8750 */
8751
8752static VALUE
8753rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8754{
8755 rb_encoding *enc;
8756 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8757 VALUE ret;
8758
8759 flags = check_case_options(argc, argv, flags);
8760 enc = str_true_enc(str);
8761 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8762 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8763 ret = rb_str_new(0, RSTRING_LEN(str));
8764 rb_str_ascii_casemap(str, ret, &flags, enc);
8765 }
8766 else {
8767 ret = rb_str_casemap(str, &flags, enc);
8768 }
8769 return ret;
8770}
8771
8772typedef unsigned char *USTR;
8773
8774struct tr {
8775 int gen;
8776 unsigned int now, max;
8777 char *p, *pend;
8778};
8779
8780static unsigned int
8781trnext(struct tr *t, rb_encoding *enc)
8782{
8783 int n;
8784
8785 for (;;) {
8786 nextpart:
8787 if (!t->gen) {
8788 if (t->p == t->pend) return -1;
8789 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8790 t->p += n;
8791 }
8792 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8793 t->p += n;
8794 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8795 t->p += n;
8796 if (t->p < t->pend) {
8797 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8798 t->p += n;
8799 if (t->now > c) {
8800 if (t->now < 0x80 && c < 0x80) {
8801 rb_raise(rb_eArgError,
8802 "invalid range \"%c-%c\" in string transliteration",
8803 t->now, c);
8804 }
8805 else {
8806 rb_raise(rb_eArgError, "invalid range in string transliteration");
8807 }
8808 continue; /* not reached */
8809 }
8810 else if (t->now < c) {
8811 t->gen = 1;
8812 t->max = c;
8813 }
8814 }
8815 }
8816 return t->now;
8817 }
8818 else {
8819 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8820 if (t->now == t->max) {
8821 t->gen = 0;
8822 goto nextpart;
8823 }
8824 }
8825 if (t->now < t->max) {
8826 return t->now;
8827 }
8828 else {
8829 t->gen = 0;
8830 return t->max;
8831 }
8832 }
8833 }
8834}
8835
8836static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8837
8838static VALUE
8839tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8840{
8841 const unsigned int errc = -1;
8842 unsigned int trans[256];
8843 rb_encoding *enc, *e1, *e2;
8844 struct tr trsrc, trrepl;
8845 int cflag = 0;
8846 unsigned int c, c0, last = 0;
8847 int modify = 0, i, l;
8848 unsigned char *s, *send;
8849 VALUE hash = 0;
8850 int singlebyte = single_byte_optimizable(str);
8851 int termlen;
8852 int cr;
8853
8854#define CHECK_IF_ASCII(c) \
8855 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8856 (cr = ENC_CODERANGE_VALID) : 0)
8857
8858 StringValue(src);
8859 StringValue(repl);
8860 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8861 if (RSTRING_LEN(repl) == 0) {
8862 return rb_str_delete_bang(1, &src, str);
8863 }
8864
8865 cr = ENC_CODERANGE(str);
8866 e1 = rb_enc_check(str, src);
8867 e2 = rb_enc_check(str, repl);
8868 if (e1 == e2) {
8869 enc = e1;
8870 }
8871 else {
8872 enc = rb_enc_check(src, repl);
8873 }
8874 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8875 if (RSTRING_LEN(src) > 1 &&
8876 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8877 trsrc.p + l < trsrc.pend) {
8878 cflag = 1;
8879 trsrc.p += l;
8880 }
8881 trrepl.p = RSTRING_PTR(repl);
8882 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8883 trsrc.gen = trrepl.gen = 0;
8884 trsrc.now = trrepl.now = 0;
8885 trsrc.max = trrepl.max = 0;
8886
8887 if (cflag) {
8888 for (i=0; i<256; i++) {
8889 trans[i] = 1;
8890 }
8891 while ((c = trnext(&trsrc, enc)) != errc) {
8892 if (c < 256) {
8893 trans[c] = errc;
8894 }
8895 else {
8896 if (!hash) hash = rb_hash_new();
8897 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8898 }
8899 }
8900 while ((c = trnext(&trrepl, enc)) != errc)
8901 /* retrieve last replacer */;
8902 last = trrepl.now;
8903 for (i=0; i<256; i++) {
8904 if (trans[i] != errc) {
8905 trans[i] = last;
8906 }
8907 }
8908 }
8909 else {
8910 unsigned int r;
8911
8912 for (i=0; i<256; i++) {
8913 trans[i] = errc;
8914 }
8915 while ((c = trnext(&trsrc, enc)) != errc) {
8916 r = trnext(&trrepl, enc);
8917 if (r == errc) r = trrepl.now;
8918 if (c < 256) {
8919 trans[c] = r;
8920 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8921 }
8922 else {
8923 if (!hash) hash = rb_hash_new();
8924 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8925 }
8926 }
8927 }
8928
8929 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8930 cr = ENC_CODERANGE_7BIT;
8931 str_modify_keep_cr(str);
8932 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8933 termlen = rb_enc_mbminlen(enc);
8934 if (sflag) {
8935 int clen, tlen;
8936 long offset, max = RSTRING_LEN(str);
8937 unsigned int save = -1;
8938 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8939
8940 while (s < send) {
8941 int may_modify = 0;
8942
8943 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8944 if (!MBCLEN_CHARFOUND_P(r)) {
8945 xfree(buf);
8946 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8947 }
8948 clen = MBCLEN_CHARFOUND_LEN(r);
8949 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8950
8951 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8952
8953 s += clen;
8954 if (c < 256) {
8955 c = trans[c];
8956 }
8957 else if (hash) {
8958 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8959 if (NIL_P(tmp)) {
8960 if (cflag) c = last;
8961 else c = errc;
8962 }
8963 else if (cflag) c = errc;
8964 else c = NUM2INT(tmp);
8965 }
8966 else {
8967 c = errc;
8968 }
8969 if (c != (unsigned int)-1) {
8970 if (save == c) {
8971 CHECK_IF_ASCII(c);
8972 continue;
8973 }
8974 save = c;
8975 tlen = rb_enc_codelen(c, enc);
8976 modify = 1;
8977 }
8978 else {
8979 save = -1;
8980 c = c0;
8981 if (enc != e1) may_modify = 1;
8982 }
8983 if ((offset = t - buf) + tlen > max) {
8984 size_t MAYBE_UNUSED(old) = max + termlen;
8985 max = offset + tlen + (send - s);
8986 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8987 t = buf + offset;
8988 }
8989 rb_enc_mbcput(c, t, enc);
8990 if (may_modify && memcmp(s, t, tlen) != 0) {
8991 modify = 1;
8992 }
8993 CHECK_IF_ASCII(c);
8994 t += tlen;
8995 }
8996 if (!STR_EMBED_P(str)) {
8997 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8998 }
8999 TERM_FILL((char *)t, termlen);
9000 RSTRING(str)->as.heap.ptr = (char *)buf;
9001 STR_SET_LEN(str, t - buf);
9002 STR_SET_NOEMBED(str);
9003 RSTRING(str)->as.heap.aux.capa = max;
9004 }
9005 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
9006 while (s < send) {
9007 c = (unsigned char)*s;
9008 if (trans[c] != errc) {
9009 if (!cflag) {
9010 c = trans[c];
9011 *s = c;
9012 modify = 1;
9013 }
9014 else {
9015 *s = last;
9016 modify = 1;
9017 }
9018 }
9019 CHECK_IF_ASCII(c);
9020 s++;
9021 }
9022 }
9023 else {
9024 int clen, tlen;
9025 long offset, max = (long)((send - s) * 1.2);
9026 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
9027
9028 while (s < send) {
9029 int may_modify = 0;
9030
9031 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
9032 if (!MBCLEN_CHARFOUND_P(r)) {
9033 xfree(buf);
9034 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
9035 }
9036 clen = MBCLEN_CHARFOUND_LEN(r);
9037 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
9038
9039 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
9040
9041 if (c < 256) {
9042 c = trans[c];
9043 }
9044 else if (hash) {
9045 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
9046 if (NIL_P(tmp)) {
9047 if (cflag) c = last;
9048 else c = errc;
9049 }
9050 else if (cflag) c = errc;
9051 else c = NUM2INT(tmp);
9052 }
9053 else {
9054 c = cflag ? last : errc;
9055 }
9056 if (c != errc) {
9057 tlen = rb_enc_codelen(c, enc);
9058 modify = 1;
9059 }
9060 else {
9061 c = c0;
9062 if (enc != e1) may_modify = 1;
9063 }
9064 if ((offset = t - buf) + tlen > max) {
9065 size_t MAYBE_UNUSED(old) = max + termlen;
9066 max = offset + tlen + (long)((send - s) * 1.2);
9067 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
9068 t = buf + offset;
9069 }
9070 if (s != t) {
9071 rb_enc_mbcput(c, t, enc);
9072 if (may_modify && memcmp(s, t, tlen) != 0) {
9073 modify = 1;
9074 }
9075 }
9076 CHECK_IF_ASCII(c);
9077 s += clen;
9078 t += tlen;
9079 }
9080 if (!STR_EMBED_P(str)) {
9081 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
9082 }
9083 TERM_FILL((char *)t, termlen);
9084 RSTRING(str)->as.heap.ptr = (char *)buf;
9085 STR_SET_LEN(str, t - buf);
9086 STR_SET_NOEMBED(str);
9087 RSTRING(str)->as.heap.aux.capa = max;
9088 }
9089
9090 if (modify) {
9091 if (cr != ENC_CODERANGE_BROKEN)
9092 ENC_CODERANGE_SET(str, cr);
9093 rb_enc_associate(str, enc);
9094 return str;
9095 }
9096 return Qnil;
9097}
9098
9099
9100/*
9101 * call-seq:
9102 * tr!(selector, replacements) -> self or nil
9103 *
9104 * Like String#tr, but modifies +self+ in place.
9105 * Returns +self+ if any changes were made, +nil+ otherwise.
9106 *
9107 */
9108
9109static VALUE
9110rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
9111{
9112 return tr_trans(str, src, repl, 0);
9113}
9114
9115
9116/*
9117 * call-seq:
9118 * tr(selector, replacements) -> new_string
9119 *
9120 * Returns a copy of +self+ with each character specified by string +selector+
9121 * translated to the corresponding character in string +replacements+.
9122 * The correspondence is _positional_:
9123 *
9124 * - Each occurrence of the first character specified by +selector+
9125 * is translated to the first character in +replacements+.
9126 * - Each occurrence of the second character specified by +selector+
9127 * is translated to the second character in +replacements+.
9128 * - And so on.
9129 *
9130 * Example:
9131 *
9132 * 'hello'.tr('el', 'ip') #=> "hippo"
9133 *
9134 * If +replacements+ is shorter than +selector+,
9135 * it is implicitly padded with its own last character:
9136 *
9137 * 'hello'.tr('aeiou', '-') # => "h-ll-"
9138 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
9139 *
9140 * Arguments +selector+ and +replacements+ must be valid character selectors
9141 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
9142 * and may use any of its valid forms, including negation, ranges, and escaping:
9143 *
9144 * # Negation.
9145 * 'hello'.tr('^aeiou', '-') # => "-e--o"
9146 * # Ranges.
9147 * 'ibm'.tr('b-z', 'a-z') # => "hal"
9148 * # Escapes.
9149 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
9150 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
9151 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
9152 *
9153 */
9154
9155static VALUE
9156rb_str_tr(VALUE str, VALUE src, VALUE repl)
9157{
9158 str = str_duplicate(rb_cString, str);
9159 tr_trans(str, src, repl, 0);
9160 return str;
9161}
9162
9163#define TR_TABLE_MAX (UCHAR_MAX+1)
9164#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
9165static void
9166tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
9167 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
9168{
9169 const unsigned int errc = -1;
9170 char buf[TR_TABLE_MAX];
9171 struct tr tr;
9172 unsigned int c;
9173 VALUE table = 0, ptable = 0;
9174 int i, l, cflag = 0;
9175
9176 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
9177 tr.gen = tr.now = tr.max = 0;
9178
9179 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
9180 cflag = 1;
9181 tr.p += l;
9182 }
9183 if (first) {
9184 for (i=0; i<TR_TABLE_MAX; i++) {
9185 stable[i] = 1;
9186 }
9187 stable[TR_TABLE_MAX] = cflag;
9188 }
9189 else if (stable[TR_TABLE_MAX] && !cflag) {
9190 stable[TR_TABLE_MAX] = 0;
9191 }
9192 for (i=0; i<TR_TABLE_MAX; i++) {
9193 buf[i] = cflag;
9194 }
9195
9196 while ((c = trnext(&tr, enc)) != errc) {
9197 if (c < TR_TABLE_MAX) {
9198 buf[(unsigned char)c] = !cflag;
9199 }
9200 else {
9201 VALUE key = UINT2NUM(c);
9202
9203 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
9204 if (cflag) {
9205 ptable = *ctablep;
9206 table = ptable ? ptable : rb_hash_new();
9207 *ctablep = table;
9208 }
9209 else {
9210 table = rb_hash_new();
9211 ptable = *tablep;
9212 *tablep = table;
9213 }
9214 }
9215 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
9216 rb_hash_aset(table, key, Qtrue);
9217 }
9218 }
9219 }
9220 for (i=0; i<TR_TABLE_MAX; i++) {
9221 stable[i] = stable[i] && buf[i];
9222 }
9223 if (!table && !cflag) {
9224 *tablep = 0;
9225 }
9226}
9227
9228
9229static int
9230tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
9231{
9232 if (c < TR_TABLE_MAX) {
9233 return table[c] != 0;
9234 }
9235 else {
9236 VALUE v = UINT2NUM(c);
9237
9238 if (del) {
9239 if (!NIL_P(rb_hash_lookup(del, v)) &&
9240 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
9241 return TRUE;
9242 }
9243 }
9244 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
9245 return FALSE;
9246 }
9247 return table[TR_TABLE_MAX] ? TRUE : FALSE;
9248 }
9249}
9250
9251/*
9252 * call-seq:
9253 * delete!(*selectors) -> self or nil
9254 *
9255 * Like String#delete, but modifies +self+ in place.
9256 * Returns +self+ if any changes were made, +nil+ otherwise.
9257 *
9258 */
9259
9260static VALUE
9261rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
9262{
9263 char squeez[TR_TABLE_SIZE];
9264 rb_encoding *enc = 0;
9265 char *s, *send, *t;
9266 VALUE del = 0, nodel = 0;
9267 int modify = 0;
9268 int i, ascompat, cr;
9269
9270 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
9272 for (i=0; i<argc; i++) {
9273 VALUE s = argv[i];
9274
9275 StringValue(s);
9276 enc = rb_enc_check(str, s);
9277 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
9278 }
9279
9280 str_modify_keep_cr(str);
9281 ascompat = rb_enc_asciicompat(enc);
9282 s = t = RSTRING_PTR(str);
9283 send = RSTRING_END(str);
9284 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
9285 while (s < send) {
9286 unsigned int c;
9287 int clen;
9288
9289 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9290 if (squeez[c]) {
9291 modify = 1;
9292 }
9293 else {
9294 if (t != s) *t = c;
9295 t++;
9296 }
9297 s++;
9298 }
9299 else {
9300 c = rb_enc_codepoint_len(s, send, &clen, enc);
9301
9302 if (tr_find(c, squeez, del, nodel)) {
9303 modify = 1;
9304 }
9305 else {
9306 if (t != s) rb_enc_mbcput(c, t, enc);
9307 t += clen;
9309 }
9310 s += clen;
9311 }
9312 }
9313 TERM_FILL(t, TERM_LEN(str));
9314 STR_SET_LEN(str, t - RSTRING_PTR(str));
9315 ENC_CODERANGE_SET(str, cr);
9316
9317 if (modify) return str;
9318 return Qnil;
9319}
9320
9321
9322/*
9323 * call-seq:
9324 * delete(*selectors) -> new_string
9325 *
9326 * Returns a copy of +self+ with characters specified by +selectors+ removed
9327 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9328 *
9329 * "hello".delete "l","lo" #=> "heo"
9330 * "hello".delete "lo" #=> "he"
9331 * "hello".delete "aeiou", "^e" #=> "hell"
9332 * "hello".delete "ej-m" #=> "ho"
9333 *
9334 */
9335
9336static VALUE
9337rb_str_delete(int argc, VALUE *argv, VALUE str)
9338{
9339 str = str_duplicate(rb_cString, str);
9340 rb_str_delete_bang(argc, argv, str);
9341 return str;
9342}
9343
9344
9345/*
9346 * call-seq:
9347 * squeeze!(*selectors) -> self or nil
9348 *
9349 * Like String#squeeze, but modifies +self+ in place.
9350 * Returns +self+ if any changes were made, +nil+ otherwise.
9351 */
9352
9353static VALUE
9354rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
9355{
9356 char squeez[TR_TABLE_SIZE];
9357 rb_encoding *enc = 0;
9358 VALUE del = 0, nodel = 0;
9359 unsigned char *s, *send, *t;
9360 int i, modify = 0;
9361 int ascompat, singlebyte = single_byte_optimizable(str);
9362 unsigned int save;
9363
9364 if (argc == 0) {
9365 enc = STR_ENC_GET(str);
9366 }
9367 else {
9368 for (i=0; i<argc; i++) {
9369 VALUE s = argv[i];
9370
9371 StringValue(s);
9372 enc = rb_enc_check(str, s);
9373 if (singlebyte && !single_byte_optimizable(s))
9374 singlebyte = 0;
9375 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
9376 }
9377 }
9378
9379 str_modify_keep_cr(str);
9380 s = t = (unsigned char *)RSTRING_PTR(str);
9381 if (!s || RSTRING_LEN(str) == 0) return Qnil;
9382 send = (unsigned char *)RSTRING_END(str);
9383 save = -1;
9384 ascompat = rb_enc_asciicompat(enc);
9385
9386 if (singlebyte) {
9387 while (s < send) {
9388 unsigned int c = *s++;
9389 if (c != save || (argc > 0 && !squeez[c])) {
9390 *t++ = save = c;
9391 }
9392 }
9393 }
9394 else {
9395 while (s < send) {
9396 unsigned int c;
9397 int clen;
9398
9399 if (ascompat && (c = *s) < 0x80) {
9400 if (c != save || (argc > 0 && !squeez[c])) {
9401 *t++ = save = c;
9402 }
9403 s++;
9404 }
9405 else {
9406 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
9407
9408 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9409 if (t != s) rb_enc_mbcput(c, t, enc);
9410 save = c;
9411 t += clen;
9412 }
9413 s += clen;
9414 }
9415 }
9416 }
9417
9418 TERM_FILL((char *)t, TERM_LEN(str));
9419 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9420 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9421 modify = 1;
9422 }
9423
9424 if (modify) return str;
9425 return Qnil;
9426}
9427
9428
9429/*
9430 * call-seq:
9431 * squeeze(*selectors) -> new_string
9432 *
9433 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9434 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9435 *
9436 * "Squeezed" means that each multiple-character run of a selected character
9437 * is squeezed down to a single character;
9438 * with no arguments given, squeezes all characters:
9439 *
9440 * "yellow moon".squeeze #=> "yelow mon"
9441 * " now is the".squeeze(" ") #=> " now is the"
9442 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9443 *
9444 */
9445
9446static VALUE
9447rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9448{
9449 str = str_duplicate(rb_cString, str);
9450 rb_str_squeeze_bang(argc, argv, str);
9451 return str;
9452}
9453
9454
9455/*
9456 * call-seq:
9457 * tr_s!(selector, replacements) -> self or nil
9458 *
9459 * Like String#tr_s, but modifies +self+ in place.
9460 * Returns +self+ if any changes were made, +nil+ otherwise.
9461 *
9462 * Related: String#squeeze!.
9463 */
9464
9465static VALUE
9466rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9467{
9468 return tr_trans(str, src, repl, 1);
9469}
9470
9471
9472/*
9473 * call-seq:
9474 * tr_s(selector, replacements) -> string
9475 *
9476 * Like String#tr, but also squeezes the modified portions of the translated string;
9477 * returns a new string (translated and squeezed).
9478 *
9479 * 'hello'.tr_s('l', 'r') #=> "hero"
9480 * 'hello'.tr_s('el', '-') #=> "h-o"
9481 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9482 *
9483 * Related: String#squeeze.
9484 *
9485 */
9486
9487static VALUE
9488rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9489{
9490 str = str_duplicate(rb_cString, str);
9491 tr_trans(str, src, repl, 1);
9492 return str;
9493}
9494
9495
9496/*
9497 * call-seq:
9498 * count(*selectors) -> integer
9499 *
9500 * Returns the total number of characters in +self+
9501 * that are specified by the given +selectors+
9502 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9503 *
9504 * a = "hello world"
9505 * a.count "lo" #=> 5
9506 * a.count "lo", "o" #=> 2
9507 * a.count "hello", "^l" #=> 4
9508 * a.count "ej-m" #=> 4
9509 *
9510 * "hello^world".count "\\^aeiou" #=> 4
9511 * "hello-world".count "a\\-eo" #=> 4
9512 *
9513 * c = "hello world\\r\\n"
9514 * c.count "\\" #=> 2
9515 * c.count "\\A" #=> 0
9516 * c.count "X-\\w" #=> 3
9517 */
9518
9519static VALUE
9520rb_str_count(int argc, VALUE *argv, VALUE str)
9521{
9522 char table[TR_TABLE_SIZE];
9523 rb_encoding *enc = 0;
9524 VALUE del = 0, nodel = 0, tstr;
9525 char *s, *send;
9526 int i;
9527 int ascompat;
9528 size_t n = 0;
9529
9531
9532 tstr = argv[0];
9533 StringValue(tstr);
9534 enc = rb_enc_check(str, tstr);
9535 if (argc == 1) {
9536 const char *ptstr;
9537 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9538 (ptstr = RSTRING_PTR(tstr),
9539 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9540 !is_broken_string(str)) {
9541 int clen;
9542 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9543
9544 s = RSTRING_PTR(str);
9545 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9546 send = RSTRING_END(str);
9547 while (s < send) {
9548 if (*(unsigned char*)s++ == c) n++;
9549 }
9550 return SIZET2NUM(n);
9551 }
9552 }
9553
9554 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9555 for (i=1; i<argc; i++) {
9556 tstr = argv[i];
9557 StringValue(tstr);
9558 enc = rb_enc_check(str, tstr);
9559 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9560 }
9561
9562 s = RSTRING_PTR(str);
9563 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9564 send = RSTRING_END(str);
9565 ascompat = rb_enc_asciicompat(enc);
9566 while (s < send) {
9567 unsigned int c;
9568
9569 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9570 if (table[c]) {
9571 n++;
9572 }
9573 s++;
9574 }
9575 else {
9576 int clen;
9577 c = rb_enc_codepoint_len(s, send, &clen, enc);
9578 if (tr_find(c, table, del, nodel)) {
9579 n++;
9580 }
9581 s += clen;
9582 }
9583 }
9584
9585 return SIZET2NUM(n);
9586}
9587
9588static VALUE
9589rb_fs_check(VALUE val)
9590{
9591 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9592 val = rb_check_string_type(val);
9593 if (NIL_P(val)) return 0;
9594 }
9595 return val;
9596}
9597
9598static const char isspacetable[256] = {
9599 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9601 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9602 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9603 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9605 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9606 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9607 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9610 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9611 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9613 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9614 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9615};
9616
9617#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9618
9619static long
9620split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9621{
9622 if (empty_count >= 0 && len == 0) {
9623 return empty_count + 1;
9624 }
9625 if (empty_count > 0) {
9626 /* make different substrings */
9627 if (result) {
9628 do {
9629 rb_ary_push(result, str_new_empty_String(str));
9630 } while (--empty_count > 0);
9631 }
9632 else {
9633 do {
9634 rb_yield(str_new_empty_String(str));
9635 } while (--empty_count > 0);
9636 }
9637 }
9638 str = rb_str_subseq(str, beg, len);
9639 if (result) {
9640 rb_ary_push(result, str);
9641 }
9642 else {
9643 rb_yield(str);
9644 }
9645 return empty_count;
9646}
9647
9648typedef enum {
9649 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9650} split_type_t;
9651
9652static split_type_t
9653literal_split_pattern(VALUE spat, split_type_t default_type)
9654{
9655 rb_encoding *enc = STR_ENC_GET(spat);
9656 const char *ptr;
9657 long len;
9658 RSTRING_GETMEM(spat, ptr, len);
9659 if (len == 0) {
9660 /* Special case - split into chars */
9661 return SPLIT_TYPE_CHARS;
9662 }
9663 else if (rb_enc_asciicompat(enc)) {
9664 if (len == 1 && ptr[0] == ' ') {
9665 return SPLIT_TYPE_AWK;
9666 }
9667 }
9668 else {
9669 int l;
9670 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9671 return SPLIT_TYPE_AWK;
9672 }
9673 }
9674 return default_type;
9675}
9676
9677/*
9678 * call-seq:
9679 * split(field_sep = $;, limit = 0) -> array
9680 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9681 *
9682 * :include: doc/string/split.rdoc
9683 *
9684 */
9685
9686static VALUE
9687rb_str_split_m(int argc, VALUE *argv, VALUE str)
9688{
9689 rb_encoding *enc;
9690 VALUE spat;
9691 VALUE limit;
9692 split_type_t split_type;
9693 long beg, end, i = 0, empty_count = -1;
9694 int lim = 0;
9695 VALUE result, tmp;
9696
9697 result = rb_block_given_p() ? Qfalse : Qnil;
9698 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9699 lim = NUM2INT(limit);
9700 if (lim <= 0) limit = Qnil;
9701 else if (lim == 1) {
9702 if (RSTRING_LEN(str) == 0)
9703 return result ? rb_ary_new2(0) : str;
9704 tmp = str_duplicate(rb_cString, str);
9705 if (!result) {
9706 rb_yield(tmp);
9707 return str;
9708 }
9709 return rb_ary_new3(1, tmp);
9710 }
9711 i = 1;
9712 }
9713 if (NIL_P(limit) && !lim) empty_count = 0;
9714
9715 enc = STR_ENC_GET(str);
9716 split_type = SPLIT_TYPE_REGEXP;
9717 if (!NIL_P(spat)) {
9718 spat = get_pat_quoted(spat, 0);
9719 }
9720 else if (NIL_P(spat = rb_fs)) {
9721 split_type = SPLIT_TYPE_AWK;
9722 }
9723 else if (!(spat = rb_fs_check(spat))) {
9724 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9725 }
9726 else {
9727 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9728 }
9729 if (split_type != SPLIT_TYPE_AWK) {
9730 switch (BUILTIN_TYPE(spat)) {
9731 case T_REGEXP:
9732 rb_reg_options(spat); /* check if uninitialized */
9733 tmp = RREGEXP_SRC(spat);
9734 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9735 if (split_type == SPLIT_TYPE_AWK) {
9736 spat = tmp;
9737 split_type = SPLIT_TYPE_STRING;
9738 }
9739 break;
9740
9741 case T_STRING:
9742 mustnot_broken(spat);
9743 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9744 break;
9745
9746 default:
9748 }
9749 }
9750
9751#define SPLIT_STR(beg, len) ( \
9752 empty_count = split_string(result, str, beg, len, empty_count), \
9753 str_mod_check(str, str_start, str_len))
9754
9755 beg = 0;
9756 char *ptr = RSTRING_PTR(str);
9757 char *const str_start = ptr;
9758 const long str_len = RSTRING_LEN(str);
9759 char *const eptr = str_start + str_len;
9760 if (split_type == SPLIT_TYPE_AWK) {
9761 char *bptr = ptr;
9762 int skip = 1;
9763 unsigned int c;
9764
9765 if (result) result = rb_ary_new();
9766 end = beg;
9767 if (is_ascii_string(str)) {
9768 while (ptr < eptr) {
9769 c = (unsigned char)*ptr++;
9770 if (skip) {
9771 if (ascii_isspace(c)) {
9772 beg = ptr - bptr;
9773 }
9774 else {
9775 end = ptr - bptr;
9776 skip = 0;
9777 if (!NIL_P(limit) && lim <= i) break;
9778 }
9779 }
9780 else if (ascii_isspace(c)) {
9781 SPLIT_STR(beg, end-beg);
9782 skip = 1;
9783 beg = ptr - bptr;
9784 if (!NIL_P(limit)) ++i;
9785 }
9786 else {
9787 end = ptr - bptr;
9788 }
9789 }
9790 }
9791 else {
9792 while (ptr < eptr) {
9793 int n;
9794
9795 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9796 ptr += n;
9797 if (skip) {
9798 if (rb_isspace(c)) {
9799 beg = ptr - bptr;
9800 }
9801 else {
9802 end = ptr - bptr;
9803 skip = 0;
9804 if (!NIL_P(limit) && lim <= i) break;
9805 }
9806 }
9807 else if (rb_isspace(c)) {
9808 SPLIT_STR(beg, end-beg);
9809 skip = 1;
9810 beg = ptr - bptr;
9811 if (!NIL_P(limit)) ++i;
9812 }
9813 else {
9814 end = ptr - bptr;
9815 }
9816 }
9817 }
9818 }
9819 else if (split_type == SPLIT_TYPE_STRING) {
9820 char *substr_start = ptr;
9821 char *sptr = RSTRING_PTR(spat);
9822 long slen = RSTRING_LEN(spat);
9823
9824 if (result) result = rb_ary_new();
9825 mustnot_broken(str);
9826 enc = rb_enc_check(str, spat);
9827 while (ptr < eptr &&
9828 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9829 /* Check we are at the start of a char */
9830 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9831 if (t != ptr + end) {
9832 ptr = t;
9833 continue;
9834 }
9835 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9836 str_mod_check(spat, sptr, slen);
9837 ptr += end + slen;
9838 substr_start = ptr;
9839 if (!NIL_P(limit) && lim <= ++i) break;
9840 }
9841 beg = ptr - str_start;
9842 }
9843 else if (split_type == SPLIT_TYPE_CHARS) {
9844 int n;
9845
9846 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9847 mustnot_broken(str);
9848 enc = rb_enc_get(str);
9849 while (ptr < eptr &&
9850 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9851 SPLIT_STR(ptr - str_start, n);
9852 ptr += n;
9853 if (!NIL_P(limit) && lim <= ++i) break;
9854 }
9855 beg = ptr - str_start;
9856 }
9857 else {
9858 if (result) result = rb_ary_new();
9859 long len = RSTRING_LEN(str);
9860 long start = beg;
9861 long idx;
9862 int last_null = 0;
9863 struct re_registers *regs;
9864 VALUE match = 0;
9865
9866 for (; rb_reg_search(spat, str, start, 0) >= 0;
9867 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9868 match = rb_backref_get();
9869 if (!result) rb_match_busy(match);
9870 regs = RMATCH_REGS(match);
9871 end = BEG(0);
9872 if (start == end && BEG(0) == END(0)) {
9873 if (!ptr) {
9874 SPLIT_STR(0, 0);
9875 break;
9876 }
9877 else if (last_null == 1) {
9878 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9879 beg = start;
9880 }
9881 else {
9882 if (start == len)
9883 start++;
9884 else
9885 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9886 last_null = 1;
9887 continue;
9888 }
9889 }
9890 else {
9891 SPLIT_STR(beg, end-beg);
9892 beg = start = END(0);
9893 }
9894 last_null = 0;
9895
9896 for (idx=1; idx < regs->num_regs; idx++) {
9897 if (BEG(idx) == -1) continue;
9898 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9899 }
9900 if (!NIL_P(limit) && lim <= ++i) break;
9901 }
9902 if (match) rb_match_unbusy(match);
9903 }
9904 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9905 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9906 }
9907
9908 return result ? result : str;
9909}
9910
9911VALUE
9912rb_str_split(VALUE str, const char *sep0)
9913{
9914 VALUE sep;
9915
9916 StringValue(str);
9917 sep = rb_str_new_cstr(sep0);
9918 return rb_str_split_m(1, &sep, str);
9919}
9920
9921#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9922
9923static inline int
9924enumerator_element(VALUE ary, VALUE e)
9925{
9926 if (ary) {
9927 rb_ary_push(ary, e);
9928 return 0;
9929 }
9930 else {
9931 rb_yield(e);
9932 return 1;
9933 }
9934}
9935
9936#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9937
9938static const char *
9939chomp_newline(const char *p, const char *e, rb_encoding *enc)
9940{
9941 const char *prev = rb_enc_prev_char(p, e, e, enc);
9942 if (rb_enc_is_newline(prev, e, enc)) {
9943 e = prev;
9944 prev = rb_enc_prev_char(p, e, e, enc);
9945 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9946 e = prev;
9947 }
9948 return e;
9949}
9950
9951static VALUE
9952get_rs(void)
9953{
9954 VALUE rs = rb_rs;
9955 if (!NIL_P(rs) &&
9956 (!RB_TYPE_P(rs, T_STRING) ||
9957 RSTRING_LEN(rs) != 1 ||
9958 RSTRING_PTR(rs)[0] != '\n')) {
9959 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9960 }
9961 return rs;
9962}
9963
9964#define rb_rs get_rs()
9965
9966static VALUE
9967rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9968{
9969 rb_encoding *enc;
9970 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9971 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9972 long pos, len, rslen;
9973 int rsnewline = 0;
9974
9975 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9976 rs = rb_rs;
9977 if (!NIL_P(opts)) {
9978 static ID keywords[1];
9979 if (!keywords[0]) {
9980 keywords[0] = rb_intern_const("chomp");
9981 }
9982 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9983 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9984 }
9985
9986 if (NIL_P(rs)) {
9987 if (!ENUM_ELEM(ary, str)) {
9988 return ary;
9989 }
9990 else {
9991 return orig;
9992 }
9993 }
9994
9995 if (!RSTRING_LEN(str)) goto end;
9996 str = rb_str_new_frozen(str);
9997 ptr = subptr = RSTRING_PTR(str);
9998 pend = RSTRING_END(str);
9999 len = RSTRING_LEN(str);
10000 StringValue(rs);
10001 rslen = RSTRING_LEN(rs);
10002
10003 if (rs == rb_default_rs)
10004 enc = rb_enc_get(str);
10005 else
10006 enc = rb_enc_check(str, rs);
10007
10008 if (rslen == 0) {
10009 /* paragraph mode */
10010 int n;
10011 const char *eol = NULL;
10012 subend = subptr;
10013 while (subend < pend) {
10014 long chomp_rslen = 0;
10015 do {
10016 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
10017 n = 0;
10018 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
10019 if (rb_enc_is_newline(subend + n, pend, enc)) {
10020 if (eol == subend) break;
10021 subend += rslen;
10022 if (subptr) {
10023 eol = subend;
10024 chomp_rslen = -rslen;
10025 }
10026 }
10027 else {
10028 if (!subptr) subptr = subend;
10029 subend += rslen;
10030 }
10031 rslen = 0;
10032 } while (subend < pend);
10033 if (!subptr) break;
10034 if (rslen == 0) chomp_rslen = 0;
10035 line = rb_str_subseq(str, subptr - ptr,
10036 subend - subptr + (chomp ? chomp_rslen : rslen));
10037 if (ENUM_ELEM(ary, line)) {
10038 str_mod_check(str, ptr, len);
10039 }
10040 subptr = eol = NULL;
10041 }
10042 goto end;
10043 }
10044 else {
10045 rsptr = RSTRING_PTR(rs);
10046 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
10047 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
10048 rsnewline = 1;
10049 }
10050 }
10051
10052 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
10053 rs = rb_str_new(rsptr, rslen);
10054 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
10055 rsptr = RSTRING_PTR(rs);
10056 rslen = RSTRING_LEN(rs);
10057 }
10058
10059 while (subptr < pend) {
10060 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
10061 if (pos < 0) break;
10062 hit = subptr + pos;
10063 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
10064 if (hit != adjusted) {
10065 subptr = adjusted;
10066 continue;
10067 }
10068 subend = hit += rslen;
10069 if (chomp) {
10070 if (rsnewline) {
10071 subend = chomp_newline(subptr, subend, enc);
10072 }
10073 else {
10074 subend -= rslen;
10075 }
10076 }
10077 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
10078 if (ENUM_ELEM(ary, line)) {
10079 str_mod_check(str, ptr, len);
10080 }
10081 subptr = hit;
10082 }
10083
10084 if (subptr != pend) {
10085 if (chomp) {
10086 if (rsnewline) {
10087 pend = chomp_newline(subptr, pend, enc);
10088 }
10089 else if (pend - subptr >= rslen &&
10090 memcmp(pend - rslen, rsptr, rslen) == 0) {
10091 pend -= rslen;
10092 }
10093 }
10094 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
10095 ENUM_ELEM(ary, line);
10096 RB_GC_GUARD(str);
10097 }
10098
10099 end:
10100 if (ary)
10101 return ary;
10102 else
10103 return orig;
10104}
10105
10106/*
10107 * call-seq:
10108 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
10109 * each_line(line_sep = $/, chomp: false) -> enumerator
10110 *
10111 * :include: doc/string/each_line.rdoc
10112 *
10113 */
10114
10115static VALUE
10116rb_str_each_line(int argc, VALUE *argv, VALUE str)
10117{
10118 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
10119 return rb_str_enumerate_lines(argc, argv, str, 0);
10120}
10121
10122/*
10123 * call-seq:
10124 * lines(Line_sep = $/, chomp: false) -> array_of_strings
10125 *
10126 * Forms substrings ("lines") of +self+ according to the given arguments
10127 * (see String#each_line for details); returns the lines in an array.
10128 *
10129 */
10130
10131static VALUE
10132rb_str_lines(int argc, VALUE *argv, VALUE str)
10133{
10134 VALUE ary = WANTARRAY("lines", 0);
10135 return rb_str_enumerate_lines(argc, argv, str, ary);
10136}
10137
10138static VALUE
10139rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
10140{
10141 return LONG2FIX(RSTRING_LEN(str));
10142}
10143
10144static VALUE
10145rb_str_enumerate_bytes(VALUE str, VALUE ary)
10146{
10147 long i;
10148
10149 for (i=0; i<RSTRING_LEN(str); i++) {
10150 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
10151 }
10152 if (ary)
10153 return ary;
10154 else
10155 return str;
10156}
10157
10158/*
10159 * call-seq:
10160 * each_byte {|byte| ... } -> self
10161 * each_byte -> enumerator
10162 *
10163 * :include: doc/string/each_byte.rdoc
10164 *
10165 */
10166
10167static VALUE
10168rb_str_each_byte(VALUE str)
10169{
10170 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
10171 return rb_str_enumerate_bytes(str, 0);
10172}
10173
10174/*
10175 * call-seq:
10176 * bytes -> array_of_bytes
10177 *
10178 * :include: doc/string/bytes.rdoc
10179 *
10180 */
10181
10182static VALUE
10183rb_str_bytes(VALUE str)
10184{
10185 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
10186 return rb_str_enumerate_bytes(str, ary);
10187}
10188
10189static VALUE
10190rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
10191{
10192 return rb_str_length(str);
10193}
10194
10195static VALUE
10196rb_str_enumerate_chars(VALUE str, VALUE ary)
10197{
10198 VALUE orig = str;
10199 long i, len, n;
10200 const char *ptr;
10201 rb_encoding *enc;
10202
10203 str = rb_str_new_frozen(str);
10204 ptr = RSTRING_PTR(str);
10205 len = RSTRING_LEN(str);
10206 enc = rb_enc_get(str);
10207
10209 for (i = 0; i < len; i += n) {
10210 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
10211 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
10212 }
10213 }
10214 else {
10215 for (i = 0; i < len; i += n) {
10216 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
10217 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
10218 }
10219 }
10220 RB_GC_GUARD(str);
10221 if (ary)
10222 return ary;
10223 else
10224 return orig;
10225}
10226
10227/*
10228 * call-seq:
10229 * each_char {|c| ... } -> self
10230 * each_char -> enumerator
10231 *
10232 * :include: doc/string/each_char.rdoc
10233 *
10234 */
10235
10236static VALUE
10237rb_str_each_char(VALUE str)
10238{
10239 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
10240 return rb_str_enumerate_chars(str, 0);
10241}
10242
10243/*
10244 * call-seq:
10245 * chars -> array_of_characters
10246 *
10247 * :include: doc/string/chars.rdoc
10248 *
10249 */
10250
10251static VALUE
10252rb_str_chars(VALUE str)
10253{
10254 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
10255 return rb_str_enumerate_chars(str, ary);
10256}
10257
10258static VALUE
10259rb_str_enumerate_codepoints(VALUE str, VALUE ary)
10260{
10261 VALUE orig = str;
10262 int n;
10263 unsigned int c;
10264 const char *ptr, *end;
10265 rb_encoding *enc;
10266
10267 if (single_byte_optimizable(str))
10268 return rb_str_enumerate_bytes(str, ary);
10269
10270 str = rb_str_new_frozen(str);
10271 ptr = RSTRING_PTR(str);
10272 end = RSTRING_END(str);
10273 enc = STR_ENC_GET(str);
10274
10275 while (ptr < end) {
10276 c = rb_enc_codepoint_len(ptr, end, &n, enc);
10277 ENUM_ELEM(ary, UINT2NUM(c));
10278 ptr += n;
10279 }
10280 RB_GC_GUARD(str);
10281 if (ary)
10282 return ary;
10283 else
10284 return orig;
10285}
10286
10287/*
10288 * call-seq:
10289 * each_codepoint {|integer| ... } -> self
10290 * each_codepoint -> enumerator
10291 *
10292 * :include: doc/string/each_codepoint.rdoc
10293 *
10294 */
10295
10296static VALUE
10297rb_str_each_codepoint(VALUE str)
10298{
10299 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
10300 return rb_str_enumerate_codepoints(str, 0);
10301}
10302
10303/*
10304 * call-seq:
10305 * codepoints -> array_of_integers
10306 *
10307 * :include: doc/string/codepoints.rdoc
10308 *
10309 */
10310
10311static VALUE
10312rb_str_codepoints(VALUE str)
10313{
10314 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
10315 return rb_str_enumerate_codepoints(str, ary);
10316}
10317
10318static regex_t *
10319get_reg_grapheme_cluster(rb_encoding *enc)
10320{
10321 int encidx = rb_enc_to_index(enc);
10322
10323 const OnigUChar source_ascii[] = "\\X";
10324 const OnigUChar *source = source_ascii;
10325 size_t source_len = sizeof(source_ascii) - 1;
10326
10327 switch (encidx) {
10328#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
10329#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
10330#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
10331#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
10332#define CASE_UTF(e) \
10333 case ENCINDEX_UTF_##e: { \
10334 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
10335 source = source_UTF_##e; \
10336 source_len = sizeof(source_UTF_##e); \
10337 break; \
10338 }
10339 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
10340#undef CASE_UTF
10341#undef CHARS_16BE
10342#undef CHARS_16LE
10343#undef CHARS_32BE
10344#undef CHARS_32LE
10345 }
10346
10347 regex_t *reg_grapheme_cluster;
10348 OnigErrorInfo einfo;
10349 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
10350 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
10351 if (r) {
10352 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
10353 onig_error_code_to_str(message, r, &einfo);
10354 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
10355 }
10356
10357 return reg_grapheme_cluster;
10358}
10359
10360static regex_t *
10361get_cached_reg_grapheme_cluster(rb_encoding *enc)
10362{
10363 int encidx = rb_enc_to_index(enc);
10364 static regex_t *reg_grapheme_cluster_utf8 = NULL;
10365
10366 if (encidx == rb_utf8_encindex()) {
10367 if (!reg_grapheme_cluster_utf8) {
10368 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
10369 }
10370
10371 return reg_grapheme_cluster_utf8;
10372 }
10373
10374 return NULL;
10375}
10376
10377static VALUE
10378rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
10379{
10380 size_t grapheme_cluster_count = 0;
10381 rb_encoding *enc = get_encoding(str);
10382 const char *ptr, *end;
10383
10384 if (!rb_enc_unicode_p(enc)) {
10385 return rb_str_length(str);
10386 }
10387
10388 bool cached_reg_grapheme_cluster = true;
10389 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10390 if (!reg_grapheme_cluster) {
10391 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10392 cached_reg_grapheme_cluster = false;
10393 }
10394
10395 ptr = RSTRING_PTR(str);
10396 end = RSTRING_END(str);
10397
10398 while (ptr < end) {
10399 OnigPosition len = onig_match(reg_grapheme_cluster,
10400 (const OnigUChar *)ptr, (const OnigUChar *)end,
10401 (const OnigUChar *)ptr, NULL, 0);
10402 if (len <= 0) break;
10403 grapheme_cluster_count++;
10404 ptr += len;
10405 }
10406
10407 if (!cached_reg_grapheme_cluster) {
10408 onig_free(reg_grapheme_cluster);
10409 }
10410
10411 return SIZET2NUM(grapheme_cluster_count);
10412}
10413
10414static VALUE
10415rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
10416{
10417 VALUE orig = str;
10418 rb_encoding *enc = get_encoding(str);
10419 const char *ptr0, *ptr, *end;
10420
10421 if (!rb_enc_unicode_p(enc)) {
10422 return rb_str_enumerate_chars(str, ary);
10423 }
10424
10425 if (!ary) str = rb_str_new_frozen(str);
10426
10427 bool cached_reg_grapheme_cluster = true;
10428 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10429 if (!reg_grapheme_cluster) {
10430 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10431 cached_reg_grapheme_cluster = false;
10432 }
10433
10434 ptr0 = ptr = RSTRING_PTR(str);
10435 end = RSTRING_END(str);
10436
10437 while (ptr < end) {
10438 OnigPosition len = onig_match(reg_grapheme_cluster,
10439 (const OnigUChar *)ptr, (const OnigUChar *)end,
10440 (const OnigUChar *)ptr, NULL, 0);
10441 if (len <= 0) break;
10442 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10443 ptr += len;
10444 }
10445
10446 if (!cached_reg_grapheme_cluster) {
10447 onig_free(reg_grapheme_cluster);
10448 }
10449
10450 RB_GC_GUARD(str);
10451 if (ary)
10452 return ary;
10453 else
10454 return orig;
10455}
10456
10457/*
10458 * call-seq:
10459 * each_grapheme_cluster {|gc| ... } -> self
10460 * each_grapheme_cluster -> enumerator
10461 *
10462 * :include: doc/string/each_grapheme_cluster.rdoc
10463 *
10464 */
10465
10466static VALUE
10467rb_str_each_grapheme_cluster(VALUE str)
10468{
10469 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10470 return rb_str_enumerate_grapheme_clusters(str, 0);
10471}
10472
10473/*
10474 * call-seq:
10475 * grapheme_clusters -> array_of_grapheme_clusters
10476 *
10477 * :include: doc/string/grapheme_clusters.rdoc
10478 *
10479 */
10480
10481static VALUE
10482rb_str_grapheme_clusters(VALUE str)
10483{
10484 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10485 return rb_str_enumerate_grapheme_clusters(str, ary);
10486}
10487
10488static long
10489chopped_length(VALUE str)
10490{
10491 rb_encoding *enc = STR_ENC_GET(str);
10492 const char *p, *p2, *beg, *end;
10493
10494 beg = RSTRING_PTR(str);
10495 end = beg + RSTRING_LEN(str);
10496 if (beg >= end) return 0;
10497 p = rb_enc_prev_char(beg, end, end, enc);
10498 if (!p) return 0;
10499 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10500 p2 = rb_enc_prev_char(beg, p, end, enc);
10501 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10502 }
10503 return p - beg;
10504}
10505
10506/*
10507 * call-seq:
10508 * chop! -> self or nil
10509 *
10510 * Like String#chop, but modifies +self+ in place;
10511 * returns +nil+ if +self+ is empty, +self+ otherwise.
10512 *
10513 * Related: String#chomp!.
10514 */
10515
10516static VALUE
10517rb_str_chop_bang(VALUE str)
10518{
10519 str_modify_keep_cr(str);
10520 if (RSTRING_LEN(str) > 0) {
10521 long len;
10522 len = chopped_length(str);
10523 STR_SET_LEN(str, len);
10524 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10525 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10527 }
10528 return str;
10529 }
10530 return Qnil;
10531}
10532
10533
10534/*
10535 * call-seq:
10536 * chop -> new_string
10537 *
10538 * :include: doc/string/chop.rdoc
10539 *
10540 */
10541
10542static VALUE
10543rb_str_chop(VALUE str)
10544{
10545 return rb_str_subseq(str, 0, chopped_length(str));
10546}
10547
10548static long
10549smart_chomp(VALUE str, const char *e, const char *p)
10550{
10551 rb_encoding *enc = rb_enc_get(str);
10552 if (rb_enc_mbminlen(enc) > 1) {
10553 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10554 if (rb_enc_is_newline(pp, e, enc)) {
10555 e = pp;
10556 }
10557 pp = e - rb_enc_mbminlen(enc);
10558 if (pp >= p) {
10559 pp = rb_enc_left_char_head(p, pp, e, enc);
10560 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10561 e = pp;
10562 }
10563 }
10564 }
10565 else {
10566 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10567 case '\n':
10568 if (--e > p && *(e-1) == '\r') {
10569 --e;
10570 }
10571 break;
10572 case '\r':
10573 --e;
10574 break;
10575 }
10576 }
10577 return e - p;
10578}
10579
10580static long
10581chompped_length(VALUE str, VALUE rs)
10582{
10583 rb_encoding *enc;
10584 int newline;
10585 char *pp, *e, *rsptr;
10586 long rslen;
10587 char *const p = RSTRING_PTR(str);
10588 long len = RSTRING_LEN(str);
10589
10590 if (len == 0) return 0;
10591 e = p + len;
10592 if (rs == rb_default_rs) {
10593 return smart_chomp(str, e, p);
10594 }
10595
10596 enc = rb_enc_get(str);
10597 RSTRING_GETMEM(rs, rsptr, rslen);
10598 if (rslen == 0) {
10599 if (rb_enc_mbminlen(enc) > 1) {
10600 while (e > p) {
10601 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10602 if (!rb_enc_is_newline(pp, e, enc)) break;
10603 e = pp;
10604 pp -= rb_enc_mbminlen(enc);
10605 if (pp >= p) {
10606 pp = rb_enc_left_char_head(p, pp, e, enc);
10607 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10608 e = pp;
10609 }
10610 }
10611 }
10612 }
10613 else {
10614 while (e > p && *(e-1) == '\n') {
10615 --e;
10616 if (e > p && *(e-1) == '\r')
10617 --e;
10618 }
10619 }
10620 return e - p;
10621 }
10622 if (rslen > len) return len;
10623
10624 enc = rb_enc_get(rs);
10625 newline = rsptr[rslen-1];
10626 if (rslen == rb_enc_mbminlen(enc)) {
10627 if (rslen == 1) {
10628 if (newline == '\n')
10629 return smart_chomp(str, e, p);
10630 }
10631 else {
10632 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10633 return smart_chomp(str, e, p);
10634 }
10635 }
10636
10637 enc = rb_enc_check(str, rs);
10638 if (is_broken_string(rs)) {
10639 return len;
10640 }
10641 pp = e - rslen;
10642 if (p[len-1] == newline &&
10643 (rslen <= 1 ||
10644 memcmp(rsptr, pp, rslen) == 0)) {
10645 if (at_char_boundary(p, pp, e, enc))
10646 return len - rslen;
10647 RB_GC_GUARD(rs);
10648 }
10649 return len;
10650}
10651
10657static VALUE
10658chomp_rs(int argc, const VALUE *argv)
10659{
10660 rb_check_arity(argc, 0, 1);
10661 if (argc > 0) {
10662 VALUE rs = argv[0];
10663 if (!NIL_P(rs)) StringValue(rs);
10664 return rs;
10665 }
10666 else {
10667 return rb_rs;
10668 }
10669}
10670
10671VALUE
10672rb_str_chomp_string(VALUE str, VALUE rs)
10673{
10674 long olen = RSTRING_LEN(str);
10675 long len = chompped_length(str, rs);
10676 if (len >= olen) return Qnil;
10677 str_modify_keep_cr(str);
10678 STR_SET_LEN(str, len);
10679 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10680 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10682 }
10683 return str;
10684}
10685
10686/*
10687 * call-seq:
10688 * chomp!(line_sep = $/) -> self or nil
10689 *
10690 * Like String#chomp, but modifies +self+ in place;
10691 * returns +nil+ if no modification made, +self+ otherwise.
10692 *
10693 */
10694
10695static VALUE
10696rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10697{
10698 VALUE rs;
10699 str_modifiable(str);
10700 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10701 rs = chomp_rs(argc, argv);
10702 if (NIL_P(rs)) return Qnil;
10703 return rb_str_chomp_string(str, rs);
10704}
10705
10706
10707/*
10708 * call-seq:
10709 * chomp(line_sep = $/) -> new_string
10710 *
10711 * :include: doc/string/chomp.rdoc
10712 *
10713 */
10714
10715static VALUE
10716rb_str_chomp(int argc, VALUE *argv, VALUE str)
10717{
10718 VALUE rs = chomp_rs(argc, argv);
10719 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10720 return rb_str_subseq(str, 0, chompped_length(str, rs));
10721}
10722
10723static long
10724lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10725{
10726 const char *const start = s;
10727
10728 if (!s || s >= e) return 0;
10729
10730 /* remove spaces at head */
10731 if (single_byte_optimizable(str)) {
10732 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10733 }
10734 else {
10735 while (s < e) {
10736 int n;
10737 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10738
10739 if (cc && !rb_isspace(cc)) break;
10740 s += n;
10741 }
10742 }
10743 return s - start;
10744}
10745
10746/*
10747 * call-seq:
10748 * lstrip! -> self or nil
10749 *
10750 * Like String#lstrip, except that any modifications are made in +self+;
10751 * returns +self+ if any modification are made, +nil+ otherwise.
10752 *
10753 * Related: String#rstrip!, String#strip!.
10754 */
10755
10756static VALUE
10757rb_str_lstrip_bang(VALUE str)
10758{
10759 rb_encoding *enc;
10760 char *start, *s;
10761 long olen, loffset;
10762
10763 str_modify_keep_cr(str);
10764 enc = STR_ENC_GET(str);
10765 RSTRING_GETMEM(str, start, olen);
10766 loffset = lstrip_offset(str, start, start+olen, enc);
10767 if (loffset > 0) {
10768 long len = olen-loffset;
10769 s = start + loffset;
10770 memmove(start, s, len);
10771 STR_SET_LEN(str, len);
10772 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10773 return str;
10774 }
10775 return Qnil;
10776}
10777
10778
10779/*
10780 * call-seq:
10781 * lstrip -> new_string
10782 *
10783 * Returns a copy of +self+ with leading whitespace removed;
10784 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10785 *
10786 * whitespace = "\x00\t\n\v\f\r "
10787 * s = whitespace + 'abc' + whitespace
10788 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10789 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10790 *
10791 * Related: String#rstrip, String#strip.
10792 */
10793
10794static VALUE
10795rb_str_lstrip(VALUE str)
10796{
10797 char *start;
10798 long len, loffset;
10799 RSTRING_GETMEM(str, start, len);
10800 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10801 if (loffset <= 0) return str_duplicate(rb_cString, str);
10802 return rb_str_subseq(str, loffset, len - loffset);
10803}
10804
10805static long
10806rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10807{
10808 const char *t;
10809
10810 rb_str_check_dummy_enc(enc);
10812 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10813 }
10814 if (!s || s >= e) return 0;
10815 t = e;
10816
10817 /* remove trailing spaces or '\0's */
10818 if (single_byte_optimizable(str)) {
10819 unsigned char c;
10820 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10821 }
10822 else {
10823 char *tp;
10824
10825 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10826 unsigned int c = rb_enc_codepoint(tp, e, enc);
10827 if (c && !rb_isspace(c)) break;
10828 t = tp;
10829 }
10830 }
10831 return e - t;
10832}
10833
10834/*
10835 * call-seq:
10836 * rstrip! -> self or nil
10837 *
10838 * Like String#rstrip, except that any modifications are made in +self+;
10839 * returns +self+ if any modification are made, +nil+ otherwise.
10840 *
10841 * Related: String#lstrip!, String#strip!.
10842 */
10843
10844static VALUE
10845rb_str_rstrip_bang(VALUE str)
10846{
10847 rb_encoding *enc;
10848 char *start;
10849 long olen, roffset;
10850
10851 str_modify_keep_cr(str);
10852 enc = STR_ENC_GET(str);
10853 RSTRING_GETMEM(str, start, olen);
10854 roffset = rstrip_offset(str, start, start+olen, enc);
10855 if (roffset > 0) {
10856 long len = olen - roffset;
10857
10858 STR_SET_LEN(str, len);
10859 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10860 return str;
10861 }
10862 return Qnil;
10863}
10864
10865
10866/*
10867 * call-seq:
10868 * rstrip -> new_string
10869 *
10870 * Returns a copy of the receiver with trailing whitespace removed;
10871 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10872 *
10873 * whitespace = "\x00\t\n\v\f\r "
10874 * s = whitespace + 'abc' + whitespace
10875 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10876 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10877 *
10878 * Related: String#lstrip, String#strip.
10879 */
10880
10881static VALUE
10882rb_str_rstrip(VALUE str)
10883{
10884 rb_encoding *enc;
10885 char *start;
10886 long olen, roffset;
10887
10888 enc = STR_ENC_GET(str);
10889 RSTRING_GETMEM(str, start, olen);
10890 roffset = rstrip_offset(str, start, start+olen, enc);
10891
10892 if (roffset <= 0) return str_duplicate(rb_cString, str);
10893 return rb_str_subseq(str, 0, olen-roffset);
10894}
10895
10896
10897/*
10898 * call-seq:
10899 * strip! -> self or nil
10900 *
10901 * Like String#strip, except that any modifications are made in +self+;
10902 * returns +self+ if any modification are made, +nil+ otherwise.
10903 *
10904 * Related: String#lstrip!, String#strip!.
10905 */
10906
10907static VALUE
10908rb_str_strip_bang(VALUE str)
10909{
10910 char *start;
10911 long olen, loffset, roffset;
10912 rb_encoding *enc;
10913
10914 str_modify_keep_cr(str);
10915 enc = STR_ENC_GET(str);
10916 RSTRING_GETMEM(str, start, olen);
10917 loffset = lstrip_offset(str, start, start+olen, enc);
10918 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10919
10920 if (loffset > 0 || roffset > 0) {
10921 long len = olen-roffset;
10922 if (loffset > 0) {
10923 len -= loffset;
10924 memmove(start, start + loffset, len);
10925 }
10926 STR_SET_LEN(str, len);
10927 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10928 return str;
10929 }
10930 return Qnil;
10931}
10932
10933
10934/*
10935 * call-seq:
10936 * strip -> new_string
10937 *
10938 * Returns a copy of the receiver with leading and trailing whitespace removed;
10939 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10940 *
10941 * whitespace = "\x00\t\n\v\f\r "
10942 * s = whitespace + 'abc' + whitespace
10943 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10944 * s.strip # => "abc"
10945 *
10946 * Related: String#lstrip, String#rstrip.
10947 */
10948
10949static VALUE
10950rb_str_strip(VALUE str)
10951{
10952 char *start;
10953 long olen, loffset, roffset;
10954 rb_encoding *enc = STR_ENC_GET(str);
10955
10956 RSTRING_GETMEM(str, start, olen);
10957 loffset = lstrip_offset(str, start, start+olen, enc);
10958 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10959
10960 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10961 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10962}
10963
10964static VALUE
10965scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10966{
10967 VALUE result = Qnil;
10968 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10969 if (pos >= 0) {
10970 VALUE match;
10971 struct re_registers *regs;
10972 if (BUILTIN_TYPE(pat) == T_STRING) {
10973 regs = NULL;
10974 end = pos + RSTRING_LEN(pat);
10975 }
10976 else {
10977 match = rb_backref_get();
10978 regs = RMATCH_REGS(match);
10979 pos = BEG(0);
10980 end = END(0);
10981 }
10982
10983 if (pos == end) {
10984 rb_encoding *enc = STR_ENC_GET(str);
10985 /*
10986 * Always consume at least one character of the input string
10987 */
10988 if (RSTRING_LEN(str) > end)
10989 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10990 RSTRING_END(str), enc);
10991 else
10992 *start = end + 1;
10993 }
10994 else {
10995 *start = end;
10996 }
10997
10998 if (!regs || regs->num_regs == 1) {
10999 result = rb_str_subseq(str, pos, end - pos);
11000 return result;
11001 }
11002 else {
11003 result = rb_ary_new2(regs->num_regs);
11004 for (int i = 1; i < regs->num_regs; i++) {
11005 VALUE s = Qnil;
11006 if (BEG(i) >= 0) {
11007 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
11008 }
11009
11010 rb_ary_push(result, s);
11011 }
11012 }
11013
11014 RB_GC_GUARD(match);
11015 }
11016
11017 return result;
11018}
11019
11020
11021/*
11022 * call-seq:
11023 * scan(string_or_regexp) -> array
11024 * scan(string_or_regexp) {|matches| ... } -> self
11025 *
11026 * Matches a pattern against +self+; the pattern is:
11027 *
11028 * - +string_or_regexp+ itself, if it is a Regexp.
11029 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
11030 *
11031 * Iterates through +self+, generating a collection of matching results:
11032 *
11033 * - If the pattern contains no groups, each result is the
11034 * matched string, <code>$&</code>.
11035 * - If the pattern contains groups, each result is an array
11036 * containing one entry per group.
11037 *
11038 * With no block given, returns an array of the results:
11039 *
11040 * s = 'cruel world'
11041 * s.scan(/\w+/) # => ["cruel", "world"]
11042 * s.scan(/.../) # => ["cru", "el ", "wor"]
11043 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
11044 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
11045 *
11046 * With a block given, calls the block with each result; returns +self+:
11047 *
11048 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
11049 * print "\n"
11050 * s.scan(/(.)(.)/) {|x,y| print y, x }
11051 * print "\n"
11052 *
11053 * Output:
11054 *
11055 * <<cruel>> <<world>>
11056 * rceu lowlr
11057 *
11058 */
11059
11060static VALUE
11061rb_str_scan(VALUE str, VALUE pat)
11062{
11063 VALUE result;
11064 long start = 0;
11065 long last = -1, prev = 0;
11066 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
11067
11068 pat = get_pat_quoted(pat, 1);
11069 mustnot_broken(str);
11070 if (!rb_block_given_p()) {
11071 VALUE ary = rb_ary_new();
11072
11073 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
11074 last = prev;
11075 prev = start;
11076 rb_ary_push(ary, result);
11077 }
11078 if (last >= 0) rb_pat_search(pat, str, last, 1);
11079 else rb_backref_set(Qnil);
11080 return ary;
11081 }
11082
11083 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
11084 last = prev;
11085 prev = start;
11086 rb_yield(result);
11087 str_mod_check(str, p, len);
11088 }
11089 if (last >= 0) rb_pat_search(pat, str, last, 1);
11090 return str;
11091}
11092
11093
11094/*
11095 * call-seq:
11096 * hex -> integer
11097 *
11098 * Interprets the leading substring of +self+ as a string of hexadecimal digits
11099 * (with an optional sign and an optional <code>0x</code>) and returns the
11100 * corresponding number;
11101 * returns zero if there is no such leading substring:
11102 *
11103 * '0x0a'.hex # => 10
11104 * '-1234'.hex # => -4660
11105 * '0'.hex # => 0
11106 * 'non-numeric'.hex # => 0
11107 *
11108 * Related: String#oct.
11109 *
11110 */
11111
11112static VALUE
11113rb_str_hex(VALUE str)
11114{
11115 return rb_str_to_inum(str, 16, FALSE);
11116}
11117
11118
11119/*
11120 * call-seq:
11121 * oct -> integer
11122 *
11123 * Interprets the leading substring of +self+ as a string of octal digits
11124 * (with an optional sign) and returns the corresponding number;
11125 * returns zero if there is no such leading substring:
11126 *
11127 * '123'.oct # => 83
11128 * '-377'.oct # => -255
11129 * '0377non-numeric'.oct # => 255
11130 * 'non-numeric'.oct # => 0
11131 *
11132 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
11133 * see Kernel#Integer.
11134 *
11135 * Related: String#hex.
11136 *
11137 */
11138
11139static VALUE
11140rb_str_oct(VALUE str)
11141{
11142 return rb_str_to_inum(str, -8, FALSE);
11143}
11144
11145#ifndef HAVE_CRYPT_R
11146# include "ruby/thread_native.h"
11147# include "ruby/atomic.h"
11148
11149static struct {
11150 rb_nativethread_lock_t lock;
11151} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
11152
11153static void
11154crypt_mutex_initialize(void)
11155{
11156}
11157#endif
11158
11159/*
11160 * call-seq:
11161 * crypt(salt_str) -> new_string
11162 *
11163 * Returns the string generated by calling <code>crypt(3)</code>
11164 * standard library function with <code>str</code> and
11165 * <code>salt_str</code>, in this order, as its arguments. Please do
11166 * not use this method any longer. It is legacy; provided only for
11167 * backward compatibility with ruby scripts in earlier days. It is
11168 * bad to use in contemporary programs for several reasons:
11169 *
11170 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
11171 * run. The generated string lacks data portability.
11172 *
11173 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
11174 * (i.e. silently ends up in unexpected results).
11175 *
11176 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
11177 * thread safe.
11178 *
11179 * * So-called "traditional" usage of <code>crypt(3)</code> is very
11180 * very very weak. According to its manpage, Linux's traditional
11181 * <code>crypt(3)</code> output has only 2**56 variations; too
11182 * easy to brute force today. And this is the default behaviour.
11183 *
11184 * * In order to make things robust some OSes implement so-called
11185 * "modular" usage. To go through, you have to do a complex
11186 * build-up of the <code>salt_str</code> parameter, by hand.
11187 * Failure in generation of a proper salt string tends not to
11188 * yield any errors; typos in parameters are normally not
11189 * detectable.
11190 *
11191 * * For instance, in the following example, the second invocation
11192 * of String#crypt is wrong; it has a typo in "round=" (lacks
11193 * "s"). However the call does not fail and something unexpected
11194 * is generated.
11195 *
11196 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
11197 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
11198 *
11199 * * Even in the "modular" mode, some hash functions are considered
11200 * archaic and no longer recommended at all; for instance module
11201 * <code>$1$</code> is officially abandoned by its author: see
11202 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
11203 * instance module <code>$3$</code> is considered completely
11204 * broken: see the manpage of FreeBSD.
11205 *
11206 * * On some OS such as Mac OS, there is no modular mode. Yet, as
11207 * written above, <code>crypt(3)</code> on Mac OS never fails.
11208 * This means even if you build up a proper salt string it
11209 * generates a traditional DES hash anyways, and there is no way
11210 * for you to be aware of.
11211 *
11212 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
11213 *
11214 * If for some reason you cannot migrate to other secure contemporary
11215 * password hashing algorithms, install the string-crypt gem and
11216 * <code>require 'string/crypt'</code> to continue using it.
11217 */
11218
11219static VALUE
11220rb_str_crypt(VALUE str, VALUE salt)
11221{
11222#ifdef HAVE_CRYPT_R
11223 VALUE databuf;
11224 struct crypt_data *data;
11225# define CRYPT_END() ALLOCV_END(databuf)
11226#else
11227 extern char *crypt(const char *, const char *);
11228# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
11229#endif
11230 VALUE result;
11231 const char *s, *saltp;
11232 char *res;
11233#ifdef BROKEN_CRYPT
11234 char salt_8bit_clean[3];
11235#endif
11236
11237 StringValue(salt);
11238 mustnot_wchar(str);
11239 mustnot_wchar(salt);
11240 s = StringValueCStr(str);
11241 saltp = RSTRING_PTR(salt);
11242 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
11243 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
11244 }
11245
11246#ifdef BROKEN_CRYPT
11247 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
11248 salt_8bit_clean[0] = saltp[0] & 0x7f;
11249 salt_8bit_clean[1] = saltp[1] & 0x7f;
11250 salt_8bit_clean[2] = '\0';
11251 saltp = salt_8bit_clean;
11252 }
11253#endif
11254#ifdef HAVE_CRYPT_R
11255 data = ALLOCV(databuf, sizeof(struct crypt_data));
11256# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
11257 data->initialized = 0;
11258# endif
11259 res = crypt_r(s, saltp, data);
11260#else
11261 crypt_mutex_initialize();
11262 rb_nativethread_lock_lock(&crypt_mutex.lock);
11263 res = crypt(s, saltp);
11264#endif
11265 if (!res) {
11266 int err = errno;
11267 CRYPT_END();
11268 rb_syserr_fail(err, "crypt");
11269 }
11270 result = rb_str_new_cstr(res);
11271 CRYPT_END();
11272 return result;
11273}
11274
11275
11276/*
11277 * call-seq:
11278 * ord -> integer
11279 *
11280 * :include: doc/string/ord.rdoc
11281 *
11282 */
11283
11284static VALUE
11285rb_str_ord(VALUE s)
11286{
11287 unsigned int c;
11288
11289 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
11290 return UINT2NUM(c);
11291}
11292/*
11293 * call-seq:
11294 * sum(n = 16) -> integer
11295 *
11296 * :include: doc/string/sum.rdoc
11297 *
11298 */
11299
11300static VALUE
11301rb_str_sum(int argc, VALUE *argv, VALUE str)
11302{
11303 int bits = 16;
11304 char *ptr, *p, *pend;
11305 long len;
11306 VALUE sum = INT2FIX(0);
11307 unsigned long sum0 = 0;
11308
11309 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11310 bits = 0;
11311 }
11312 ptr = p = RSTRING_PTR(str);
11313 len = RSTRING_LEN(str);
11314 pend = p + len;
11315
11316 while (p < pend) {
11317 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11318 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11319 str_mod_check(str, ptr, len);
11320 sum0 = 0;
11321 }
11322 sum0 += (unsigned char)*p;
11323 p++;
11324 }
11325
11326 if (bits == 0) {
11327 if (sum0) {
11328 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11329 }
11330 }
11331 else {
11332 if (sum == INT2FIX(0)) {
11333 if (bits < (int)sizeof(long)*CHAR_BIT) {
11334 sum0 &= (((unsigned long)1)<<bits)-1;
11335 }
11336 sum = LONG2FIX(sum0);
11337 }
11338 else {
11339 VALUE mod;
11340
11341 if (sum0) {
11342 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11343 }
11344
11345 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11346 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11347 sum = rb_funcall(sum, '&', 1, mod);
11348 }
11349 }
11350 return sum;
11351}
11352
11353static VALUE
11354rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11355{
11356 rb_encoding *enc;
11357 VALUE w;
11358 long width, len, flen = 1, fclen = 1;
11359 VALUE res;
11360 char *p;
11361 const char *f = " ";
11362 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11363 VALUE pad;
11364 int singlebyte = 1, cr;
11365 int termlen;
11366
11367 rb_scan_args(argc, argv, "11", &w, &pad);
11368 enc = STR_ENC_GET(str);
11369 termlen = rb_enc_mbminlen(enc);
11370 width = NUM2LONG(w);
11371 if (argc == 2) {
11372 StringValue(pad);
11373 enc = rb_enc_check(str, pad);
11374 f = RSTRING_PTR(pad);
11375 flen = RSTRING_LEN(pad);
11376 fclen = str_strlen(pad, enc); /* rb_enc_check */
11377 singlebyte = single_byte_optimizable(pad);
11378 if (flen == 0 || fclen == 0) {
11379 rb_raise(rb_eArgError, "zero width padding");
11380 }
11381 }
11382 len = str_strlen(str, enc); /* rb_enc_check */
11383 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11384 n = width - len;
11385 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11386 rlen = n - llen;
11387 cr = ENC_CODERANGE(str);
11388 if (flen > 1) {
11389 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11390 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11391 }
11392 size = RSTRING_LEN(str);
11393 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11394 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11395 (len += llen2 + rlen2) >= LONG_MAX - size) {
11396 rb_raise(rb_eArgError, "argument too big");
11397 }
11398 len += size;
11399 res = str_enc_new(rb_cString, 0, len, enc);
11400 p = RSTRING_PTR(res);
11401 if (flen <= 1) {
11402 memset(p, *f, llen);
11403 p += llen;
11404 }
11405 else {
11406 while (llen >= fclen) {
11407 memcpy(p,f,flen);
11408 p += flen;
11409 llen -= fclen;
11410 }
11411 if (llen > 0) {
11412 memcpy(p, f, llen2);
11413 p += llen2;
11414 }
11415 }
11416 memcpy(p, RSTRING_PTR(str), size);
11417 p += size;
11418 if (flen <= 1) {
11419 memset(p, *f, rlen);
11420 p += rlen;
11421 }
11422 else {
11423 while (rlen >= fclen) {
11424 memcpy(p,f,flen);
11425 p += flen;
11426 rlen -= fclen;
11427 }
11428 if (rlen > 0) {
11429 memcpy(p, f, rlen2);
11430 p += rlen2;
11431 }
11432 }
11433 TERM_FILL(p, termlen);
11434 STR_SET_LEN(res, p-RSTRING_PTR(res));
11435
11436 if (argc == 2)
11437 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11438 if (cr != ENC_CODERANGE_BROKEN)
11439 ENC_CODERANGE_SET(res, cr);
11440
11441 RB_GC_GUARD(pad);
11442 return res;
11443}
11444
11445
11446/*
11447 * call-seq:
11448 * ljust(size, pad_string = ' ') -> new_string
11449 *
11450 * :include: doc/string/ljust.rdoc
11451 *
11452 * Related: String#rjust, String#center.
11453 *
11454 */
11455
11456static VALUE
11457rb_str_ljust(int argc, VALUE *argv, VALUE str)
11458{
11459 return rb_str_justify(argc, argv, str, 'l');
11460}
11461
11462/*
11463 * call-seq:
11464 * rjust(size, pad_string = ' ') -> new_string
11465 *
11466 * :include: doc/string/rjust.rdoc
11467 *
11468 * Related: String#ljust, String#center.
11469 *
11470 */
11471
11472static VALUE
11473rb_str_rjust(int argc, VALUE *argv, VALUE str)
11474{
11475 return rb_str_justify(argc, argv, str, 'r');
11476}
11477
11478
11479/*
11480 * call-seq:
11481 * center(size, pad_string = ' ') -> new_string
11482 *
11483 * :include: doc/string/center.rdoc
11484 *
11485 * Related: String#ljust, String#rjust.
11486 *
11487 */
11488
11489static VALUE
11490rb_str_center(int argc, VALUE *argv, VALUE str)
11491{
11492 return rb_str_justify(argc, argv, str, 'c');
11493}
11494
11495/*
11496 * call-seq:
11497 * partition(string_or_regexp) -> [head, match, tail]
11498 *
11499 * :include: doc/string/partition.rdoc
11500 *
11501 */
11502
11503static VALUE
11504rb_str_partition(VALUE str, VALUE sep)
11505{
11506 long pos;
11507
11508 sep = get_pat_quoted(sep, 0);
11509 if (RB_TYPE_P(sep, T_REGEXP)) {
11510 if (rb_reg_search(sep, str, 0, 0) < 0) {
11511 goto failed;
11512 }
11513 VALUE match = rb_backref_get();
11514 struct re_registers *regs = RMATCH_REGS(match);
11515
11516 pos = BEG(0);
11517 sep = rb_str_subseq(str, pos, END(0) - pos);
11518 }
11519 else {
11520 pos = rb_str_index(str, sep, 0);
11521 if (pos < 0) goto failed;
11522 }
11523 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11524 sep,
11525 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11526 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11527
11528 failed:
11529 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11530}
11531
11532/*
11533 * call-seq:
11534 * rpartition(sep) -> [head, match, tail]
11535 *
11536 * :include: doc/string/rpartition.rdoc
11537 *
11538 */
11539
11540static VALUE
11541rb_str_rpartition(VALUE str, VALUE sep)
11542{
11543 long pos = RSTRING_LEN(str);
11544
11545 sep = get_pat_quoted(sep, 0);
11546 if (RB_TYPE_P(sep, T_REGEXP)) {
11547 if (rb_reg_search(sep, str, pos, 1) < 0) {
11548 goto failed;
11549 }
11550 VALUE match = rb_backref_get();
11551 struct re_registers *regs = RMATCH_REGS(match);
11552
11553 pos = BEG(0);
11554 sep = rb_str_subseq(str, pos, END(0) - pos);
11555 }
11556 else {
11557 pos = rb_str_sublen(str, pos);
11558 pos = rb_str_rindex(str, sep, pos);
11559 if (pos < 0) {
11560 goto failed;
11561 }
11562 }
11563
11564 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11565 sep,
11566 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11567 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11568 failed:
11569 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11570}
11571
11572/*
11573 * call-seq:
11574 * start_with?(*string_or_regexp) -> true or false
11575 *
11576 * :include: doc/string/start_with_p.rdoc
11577 *
11578 */
11579
11580static VALUE
11581rb_str_start_with(int argc, VALUE *argv, VALUE str)
11582{
11583 int i;
11584
11585 for (i=0; i<argc; i++) {
11586 VALUE tmp = argv[i];
11587 if (RB_TYPE_P(tmp, T_REGEXP)) {
11588 if (rb_reg_start_with_p(tmp, str))
11589 return Qtrue;
11590 }
11591 else {
11592 const char *p, *s, *e;
11593 long slen, tlen;
11594 rb_encoding *enc;
11595
11596 StringValue(tmp);
11597 enc = rb_enc_check(str, tmp);
11598 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11599 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11600 p = RSTRING_PTR(str);
11601 e = p + slen;
11602 s = p + tlen;
11603 if (!at_char_right_boundary(p, s, e, enc))
11604 continue;
11605 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11606 return Qtrue;
11607 }
11608 }
11609 return Qfalse;
11610}
11611
11612/*
11613 * call-seq:
11614 * end_with?(*strings) -> true or false
11615 *
11616 * :include: doc/string/end_with_p.rdoc
11617 *
11618 */
11619
11620static VALUE
11621rb_str_end_with(int argc, VALUE *argv, VALUE str)
11622{
11623 int i;
11624
11625 for (i=0; i<argc; i++) {
11626 VALUE tmp = argv[i];
11627 const char *p, *s, *e;
11628 long slen, tlen;
11629 rb_encoding *enc;
11630
11631 StringValue(tmp);
11632 enc = rb_enc_check(str, tmp);
11633 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11634 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11635 p = RSTRING_PTR(str);
11636 e = p + slen;
11637 s = e - tlen;
11638 if (!at_char_boundary(p, s, e, enc))
11639 continue;
11640 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11641 return Qtrue;
11642 }
11643 return Qfalse;
11644}
11645
11655static long
11656deleted_prefix_length(VALUE str, VALUE prefix)
11657{
11658 const char *strptr, *prefixptr;
11659 long olen, prefixlen;
11660 rb_encoding *enc = rb_enc_get(str);
11661
11662 StringValue(prefix);
11663
11664 if (!is_broken_string(prefix) ||
11665 !rb_enc_asciicompat(enc) ||
11666 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11667 enc = rb_enc_check(str, prefix);
11668 }
11669
11670 /* return 0 if not start with prefix */
11671 prefixlen = RSTRING_LEN(prefix);
11672 if (prefixlen <= 0) return 0;
11673 olen = RSTRING_LEN(str);
11674 if (olen < prefixlen) return 0;
11675 strptr = RSTRING_PTR(str);
11676 prefixptr = RSTRING_PTR(prefix);
11677 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11678 if (is_broken_string(prefix)) {
11679 if (!is_broken_string(str)) {
11680 /* prefix in a valid string cannot be broken */
11681 return 0;
11682 }
11683 const char *strend = strptr + olen;
11684 const char *after_prefix = strptr + prefixlen;
11685 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11686 /* prefix does not end at char-boundary */
11687 return 0;
11688 }
11689 }
11690 /* prefix part in `str` also should be valid. */
11691
11692 return prefixlen;
11693}
11694
11695/*
11696 * call-seq:
11697 * delete_prefix!(prefix) -> self or nil
11698 *
11699 * Like String#delete_prefix, except that +self+ is modified in place.
11700 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11701 *
11702 */
11703
11704static VALUE
11705rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11706{
11707 long prefixlen;
11708 str_modify_keep_cr(str);
11709
11710 prefixlen = deleted_prefix_length(str, prefix);
11711 if (prefixlen <= 0) return Qnil;
11712
11713 return rb_str_drop_bytes(str, prefixlen);
11714}
11715
11716/*
11717 * call-seq:
11718 * delete_prefix(prefix) -> new_string
11719 *
11720 * :include: doc/string/delete_prefix.rdoc
11721 *
11722 */
11723
11724static VALUE
11725rb_str_delete_prefix(VALUE str, VALUE prefix)
11726{
11727 long prefixlen;
11728
11729 prefixlen = deleted_prefix_length(str, prefix);
11730 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11731
11732 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11733}
11734
11744static long
11745deleted_suffix_length(VALUE str, VALUE suffix)
11746{
11747 const char *strptr, *suffixptr;
11748 long olen, suffixlen;
11749 rb_encoding *enc;
11750
11751 StringValue(suffix);
11752 if (is_broken_string(suffix)) return 0;
11753 enc = rb_enc_check(str, suffix);
11754
11755 /* return 0 if not start with suffix */
11756 suffixlen = RSTRING_LEN(suffix);
11757 if (suffixlen <= 0) return 0;
11758 olen = RSTRING_LEN(str);
11759 if (olen < suffixlen) return 0;
11760 strptr = RSTRING_PTR(str);
11761 suffixptr = RSTRING_PTR(suffix);
11762 const char *strend = strptr + olen;
11763 const char *before_suffix = strend - suffixlen;
11764 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11765 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11766
11767 return suffixlen;
11768}
11769
11770/*
11771 * call-seq:
11772 * delete_suffix!(suffix) -> self or nil
11773 *
11774 * Like String#delete_suffix, except that +self+ is modified in place.
11775 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11776 *
11777 */
11778
11779static VALUE
11780rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11781{
11782 long olen, suffixlen, len;
11783 str_modifiable(str);
11784
11785 suffixlen = deleted_suffix_length(str, suffix);
11786 if (suffixlen <= 0) return Qnil;
11787
11788 olen = RSTRING_LEN(str);
11789 str_modify_keep_cr(str);
11790 len = olen - suffixlen;
11791 STR_SET_LEN(str, len);
11792 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11793 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11795 }
11796 return str;
11797}
11798
11799/*
11800 * call-seq:
11801 * delete_suffix(suffix) -> new_string
11802 *
11803 * :include: doc/string/delete_suffix.rdoc
11804 *
11805 */
11806
11807static VALUE
11808rb_str_delete_suffix(VALUE str, VALUE suffix)
11809{
11810 long suffixlen;
11811
11812 suffixlen = deleted_suffix_length(str, suffix);
11813 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11814
11815 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11816}
11817
11818void
11819rb_str_setter(VALUE val, ID id, VALUE *var)
11820{
11821 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11822 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11823 }
11824 *var = val;
11825}
11826
11827static void
11828rb_fs_setter(VALUE val, ID id, VALUE *var)
11829{
11830 val = rb_fs_check(val);
11831 if (!val) {
11832 rb_raise(rb_eTypeError,
11833 "value of %"PRIsVALUE" must be String or Regexp",
11834 rb_id2str(id));
11835 }
11836 if (!NIL_P(val)) {
11837 rb_warn_deprecated("'$;'", NULL);
11838 }
11839 *var = val;
11840}
11841
11842
11843/*
11844 * call-seq:
11845 * force_encoding(encoding) -> self
11846 *
11847 * :include: doc/string/force_encoding.rdoc
11848 *
11849 */
11850
11851static VALUE
11852rb_str_force_encoding(VALUE str, VALUE enc)
11853{
11854 str_modifiable(str);
11855
11856 rb_encoding *encoding = rb_to_encoding(enc);
11857 int idx = rb_enc_to_index(encoding);
11858
11859 // If the encoding is unchanged, we do nothing.
11860 if (ENCODING_GET(str) == idx) {
11861 return str;
11862 }
11863
11864 rb_enc_associate_index(str, idx);
11865
11866 // If the coderange was 7bit and the new encoding is ASCII-compatible
11867 // we can keep the coderange.
11868 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11869 return str;
11870 }
11871
11873 return str;
11874}
11875
11876/*
11877 * call-seq:
11878 * b -> new_string
11879 *
11880 * :include: doc/string/b.rdoc
11881 *
11882 */
11883
11884static VALUE
11885rb_str_b(VALUE str)
11886{
11887 VALUE str2;
11888 if (STR_EMBED_P(str)) {
11889 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11890 }
11891 else {
11892 str2 = str_alloc_heap(rb_cString);
11893 }
11894 str_replace_shared_without_enc(str2, str);
11895
11896 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11897 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11898 // If we know the receiver's code range then we know the result's code range.
11899 int cr = ENC_CODERANGE(str);
11900 switch (cr) {
11901 case ENC_CODERANGE_7BIT:
11903 break;
11907 break;
11908 default:
11909 ENC_CODERANGE_CLEAR(str2);
11910 break;
11911 }
11912 }
11913
11914 return str2;
11915}
11916
11917/*
11918 * call-seq:
11919 * valid_encoding? -> true or false
11920 *
11921 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11922 *
11923 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11924 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11925 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11926 */
11927
11928static VALUE
11929rb_str_valid_encoding_p(VALUE str)
11930{
11931 int cr = rb_enc_str_coderange(str);
11932
11933 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11934}
11935
11936/*
11937 * call-seq:
11938 * ascii_only? -> true or false
11939 *
11940 * Returns whether +self+ contains only ASCII characters:
11941 *
11942 * 'abc'.ascii_only? # => true
11943 * "abc\u{6666}".ascii_only? # => false
11944 *
11945 * Related: see {Querying}[rdoc-ref:String@Querying].
11946 */
11947
11948static VALUE
11949rb_str_is_ascii_only_p(VALUE str)
11950{
11951 int cr = rb_enc_str_coderange(str);
11952
11953 return RBOOL(cr == ENC_CODERANGE_7BIT);
11954}
11955
11956VALUE
11958{
11959 static const char ellipsis[] = "...";
11960 const long ellipsislen = sizeof(ellipsis) - 1;
11961 rb_encoding *const enc = rb_enc_get(str);
11962 const long blen = RSTRING_LEN(str);
11963 const char *const p = RSTRING_PTR(str), *e = p + blen;
11964 VALUE estr, ret = 0;
11965
11966 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11967 if (len * rb_enc_mbminlen(enc) >= blen ||
11968 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11969 ret = str;
11970 }
11971 else if (len <= ellipsislen ||
11972 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11973 if (rb_enc_asciicompat(enc)) {
11974 ret = rb_str_new(ellipsis, len);
11975 rb_enc_associate(ret, enc);
11976 }
11977 else {
11978 estr = rb_usascii_str_new(ellipsis, len);
11979 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11980 }
11981 }
11982 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11983 rb_str_cat(ret, ellipsis, ellipsislen);
11984 }
11985 else {
11986 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11987 rb_enc_from_encoding(enc), 0, Qnil);
11988 rb_str_append(ret, estr);
11989 }
11990 return ret;
11991}
11992
11993static VALUE
11994str_compat_and_valid(VALUE str, rb_encoding *enc)
11995{
11996 int cr;
11997 str = StringValue(str);
11998 cr = rb_enc_str_coderange(str);
11999 if (cr == ENC_CODERANGE_BROKEN) {
12000 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
12001 }
12002 else {
12003 rb_encoding *e = STR_ENC_GET(str);
12004 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
12005 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
12006 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
12007 }
12008 }
12009 return str;
12010}
12011
12012static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
12013
12014VALUE
12016{
12017 rb_encoding *enc = STR_ENC_GET(str);
12018 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
12019}
12020
12021VALUE
12022rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
12023{
12024 int cr = ENC_CODERANGE_UNKNOWN;
12025 if (enc == STR_ENC_GET(str)) {
12026 /* cached coderange makes sense only when enc equals the
12027 * actual encoding of str */
12028 cr = ENC_CODERANGE(str);
12029 }
12030 return enc_str_scrub(enc, str, repl, cr);
12031}
12032
12033static VALUE
12034enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
12035{
12036 int encidx;
12037 VALUE buf = Qnil;
12038 const char *rep, *p, *e, *p1, *sp;
12039 long replen = -1;
12040 long slen;
12041
12042 if (rb_block_given_p()) {
12043 if (!NIL_P(repl))
12044 rb_raise(rb_eArgError, "both of block and replacement given");
12045 replen = 0;
12046 }
12047
12048 if (ENC_CODERANGE_CLEAN_P(cr))
12049 return Qnil;
12050
12051 if (!NIL_P(repl)) {
12052 repl = str_compat_and_valid(repl, enc);
12053 }
12054
12055 if (rb_enc_dummy_p(enc)) {
12056 return Qnil;
12057 }
12058 encidx = rb_enc_to_index(enc);
12059
12060#define DEFAULT_REPLACE_CHAR(str) do { \
12061 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
12062 rep = replace; replen = (int)sizeof(replace); \
12063 } while (0)
12064
12065 slen = RSTRING_LEN(str);
12066 p = RSTRING_PTR(str);
12067 e = RSTRING_END(str);
12068 p1 = p;
12069 sp = p;
12070
12071 if (rb_enc_asciicompat(enc)) {
12072 int rep7bit_p;
12073 if (!replen) {
12074 rep = NULL;
12075 rep7bit_p = FALSE;
12076 }
12077 else if (!NIL_P(repl)) {
12078 rep = RSTRING_PTR(repl);
12079 replen = RSTRING_LEN(repl);
12080 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
12081 }
12082 else if (encidx == rb_utf8_encindex()) {
12083 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
12084 rep7bit_p = FALSE;
12085 }
12086 else {
12087 DEFAULT_REPLACE_CHAR("?");
12088 rep7bit_p = TRUE;
12089 }
12090 cr = ENC_CODERANGE_7BIT;
12091
12092 p = search_nonascii(p, e);
12093 if (!p) {
12094 p = e;
12095 }
12096 while (p < e) {
12097 int ret = rb_enc_precise_mbclen(p, e, enc);
12098 if (MBCLEN_NEEDMORE_P(ret)) {
12099 break;
12100 }
12101 else if (MBCLEN_CHARFOUND_P(ret)) {
12103 p += MBCLEN_CHARFOUND_LEN(ret);
12104 }
12105 else if (MBCLEN_INVALID_P(ret)) {
12106 /*
12107 * p1~p: valid ascii/multibyte chars
12108 * p ~e: invalid bytes + unknown bytes
12109 */
12110 long clen = rb_enc_mbmaxlen(enc);
12111 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
12112 if (p > p1) {
12113 rb_str_buf_cat(buf, p1, p - p1);
12114 }
12115
12116 if (e - p < clen) clen = e - p;
12117 if (clen <= 2) {
12118 clen = 1;
12119 }
12120 else {
12121 const char *q = p;
12122 clen--;
12123 for (; clen > 1; clen--) {
12124 ret = rb_enc_precise_mbclen(q, q + clen, enc);
12125 if (MBCLEN_NEEDMORE_P(ret)) break;
12126 if (MBCLEN_INVALID_P(ret)) continue;
12128 }
12129 }
12130 if (rep) {
12131 rb_str_buf_cat(buf, rep, replen);
12132 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
12133 }
12134 else {
12135 repl = rb_yield(rb_enc_str_new(p, clen, enc));
12136 str_mod_check(str, sp, slen);
12137 repl = str_compat_and_valid(repl, enc);
12138 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12141 }
12142 p += clen;
12143 p1 = p;
12144 p = search_nonascii(p, e);
12145 if (!p) {
12146 p = e;
12147 break;
12148 }
12149 }
12150 else {
12152 }
12153 }
12154 if (NIL_P(buf)) {
12155 if (p == e) {
12156 ENC_CODERANGE_SET(str, cr);
12157 return Qnil;
12158 }
12159 buf = rb_str_buf_new(RSTRING_LEN(str));
12160 }
12161 if (p1 < p) {
12162 rb_str_buf_cat(buf, p1, p - p1);
12163 }
12164 if (p < e) {
12165 if (rep) {
12166 rb_str_buf_cat(buf, rep, replen);
12167 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
12168 }
12169 else {
12170 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12171 str_mod_check(str, sp, slen);
12172 repl = str_compat_and_valid(repl, enc);
12173 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12176 }
12177 }
12178 }
12179 else {
12180 /* ASCII incompatible */
12181 long mbminlen = rb_enc_mbminlen(enc);
12182 if (!replen) {
12183 rep = NULL;
12184 }
12185 else if (!NIL_P(repl)) {
12186 rep = RSTRING_PTR(repl);
12187 replen = RSTRING_LEN(repl);
12188 }
12189 else if (encidx == ENCINDEX_UTF_16BE) {
12190 DEFAULT_REPLACE_CHAR("\xFF\xFD");
12191 }
12192 else if (encidx == ENCINDEX_UTF_16LE) {
12193 DEFAULT_REPLACE_CHAR("\xFD\xFF");
12194 }
12195 else if (encidx == ENCINDEX_UTF_32BE) {
12196 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
12197 }
12198 else if (encidx == ENCINDEX_UTF_32LE) {
12199 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
12200 }
12201 else {
12202 DEFAULT_REPLACE_CHAR("?");
12203 }
12204
12205 while (p < e) {
12206 int ret = rb_enc_precise_mbclen(p, e, enc);
12207 if (MBCLEN_NEEDMORE_P(ret)) {
12208 break;
12209 }
12210 else if (MBCLEN_CHARFOUND_P(ret)) {
12211 p += MBCLEN_CHARFOUND_LEN(ret);
12212 }
12213 else if (MBCLEN_INVALID_P(ret)) {
12214 const char *q = p;
12215 long clen = rb_enc_mbmaxlen(enc);
12216 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
12217 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
12218
12219 if (e - p < clen) clen = e - p;
12220 if (clen <= mbminlen * 2) {
12221 clen = mbminlen;
12222 }
12223 else {
12224 clen -= mbminlen;
12225 for (; clen > mbminlen; clen-=mbminlen) {
12226 ret = rb_enc_precise_mbclen(q, q + clen, enc);
12227 if (MBCLEN_NEEDMORE_P(ret)) break;
12228 if (MBCLEN_INVALID_P(ret)) continue;
12230 }
12231 }
12232 if (rep) {
12233 rb_str_buf_cat(buf, rep, replen);
12234 }
12235 else {
12236 repl = rb_yield(rb_enc_str_new(p, clen, enc));
12237 str_mod_check(str, sp, slen);
12238 repl = str_compat_and_valid(repl, enc);
12239 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12240 }
12241 p += clen;
12242 p1 = p;
12243 }
12244 else {
12246 }
12247 }
12248 if (NIL_P(buf)) {
12249 if (p == e) {
12251 return Qnil;
12252 }
12253 buf = rb_str_buf_new(RSTRING_LEN(str));
12254 }
12255 if (p1 < p) {
12256 rb_str_buf_cat(buf, p1, p - p1);
12257 }
12258 if (p < e) {
12259 if (rep) {
12260 rb_str_buf_cat(buf, rep, replen);
12261 }
12262 else {
12263 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12264 str_mod_check(str, sp, slen);
12265 repl = str_compat_and_valid(repl, enc);
12266 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12267 }
12268 }
12270 }
12271 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
12272 return buf;
12273}
12274
12275/*
12276 * call-seq:
12277 * scrub(replacement_string = default_replacement) -> new_string
12278 * scrub{|bytes| ... } -> new_string
12279 *
12280 * :include: doc/string/scrub.rdoc
12281 *
12282 */
12283static VALUE
12284str_scrub(int argc, VALUE *argv, VALUE str)
12285{
12286 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12287 VALUE new = rb_str_scrub(str, repl);
12288 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
12289}
12290
12291/*
12292 * call-seq:
12293 * scrub! -> self
12294 * scrub!(replacement_string = default_replacement) -> self
12295 * scrub!{|bytes| ... } -> self
12296 *
12297 * Like String#scrub, except that any replacements are made in +self+.
12298 *
12299 */
12300static VALUE
12301str_scrub_bang(int argc, VALUE *argv, VALUE str)
12302{
12303 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12304 VALUE new = rb_str_scrub(str, repl);
12305 if (!NIL_P(new)) rb_str_replace(str, new);
12306 return str;
12307}
12308
12309static ID id_normalize;
12310static ID id_normalized_p;
12311static VALUE mUnicodeNormalize;
12312
12313static VALUE
12314unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12315{
12316 static int UnicodeNormalizeRequired = 0;
12317 VALUE argv2[2];
12318
12319 if (!UnicodeNormalizeRequired) {
12320 rb_require("unicode_normalize/normalize.rb");
12321 UnicodeNormalizeRequired = 1;
12322 }
12323 argv2[0] = str;
12324 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12325 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12326}
12327
12328/*
12329 * call-seq:
12330 * unicode_normalize(form = :nfc) -> string
12331 *
12332 * Returns a copy of +self+ with
12333 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
12334 *
12335 * Argument +form+ must be one of the following symbols
12336 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
12337 *
12338 * - +:nfc+: Canonical decomposition, followed by canonical composition.
12339 * - +:nfd+: Canonical decomposition.
12340 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
12341 * - +:nfkd+: Compatibility decomposition.
12342 *
12343 * The encoding of +self+ must be one of:
12344 *
12345 * - Encoding::UTF_8
12346 * - Encoding::UTF_16BE
12347 * - Encoding::UTF_16LE
12348 * - Encoding::UTF_32BE
12349 * - Encoding::UTF_32LE
12350 * - Encoding::GB18030
12351 * - Encoding::UCS_2BE
12352 * - Encoding::UCS_4BE
12353 *
12354 * Examples:
12355 *
12356 * "a\u0300".unicode_normalize # => "a"
12357 * "\u00E0".unicode_normalize(:nfd) # => "a "
12358 *
12359 * Related: String#unicode_normalize!, String#unicode_normalized?.
12360 */
12361static VALUE
12362rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12363{
12364 return unicode_normalize_common(argc, argv, str, id_normalize);
12365}
12366
12367/*
12368 * call-seq:
12369 * unicode_normalize!(form = :nfc) -> self
12370 *
12371 * Like String#unicode_normalize, except that the normalization
12372 * is performed on +self+.
12373 *
12374 * Related String#unicode_normalized?.
12375 *
12376 */
12377static VALUE
12378rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12379{
12380 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12381}
12382
12383/* call-seq:
12384 * unicode_normalized?(form = :nfc) -> true or false
12385 *
12386 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
12387 * +false+ otherwise.
12388 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12389 *
12390 * Examples:
12391 *
12392 * "a\u0300".unicode_normalized? # => false
12393 * "a\u0300".unicode_normalized?(:nfd) # => true
12394 * "\u00E0".unicode_normalized? # => true
12395 * "\u00E0".unicode_normalized?(:nfd) # => false
12396 *
12397 *
12398 * Raises an exception if +self+ is not in a Unicode encoding:
12399 *
12400 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12401 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
12402 *
12403 * Related: String#unicode_normalize, String#unicode_normalize!.
12404 *
12405 */
12406static VALUE
12407rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12408{
12409 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12410}
12411
12412/**********************************************************************
12413 * Document-class: Symbol
12414 *
12415 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12416 *
12417 * You can create a +Symbol+ object explicitly with:
12418 *
12419 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12420 *
12421 * The same +Symbol+ object will be
12422 * created for a given name or string for the duration of a program's
12423 * execution, regardless of the context or meaning of that name. Thus
12424 * if <code>Fred</code> is a constant in one context, a method in
12425 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12426 * will be the same object in all three contexts.
12427 *
12428 * module One
12429 * class Fred
12430 * end
12431 * $f1 = :Fred
12432 * end
12433 * module Two
12434 * Fred = 1
12435 * $f2 = :Fred
12436 * end
12437 * def Fred()
12438 * end
12439 * $f3 = :Fred
12440 * $f1.object_id #=> 2514190
12441 * $f2.object_id #=> 2514190
12442 * $f3.object_id #=> 2514190
12443 *
12444 * Constant, method, and variable names are returned as symbols:
12445 *
12446 * module One
12447 * Two = 2
12448 * def three; 3 end
12449 * @four = 4
12450 * @@five = 5
12451 * $six = 6
12452 * end
12453 * seven = 7
12454 *
12455 * One.constants
12456 * # => [:Two]
12457 * One.instance_methods(true)
12458 * # => [:three]
12459 * One.instance_variables
12460 * # => [:@four]
12461 * One.class_variables
12462 * # => [:@@five]
12463 * global_variables.grep(/six/)
12464 * # => [:$six]
12465 * local_variables
12466 * # => [:seven]
12467 *
12468 * A +Symbol+ object differs from a String object in that
12469 * a +Symbol+ object represents an identifier, while a String object
12470 * represents text or data.
12471 *
12472 * == What's Here
12473 *
12474 * First, what's elsewhere. Class +Symbol+:
12475 *
12476 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12477 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12478 *
12479 * Here, class +Symbol+ provides methods that are useful for:
12480 *
12481 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12482 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12483 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12484 *
12485 * === Methods for Querying
12486 *
12487 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12488 * - #=~: Returns the index of the first substring in symbol that matches a
12489 * given Regexp or other object; returns +nil+ if no match is found.
12490 * - #[], #slice : Returns a substring of symbol
12491 * determined by a given index, start/length, or range, or string.
12492 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12493 * - #encoding: Returns the Encoding object that represents the encoding
12494 * of symbol.
12495 * - #end_with?: Returns +true+ if symbol ends with
12496 * any of the given strings.
12497 * - #match: Returns a MatchData object if symbol
12498 * matches a given Regexp; +nil+ otherwise.
12499 * - #match?: Returns +true+ if symbol
12500 * matches a given Regexp; +false+ otherwise.
12501 * - #length, #size: Returns the number of characters in symbol.
12502 * - #start_with?: Returns +true+ if symbol starts with
12503 * any of the given strings.
12504 *
12505 * === Methods for Comparing
12506 *
12507 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12508 * or larger than symbol.
12509 * - #==, #===: Returns +true+ if a given symbol has the same content and
12510 * encoding.
12511 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12512 * symbol is smaller than, equal to, or larger than symbol.
12513 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12514 * after Unicode case folding; +false+ otherwise.
12515 *
12516 * === Methods for Converting
12517 *
12518 * - #capitalize: Returns symbol with the first character upcased
12519 * and all other characters downcased.
12520 * - #downcase: Returns symbol with all characters downcased.
12521 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12522 * - #name: Returns the frozen string corresponding to symbol.
12523 * - #succ, #next: Returns the symbol that is the successor to symbol.
12524 * - #swapcase: Returns symbol with all upcase characters downcased
12525 * and all downcase characters upcased.
12526 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12527 * - #to_s, #id2name: Returns the string corresponding to +self+.
12528 * - #to_sym, #intern: Returns +self+.
12529 * - #upcase: Returns symbol with all characters upcased.
12530 *
12531 */
12532
12533
12534/*
12535 * call-seq:
12536 * symbol == object -> true or false
12537 *
12538 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12539 */
12540
12541#define sym_equal rb_obj_equal
12542
12543static int
12544sym_printable(const char *s, const char *send, rb_encoding *enc)
12545{
12546 while (s < send) {
12547 int n;
12548 int c = rb_enc_precise_mbclen(s, send, enc);
12549
12550 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12551 n = MBCLEN_CHARFOUND_LEN(c);
12552 c = rb_enc_mbc_to_codepoint(s, send, enc);
12553 if (!rb_enc_isprint(c, enc)) return FALSE;
12554 s += n;
12555 }
12556 return TRUE;
12557}
12558
12559int
12560rb_str_symname_p(VALUE sym)
12561{
12562 rb_encoding *enc;
12563 const char *ptr;
12564 long len;
12565 rb_encoding *resenc = rb_default_internal_encoding();
12566
12567 if (resenc == NULL) resenc = rb_default_external_encoding();
12568 enc = STR_ENC_GET(sym);
12569 ptr = RSTRING_PTR(sym);
12570 len = RSTRING_LEN(sym);
12571 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12572 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12573 return FALSE;
12574 }
12575 return TRUE;
12576}
12577
12578VALUE
12579rb_str_quote_unprintable(VALUE str)
12580{
12581 rb_encoding *enc;
12582 const char *ptr;
12583 long len;
12584 rb_encoding *resenc;
12585
12586 Check_Type(str, T_STRING);
12587 resenc = rb_default_internal_encoding();
12588 if (resenc == NULL) resenc = rb_default_external_encoding();
12589 enc = STR_ENC_GET(str);
12590 ptr = RSTRING_PTR(str);
12591 len = RSTRING_LEN(str);
12592 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12593 !sym_printable(ptr, ptr + len, enc)) {
12594 return rb_str_escape(str);
12595 }
12596 return str;
12597}
12598
12599VALUE
12600rb_id_quote_unprintable(ID id)
12601{
12602 VALUE str = rb_id2str(id);
12603 if (!rb_str_symname_p(str)) {
12604 return rb_str_escape(str);
12605 }
12606 return str;
12607}
12608
12609/*
12610 * call-seq:
12611 * inspect -> string
12612 *
12613 * Returns a string representation of +self+ (including the leading colon):
12614 *
12615 * :foo.inspect # => ":foo"
12616 *
12617 * Related: Symbol#to_s, Symbol#name.
12618 *
12619 */
12620
12621static VALUE
12622sym_inspect(VALUE sym)
12623{
12624 VALUE str = rb_sym2str(sym);
12625 const char *ptr;
12626 long len;
12627 char *dest;
12628
12629 if (!rb_str_symname_p(str)) {
12630 str = rb_str_inspect(str);
12631 len = RSTRING_LEN(str);
12632 rb_str_resize(str, len + 1);
12633 dest = RSTRING_PTR(str);
12634 memmove(dest + 1, dest, len);
12635 }
12636 else {
12637 rb_encoding *enc = STR_ENC_GET(str);
12638 VALUE orig_str = str;
12639
12640 len = RSTRING_LEN(orig_str);
12641 str = rb_enc_str_new(0, len + 1, enc);
12642
12643 // Get data pointer after allocation
12644 ptr = RSTRING_PTR(orig_str);
12645 dest = RSTRING_PTR(str);
12646 memcpy(dest + 1, ptr, len);
12647
12648 RB_GC_GUARD(orig_str);
12649 }
12650 dest[0] = ':';
12651
12653
12654 return str;
12655}
12656
12657VALUE
12659{
12660 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12661 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12662 return str;
12663}
12664
12665VALUE
12666rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12667{
12668 VALUE obj;
12669
12670 if (argc < 1) {
12671 rb_raise(rb_eArgError, "no receiver given");
12672 }
12673 obj = argv[0];
12674 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12675}
12676
12677/*
12678 * call-seq:
12679 * succ
12680 *
12681 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12682 *
12683 * :foo.succ # => :fop
12684 *
12685 * Related: String#succ.
12686 */
12687
12688static VALUE
12689sym_succ(VALUE sym)
12690{
12691 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12692}
12693
12694/*
12695 * call-seq:
12696 * symbol <=> object -> -1, 0, +1, or nil
12697 *
12698 * If +object+ is a symbol,
12699 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12700 *
12701 * :bar <=> :foo # => -1
12702 * :foo <=> :foo # => 0
12703 * :foo <=> :bar # => 1
12704 *
12705 * Otherwise, returns +nil+:
12706 *
12707 * :foo <=> 'bar' # => nil
12708 *
12709 * Related: String#<=>.
12710 */
12711
12712static VALUE
12713sym_cmp(VALUE sym, VALUE other)
12714{
12715 if (!SYMBOL_P(other)) {
12716 return Qnil;
12717 }
12718 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12719}
12720
12721/*
12722 * call-seq:
12723 * casecmp(object) -> -1, 0, 1, or nil
12724 *
12725 * :include: doc/symbol/casecmp.rdoc
12726 *
12727 */
12728
12729static VALUE
12730sym_casecmp(VALUE sym, VALUE other)
12731{
12732 if (!SYMBOL_P(other)) {
12733 return Qnil;
12734 }
12735 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12736}
12737
12738/*
12739 * call-seq:
12740 * casecmp?(object) -> true, false, or nil
12741 *
12742 * :include: doc/symbol/casecmp_p.rdoc
12743 *
12744 */
12745
12746static VALUE
12747sym_casecmp_p(VALUE sym, VALUE other)
12748{
12749 if (!SYMBOL_P(other)) {
12750 return Qnil;
12751 }
12752 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12753}
12754
12755/*
12756 * call-seq:
12757 * symbol =~ object -> integer or nil
12758 *
12759 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12760 * including possible updates to global variables;
12761 * see String#=~.
12762 *
12763 */
12764
12765static VALUE
12766sym_match(VALUE sym, VALUE other)
12767{
12768 return rb_str_match(rb_sym2str(sym), other);
12769}
12770
12771/*
12772 * call-seq:
12773 * match(pattern, offset = 0) -> matchdata or nil
12774 * match(pattern, offset = 0) {|matchdata| } -> object
12775 *
12776 * Equivalent to <tt>self.to_s.match</tt>,
12777 * including possible updates to global variables;
12778 * see String#match.
12779 *
12780 */
12781
12782static VALUE
12783sym_match_m(int argc, VALUE *argv, VALUE sym)
12784{
12785 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12786}
12787
12788/*
12789 * call-seq:
12790 * match?(pattern, offset) -> true or false
12791 *
12792 * Equivalent to <tt>sym.to_s.match?</tt>;
12793 * see String#match.
12794 *
12795 */
12796
12797static VALUE
12798sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12799{
12800 return rb_str_match_m_p(argc, argv, sym);
12801}
12802
12803/*
12804 * call-seq:
12805 * symbol[index] -> string or nil
12806 * symbol[start, length] -> string or nil
12807 * symbol[range] -> string or nil
12808 * symbol[regexp, capture = 0] -> string or nil
12809 * symbol[substring] -> string or nil
12810 *
12811 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12812 *
12813 */
12814
12815static VALUE
12816sym_aref(int argc, VALUE *argv, VALUE sym)
12817{
12818 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12819}
12820
12821/*
12822 * call-seq:
12823 * length -> integer
12824 *
12825 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12826 */
12827
12828static VALUE
12829sym_length(VALUE sym)
12830{
12831 return rb_str_length(rb_sym2str(sym));
12832}
12833
12834/*
12835 * call-seq:
12836 * empty? -> true or false
12837 *
12838 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12839 *
12840 */
12841
12842static VALUE
12843sym_empty(VALUE sym)
12844{
12845 return rb_str_empty(rb_sym2str(sym));
12846}
12847
12848/*
12849 * call-seq:
12850 * upcase(*options) -> symbol
12851 *
12852 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12853 *
12854 * See String#upcase.
12855 *
12856 */
12857
12858static VALUE
12859sym_upcase(int argc, VALUE *argv, VALUE sym)
12860{
12861 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12862}
12863
12864/*
12865 * call-seq:
12866 * downcase(*options) -> symbol
12867 *
12868 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12869 *
12870 * See String#downcase.
12871 *
12872 * Related: Symbol#upcase.
12873 *
12874 */
12875
12876static VALUE
12877sym_downcase(int argc, VALUE *argv, VALUE sym)
12878{
12879 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12880}
12881
12882/*
12883 * call-seq:
12884 * capitalize(*options) -> symbol
12885 *
12886 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12887 *
12888 * See String#capitalize.
12889 *
12890 */
12891
12892static VALUE
12893sym_capitalize(int argc, VALUE *argv, VALUE sym)
12894{
12895 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12896}
12897
12898/*
12899 * call-seq:
12900 * swapcase(*options) -> symbol
12901 *
12902 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12903 *
12904 * See String#swapcase.
12905 *
12906 */
12907
12908static VALUE
12909sym_swapcase(int argc, VALUE *argv, VALUE sym)
12910{
12911 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12912}
12913
12914/*
12915 * call-seq:
12916 * start_with?(*string_or_regexp) -> true or false
12917 *
12918 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12919 *
12920 */
12921
12922static VALUE
12923sym_start_with(int argc, VALUE *argv, VALUE sym)
12924{
12925 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12926}
12927
12928/*
12929 * call-seq:
12930 * end_with?(*strings) -> true or false
12931 *
12932 *
12933 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12934 *
12935 */
12936
12937static VALUE
12938sym_end_with(int argc, VALUE *argv, VALUE sym)
12939{
12940 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12941}
12942
12943/*
12944 * call-seq:
12945 * encoding -> encoding
12946 *
12947 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12948 *
12949 */
12950
12951static VALUE
12952sym_encoding(VALUE sym)
12953{
12954 return rb_obj_encoding(rb_sym2str(sym));
12955}
12956
12957static VALUE
12958string_for_symbol(VALUE name)
12959{
12960 if (!RB_TYPE_P(name, T_STRING)) {
12961 VALUE tmp = rb_check_string_type(name);
12962 if (NIL_P(tmp)) {
12963 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12964 name);
12965 }
12966 name = tmp;
12967 }
12968 return name;
12969}
12970
12971ID
12973{
12974 if (SYMBOL_P(name)) {
12975 return SYM2ID(name);
12976 }
12977 name = string_for_symbol(name);
12978 return rb_intern_str(name);
12979}
12980
12981VALUE
12983{
12984 if (SYMBOL_P(name)) {
12985 return name;
12986 }
12987 name = string_for_symbol(name);
12988 return rb_str_intern(name);
12989}
12990
12991/*
12992 * call-seq:
12993 * Symbol.all_symbols -> array_of_symbols
12994 *
12995 * Returns an array of all symbols currently in Ruby's symbol table:
12996 *
12997 * Symbol.all_symbols.size # => 9334
12998 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12999 *
13000 */
13001
13002static VALUE
13003sym_all_symbols(VALUE _)
13004{
13005 return rb_sym_all_symbols();
13006}
13007
13008VALUE
13009rb_str_to_interned_str(VALUE str)
13010{
13011 return rb_fstring(str);
13012}
13013
13014VALUE
13015rb_interned_str(const char *ptr, long len)
13016{
13017 struct RString fake_str;
13018 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
13019}
13020
13021VALUE
13023{
13024 return rb_interned_str(ptr, strlen(ptr));
13025}
13026
13027VALUE
13028rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
13029{
13030 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
13031 rb_enc_autoload(enc);
13032 }
13033
13034 struct RString fake_str;
13035 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
13036}
13037
13038VALUE
13039rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
13040{
13041 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
13042 rb_enc_autoload(enc);
13043 }
13044
13045 struct RString fake_str;
13046 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
13047}
13048
13049VALUE
13051{
13052 return rb_enc_interned_str(ptr, strlen(ptr), enc);
13053}
13054
13055#if USE_YJIT
13056void
13057rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
13058{
13059 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
13060 ssize_t code = RB_NUM2SSIZE(codepoint);
13061
13062 if (RB_LIKELY(code >= 0 && code < 0xff)) {
13063 rb_str_buf_cat_byte(str, (char) code);
13064 return;
13065 }
13066 }
13067
13068 rb_str_concat(str, codepoint);
13069}
13070#endif
13071
13072void
13073Init_String(void)
13074{
13075 rb_cString = rb_define_class("String", rb_cObject);
13076 struct fstring_table_struct *fstring_table = RTYPEDDATA_GET_DATA(fstring_table_obj);
13077 for (unsigned int i = 0; i < fstring_table->capacity; i++) {
13078 VALUE str = fstring_table->entries[i].str;
13079 if (!str) continue;
13080 RBASIC_SET_CLASS(str, rb_cString);
13081 }
13083 rb_define_alloc_func(rb_cString, empty_str_alloc);
13084 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
13085 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
13086 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
13087 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
13088 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
13089 rb_define_method(rb_cString, "==", rb_str_equal, 1);
13090 rb_define_method(rb_cString, "===", rb_str_equal, 1);
13091 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
13092 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
13093 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
13094 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
13095 rb_define_method(rb_cString, "+", rb_str_plus, 1);
13096 rb_define_method(rb_cString, "*", rb_str_times, 1);
13097 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
13098 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
13099 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
13100 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
13101 rb_define_method(rb_cString, "length", rb_str_length, 0);
13102 rb_define_method(rb_cString, "size", rb_str_length, 0);
13103 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
13104 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
13105 rb_define_method(rb_cString, "=~", rb_str_match, 1);
13106 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
13107 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
13108 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
13109 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
13110 rb_define_method(rb_cString, "next", rb_str_succ, 0);
13111 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
13112 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
13113 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
13114 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
13115 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
13116 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
13117 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
13118 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
13119 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
13120 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
13121 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
13122 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
13123 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
13124 rb_define_method(rb_cString, "scrub", str_scrub, -1);
13125 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
13126 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
13127 rb_define_method(rb_cString, "+@", str_uplus, 0);
13128 rb_define_method(rb_cString, "-@", str_uminus, 0);
13129 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
13130 rb_define_alias(rb_cString, "dedup", "-@");
13131
13132 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
13133 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
13134 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
13135 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
13136 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
13137 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
13138 rb_define_method(rb_cString, "undump", str_undump, 0);
13139
13140 sym_ascii = ID2SYM(rb_intern_const("ascii"));
13141 sym_turkic = ID2SYM(rb_intern_const("turkic"));
13142 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
13143 sym_fold = ID2SYM(rb_intern_const("fold"));
13144
13145 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
13146 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
13147 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
13148 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
13149
13150 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
13151 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
13152 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
13153 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
13154
13155 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
13156 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
13157 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
13158 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
13159 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
13160 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
13161 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
13162 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
13163 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
13164 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
13165 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
13166 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
13167 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
13168 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
13169 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
13170 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
13171 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
13172 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
13173
13174 rb_define_method(rb_cString, "include?", rb_str_include, 1);
13175 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
13176 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
13177
13178 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
13179
13180 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
13181 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
13182 rb_define_method(rb_cString, "center", rb_str_center, -1);
13183
13184 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
13185 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
13186 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
13187 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
13188 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
13189 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
13190 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
13191 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
13192 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
13193
13194 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
13195 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
13196 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
13197 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
13198 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
13199 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
13200 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
13201 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
13202 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
13203
13204 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
13205 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
13206 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
13207 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
13208 rb_define_method(rb_cString, "count", rb_str_count, -1);
13209
13210 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
13211 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
13212 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
13213 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
13214
13215 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
13216 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
13217 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
13218 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
13219 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
13220
13221 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
13222
13223 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
13224 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
13225
13226 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
13227 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
13228
13229 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
13230 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
13231 rb_define_method(rb_cString, "b", rb_str_b, 0);
13232 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
13233 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
13234
13235 /* define UnicodeNormalize module here so that we don't have to look it up */
13236 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
13237 id_normalize = rb_intern_const("normalize");
13238 id_normalized_p = rb_intern_const("normalized?");
13239
13240 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
13241 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
13242 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
13243
13244 rb_fs = Qnil;
13245 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
13246 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
13247 rb_gc_register_address(&rb_fs);
13248
13249 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
13253 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
13254
13255 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
13256 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
13257 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
13258 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
13259 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
13260 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
13261
13262 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
13263 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
13264 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
13265 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
13266
13267 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
13268 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
13269 rb_define_method(rb_cSymbol, "length", sym_length, 0);
13270 rb_define_method(rb_cSymbol, "size", sym_length, 0);
13271 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
13272 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
13273 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
13274
13275 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
13276 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
13277 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
13278 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
13279
13280 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
13281 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
13282
13283 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
13284}
13285
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
#define RUBY_ATOMIC_VALUE_CAS(var, oldval, newval)
Identical to RUBY_ATOMIC_CAS, except it expects its arguments are VALUE.
Definition atomic.h:381
#define RUBY_ATOMIC_VALUE_SET(var, val)
Identical to RUBY_ATOMIC_SET, except it expects its arguments are VALUE.
Definition atomic.h:353
std::atomic< unsigned > rb_atomic_t
Type that is eligible for atomic operations.
Definition atomic.h:69
#define RUBY_ATOMIC_FETCH_ADD(var, val)
Atomically replaces the value pointed by var with the result of addition of val to the old value of v...
Definition atomic.h:93
#define RUBY_ATOMIC_VALUE_EXCHANGE(var, val)
Identical to RUBY_ATOMIC_EXCHANGE, except it expects its arguments are VALUE.
Definition atomic.h:367
#define RUBY_ATOMIC_DEC(var)
Atomically decrements the value pointed by var.
Definition atomic.h:198
#define RUBY_ATOMIC_LOAD(var)
Atomic load.
Definition atomic.h:150
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:870
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:456
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:311
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1682
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1465
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1583
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2828
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2649
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3118
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:942
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2907
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:65
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:681
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3905
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:649
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2090
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2108
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1276
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3504
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:242
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:556
VALUE rb_cSymbol
Symbol class.
Definition string.c:83
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:174
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1264
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:82
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3188
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1670
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:1285
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1535
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3340
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1554
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:13028
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2647
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:4042
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1483
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1775
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1676
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:1304
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:13050
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:1169
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:434
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:1054
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1865
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1044
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1871
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1926
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4219
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3716
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1926
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:2070
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1840
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2799
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:4107
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1751
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12658
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2872
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1727
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:2064
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3368
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5743
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4483
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3465
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11957
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1752
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:2106
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1517
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1339
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1846
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:2309
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4469
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3875
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2736
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:2327
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6981
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3473
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:13022
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1757
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:4073
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3415
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4592
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3697
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7702
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:3102
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:13015
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4539
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4356
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4514
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:4049
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3590
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:6253
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:12015
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:2020
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:3262
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3562
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3680
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1529
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:3056
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7816
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1739
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:2036
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2750
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:6171
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9912
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1523
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:884
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:2168
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2100
st_index_t rb_ivar_count(VALUE obj)
Number of instance variables defined on an object.
Definition variable.c:2456
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2175
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3058
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1383
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:972
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12982
ID rb_to_id(VALUE str)
Definition string.c:12972
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1865
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3500
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4463
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:163
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1769
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:3239
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:3121
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1763
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:3134
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:2097
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:450
#define TypedData_Make_Struct(klass, type, data_type, sval)
Identical to TypedData_Wrap_Struct, except it allocates a new data region internally instead of takin...
Definition rtypeddata.h:497
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1580
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
Definition string.c:541
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:203
const char * wrap_struct_name
Name of structs of this kind.
Definition rtypeddata.h:210
Definition string.c:8774
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:295
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113