Ruby 3.5.0dev (2025-05-16 revision 06a56a7ffcb053d5bc45b9a984082d9301d6819c)
string.c (06a56a7ffcb053d5bc45b9a984082d9301d6819c)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/thread.h"
46#include "ruby/util.h"
47#include "ruby_assert.h"
48#include "shape.h"
49#include "vm_sync.h"
51
52#if defined HAVE_CRYPT_R
53# if defined HAVE_CRYPT_H
54# include <crypt.h>
55# endif
56#elif !defined HAVE_CRYPT
57# include "missing/crypt.h"
58# define HAVE_CRYPT_R 1
59#endif
60
61#define BEG(no) (regs->beg[(no)])
62#define END(no) (regs->end[(no)])
63
64#undef rb_str_new
65#undef rb_usascii_str_new
66#undef rb_utf8_str_new
67#undef rb_enc_str_new
68#undef rb_str_new_cstr
69#undef rb_usascii_str_new_cstr
70#undef rb_utf8_str_new_cstr
71#undef rb_enc_str_new_cstr
72#undef rb_external_str_new_cstr
73#undef rb_locale_str_new_cstr
74#undef rb_str_dup_frozen
75#undef rb_str_buf_new_cstr
76#undef rb_str_buf_cat
77#undef rb_str_buf_cat2
78#undef rb_str_cat2
79#undef rb_str_cat_cstr
80#undef rb_fstring_cstr
81
84
85/* Flags of RString
86 *
87 * 0: STR_SHARED (equal to ELTS_SHARED)
88 * The string is shared. The buffer this string points to is owned by
89 * another string (the shared root).
90 * 1: RSTRING_NOEMBED
91 * The string is not embedded. When a string is embedded, the contents
92 * follow the header. When a string is not embedded, the contents is
93 * on a separately allocated buffer.
94 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
95 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
96 * It emits a deprecation warning when mutated for the first time.
97 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
98 * The string was allocated by the `Symbol#to_s` method.
99 * It emits a deprecation warning when mutated for the first time.
100 * 4: STR_PRECOMPUTED_HASH
101 * The string is embedded and has its precomputed hashcode stored
102 * after the terminator.
103 * 5: STR_SHARED_ROOT
104 * Other strings may point to the contents of this string. When this
105 * flag is set, STR_SHARED must not be set.
106 * 6: STR_BORROWED
107 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
108 * to be unshared by rb_str_tmp_frozen_release.
109 * 7: STR_TMPLOCK
110 * The pointer to the buffer is passed to a system call such as
111 * read(2). Any modification and realloc is prohibited.
112 * 8-9: ENC_CODERANGE
113 * Stores the coderange of the string.
114 * 10-16: ENCODING
115 * Stores the encoding of the string.
116 * 17: RSTRING_FSTR
117 * The string is a fstring. The string is deduplicated in the fstring
118 * table.
119 * 18: STR_NOFREE
120 * Do not free this string's buffer when the string is reclaimed
121 * by the garbage collector. Used for when the string buffer is a C
122 * string literal.
123 * 19: STR_FAKESTR
124 * The string is not allocated or managed by the garbage collector.
125 * Typically, the string object header (struct RString) is temporarily
126 * allocated on C stack.
127 */
128
129#define RUBY_MAX_CHAR_LEN 16
130#define STR_PRECOMPUTED_HASH FL_USER4
131#define STR_SHARED_ROOT FL_USER5
132#define STR_BORROWED FL_USER6
133#define STR_TMPLOCK FL_USER7
134#define STR_NOFREE FL_USER18
135#define STR_FAKESTR FL_USER19
136
137#define STR_SET_NOEMBED(str) do {\
138 FL_SET((str), STR_NOEMBED);\
139 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
140} while (0)
141#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
142
143#define STR_SET_LEN(str, n) do { \
144 RSTRING(str)->len = (n); \
145} while (0)
146
147static inline bool
148str_encindex_fastpath(int encindex)
149{
150 // The overwhelming majority of strings are in one of these 3 encodings.
151 switch (encindex) {
152 case ENCINDEX_ASCII_8BIT:
153 case ENCINDEX_UTF_8:
154 case ENCINDEX_US_ASCII:
155 return true;
156 default:
157 return false;
158 }
159}
160
161static inline bool
162str_enc_fastpath(VALUE str)
163{
164 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
165}
166
167#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
168#define TERM_FILL(ptr, termlen) do {\
169 char *const term_fill_ptr = (ptr);\
170 const int term_fill_len = (termlen);\
171 *term_fill_ptr = '\0';\
172 if (UNLIKELY(term_fill_len > 1))\
173 memset(term_fill_ptr, 0, term_fill_len);\
174} while (0)
175
176#define RESIZE_CAPA(str,capacity) do {\
177 const int termlen = TERM_LEN(str);\
178 RESIZE_CAPA_TERM(str,capacity,termlen);\
179} while (0)
180#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
181 if (STR_EMBED_P(str)) {\
182 if (str_embed_capa(str) < capacity + termlen) {\
183 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
184 const long tlen = RSTRING_LEN(str);\
185 memcpy(tmp, RSTRING_PTR(str), tlen);\
186 RSTRING(str)->as.heap.ptr = tmp;\
187 RSTRING(str)->len = tlen;\
188 STR_SET_NOEMBED(str);\
189 RSTRING(str)->as.heap.aux.capa = (capacity);\
190 }\
191 }\
192 else {\
193 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
194 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
195 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
196 RSTRING(str)->as.heap.aux.capa = (capacity);\
197 }\
198} while (0)
199
200#define STR_SET_SHARED(str, shared_str) do { \
201 if (!FL_TEST(str, STR_FAKESTR)) { \
202 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
203 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
204 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
205 FL_SET((str), STR_SHARED); \
206 FL_SET((shared_str), STR_SHARED_ROOT); \
207 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
208 FL_SET_RAW((shared_str), STR_BORROWED); \
209 } \
210} while (0)
211
212#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
213#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
214/* TODO: include the terminator size in capa. */
215
216#define STR_ENC_GET(str) get_encoding(str)
217
218#if !defined SHARABLE_MIDDLE_SUBSTRING
219# define SHARABLE_MIDDLE_SUBSTRING 0
220#endif
221#if !SHARABLE_MIDDLE_SUBSTRING
222#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
223#else
224#define SHARABLE_SUBSTRING_P(beg, len, end) 1
225#endif
226
227
228static inline long
229str_embed_capa(VALUE str)
230{
231 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
232}
233
234bool
235rb_str_reembeddable_p(VALUE str)
236{
237 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
238}
239
240static inline size_t
241rb_str_embed_size(long capa)
242{
243 return offsetof(struct RString, as.embed.ary) + capa;
244}
245
246size_t
247rb_str_size_as_embedded(VALUE str)
248{
249 size_t real_size;
250 if (STR_EMBED_P(str)) {
251 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
252 }
253 /* if the string is not currently embedded, but it can be embedded, how
254 * much space would it require */
255 else if (rb_str_reembeddable_p(str)) {
256 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
257 }
258 else {
259 real_size = sizeof(struct RString);
260 }
261
262 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
263 real_size += sizeof(st_index_t);
264 }
265
266 return real_size;
267}
268
269static inline bool
270STR_EMBEDDABLE_P(long len, long termlen)
271{
272 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
273}
274
275static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
276static VALUE str_new_frozen(VALUE klass, VALUE orig);
277static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
278static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
279static VALUE str_new(VALUE klass, const char *ptr, long len);
280static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
281static inline void str_modifiable(VALUE str);
282static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
283static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
284
285static inline void
286str_make_independent(VALUE str)
287{
288 long len = RSTRING_LEN(str);
289 int termlen = TERM_LEN(str);
290 str_make_independent_expand((str), len, 0L, termlen);
291}
292
293static inline int str_dependent_p(VALUE str);
294
295void
296rb_str_make_independent(VALUE str)
297{
298 if (str_dependent_p(str)) {
299 str_make_independent(str);
300 }
301}
302
303void
304rb_str_make_embedded(VALUE str)
305{
306 RUBY_ASSERT(rb_str_reembeddable_p(str));
307 RUBY_ASSERT(!STR_EMBED_P(str));
308
309 char *buf = RSTRING(str)->as.heap.ptr;
310 long len = RSTRING(str)->len;
311
312 STR_SET_EMBED(str);
313 STR_SET_LEN(str, len);
314
315 if (len > 0) {
316 memcpy(RSTRING_PTR(str), buf, len);
317 ruby_xfree(buf);
318 }
319
320 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
321}
322
323void
324rb_debug_rstring_null_ptr(const char *func)
325{
326 fprintf(stderr, "%s is returning NULL!! "
327 "SIGSEGV is highly expected to follow immediately.\n"
328 "If you could reproduce, attach your debugger here, "
329 "and look at the passed string.\n",
330 func);
331}
332
333/* symbols for [up|down|swap]case/capitalize options */
334static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
335
336static rb_encoding *
337get_encoding(VALUE str)
338{
339 return rb_enc_from_index(ENCODING_GET(str));
340}
341
342static void
343mustnot_broken(VALUE str)
344{
345 if (is_broken_string(str)) {
346 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
347 }
348}
349
350static void
351mustnot_wchar(VALUE str)
352{
353 rb_encoding *enc = STR_ENC_GET(str);
354 if (rb_enc_mbminlen(enc) > 1) {
355 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
356 }
357}
358
359static int fstring_cmp(VALUE a, VALUE b);
360
361static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
362
363#if SIZEOF_LONG == SIZEOF_VOIDP
364#define PRECOMPUTED_FAKESTR_HASH 1
365#else
366#endif
367
368#ifdef PRECOMPUTED_FAKESTR_HASH
369static st_index_t
370fstring_hash(VALUE str)
371{
372 st_index_t h;
373 if (FL_TEST_RAW(str, STR_FAKESTR)) {
374 // register_fstring precomputes the hash and stores it in capa for fake strings
375 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
376 }
377 else {
378 h = rb_str_hash(str);
379 }
380 // rb_str_hash doesn't include the encoding for ascii only strings, so
381 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
382 return rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
383}
384#else
385#define fstring_hash rb_str_hash
386#endif
387
388static inline bool
389BARE_STRING_P(VALUE str)
390{
391 if (RBASIC_CLASS(str) != rb_cString) return false;
392
393 if (FL_TEST_RAW(str, FL_EXIVAR)) {
394 return rb_ivar_count(str) == 0;
395 }
396 return true;
397}
398
399static inline st_index_t
400str_do_hash(VALUE str)
401{
402 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
403 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
404 if (e && !is_ascii_string(str)) {
405 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
406 }
407 return h;
408}
409
410static VALUE
411str_store_precomputed_hash(VALUE str, st_index_t hash)
412{
413 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
414 RUBY_ASSERT(STR_EMBED_P(str));
415
416#if RUBY_DEBUG
417 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
418 size_t free_bytes = str_embed_capa(str) - used_bytes;
419 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
420#endif
421
422 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
423
424 FL_SET(str, STR_PRECOMPUTED_HASH);
425
426 return str;
427}
428
430 bool copy;
431 bool force_precompute_hash;
432};
433
434static VALUE
435build_fstring(VALUE str, struct fstr_update_arg *arg)
436{
437 // Unless the string is empty or binary, its coderange has been precomputed.
438 int coderange = ENC_CODERANGE(str);
439
440 if (FL_TEST_RAW(str, STR_FAKESTR)) {
441 if (arg->copy) {
442 VALUE new_str;
443 long len = RSTRING_LEN(str);
444 long capa = len + sizeof(st_index_t);
445 int term_len = TERM_LEN(str);
446
447 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
448 new_str = str_alloc_embed(rb_cString, capa + term_len);
449 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
450 STR_SET_LEN(new_str, RSTRING_LEN(str));
451 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
452 rb_enc_copy(new_str, str);
453 str_store_precomputed_hash(new_str, str_do_hash(str));
454 }
455 else {
456 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
457 rb_enc_copy(new_str, str);
458#ifdef PRECOMPUTED_FAKESTR_HASH
459 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
460 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
461 }
462#endif
463 }
464 str = new_str;
465 }
466 else {
467 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
468 RSTRING(str)->len,
469 ENCODING_GET(str));
470 }
471 OBJ_FREEZE(str);
472 }
473 else {
474 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
475 str = str_new_frozen(rb_cString, str);
476 }
477 if (STR_SHARED_P(str)) { /* str should not be shared */
478 /* shared substring */
479 str_make_independent(str);
481 }
482 if (!BARE_STRING_P(str)) {
483 str = str_new_frozen(rb_cString, str);
484 }
485 }
486
487 ENC_CODERANGE_SET(str, coderange);
488 RBASIC(str)->flags |= RSTRING_FSTR;
489
492 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
495 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
496
497 return str;
498}
499
500VALUE
501rb_fstring(VALUE str)
502{
503 VALUE fstr;
504 int bare;
505
506 Check_Type(str, T_STRING);
507
508 if (FL_TEST(str, RSTRING_FSTR))
509 return str;
510
511 bare = BARE_STRING_P(str);
512 if (!bare) {
513 if (STR_EMBED_P(str)) {
514 OBJ_FREEZE(str);
515 return str;
516 }
517
518 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
520 return str;
521 }
522 }
523
524 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
525 rb_str_resize(str, RSTRING_LEN(str));
526
527 fstr = register_fstring(str, false, false);
528
529 if (!bare) {
530 str_replace_shared_without_enc(str, fstr);
531 OBJ_FREEZE(str);
532 return str;
533 }
534 return fstr;
535}
536
537#define FSTRING_TABLE_EMPTY Qfalse
538#define FSTRING_TABLE_TOMBSTONE Qtrue
539#define FSTRING_TABLE_MOVED Qundef
540
542 VALUE str;
543 VALUE hash;
544};
545
547 struct fstring_table_entry *entries;
548 unsigned int capacity;
549 unsigned int deleted_entries;
550 rb_atomic_t count; // TODO: pad to own cache line?
551};
552
553static void
554fstring_table_free(void *ptr)
555{
556 struct fstring_table_struct *table = ptr;
557 xfree(table->entries);
558}
559
560static size_t
561fstring_table_size(const void *ptr)
562{
563 const struct fstring_table_struct *table = ptr;
564 return sizeof(struct fstring_table_struct) + sizeof(struct fstring_table_entry) * table->capacity;
565}
566
567// We declare a type for the table so that we can lean on Ruby's GC for deferred reclamation
568static const rb_data_type_t fstring_table_type = {
569 .wrap_struct_name = "VM/fstring_table",
570 .function = {
571 .dmark = NULL,
572 .dfree = fstring_table_free,
573 .dsize = fstring_table_size,
574 },
575 .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
576};
577
578
579static VALUE fstring_table_obj;
580
581static VALUE
582new_fstring_table(int capacity)
583{
584 VALUE obj;
585 struct fstring_table_struct *table;
586 obj = TypedData_Make_Struct(0, struct fstring_table_struct, &fstring_table_type, table);
587 table->capacity = capacity;
588 table->count = 0;
589 table->entries = ZALLOC_N(struct fstring_table_entry, capacity);
590 return obj;
591}
592
593void
594Init_fstring_table(void)
595{
596 fstring_table_obj = new_fstring_table(8192);
597 rb_gc_register_address(&fstring_table_obj);
598}
599
600#if 0
601
602// Linear probe
603struct fstring_table_probe {
604 int idx;
605 int mask;
606};
607
608static int
609fstring_table_probe_start(struct fstring_table_probe *probe, struct fstring_table_struct *table, VALUE hash_code)
610{
611 RUBY_ASSERT((table->capacity & (table->capacity - 1)) == 0);
612 probe->mask = table->capacity - 1;
613 probe->idx = hash_code & probe->mask;
614 return probe->idx;
615}
616
617static int
618fstring_table_probe_next(struct fstring_table_probe *probe)
619{
620 probe->idx = (probe->idx + 1) & probe->mask;
621 return probe->idx;
622}
623
624#else
625
626// Struct containing probe information. Intended that the compiler should always inline this
627// Quadratic probing
629 int idx;
630 int d;
631 int mask;
632};
633
634static int
635fstring_table_probe_start(struct fstring_table_probe *probe, struct fstring_table_struct *table, VALUE hash_code)
636{
637 RUBY_ASSERT((table->capacity & (table->capacity - 1)) == 0);
638 probe->d = 0;
639 probe->mask = table->capacity - 1;
640 probe->idx = hash_code & probe->mask;
641 return probe->idx;
642}
643
644static int
645fstring_table_probe_next(struct fstring_table_probe *probe)
646{
647 probe->d++;
648 probe->idx = (probe->idx + probe->d) & probe->mask;
649 return probe->idx;
650}
651#endif
652
653#define RUBY_ATOMIC_VALUE_LOAD(x) (VALUE)(RUBY_ATOMIC_PTR_LOAD(x))
654
655static void
656fstring_insert_on_resize(struct fstring_table_struct *table, VALUE hash_code, VALUE value)
657{
658 struct fstring_table_probe probe;
659 int idx = fstring_table_probe_start(&probe, table, hash_code);
660
661 for (;;) {
662 struct fstring_table_entry *entry = &table->entries[idx];
663 VALUE candidate = entry->str;
664
665 RUBY_ASSERT(candidate != FSTRING_TABLE_TOMBSTONE);
666 RUBY_ASSERT(candidate != FSTRING_TABLE_MOVED);
667
668 if (candidate == FSTRING_TABLE_EMPTY) {
669 table->count++;
670
671 RUBY_ASSERT(table->count < table->capacity / 2);
672 RUBY_ASSERT(entry->hash == 0);
673
674 entry->str = value;
675 entry->hash = hash_code;
676 return;
677 }
678
679 idx = fstring_table_probe_next(&probe);
680 }
681}
682
683// Rebuilds the table
684static void
685fstring_try_resize(VALUE old_table_obj)
686{
687 RB_VM_LOCK_ENTER();
688
689 // Check if another thread has already resized
690 if (RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj) != old_table_obj) {
691 goto end;
692 }
693
694 struct fstring_table_struct *old_table = RTYPEDDATA_GET_DATA(old_table_obj);
695
696 // This may overcount by up to the number of threads concurrently attempting to insert
697 // GC may also happen between now and the table being rebuilt
698 int expected_count = RUBY_ATOMIC_LOAD(old_table->count) - old_table->deleted_entries;
699
700 struct fstring_table_entry *old_entries = old_table->entries;
701 int old_capacity = old_table->capacity;
702 int new_capacity = old_capacity * 2;
703 if (new_capacity > expected_count * 8) {
704 new_capacity = old_capacity / 2;
705 }
706 else if (new_capacity > expected_count * 4) {
707 new_capacity = old_capacity;
708 }
709
710 // May cause GC and therefore deletes, so must hapen first
711 VALUE new_table_obj = new_fstring_table(new_capacity);
712 struct fstring_table_struct *new_table = RTYPEDDATA_GET_DATA(new_table_obj);
713
714 for (int i = 0; i < old_capacity; i++) {
715 struct fstring_table_entry *entry = &old_entries[i];
716 VALUE val = RUBY_ATOMIC_VALUE_EXCHANGE(entry->str, FSTRING_TABLE_MOVED);
717 RUBY_ASSERT(val != FSTRING_TABLE_MOVED);
718 if (val == FSTRING_TABLE_EMPTY) continue;
719 if (val == FSTRING_TABLE_TOMBSTONE) continue;
720 if (rb_objspace_garbage_object_p(val)) continue;
721
722 VALUE hash_code = RUBY_ATOMIC_VALUE_LOAD(entry->hash);
723 if (hash_code == 0) {
724 // Either in-progress insert or extremely unlikely 0 hash
725 // Re-calculate the hash ourselves
726 hash_code = fstring_hash(val);
727 }
728 RUBY_ASSERT(hash_code == fstring_hash(val));
729 fstring_insert_on_resize(new_table, hash_code, val);
730 }
731
732#if 0
733 fprintf(stderr, "resized: %p(%i) -> %p(%i) (count: %i->%i)\n", old_table, old_table->capacity, new_table, new_table->capacity, old_table->count, new_table->count);
734#endif
735
736 RUBY_ATOMIC_VALUE_SET(fstring_table_obj, new_table_obj);
737
738end:
739 RB_GC_GUARD(old_table_obj);
740 RB_VM_LOCK_LEAVE();
741}
742
743static VALUE
744fstring_find_or_insert(VALUE hash_code, VALUE value, struct fstr_update_arg *arg)
745{
746 struct fstring_table_probe probe;
747 bool inserting = false;
748 int idx;
749 VALUE table_obj;
750 struct fstring_table_struct *table;
751
752 retry:
753 table_obj = RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj);
754 RUBY_ASSERT(table_obj);
755 table = RTYPEDDATA_GET_DATA(table_obj);
756 idx = fstring_table_probe_start(&probe, table, hash_code);
757
758 for (;;) {
759 struct fstring_table_entry *entry = &table->entries[idx];
760 VALUE candidate = RUBY_ATOMIC_VALUE_LOAD(entry->str);
761
762 if (candidate == FSTRING_TABLE_EMPTY) {
763 // Not in table
764 if (!inserting) {
765 // Prepare a string suitable for inserting into the table
766 value = build_fstring(value, arg);
767 RUBY_ASSERT(hash_code == fstring_hash(value));
768 inserting = true;
769 }
770
771 unsigned int prev_count = RUBY_ATOMIC_FETCH_ADD(table->count, 1);
772
773 if (UNLIKELY(prev_count > table->capacity / 2)) {
774 fstring_try_resize(table_obj);
775 goto retry;
776 }
777
778 VALUE found = RUBY_ATOMIC_VALUE_CAS(entry->str, FSTRING_TABLE_EMPTY, value);
779 if (found == FSTRING_TABLE_EMPTY) {
780 // Success! Our value was inserted
781
782 // Also set the hash code
783 RUBY_ATOMIC_VALUE_SET(entry->hash, hash_code);
784
785 RB_GC_GUARD(table_obj);
786 return value;
787 }
788 else {
789 // Nothing was inserted
790 RUBY_ATOMIC_DEC(table->count); // we didn't end up inserting
791
792 // Another thread won the race, try again at the same location
793 continue;
794 }
795 }
796 else if (candidate == FSTRING_TABLE_TOMBSTONE) {
797 // Deleted entry, continue searching
798 }
799 else if (candidate == FSTRING_TABLE_MOVED) {
800 // Wait
801 RB_VM_LOCK_ENTER();
802 RB_VM_LOCK_LEAVE();
803
804 goto retry;
805 }
806 else {
807 VALUE candidate_hash = RUBY_ATOMIC_VALUE_LOAD(entry->hash);
808 if ((candidate_hash == hash_code || candidate_hash == 0) && !fstring_cmp(candidate, value)) {
809 // We've found a match
810 if (UNLIKELY(rb_objspace_garbage_object_p(candidate))) {
811 // This is a weakref table, so after marking but before sweeping is complete we may find a matching garbage object.
812 // Skip it and mark it as a tombstone to help other threads out
813 RUBY_ATOMIC_VALUE_CAS(entry->str, candidate, FSTRING_TABLE_TOMBSTONE);
814
815 // Fall through and continue our search
816 }
817 else {
818 RB_GC_GUARD(table_obj);
819 return candidate;
820 }
821 }
822 }
823
824 idx = fstring_table_probe_next(&probe);
825 }
826}
827
828
829// Removes an fstring from the table. Compares by identity
830static void
831fstring_delete(VALUE hash_code, VALUE value)
832{
833 // Delete is never called concurrently, so atomic operations are unnecessary
834 VALUE table_obj = RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj);
835 RUBY_ASSERT_ALWAYS(table_obj);
836 struct fstring_table_struct *table = RTYPEDDATA_GET_DATA(table_obj);
837
838 struct fstring_table_probe probe;
839 int idx = fstring_table_probe_start(&probe, table, hash_code);
840
841 for (;;) {
842 struct fstring_table_entry *entry = &table->entries[idx];
843 VALUE candidate = entry->str;
844
845 // Allocations should only occur at the beginning of the resize
846 RUBY_ASSERT(candidate != FSTRING_TABLE_MOVED);
847
848 if (candidate == FSTRING_TABLE_EMPTY) {
849 // We didn't find our string to delete
850 return;
851 }
852 else if (candidate == value) {
853 // We found our string, replace it with a tombstone and increment the count
854 entry->str = FSTRING_TABLE_TOMBSTONE;
855 table->deleted_entries++;
856 return;
857 }
858
859 idx = fstring_table_probe_next(&probe);
860 }
861}
862
863static VALUE
864register_fstring(VALUE str, bool copy, bool force_precompute_hash)
865{
866 struct fstr_update_arg args = {
867 .copy = copy,
868 .force_precompute_hash = force_precompute_hash
869 };
870
871#if SIZEOF_VOIDP == SIZEOF_LONG
872 if (FL_TEST_RAW(str, STR_FAKESTR)) {
873 // if the string hasn't been interned, we'll need the hash twice, so we
874 // compute it once and store it in capa
875 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
876 }
877#endif
878
879 VALUE hash_code = fstring_hash(str);
880 VALUE result = fstring_find_or_insert(hash_code, str, &args);
881
882 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
884 RUBY_ASSERT(OBJ_FROZEN(result));
885 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
887
888 return result;
889}
890
891void
892rb_fstring_foreach_with_replace(st_foreach_check_callback_func *func, st_update_callback_func *replace, st_data_t arg)
893{
894 // Assume locking and barrier (which there is no assert for)
895 ASSERT_vm_locking();
896
897 VALUE table_obj = RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj);
898 if (!table_obj) {
899 // Table not yet initialized. Nothing to iterate over
900 return;
901 }
902 struct fstring_table_struct *table = RTYPEDDATA_GET_DATA(table_obj);
903
904 for (unsigned int i = 0; i < table->capacity; i++) {
905 VALUE key = table->entries[i].str;
906 if(key == FSTRING_TABLE_EMPTY) continue;
907 if(key == FSTRING_TABLE_TOMBSTONE) continue;
908
909 enum st_retval retval;
910 retval = (*func)(key, key, arg, 0);
911
912 if (retval == ST_REPLACE && replace) {
913 st_data_t value = key;
914 retval = (*replace)(&key, &value, arg, TRUE);
915 table->entries[i].str = key;
916 }
917 switch (retval) {
918 case ST_REPLACE:
919 case ST_CONTINUE:
920 break;
921 case ST_CHECK:
922 rb_bug("unsupported");
923 case ST_STOP:
924 return;
925 case ST_DELETE:
926 table->entries[i].str = FSTRING_TABLE_TOMBSTONE;
927 break;
928 }
929 }
930}
931
932bool
933rb_obj_is_fstring_table(VALUE obj)
934{
935 ASSERT_vm_locking();
936
937 return obj == fstring_table_obj;
938}
939
940void
941rb_gc_free_fstring(VALUE obj)
942{
943 // Assume locking and barrier (which there is no assert for)
944 ASSERT_vm_locking();
945
946 VALUE str_hash = fstring_hash(obj);
947 fstring_delete(str_hash, obj);
948
949 RB_DEBUG_COUNTER_INC(obj_str_fstr);
950
951 FL_UNSET(obj, RSTRING_FSTR);
952}
953
954static VALUE
955setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
956{
957 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
958
959 if (!name) {
961 name = "";
962 }
963
964 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
965
966 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
967 fake_str->len = len;
968 fake_str->as.heap.ptr = (char *)name;
969 fake_str->as.heap.aux.capa = len;
970 return (VALUE)fake_str;
971}
972
973/*
974 * set up a fake string which refers a static string literal.
975 */
976VALUE
977rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
978{
979 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
980}
981
982/*
983 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
984 * shared string which refers a static string literal. `ptr` must
985 * point a constant string.
986 */
987VALUE
988rb_fstring_new(const char *ptr, long len)
989{
990 struct RString fake_str;
991 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
992}
993
994VALUE
995rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
996{
997 struct RString fake_str;
998 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
999}
1000
1001VALUE
1002rb_fstring_cstr(const char *ptr)
1003{
1004 return rb_fstring_new(ptr, strlen(ptr));
1005}
1006
1007static int
1008fstring_cmp(VALUE a, VALUE b)
1009{
1010 long alen, blen;
1011 const char *aptr, *bptr;
1012
1015
1016 RSTRING_GETMEM(a, aptr, alen);
1017 RSTRING_GETMEM(b, bptr, blen);
1018 return (alen != blen ||
1019 ENCODING_GET(a) != ENCODING_GET(b) ||
1020 memcmp(aptr, bptr, alen) != 0);
1021}
1022
1023static inline bool
1024single_byte_optimizable(VALUE str)
1025{
1026 int encindex = ENCODING_GET(str);
1027 switch (encindex) {
1028 case ENCINDEX_ASCII_8BIT:
1029 case ENCINDEX_US_ASCII:
1030 return true;
1031 case ENCINDEX_UTF_8:
1032 // For UTF-8 it's worth scanning the string coderange when unknown.
1034 }
1035 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
1036 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
1037 return true;
1038 }
1039
1040 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
1041 return true;
1042 }
1043
1044 /* Conservative. Possibly single byte.
1045 * "\xa1" in Shift_JIS for example. */
1046 return false;
1047}
1048
1050
1051static inline const char *
1052search_nonascii(const char *p, const char *e)
1053{
1054 const uintptr_t *s, *t;
1055
1056#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
1057# if SIZEOF_UINTPTR_T == 8
1058# define NONASCII_MASK UINT64_C(0x8080808080808080)
1059# elif SIZEOF_UINTPTR_T == 4
1060# define NONASCII_MASK UINT32_C(0x80808080)
1061# else
1062# error "don't know what to do."
1063# endif
1064#else
1065# if SIZEOF_UINTPTR_T == 8
1066# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
1067# elif SIZEOF_UINTPTR_T == 4
1068# define NONASCII_MASK 0x80808080UL /* or...? */
1069# else
1070# error "don't know what to do."
1071# endif
1072#endif
1073
1074 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
1075#if !UNALIGNED_WORD_ACCESS
1076 if ((uintptr_t)p % SIZEOF_VOIDP) {
1077 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
1078 p += l;
1079 switch (l) {
1080 default: UNREACHABLE;
1081#if SIZEOF_VOIDP > 4
1082 case 7: if (p[-7]&0x80) return p-7;
1083 case 6: if (p[-6]&0x80) return p-6;
1084 case 5: if (p[-5]&0x80) return p-5;
1085 case 4: if (p[-4]&0x80) return p-4;
1086#endif
1087 case 3: if (p[-3]&0x80) return p-3;
1088 case 2: if (p[-2]&0x80) return p-2;
1089 case 1: if (p[-1]&0x80) return p-1;
1090 case 0: break;
1091 }
1092 }
1093#endif
1094#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
1095#define aligned_ptr(value) \
1096 __builtin_assume_aligned((value), sizeof(uintptr_t))
1097#else
1098#define aligned_ptr(value) (uintptr_t *)(value)
1099#endif
1100 s = aligned_ptr(p);
1101 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
1102#undef aligned_ptr
1103 for (;s < t; s++) {
1104 if (*s & NONASCII_MASK) {
1105#ifdef WORDS_BIGENDIAN
1106 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
1107#else
1108 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
1109#endif
1110 }
1111 }
1112 p = (const char *)s;
1113 }
1114
1115 switch (e - p) {
1116 default: UNREACHABLE;
1117#if SIZEOF_VOIDP > 4
1118 case 7: if (e[-7]&0x80) return e-7;
1119 case 6: if (e[-6]&0x80) return e-6;
1120 case 5: if (e[-5]&0x80) return e-5;
1121 case 4: if (e[-4]&0x80) return e-4;
1122#endif
1123 case 3: if (e[-3]&0x80) return e-3;
1124 case 2: if (e[-2]&0x80) return e-2;
1125 case 1: if (e[-1]&0x80) return e-1;
1126 case 0: return NULL;
1127 }
1128}
1129
1130static int
1131coderange_scan(const char *p, long len, rb_encoding *enc)
1132{
1133 const char *e = p + len;
1134
1135 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
1136 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
1137 p = search_nonascii(p, e);
1139 }
1140
1141 if (rb_enc_asciicompat(enc)) {
1142 p = search_nonascii(p, e);
1143 if (!p) return ENC_CODERANGE_7BIT;
1144 for (;;) {
1145 int ret = rb_enc_precise_mbclen(p, e, enc);
1146 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
1147 p += MBCLEN_CHARFOUND_LEN(ret);
1148 if (p == e) break;
1149 p = search_nonascii(p, e);
1150 if (!p) break;
1151 }
1152 }
1153 else {
1154 while (p < e) {
1155 int ret = rb_enc_precise_mbclen(p, e, enc);
1156 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
1157 p += MBCLEN_CHARFOUND_LEN(ret);
1158 }
1159 }
1160 return ENC_CODERANGE_VALID;
1161}
1162
1163long
1164rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
1165{
1166 const char *p = s;
1167
1168 if (*cr == ENC_CODERANGE_BROKEN)
1169 return e - s;
1170
1171 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
1172 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
1173 if (*cr == ENC_CODERANGE_VALID) return e - s;
1174 p = search_nonascii(p, e);
1176 return e - s;
1177 }
1178 else if (rb_enc_asciicompat(enc)) {
1179 p = search_nonascii(p, e);
1180 if (!p) {
1181 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
1182 return e - s;
1183 }
1184 for (;;) {
1185 int ret = rb_enc_precise_mbclen(p, e, enc);
1186 if (!MBCLEN_CHARFOUND_P(ret)) {
1188 return p - s;
1189 }
1190 p += MBCLEN_CHARFOUND_LEN(ret);
1191 if (p == e) break;
1192 p = search_nonascii(p, e);
1193 if (!p) break;
1194 }
1195 }
1196 else {
1197 while (p < e) {
1198 int ret = rb_enc_precise_mbclen(p, e, enc);
1199 if (!MBCLEN_CHARFOUND_P(ret)) {
1201 return p - s;
1202 }
1203 p += MBCLEN_CHARFOUND_LEN(ret);
1204 }
1205 }
1206 *cr = ENC_CODERANGE_VALID;
1207 return e - s;
1208}
1209
1210static inline void
1211str_enc_copy(VALUE str1, VALUE str2)
1212{
1213 rb_enc_set_index(str1, ENCODING_GET(str2));
1214}
1215
1216/* Like str_enc_copy, but does not check frozen status of str1.
1217 * You should use this only if you're certain that str1 is not frozen. */
1218static inline void
1219str_enc_copy_direct(VALUE str1, VALUE str2)
1220{
1221 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
1222 if (inlined_encoding == ENCODING_INLINE_MAX) {
1223 rb_enc_set_index(str1, rb_enc_get_index(str2));
1224 }
1225 else {
1226 ENCODING_SET_INLINED(str1, inlined_encoding);
1227 }
1228}
1229
1230static void
1231rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
1232{
1233 /* this function is designed for copying encoding and coderange
1234 * from src to new string "dest" which is made from the part of src.
1235 */
1236 str_enc_copy(dest, src);
1237 if (RSTRING_LEN(dest) == 0) {
1238 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
1240 else
1242 return;
1243 }
1244 switch (ENC_CODERANGE(src)) {
1245 case ENC_CODERANGE_7BIT:
1247 break;
1249 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
1250 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
1252 else
1254 break;
1255 default:
1256 break;
1257 }
1258}
1259
1260static void
1261rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
1262{
1263 str_enc_copy(dest, src);
1264 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
1265}
1266
1267static int
1268enc_coderange_scan(VALUE str, rb_encoding *enc)
1269{
1270 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
1271}
1272
1273int
1274rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
1275{
1276 return enc_coderange_scan(str, enc);
1277}
1278
1279int
1281{
1282 int cr = ENC_CODERANGE(str);
1283
1284 if (cr == ENC_CODERANGE_UNKNOWN) {
1285 cr = enc_coderange_scan(str, get_encoding(str));
1286 ENC_CODERANGE_SET(str, cr);
1287 }
1288 return cr;
1289}
1290
1291static inline bool
1292rb_enc_str_asciicompat(VALUE str)
1293{
1294 int encindex = ENCODING_GET_INLINED(str);
1295 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
1296}
1297
1298int
1300{
1301 switch(ENC_CODERANGE(str)) {
1303 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
1304 case ENC_CODERANGE_7BIT:
1305 return true;
1306 default:
1307 return false;
1308 }
1309}
1310
1311static inline void
1312str_mod_check(VALUE s, const char *p, long len)
1313{
1314 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
1315 rb_raise(rb_eRuntimeError, "string modified");
1316 }
1317}
1318
1319static size_t
1320str_capacity(VALUE str, const int termlen)
1321{
1322 if (STR_EMBED_P(str)) {
1323 return str_embed_capa(str) - termlen;
1324 }
1325 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
1326 return RSTRING(str)->len;
1327 }
1328 else {
1329 return RSTRING(str)->as.heap.aux.capa;
1330 }
1331}
1332
1333size_t
1335{
1336 return str_capacity(str, TERM_LEN(str));
1337}
1338
1339static inline void
1340must_not_null(const char *ptr)
1341{
1342 if (!ptr) {
1343 rb_raise(rb_eArgError, "NULL pointer given");
1344 }
1345}
1346
1347static inline VALUE
1348str_alloc_embed(VALUE klass, size_t capa)
1349{
1350 size_t size = rb_str_embed_size(capa);
1351 RUBY_ASSERT(size > 0);
1352 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1353
1354 NEWOBJ_OF(str, struct RString, klass,
1356
1357 return (VALUE)str;
1358}
1359
1360static inline VALUE
1361str_alloc_heap(VALUE klass)
1362{
1363 NEWOBJ_OF(str, struct RString, klass,
1364 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1365
1366 return (VALUE)str;
1367}
1368
1369static inline VALUE
1370empty_str_alloc(VALUE klass)
1371{
1372 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1373 VALUE str = str_alloc_embed(klass, 0);
1374 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1376 return str;
1377}
1378
1379static VALUE
1380str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1381{
1382 VALUE str;
1383
1384 if (len < 0) {
1385 rb_raise(rb_eArgError, "negative string size (or size too big)");
1386 }
1387
1388 if (enc == NULL) {
1389 enc = rb_ascii8bit_encoding();
1390 }
1391
1392 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1393
1394 int termlen = rb_enc_mbminlen(enc);
1395
1396 if (STR_EMBEDDABLE_P(len, termlen)) {
1397 str = str_alloc_embed(klass, len + termlen);
1398 if (len == 0) {
1399 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1400 }
1401 }
1402 else {
1403 str = str_alloc_heap(klass);
1404 RSTRING(str)->as.heap.aux.capa = len;
1405 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1406 * integer overflow. If we can STATIC_ASSERT that, the following
1407 * mul_add_mul can be reverted to a simple ALLOC_N. */
1408 RSTRING(str)->as.heap.ptr =
1409 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1410 }
1411
1412 rb_enc_raw_set(str, enc);
1413
1414 if (ptr) {
1415 memcpy(RSTRING_PTR(str), ptr, len);
1416 }
1417
1418 STR_SET_LEN(str, len);
1419 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1420 return str;
1421}
1422
1423static VALUE
1424str_new(VALUE klass, const char *ptr, long len)
1425{
1426 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1427}
1428
1429VALUE
1430rb_str_new(const char *ptr, long len)
1431{
1432 return str_new(rb_cString, ptr, len);
1433}
1434
1435VALUE
1436rb_usascii_str_new(const char *ptr, long len)
1437{
1438 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1439}
1440
1441VALUE
1442rb_utf8_str_new(const char *ptr, long len)
1443{
1444 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1445}
1446
1447VALUE
1448rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1449{
1450 return str_enc_new(rb_cString, ptr, len, enc);
1451}
1452
1453VALUE
1455{
1456 must_not_null(ptr);
1457 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1458 * memory regions, and that cannot be detected by the MSAN. Just
1459 * trust the programmer that the argument passed here is a sane C
1460 * string. */
1461 __msan_unpoison_string(ptr);
1462 return rb_str_new(ptr, strlen(ptr));
1463}
1464
1465VALUE
1467{
1468 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1469}
1470
1471VALUE
1473{
1474 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1475}
1476
1477VALUE
1479{
1480 must_not_null(ptr);
1481 if (rb_enc_mbminlen(enc) != 1) {
1482 rb_raise(rb_eArgError, "wchar encoding given");
1483 }
1484 return rb_enc_str_new(ptr, strlen(ptr), enc);
1485}
1486
1487static VALUE
1488str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1489{
1490 VALUE str;
1491
1492 if (len < 0) {
1493 rb_raise(rb_eArgError, "negative string size (or size too big)");
1494 }
1495
1496 if (!ptr) {
1497 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1498 }
1499 else {
1500 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1501 str = str_alloc_heap(klass);
1502 RSTRING(str)->len = len;
1503 RSTRING(str)->as.heap.ptr = (char *)ptr;
1504 RSTRING(str)->as.heap.aux.capa = len;
1505 RBASIC(str)->flags |= STR_NOFREE;
1506 rb_enc_associate_index(str, encindex);
1507 }
1508 return str;
1509}
1510
1511VALUE
1512rb_str_new_static(const char *ptr, long len)
1513{
1514 return str_new_static(rb_cString, ptr, len, 0);
1515}
1516
1517VALUE
1519{
1520 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1521}
1522
1523VALUE
1525{
1526 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1527}
1528
1529VALUE
1531{
1532 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1533}
1534
1535static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1536 rb_encoding *from, rb_encoding *to,
1537 int ecflags, VALUE ecopts);
1538
1539static inline bool
1540is_enc_ascii_string(VALUE str, rb_encoding *enc)
1541{
1542 int encidx = rb_enc_to_index(enc);
1543 if (rb_enc_get_index(str) == encidx)
1544 return is_ascii_string(str);
1545 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1546}
1547
1548VALUE
1549rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1550{
1551 long len;
1552 const char *ptr;
1553 VALUE newstr;
1554
1555 if (!to) return str;
1556 if (!from) from = rb_enc_get(str);
1557 if (from == to) return str;
1558 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1559 rb_is_ascii8bit_enc(to)) {
1560 if (STR_ENC_GET(str) != to) {
1561 str = rb_str_dup(str);
1562 rb_enc_associate(str, to);
1563 }
1564 return str;
1565 }
1566
1567 RSTRING_GETMEM(str, ptr, len);
1568 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1569 from, to, ecflags, ecopts);
1570 if (NIL_P(newstr)) {
1571 /* some error, return original */
1572 return str;
1573 }
1574 return newstr;
1575}
1576
1577VALUE
1578rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1579 rb_encoding *from, int ecflags, VALUE ecopts)
1580{
1581 long olen;
1582
1583 olen = RSTRING_LEN(newstr);
1584 if (ofs < -olen || olen < ofs)
1585 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1586 if (ofs < 0) ofs += olen;
1587 if (!from) {
1588 STR_SET_LEN(newstr, ofs);
1589 return rb_str_cat(newstr, ptr, len);
1590 }
1591
1592 rb_str_modify(newstr);
1593 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1594 rb_enc_get(newstr),
1595 ecflags, ecopts);
1596}
1597
1598VALUE
1599rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1600{
1601 STR_SET_LEN(str, 0);
1602 rb_enc_associate(str, enc);
1603 rb_str_cat(str, ptr, len);
1604 return str;
1605}
1606
1607static VALUE
1608str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1609 rb_encoding *from, rb_encoding *to,
1610 int ecflags, VALUE ecopts)
1611{
1612 rb_econv_t *ec;
1614 long olen;
1615 VALUE econv_wrapper;
1616 const unsigned char *start, *sp;
1617 unsigned char *dest, *dp;
1618 size_t converted_output = (size_t)ofs;
1619
1620 olen = rb_str_capacity(newstr);
1621
1622 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1623 RBASIC_CLEAR_CLASS(econv_wrapper);
1624 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1625 if (!ec) return Qnil;
1626 DATA_PTR(econv_wrapper) = ec;
1627
1628 sp = (unsigned char*)ptr;
1629 start = sp;
1630 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1631 (dp = dest + converted_output),
1632 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1634 /* destination buffer short */
1635 size_t converted_input = sp - start;
1636 size_t rest = len - converted_input;
1637 converted_output = dp - dest;
1638 rb_str_set_len(newstr, converted_output);
1639 if (converted_input && converted_output &&
1640 rest < (LONG_MAX / converted_output)) {
1641 rest = (rest * converted_output) / converted_input;
1642 }
1643 else {
1644 rest = olen;
1645 }
1646 olen += rest < 2 ? 2 : rest;
1647 rb_str_resize(newstr, olen);
1648 }
1649 DATA_PTR(econv_wrapper) = 0;
1650 RB_GC_GUARD(econv_wrapper);
1651 rb_econv_close(ec);
1652 switch (ret) {
1653 case econv_finished:
1654 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1655 rb_str_set_len(newstr, len);
1656 rb_enc_associate(newstr, to);
1657 return newstr;
1658
1659 default:
1660 return Qnil;
1661 }
1662}
1663
1664VALUE
1666{
1667 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1668}
1669
1670VALUE
1672{
1673 rb_encoding *ienc;
1674 VALUE str;
1675 const int eidx = rb_enc_to_index(eenc);
1676
1677 if (!ptr) {
1678 return rb_enc_str_new(ptr, len, eenc);
1679 }
1680
1681 /* ASCII-8BIT case, no conversion */
1682 if ((eidx == rb_ascii8bit_encindex()) ||
1683 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1684 return rb_str_new(ptr, len);
1685 }
1686 /* no default_internal or same encoding, no conversion */
1687 ienc = rb_default_internal_encoding();
1688 if (!ienc || eenc == ienc) {
1689 return rb_enc_str_new(ptr, len, eenc);
1690 }
1691 /* ASCII compatible, and ASCII only string, no conversion in
1692 * default_internal */
1693 if ((eidx == rb_ascii8bit_encindex()) ||
1694 (eidx == rb_usascii_encindex()) ||
1695 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1696 return rb_enc_str_new(ptr, len, ienc);
1697 }
1698 /* convert from the given encoding to default_internal */
1699 str = rb_enc_str_new(NULL, 0, ienc);
1700 /* when the conversion failed for some reason, just ignore the
1701 * default_internal and result in the given encoding as-is. */
1702 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1703 rb_str_initialize(str, ptr, len, eenc);
1704 }
1705 return str;
1706}
1707
1708VALUE
1709rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1710{
1711 int eidx = rb_enc_to_index(eenc);
1712 if (eidx == rb_usascii_encindex() &&
1713 !is_ascii_string(str)) {
1714 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1715 return str;
1716 }
1717 rb_enc_associate_index(str, eidx);
1718 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1719}
1720
1721VALUE
1722rb_external_str_new(const char *ptr, long len)
1723{
1724 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1725}
1726
1727VALUE
1729{
1730 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1731}
1732
1733VALUE
1734rb_locale_str_new(const char *ptr, long len)
1735{
1736 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1737}
1738
1739VALUE
1741{
1742 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1743}
1744
1745VALUE
1747{
1748 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1749}
1750
1751VALUE
1753{
1754 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1755}
1756
1757VALUE
1759{
1760 return rb_str_export_to_enc(str, rb_default_external_encoding());
1761}
1762
1763VALUE
1765{
1766 return rb_str_export_to_enc(str, rb_locale_encoding());
1767}
1768
1769VALUE
1771{
1772 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1773}
1774
1775static VALUE
1776str_replace_shared_without_enc(VALUE str2, VALUE str)
1777{
1778 const int termlen = TERM_LEN(str);
1779 char *ptr;
1780 long len;
1781
1782 RSTRING_GETMEM(str, ptr, len);
1783 if (str_embed_capa(str2) >= len + termlen) {
1784 char *ptr2 = RSTRING(str2)->as.embed.ary;
1785 STR_SET_EMBED(str2);
1786 memcpy(ptr2, RSTRING_PTR(str), len);
1787 TERM_FILL(ptr2+len, termlen);
1788 }
1789 else {
1790 VALUE root;
1791 if (STR_SHARED_P(str)) {
1792 root = RSTRING(str)->as.heap.aux.shared;
1793 RSTRING_GETMEM(str, ptr, len);
1794 }
1795 else {
1796 root = rb_str_new_frozen(str);
1797 RSTRING_GETMEM(root, ptr, len);
1798 }
1799 RUBY_ASSERT(OBJ_FROZEN(root));
1800
1801 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1802 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1803 rb_fatal("about to free a possible shared root");
1804 }
1805 char *ptr2 = STR_HEAP_PTR(str2);
1806 if (ptr2 != ptr) {
1807 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1808 }
1809 }
1810 FL_SET(str2, STR_NOEMBED);
1811 RSTRING(str2)->as.heap.ptr = ptr;
1812 STR_SET_SHARED(str2, root);
1813 }
1814
1815 STR_SET_LEN(str2, len);
1816
1817 return str2;
1818}
1819
1820static VALUE
1821str_replace_shared(VALUE str2, VALUE str)
1822{
1823 str_replace_shared_without_enc(str2, str);
1824 rb_enc_cr_str_exact_copy(str2, str);
1825 return str2;
1826}
1827
1828static VALUE
1829str_new_shared(VALUE klass, VALUE str)
1830{
1831 return str_replace_shared(str_alloc_heap(klass), str);
1832}
1833
1834VALUE
1836{
1837 return str_new_shared(rb_obj_class(str), str);
1838}
1839
1840VALUE
1842{
1843 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1844 return str_new_frozen(rb_obj_class(orig), orig);
1845}
1846
1847static VALUE
1848rb_str_new_frozen_String(VALUE orig)
1849{
1850 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1851 return str_new_frozen(rb_cString, orig);
1852}
1853
1854
1855VALUE
1856rb_str_frozen_bare_string(VALUE orig)
1857{
1858 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1859 return str_new_frozen(rb_cString, orig);
1860}
1861
1862VALUE
1863rb_str_tmp_frozen_acquire(VALUE orig)
1864{
1865 if (OBJ_FROZEN_RAW(orig)) return orig;
1866 return str_new_frozen_buffer(0, orig, FALSE);
1867}
1868
1869VALUE
1870rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1871{
1872 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1873 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1874
1875 VALUE str = str_alloc_heap(0);
1876 OBJ_FREEZE(str);
1877 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1878 FL_SET(str, STR_SHARED_ROOT);
1879
1880 size_t capa = str_capacity(orig, TERM_LEN(orig));
1881
1882 /* If the string is embedded then we want to create a copy that is heap
1883 * allocated. If the string is shared then the shared root must be
1884 * embedded, so we want to create a copy. If the string is a shared root
1885 * then it must be embedded, so we want to create a copy. */
1886 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1887 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1888 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1889 }
1890 else {
1891 /* orig must be heap allocated and not shared, so we can safely transfer
1892 * the pointer to str. */
1893 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1894 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1895 RBASIC(orig)->flags &= ~STR_NOFREE;
1896 STR_SET_SHARED(orig, str);
1897 }
1898
1899 RSTRING(str)->len = RSTRING(orig)->len;
1900 RSTRING(str)->as.heap.aux.capa = capa;
1901
1902 return str;
1903}
1904
1905void
1906rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1907{
1908 if (RBASIC_CLASS(tmp) != 0)
1909 return;
1910
1911 if (STR_EMBED_P(tmp)) {
1913 }
1914 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1915 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1916 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1917
1918 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1919 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1920 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1921
1922 /* Unshare orig since the root (tmp) only has this one child. */
1923 FL_UNSET_RAW(orig, STR_SHARED);
1924 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1925 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1927
1928 /* Make tmp embedded and empty so it is safe for sweeping. */
1929 STR_SET_EMBED(tmp);
1930 STR_SET_LEN(tmp, 0);
1931 }
1932 }
1933}
1934
1935static VALUE
1936str_new_frozen(VALUE klass, VALUE orig)
1937{
1938 return str_new_frozen_buffer(klass, orig, TRUE);
1939}
1940
1941static VALUE
1942heap_str_make_shared(VALUE klass, VALUE orig)
1943{
1944 RUBY_ASSERT(!STR_EMBED_P(orig));
1945 RUBY_ASSERT(!STR_SHARED_P(orig));
1946
1947 VALUE str = str_alloc_heap(klass);
1948 STR_SET_LEN(str, RSTRING_LEN(orig));
1949 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1950 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1951 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1952 RBASIC(orig)->flags &= ~STR_NOFREE;
1953 STR_SET_SHARED(orig, str);
1954 if (klass == 0)
1955 FL_UNSET_RAW(str, STR_BORROWED);
1956 return str;
1957}
1958
1959static VALUE
1960str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1961{
1962 VALUE str;
1963
1964 long len = RSTRING_LEN(orig);
1965 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1966 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1967
1968 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1969 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1970 RUBY_ASSERT(STR_EMBED_P(str));
1971 }
1972 else {
1973 if (FL_TEST_RAW(orig, STR_SHARED)) {
1974 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1975 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1976 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1977 RUBY_ASSERT(ofs >= 0);
1978 RUBY_ASSERT(rest >= 0);
1979 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1981
1982 if ((ofs > 0) || (rest > 0) ||
1983 (klass != RBASIC(shared)->klass) ||
1984 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1985 str = str_new_shared(klass, shared);
1986 RUBY_ASSERT(!STR_EMBED_P(str));
1987 RSTRING(str)->as.heap.ptr += ofs;
1988 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1989 }
1990 else {
1991 if (RBASIC_CLASS(shared) == 0)
1992 FL_SET_RAW(shared, STR_BORROWED);
1993 return shared;
1994 }
1995 }
1996 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1997 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1998 STR_SET_EMBED(str);
1999 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
2000 STR_SET_LEN(str, RSTRING_LEN(orig));
2001 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
2002 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
2003 }
2004 else {
2005 str = heap_str_make_shared(klass, orig);
2006 }
2007 }
2008
2009 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
2010 OBJ_FREEZE(str);
2011 return str;
2012}
2013
2014VALUE
2015rb_str_new_with_class(VALUE obj, const char *ptr, long len)
2016{
2017 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
2018}
2019
2020static VALUE
2021str_new_empty_String(VALUE str)
2022{
2023 VALUE v = rb_str_new(0, 0);
2024 rb_enc_copy(v, str);
2025 return v;
2026}
2027
2028#define STR_BUF_MIN_SIZE 63
2029
2030VALUE
2032{
2033 if (STR_EMBEDDABLE_P(capa, 1)) {
2034 return str_alloc_embed(rb_cString, capa + 1);
2035 }
2036
2037 VALUE str = str_alloc_heap(rb_cString);
2038
2039 RSTRING(str)->as.heap.aux.capa = capa;
2040 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
2041 RSTRING(str)->as.heap.ptr[0] = '\0';
2042
2043 return str;
2044}
2045
2046VALUE
2048{
2049 VALUE str;
2050 long len = strlen(ptr);
2051
2052 str = rb_str_buf_new(len);
2053 rb_str_buf_cat(str, ptr, len);
2054
2055 return str;
2056}
2057
2058VALUE
2060{
2061 return str_new(0, 0, len);
2062}
2063
2064void
2066{
2067 if (STR_EMBED_P(str)) {
2068 RB_DEBUG_COUNTER_INC(obj_str_embed);
2069 }
2070 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
2071 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
2072 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
2073 }
2074 else {
2075 RB_DEBUG_COUNTER_INC(obj_str_ptr);
2076 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2077 }
2078}
2079
2080size_t
2081rb_str_memsize(VALUE str)
2082{
2083 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
2084 return STR_HEAP_SIZE(str);
2085 }
2086 else {
2087 return 0;
2088 }
2089}
2090
2091VALUE
2093{
2094 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
2095}
2096
2097static inline void str_discard(VALUE str);
2098static void str_shared_replace(VALUE str, VALUE str2);
2099
2100void
2102{
2103 if (str != str2) str_shared_replace(str, str2);
2104}
2105
2106static void
2107str_shared_replace(VALUE str, VALUE str2)
2108{
2109 rb_encoding *enc;
2110 int cr;
2111 int termlen;
2112
2113 RUBY_ASSERT(str2 != str);
2114 enc = STR_ENC_GET(str2);
2115 cr = ENC_CODERANGE(str2);
2116 str_discard(str);
2117 termlen = rb_enc_mbminlen(enc);
2118
2119 STR_SET_LEN(str, RSTRING_LEN(str2));
2120
2121 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
2122 STR_SET_EMBED(str);
2123 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
2124 rb_enc_associate(str, enc);
2125 ENC_CODERANGE_SET(str, cr);
2126 }
2127 else {
2128 if (STR_EMBED_P(str2)) {
2129 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
2130 long len = RSTRING_LEN(str2);
2131 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
2132
2133 char *new_ptr = ALLOC_N(char, len + termlen);
2134 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
2135 RSTRING(str2)->as.heap.ptr = new_ptr;
2136 STR_SET_LEN(str2, len);
2137 RSTRING(str2)->as.heap.aux.capa = len;
2138 STR_SET_NOEMBED(str2);
2139 }
2140
2141 STR_SET_NOEMBED(str);
2142 FL_UNSET(str, STR_SHARED);
2143 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
2144
2145 if (FL_TEST(str2, STR_SHARED)) {
2146 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
2147 STR_SET_SHARED(str, shared);
2148 }
2149 else {
2150 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
2151 }
2152
2153 /* abandon str2 */
2154 STR_SET_EMBED(str2);
2155 RSTRING_PTR(str2)[0] = 0;
2156 STR_SET_LEN(str2, 0);
2157 rb_enc_associate(str, enc);
2158 ENC_CODERANGE_SET(str, cr);
2159 }
2160}
2161
2162VALUE
2164{
2165 VALUE str;
2166
2167 if (RB_TYPE_P(obj, T_STRING)) {
2168 return obj;
2169 }
2170 str = rb_funcall(obj, idTo_s, 0);
2171 return rb_obj_as_string_result(str, obj);
2172}
2173
2174VALUE
2175rb_obj_as_string_result(VALUE str, VALUE obj)
2176{
2177 if (!RB_TYPE_P(str, T_STRING))
2178 return rb_any_to_s(obj);
2179 return str;
2180}
2181
2182static VALUE
2183str_replace(VALUE str, VALUE str2)
2184{
2185 long len;
2186
2187 len = RSTRING_LEN(str2);
2188 if (STR_SHARED_P(str2)) {
2189 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
2191 STR_SET_NOEMBED(str);
2192 STR_SET_LEN(str, len);
2193 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
2194 STR_SET_SHARED(str, shared);
2195 rb_enc_cr_str_exact_copy(str, str2);
2196 }
2197 else {
2198 str_replace_shared(str, str2);
2199 }
2200
2201 return str;
2202}
2203
2204static inline VALUE
2205ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
2206{
2207 size_t size = rb_str_embed_size(capa);
2208 RUBY_ASSERT(size > 0);
2209 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
2210
2211 NEWOBJ_OF(str, struct RString, klass,
2213
2214 return (VALUE)str;
2215}
2216
2217static inline VALUE
2218ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
2219{
2220 NEWOBJ_OF(str, struct RString, klass,
2221 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
2222
2223 return (VALUE)str;
2224}
2225
2226static inline VALUE
2227str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
2228{
2229 int encidx = 0;
2230 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
2231 encidx = rb_enc_get_index(str);
2232 flags &= ~ENCODING_MASK;
2233 }
2234 FL_SET_RAW(dup, flags & ~FL_FREEZE);
2235 if (encidx) rb_enc_associate_index(dup, encidx);
2236 return dup;
2237}
2238
2239static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
2240
2241static inline VALUE
2242str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
2243{
2244 VALUE flags = FL_TEST_RAW(str, flag_mask);
2245 long len = RSTRING_LEN(str);
2246
2247 RUBY_ASSERT(STR_EMBED_P(dup));
2248 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
2249 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
2250 STR_SET_LEN(dup, RSTRING_LEN(str));
2251 return str_duplicate_setup_encoding(str, dup, flags);
2252}
2253
2254static inline VALUE
2255str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
2256{
2257 VALUE flags = FL_TEST_RAW(str, flag_mask);
2258 VALUE root = str;
2259 if (FL_TEST_RAW(str, STR_SHARED)) {
2260 root = RSTRING(str)->as.heap.aux.shared;
2261 }
2262 else if (UNLIKELY(!(flags & FL_FREEZE))) {
2263 root = str = str_new_frozen(klass, str);
2264 flags = FL_TEST_RAW(str, flag_mask);
2265 }
2266 RUBY_ASSERT(!STR_SHARED_P(root));
2268
2269 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
2270 FL_SET(root, STR_SHARED_ROOT);
2271 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
2272 flags |= RSTRING_NOEMBED | STR_SHARED;
2273
2274 STR_SET_LEN(dup, RSTRING_LEN(str));
2275 return str_duplicate_setup_encoding(str, dup, flags);
2276}
2277
2278static inline VALUE
2279str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
2280{
2281 if (STR_EMBED_P(str)) {
2282 return str_duplicate_setup_embed(klass, str, dup);
2283 }
2284 else {
2285 return str_duplicate_setup_heap(klass, str, dup);
2286 }
2287}
2288
2289static inline VALUE
2290str_duplicate(VALUE klass, VALUE str)
2291{
2292 VALUE dup;
2293 if (STR_EMBED_P(str)) {
2294 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
2295 }
2296 else {
2297 dup = str_alloc_heap(klass);
2298 }
2299
2300 return str_duplicate_setup(klass, str, dup);
2301}
2302
2303VALUE
2305{
2306 return str_duplicate(rb_obj_class(str), str);
2307}
2308
2309/* :nodoc: */
2310VALUE
2311rb_str_dup_m(VALUE str)
2312{
2313 if (LIKELY(BARE_STRING_P(str))) {
2314 return str_duplicate(rb_obj_class(str), str);
2315 }
2316 else {
2317 return rb_obj_dup(str);
2318 }
2319}
2320
2321VALUE
2323{
2324 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2325 return str_duplicate(rb_cString, str);
2326}
2327
2328VALUE
2329rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2330{
2331 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2332 VALUE new_str, klass = rb_cString;
2333
2334 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2335 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2336 str_duplicate_setup_embed(klass, str, new_str);
2337 }
2338 else {
2339 new_str = ec_str_alloc_heap(ec, klass);
2340 str_duplicate_setup_heap(klass, str, new_str);
2341 }
2342 if (chilled) {
2343 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2344 }
2345 return new_str;
2346}
2347
2348VALUE
2349rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2350{
2351 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2352 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2353 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2354 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2355 return rb_str_freeze(str);
2356}
2357
2358/*
2359 * The documentation block below uses an include (instead of inline text)
2360 * because the included text has non-ASCII characters (which are not allowed in a C file).
2361 */
2362
2363/*
2364 *
2365 * call-seq:
2366 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2367 *
2368 * :include: doc/string/new.rdoc
2369 *
2370 */
2371
2372static VALUE
2373rb_str_init(int argc, VALUE *argv, VALUE str)
2374{
2375 static ID keyword_ids[2];
2376 VALUE orig, opt, venc, vcapa;
2377 VALUE kwargs[2];
2378 rb_encoding *enc = 0;
2379 int n;
2380
2381 if (!keyword_ids[0]) {
2382 keyword_ids[0] = rb_id_encoding();
2383 CONST_ID(keyword_ids[1], "capacity");
2384 }
2385
2386 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2387 if (!NIL_P(opt)) {
2388 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2389 venc = kwargs[0];
2390 vcapa = kwargs[1];
2391 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2392 enc = rb_to_encoding(venc);
2393 }
2394 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2395 long capa = NUM2LONG(vcapa);
2396 long len = 0;
2397 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2398
2399 if (capa < STR_BUF_MIN_SIZE) {
2400 capa = STR_BUF_MIN_SIZE;
2401 }
2402 if (n == 1) {
2403 StringValue(orig);
2404 len = RSTRING_LEN(orig);
2405 if (capa < len) {
2406 capa = len;
2407 }
2408 if (orig == str) n = 0;
2409 }
2410 str_modifiable(str);
2411 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2412 /* make noembed always */
2413 const size_t size = (size_t)capa + termlen;
2414 const char *const old_ptr = RSTRING_PTR(str);
2415 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2416 char *new_ptr = ALLOC_N(char, size);
2417 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2418 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2419 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2420 RSTRING(str)->as.heap.ptr = new_ptr;
2421 }
2422 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2423 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2424 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2425 }
2426 STR_SET_LEN(str, len);
2427 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2428 if (n == 1) {
2429 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2430 rb_enc_cr_str_exact_copy(str, orig);
2431 }
2432 FL_SET(str, STR_NOEMBED);
2433 RSTRING(str)->as.heap.aux.capa = capa;
2434 }
2435 else if (n == 1) {
2436 rb_str_replace(str, orig);
2437 }
2438 if (enc) {
2439 rb_enc_associate(str, enc);
2441 }
2442 }
2443 else if (n == 1) {
2444 rb_str_replace(str, orig);
2445 }
2446 return str;
2447}
2448
2449/* :nodoc: */
2450static VALUE
2451rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2452{
2453 if (klass != rb_cString) {
2454 return rb_class_new_instance_pass_kw(argc, argv, klass);
2455 }
2456
2457 static ID keyword_ids[2];
2458 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2459 VALUE kwargs[2];
2460 rb_encoding *enc = NULL;
2461
2462 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2463 if (NIL_P(opt)) {
2464 return rb_class_new_instance_pass_kw(argc, argv, klass);
2465 }
2466
2467 keyword_ids[0] = rb_id_encoding();
2468 CONST_ID(keyword_ids[1], "capacity");
2469 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2470 encoding = kwargs[0];
2471 capacity = kwargs[1];
2472
2473 if (n == 1) {
2474 orig = StringValue(orig);
2475 }
2476 else {
2477 orig = Qnil;
2478 }
2479
2480 if (UNDEF_P(encoding)) {
2481 if (!NIL_P(orig)) {
2482 encoding = rb_obj_encoding(orig);
2483 }
2484 }
2485
2486 if (!UNDEF_P(encoding)) {
2487 enc = rb_to_encoding(encoding);
2488 }
2489
2490 // If capacity is nil, we're basically just duping `orig`.
2491 if (UNDEF_P(capacity)) {
2492 if (NIL_P(orig)) {
2493 VALUE empty_str = str_new(klass, "", 0);
2494 if (enc) {
2495 rb_enc_associate(empty_str, enc);
2496 }
2497 return empty_str;
2498 }
2499 VALUE copy = str_duplicate(klass, orig);
2500 rb_enc_associate(copy, enc);
2501 ENC_CODERANGE_CLEAR(copy);
2502 return copy;
2503 }
2504
2505 long capa = 0;
2506 capa = NUM2LONG(capacity);
2507 if (capa < 0) {
2508 capa = 0;
2509 }
2510
2511 if (!NIL_P(orig)) {
2512 long orig_capa = rb_str_capacity(orig);
2513 if (orig_capa > capa) {
2514 capa = orig_capa;
2515 }
2516 }
2517
2518 VALUE str = str_enc_new(klass, NULL, capa, enc);
2519 STR_SET_LEN(str, 0);
2520 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2521
2522 if (!NIL_P(orig)) {
2523 rb_str_buf_append(str, orig);
2524 }
2525
2526 return str;
2527}
2528
2529#ifdef NONASCII_MASK
2530#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2531
2532/*
2533 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2534 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2535 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2536 *
2537 * if (!(byte & 0x80))
2538 * byte |= 0x40; // turn on bit6
2539 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2540 *
2541 * This function calculates whether a byte is leading or not for all bytes
2542 * in the argument word by concurrently using the above logic, and then
2543 * adds up the number of leading bytes in the word.
2544 */
2545static inline uintptr_t
2546count_utf8_lead_bytes_with_word(const uintptr_t *s)
2547{
2548 uintptr_t d = *s;
2549
2550 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2551 d = (d>>6) | (~d>>7);
2552 d &= NONASCII_MASK >> 7;
2553
2554 /* Gather all bytes. */
2555#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2556 /* use only if it can use POPCNT */
2557 return rb_popcount_intptr(d);
2558#else
2559 d += (d>>8);
2560 d += (d>>16);
2561# if SIZEOF_VOIDP == 8
2562 d += (d>>32);
2563# endif
2564 return (d&0xF);
2565#endif
2566}
2567#endif
2568
2569static inline long
2570enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2571{
2572 long c;
2573 const char *q;
2574
2575 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2576 long diff = (long)(e - p);
2577 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2578 }
2579#ifdef NONASCII_MASK
2580 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2581 uintptr_t len = 0;
2582 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2583 const uintptr_t *s, *t;
2584 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2585 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2586 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2587 while (p < (const char *)s) {
2588 if (is_utf8_lead_byte(*p)) len++;
2589 p++;
2590 }
2591 while (s < t) {
2592 len += count_utf8_lead_bytes_with_word(s);
2593 s++;
2594 }
2595 p = (const char *)s;
2596 }
2597 while (p < e) {
2598 if (is_utf8_lead_byte(*p)) len++;
2599 p++;
2600 }
2601 return (long)len;
2602 }
2603#endif
2604 else if (rb_enc_asciicompat(enc)) {
2605 c = 0;
2606 if (ENC_CODERANGE_CLEAN_P(cr)) {
2607 while (p < e) {
2608 if (ISASCII(*p)) {
2609 q = search_nonascii(p, e);
2610 if (!q)
2611 return c + (e - p);
2612 c += q - p;
2613 p = q;
2614 }
2615 p += rb_enc_fast_mbclen(p, e, enc);
2616 c++;
2617 }
2618 }
2619 else {
2620 while (p < e) {
2621 if (ISASCII(*p)) {
2622 q = search_nonascii(p, e);
2623 if (!q)
2624 return c + (e - p);
2625 c += q - p;
2626 p = q;
2627 }
2628 p += rb_enc_mbclen(p, e, enc);
2629 c++;
2630 }
2631 }
2632 return c;
2633 }
2634
2635 for (c=0; p<e; c++) {
2636 p += rb_enc_mbclen(p, e, enc);
2637 }
2638 return c;
2639}
2640
2641long
2642rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2643{
2644 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2645}
2646
2647/* To get strlen with cr
2648 * Note that given cr is not used.
2649 */
2650long
2651rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2652{
2653 long c;
2654 const char *q;
2655 int ret;
2656
2657 *cr = 0;
2658 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2659 long diff = (long)(e - p);
2660 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2661 }
2662 else if (rb_enc_asciicompat(enc)) {
2663 c = 0;
2664 while (p < e) {
2665 if (ISASCII(*p)) {
2666 q = search_nonascii(p, e);
2667 if (!q) {
2668 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2669 return c + (e - p);
2670 }
2671 c += q - p;
2672 p = q;
2673 }
2674 ret = rb_enc_precise_mbclen(p, e, enc);
2675 if (MBCLEN_CHARFOUND_P(ret)) {
2676 *cr |= ENC_CODERANGE_VALID;
2677 p += MBCLEN_CHARFOUND_LEN(ret);
2678 }
2679 else {
2681 p++;
2682 }
2683 c++;
2684 }
2685 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2686 return c;
2687 }
2688
2689 for (c=0; p<e; c++) {
2690 ret = rb_enc_precise_mbclen(p, e, enc);
2691 if (MBCLEN_CHARFOUND_P(ret)) {
2692 *cr |= ENC_CODERANGE_VALID;
2693 p += MBCLEN_CHARFOUND_LEN(ret);
2694 }
2695 else {
2697 if (p + rb_enc_mbminlen(enc) <= e)
2698 p += rb_enc_mbminlen(enc);
2699 else
2700 p = e;
2701 }
2702 }
2703 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2704 return c;
2705}
2706
2707/* enc must be str's enc or rb_enc_check(str, str2) */
2708static long
2709str_strlen(VALUE str, rb_encoding *enc)
2710{
2711 const char *p, *e;
2712 int cr;
2713
2714 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2715 if (!enc) enc = STR_ENC_GET(str);
2716 p = RSTRING_PTR(str);
2717 e = RSTRING_END(str);
2718 cr = ENC_CODERANGE(str);
2719
2720 if (cr == ENC_CODERANGE_UNKNOWN) {
2721 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2722 if (cr) ENC_CODERANGE_SET(str, cr);
2723 return n;
2724 }
2725 else {
2726 return enc_strlen(p, e, enc, cr);
2727 }
2728}
2729
2730long
2732{
2733 return str_strlen(str, NULL);
2734}
2735
2736/*
2737 * call-seq:
2738 * length -> integer
2739 *
2740 * :include: doc/string/length.rdoc
2741 *
2742 */
2743
2744VALUE
2746{
2747 return LONG2NUM(str_strlen(str, NULL));
2748}
2749
2750/*
2751 * call-seq:
2752 * bytesize -> integer
2753 *
2754 * :include: doc/string/bytesize.rdoc
2755 *
2756 */
2757
2758VALUE
2759rb_str_bytesize(VALUE str)
2760{
2761 return LONG2NUM(RSTRING_LEN(str));
2762}
2763
2764/*
2765 * call-seq:
2766 * empty? -> true or false
2767 *
2768 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2769 *
2770 * "hello".empty? # => false
2771 * " ".empty? # => false
2772 * "".empty? # => true
2773 *
2774 */
2775
2776static VALUE
2777rb_str_empty(VALUE str)
2778{
2779 return RBOOL(RSTRING_LEN(str) == 0);
2780}
2781
2782/*
2783 * call-seq:
2784 * self + other_string -> new_string
2785 *
2786 * Returns a new string containing +other_string+ concatenated to +self+:
2787 *
2788 * 'Hello from ' + self.to_s # => "Hello from main"
2789 *
2790 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2791 */
2792
2793VALUE
2795{
2796 VALUE str3;
2797 rb_encoding *enc;
2798 char *ptr1, *ptr2, *ptr3;
2799 long len1, len2;
2800 int termlen;
2801
2802 StringValue(str2);
2803 enc = rb_enc_check_str(str1, str2);
2804 RSTRING_GETMEM(str1, ptr1, len1);
2805 RSTRING_GETMEM(str2, ptr2, len2);
2806 termlen = rb_enc_mbminlen(enc);
2807 if (len1 > LONG_MAX - len2) {
2808 rb_raise(rb_eArgError, "string size too big");
2809 }
2810 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2811 ptr3 = RSTRING_PTR(str3);
2812 memcpy(ptr3, ptr1, len1);
2813 memcpy(ptr3+len1, ptr2, len2);
2814 TERM_FILL(&ptr3[len1+len2], termlen);
2815
2816 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2818 RB_GC_GUARD(str1);
2819 RB_GC_GUARD(str2);
2820 return str3;
2821}
2822
2823/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2824VALUE
2825rb_str_opt_plus(VALUE str1, VALUE str2)
2826{
2829 long len1, len2;
2830 MAYBE_UNUSED(char) *ptr1, *ptr2;
2831 RSTRING_GETMEM(str1, ptr1, len1);
2832 RSTRING_GETMEM(str2, ptr2, len2);
2833 int enc1 = rb_enc_get_index(str1);
2834 int enc2 = rb_enc_get_index(str2);
2835
2836 if (enc1 < 0) {
2837 return Qundef;
2838 }
2839 else if (enc2 < 0) {
2840 return Qundef;
2841 }
2842 else if (enc1 != enc2) {
2843 return Qundef;
2844 }
2845 else if (len1 > LONG_MAX - len2) {
2846 return Qundef;
2847 }
2848 else {
2849 return rb_str_plus(str1, str2);
2850 }
2851
2852}
2853
2854/*
2855 * call-seq:
2856 * self * n -> new_string
2857 *
2858 * Returns a new string containing +n+ copies of +self+:
2859 *
2860 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2861 * 'No!' * 0 # => ""
2862 *
2863 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2864 */
2865
2866VALUE
2868{
2869 VALUE str2;
2870 long n, len;
2871 char *ptr2;
2872 int termlen;
2873
2874 if (times == INT2FIX(1)) {
2875 return str_duplicate(rb_cString, str);
2876 }
2877 if (times == INT2FIX(0)) {
2878 str2 = str_alloc_embed(rb_cString, 0);
2879 rb_enc_copy(str2, str);
2880 return str2;
2881 }
2882 len = NUM2LONG(times);
2883 if (len < 0) {
2884 rb_raise(rb_eArgError, "negative argument");
2885 }
2886 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2887 if (STR_EMBEDDABLE_P(len, 1)) {
2888 str2 = str_alloc_embed(rb_cString, len + 1);
2889 memset(RSTRING_PTR(str2), 0, len + 1);
2890 }
2891 else {
2892 str2 = str_alloc_heap(rb_cString);
2893 RSTRING(str2)->as.heap.aux.capa = len;
2894 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2895 }
2896 STR_SET_LEN(str2, len);
2897 rb_enc_copy(str2, str);
2898 return str2;
2899 }
2900 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2901 rb_raise(rb_eArgError, "argument too big");
2902 }
2903
2904 len *= RSTRING_LEN(str);
2905 termlen = TERM_LEN(str);
2906 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2907 ptr2 = RSTRING_PTR(str2);
2908 if (len) {
2909 n = RSTRING_LEN(str);
2910 memcpy(ptr2, RSTRING_PTR(str), n);
2911 while (n <= len/2) {
2912 memcpy(ptr2 + n, ptr2, n);
2913 n *= 2;
2914 }
2915 memcpy(ptr2 + n, ptr2, len-n);
2916 }
2917 STR_SET_LEN(str2, len);
2918 TERM_FILL(&ptr2[len], termlen);
2919 rb_enc_cr_str_copy_for_substr(str2, str);
2920
2921 return str2;
2922}
2923
2924/*
2925 * call-seq:
2926 * self % object -> new_string
2927 *
2928 * Returns the result of formatting +object+ into the format specifications
2929 * contained in +self+
2930 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2931 *
2932 * '%05d' % 123 # => "00123"
2933 *
2934 * If +self+ contains multiple format specifications,
2935 * +object+ must be an array or hash containing the objects to be formatted:
2936 *
2937 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2938 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2939 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2940 *
2941 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2942 */
2943
2944static VALUE
2945rb_str_format_m(VALUE str, VALUE arg)
2946{
2947 VALUE tmp = rb_check_array_type(arg);
2948
2949 if (!NIL_P(tmp)) {
2950 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2951 }
2952 return rb_str_format(1, &arg, str);
2953}
2954
2955static inline void
2956rb_check_lockedtmp(VALUE str)
2957{
2958 if (FL_TEST(str, STR_TMPLOCK)) {
2959 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2960 }
2961}
2962
2963// If none of these flags are set, we know we have an modifiable string.
2964// If any is set, we need to do more detailed checks.
2965#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2966static inline void
2967str_modifiable(VALUE str)
2968{
2969 RUBY_ASSERT(ruby_thread_has_gvl_p());
2970
2971 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2972 if (CHILLED_STRING_P(str)) {
2973 CHILLED_STRING_MUTATED(str);
2974 }
2975 rb_check_lockedtmp(str);
2976 rb_check_frozen(str);
2977 }
2978}
2979
2980static inline int
2981str_dependent_p(VALUE str)
2982{
2983 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2984 return FALSE;
2985 }
2986 else {
2987 return TRUE;
2988 }
2989}
2990
2991// If none of these flags are set, we know we have an independent string.
2992// If any is set, we need to do more detailed checks.
2993#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2994static inline int
2995str_independent(VALUE str)
2996{
2997 RUBY_ASSERT(ruby_thread_has_gvl_p());
2998
2999 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
3000 str_modifiable(str);
3001 return !str_dependent_p(str);
3002 }
3003 return TRUE;
3004}
3005
3006static void
3007str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
3008{
3009 RUBY_ASSERT(ruby_thread_has_gvl_p());
3010
3011 char *ptr;
3012 char *oldptr;
3013 long capa = len + expand;
3014
3015 if (len > capa) len = capa;
3016
3017 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
3018 ptr = RSTRING(str)->as.heap.ptr;
3019 STR_SET_EMBED(str);
3020 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
3021 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3022 STR_SET_LEN(str, len);
3023 return;
3024 }
3025
3026 ptr = ALLOC_N(char, (size_t)capa + termlen);
3027 oldptr = RSTRING_PTR(str);
3028 if (oldptr) {
3029 memcpy(ptr, oldptr, len);
3030 }
3031 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
3032 xfree(oldptr);
3033 }
3034 STR_SET_NOEMBED(str);
3035 FL_UNSET(str, STR_SHARED|STR_NOFREE);
3036 TERM_FILL(ptr + len, termlen);
3037 RSTRING(str)->as.heap.ptr = ptr;
3038 STR_SET_LEN(str, len);
3039 RSTRING(str)->as.heap.aux.capa = capa;
3040}
3041
3042void
3043rb_str_modify(VALUE str)
3044{
3045 if (!str_independent(str))
3046 str_make_independent(str);
3048}
3049
3050void
3052{
3053 RUBY_ASSERT(ruby_thread_has_gvl_p());
3054
3055 int termlen = TERM_LEN(str);
3056 long len = RSTRING_LEN(str);
3057
3058 if (expand < 0) {
3059 rb_raise(rb_eArgError, "negative expanding string size");
3060 }
3061 if (expand >= LONG_MAX - len) {
3062 rb_raise(rb_eArgError, "string size too big");
3063 }
3064
3065 if (!str_independent(str)) {
3066 str_make_independent_expand(str, len, expand, termlen);
3067 }
3068 else if (expand > 0) {
3069 RESIZE_CAPA_TERM(str, len + expand, termlen);
3070 }
3072}
3073
3074/* As rb_str_modify(), but don't clear coderange */
3075static void
3076str_modify_keep_cr(VALUE str)
3077{
3078 if (!str_independent(str))
3079 str_make_independent(str);
3081 /* Force re-scan later */
3083}
3084
3085static inline void
3086str_discard(VALUE str)
3087{
3088 str_modifiable(str);
3089 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
3090 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
3091 RSTRING(str)->as.heap.ptr = 0;
3092 STR_SET_LEN(str, 0);
3093 }
3094}
3095
3096void
3098{
3099 int encindex = rb_enc_get_index(str);
3100
3101 if (RB_UNLIKELY(encindex == -1)) {
3102 rb_raise(rb_eTypeError, "not encoding capable object");
3103 }
3104
3105 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
3106 return;
3107 }
3108
3109 rb_encoding *enc = rb_enc_from_index(encindex);
3110 if (!rb_enc_asciicompat(enc)) {
3111 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
3112 }
3113}
3114
3115VALUE
3117{
3118 RUBY_ASSERT(ruby_thread_has_gvl_p());
3119
3120 VALUE s = *ptr;
3121 if (!RB_TYPE_P(s, T_STRING)) {
3122 s = rb_str_to_str(s);
3123 *ptr = s;
3124 }
3125 return s;
3126}
3127
3128char *
3130{
3131 VALUE str = rb_string_value(ptr);
3132 return RSTRING_PTR(str);
3133}
3134
3135static int
3136zero_filled(const char *s, int n)
3137{
3138 for (; n > 0; --n) {
3139 if (*s++) return 0;
3140 }
3141 return 1;
3142}
3143
3144static const char *
3145str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
3146{
3147 const char *e = s + len;
3148
3149 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
3150 if (zero_filled(s, minlen)) return s;
3151 }
3152 return 0;
3153}
3154
3155static char *
3156str_fill_term(VALUE str, char *s, long len, int termlen)
3157{
3158 /* This function assumes that (capa + termlen) bytes of memory
3159 * is allocated, like many other functions in this file.
3160 */
3161 if (str_dependent_p(str)) {
3162 if (!zero_filled(s + len, termlen))
3163 str_make_independent_expand(str, len, 0L, termlen);
3164 }
3165 else {
3166 TERM_FILL(s + len, termlen);
3167 return s;
3168 }
3169 return RSTRING_PTR(str);
3170}
3171
3172void
3173rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
3174{
3175 long capa = str_capacity(str, oldtermlen) + oldtermlen;
3176 long len = RSTRING_LEN(str);
3177
3178 RUBY_ASSERT(capa >= len);
3179 if (capa - len < termlen) {
3180 rb_check_lockedtmp(str);
3181 str_make_independent_expand(str, len, 0L, termlen);
3182 }
3183 else if (str_dependent_p(str)) {
3184 if (termlen > oldtermlen)
3185 str_make_independent_expand(str, len, 0L, termlen);
3186 }
3187 else {
3188 if (!STR_EMBED_P(str)) {
3189 /* modify capa instead of realloc */
3190 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
3191 RSTRING(str)->as.heap.aux.capa = capa - termlen;
3192 }
3193 if (termlen > oldtermlen) {
3194 TERM_FILL(RSTRING_PTR(str) + len, termlen);
3195 }
3196 }
3197
3198 return;
3199}
3200
3201static char *
3202str_null_check(VALUE str, int *w)
3203{
3204 char *s = RSTRING_PTR(str);
3205 long len = RSTRING_LEN(str);
3206 rb_encoding *enc = rb_enc_get(str);
3207 const int minlen = rb_enc_mbminlen(enc);
3208
3209 if (minlen > 1) {
3210 *w = 1;
3211 if (str_null_char(s, len, minlen, enc)) {
3212 return NULL;
3213 }
3214 return str_fill_term(str, s, len, minlen);
3215 }
3216 *w = 0;
3217 if (!s || memchr(s, 0, len)) {
3218 return NULL;
3219 }
3220 if (s[len]) {
3221 s = str_fill_term(str, s, len, minlen);
3222 }
3223 return s;
3224}
3225
3226char *
3227rb_str_to_cstr(VALUE str)
3228{
3229 int w;
3230 return str_null_check(str, &w);
3231}
3232
3233char *
3235{
3236 VALUE str = rb_string_value(ptr);
3237 int w;
3238 char *s = str_null_check(str, &w);
3239 if (!s) {
3240 if (w) {
3241 rb_raise(rb_eArgError, "string contains null char");
3242 }
3243 rb_raise(rb_eArgError, "string contains null byte");
3244 }
3245 return s;
3246}
3247
3248char *
3249rb_str_fill_terminator(VALUE str, const int newminlen)
3250{
3251 char *s = RSTRING_PTR(str);
3252 long len = RSTRING_LEN(str);
3253 return str_fill_term(str, s, len, newminlen);
3254}
3255
3256VALUE
3258{
3259 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
3260 return str;
3261}
3262
3263/*
3264 * call-seq:
3265 * String.try_convert(object) -> object, new_string, or nil
3266 *
3267 * Attempts to convert the given +object+ to a string.
3268 *
3269 * If +object+ is already a string, returns +object+, unmodified.
3270 *
3271 * Otherwise if +object+ responds to <tt>:to_str</tt>,
3272 * calls <tt>object.to_str</tt> and returns the result.
3273 *
3274 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
3275 *
3276 * Raises an exception unless <tt>object.to_str</tt> returns a string.
3277 */
3278static VALUE
3279rb_str_s_try_convert(VALUE dummy, VALUE str)
3280{
3281 return rb_check_string_type(str);
3282}
3283
3284static char*
3285str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
3286{
3287 long nth = *nthp;
3288 if (rb_enc_mbmaxlen(enc) == 1) {
3289 p += nth;
3290 }
3291 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3292 p += nth * rb_enc_mbmaxlen(enc);
3293 }
3294 else if (rb_enc_asciicompat(enc)) {
3295 const char *p2, *e2;
3296 int n;
3297
3298 while (p < e && 0 < nth) {
3299 e2 = p + nth;
3300 if (e < e2) {
3301 *nthp = nth;
3302 return (char *)e;
3303 }
3304 if (ISASCII(*p)) {
3305 p2 = search_nonascii(p, e2);
3306 if (!p2) {
3307 nth -= e2 - p;
3308 *nthp = nth;
3309 return (char *)e2;
3310 }
3311 nth -= p2 - p;
3312 p = p2;
3313 }
3314 n = rb_enc_mbclen(p, e, enc);
3315 p += n;
3316 nth--;
3317 }
3318 *nthp = nth;
3319 if (nth != 0) {
3320 return (char *)e;
3321 }
3322 return (char *)p;
3323 }
3324 else {
3325 while (p < e && nth--) {
3326 p += rb_enc_mbclen(p, e, enc);
3327 }
3328 }
3329 if (p > e) p = e;
3330 *nthp = nth;
3331 return (char*)p;
3332}
3333
3334char*
3335rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3336{
3337 return str_nth_len(p, e, &nth, enc);
3338}
3339
3340static char*
3341str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3342{
3343 if (singlebyte)
3344 p += nth;
3345 else {
3346 p = str_nth_len(p, e, &nth, enc);
3347 }
3348 if (!p) return 0;
3349 if (p > e) p = e;
3350 return (char *)p;
3351}
3352
3353/* char offset to byte offset */
3354static long
3355str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3356{
3357 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3358 if (!pp) return e - p;
3359 return pp - p;
3360}
3361
3362long
3363rb_str_offset(VALUE str, long pos)
3364{
3365 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3366 STR_ENC_GET(str), single_byte_optimizable(str));
3367}
3368
3369#ifdef NONASCII_MASK
3370static char *
3371str_utf8_nth(const char *p, const char *e, long *nthp)
3372{
3373 long nth = *nthp;
3374 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3375 const uintptr_t *s, *t;
3376 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3377 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3378 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3379 while (p < (const char *)s) {
3380 if (is_utf8_lead_byte(*p)) nth--;
3381 p++;
3382 }
3383 do {
3384 nth -= count_utf8_lead_bytes_with_word(s);
3385 s++;
3386 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3387 p = (char *)s;
3388 }
3389 while (p < e) {
3390 if (is_utf8_lead_byte(*p)) {
3391 if (nth == 0) break;
3392 nth--;
3393 }
3394 p++;
3395 }
3396 *nthp = nth;
3397 return (char *)p;
3398}
3399
3400static long
3401str_utf8_offset(const char *p, const char *e, long nth)
3402{
3403 const char *pp = str_utf8_nth(p, e, &nth);
3404 return pp - p;
3405}
3406#endif
3407
3408/* byte offset to char offset */
3409long
3410rb_str_sublen(VALUE str, long pos)
3411{
3412 if (single_byte_optimizable(str) || pos < 0)
3413 return pos;
3414 else {
3415 char *p = RSTRING_PTR(str);
3416 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3417 }
3418}
3419
3420static VALUE
3421str_subseq(VALUE str, long beg, long len)
3422{
3423 VALUE str2;
3424
3425 RUBY_ASSERT(beg >= 0);
3426 RUBY_ASSERT(len >= 0);
3427 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3428
3429 const int termlen = TERM_LEN(str);
3430 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3431 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3432 RB_GC_GUARD(str);
3433 return str2;
3434 }
3435
3436 str2 = str_alloc_heap(rb_cString);
3437 if (str_embed_capa(str2) >= len + termlen) {
3438 char *ptr2 = RSTRING(str2)->as.embed.ary;
3439 STR_SET_EMBED(str2);
3440 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3441 TERM_FILL(ptr2+len, termlen);
3442
3443 STR_SET_LEN(str2, len);
3444 RB_GC_GUARD(str);
3445 }
3446 else {
3447 str_replace_shared(str2, str);
3448 RUBY_ASSERT(!STR_EMBED_P(str2));
3449 ENC_CODERANGE_CLEAR(str2);
3450 RSTRING(str2)->as.heap.ptr += beg;
3451 if (RSTRING_LEN(str2) > len) {
3452 STR_SET_LEN(str2, len);
3453 }
3454 }
3455
3456 return str2;
3457}
3458
3459VALUE
3460rb_str_subseq(VALUE str, long beg, long len)
3461{
3462 VALUE str2 = str_subseq(str, beg, len);
3463 rb_enc_cr_str_copy_for_substr(str2, str);
3464 return str2;
3465}
3466
3467char *
3468rb_str_subpos(VALUE str, long beg, long *lenp)
3469{
3470 long len = *lenp;
3471 long slen = -1L;
3472 const long blen = RSTRING_LEN(str);
3473 rb_encoding *enc = STR_ENC_GET(str);
3474 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3475
3476 if (len < 0) return 0;
3477 if (beg < 0 && -beg < 0) return 0;
3478 if (!blen) {
3479 len = 0;
3480 }
3481 if (single_byte_optimizable(str)) {
3482 if (beg > blen) return 0;
3483 if (beg < 0) {
3484 beg += blen;
3485 if (beg < 0) return 0;
3486 }
3487 if (len > blen - beg)
3488 len = blen - beg;
3489 if (len < 0) return 0;
3490 p = s + beg;
3491 goto end;
3492 }
3493 if (beg < 0) {
3494 if (len > -beg) len = -beg;
3495 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3496 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3497 beg = -beg;
3498 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3499 p = e;
3500 if (!p) return 0;
3501 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3502 if (!p) return 0;
3503 len = e - p;
3504 goto end;
3505 }
3506 else {
3507 slen = str_strlen(str, enc);
3508 beg += slen;
3509 if (beg < 0) return 0;
3510 p = s + beg;
3511 if (len == 0) goto end;
3512 }
3513 }
3514 else if (beg > 0 && beg > blen) {
3515 return 0;
3516 }
3517 if (len == 0) {
3518 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3519 p = s + beg;
3520 }
3521#ifdef NONASCII_MASK
3522 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3523 enc == rb_utf8_encoding()) {
3524 p = str_utf8_nth(s, e, &beg);
3525 if (beg > 0) return 0;
3526 len = str_utf8_offset(p, e, len);
3527 }
3528#endif
3529 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3530 int char_sz = rb_enc_mbmaxlen(enc);
3531
3532 p = s + beg * char_sz;
3533 if (p > e) {
3534 return 0;
3535 }
3536 else if (len * char_sz > e - p)
3537 len = e - p;
3538 else
3539 len *= char_sz;
3540 }
3541 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3542 if (beg > 0) return 0;
3543 len = 0;
3544 }
3545 else {
3546 len = str_offset(p, e, len, enc, 0);
3547 }
3548 end:
3549 *lenp = len;
3550 RB_GC_GUARD(str);
3551 return p;
3552}
3553
3554static VALUE str_substr(VALUE str, long beg, long len, int empty);
3555
3556VALUE
3557rb_str_substr(VALUE str, long beg, long len)
3558{
3559 return str_substr(str, beg, len, TRUE);
3560}
3561
3562VALUE
3563rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3564{
3565 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3566}
3567
3568static VALUE
3569str_substr(VALUE str, long beg, long len, int empty)
3570{
3571 char *p = rb_str_subpos(str, beg, &len);
3572
3573 if (!p) return Qnil;
3574 if (!len && !empty) return Qnil;
3575
3576 beg = p - RSTRING_PTR(str);
3577
3578 VALUE str2 = str_subseq(str, beg, len);
3579 rb_enc_cr_str_copy_for_substr(str2, str);
3580 return str2;
3581}
3582
3583/* :nodoc: */
3584VALUE
3586{
3587 if (CHILLED_STRING_P(str)) {
3588 FL_UNSET_RAW(str, STR_CHILLED);
3589 }
3590
3591 if (OBJ_FROZEN(str)) return str;
3592 rb_str_resize(str, RSTRING_LEN(str));
3593 return rb_obj_freeze(str);
3594}
3595
3596/*
3597 * call-seq:
3598 * +string -> new_string or self
3599 *
3600 * Returns +self+ if +self+ is not frozen and can be mutated
3601 * without warning issuance.
3602 *
3603 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3604 *
3605 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3606 */
3607static VALUE
3608str_uplus(VALUE str)
3609{
3610 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3611 return rb_str_dup(str);
3612 }
3613 else {
3614 return str;
3615 }
3616}
3617
3618/*
3619 * call-seq:
3620 * -self -> frozen_string
3621 *
3622 * Returns a frozen string equal to +self+.
3623 *
3624 * The returned string is +self+ if and only if all of the following are true:
3625 *
3626 * - +self+ is already frozen.
3627 * - +self+ is an instance of \String (rather than of a subclass of \String)
3628 * - +self+ has no instance variables set on it.
3629 *
3630 * Otherwise, the returned string is a frozen copy of +self+.
3631 *
3632 * Returning +self+, when possible, saves duplicating +self+;
3633 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3634 *
3635 * It may also save duplicating other, already-existing, strings:
3636 *
3637 * s0 = 'foo'
3638 * s1 = 'foo'
3639 * s0.object_id == s1.object_id # => false
3640 * (-s0).object_id == (-s1).object_id # => true
3641 *
3642 * Note that method #-@ is convenient for defining a constant:
3643 *
3644 * FileName = -'config/database.yml'
3645 *
3646 * While its alias #dedup is better suited for chaining:
3647 *
3648 * 'foo'.dedup.gsub!('o')
3649 *
3650 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3651 */
3652static VALUE
3653str_uminus(VALUE str)
3654{
3655 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3656 str = rb_str_dup(str);
3657 }
3658 return rb_fstring(str);
3659}
3660
3661RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3662#define rb_str_dup_frozen rb_str_new_frozen
3663
3664VALUE
3666{
3667 if (FL_TEST(str, STR_TMPLOCK)) {
3668 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3669 }
3670 FL_SET(str, STR_TMPLOCK);
3671 return str;
3672}
3673
3674VALUE
3676{
3677 if (!FL_TEST(str, STR_TMPLOCK)) {
3678 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3679 }
3680 FL_UNSET(str, STR_TMPLOCK);
3681 return str;
3682}
3683
3684VALUE
3685rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3686{
3687 rb_str_locktmp(str);
3688 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3689}
3690
3691void
3693{
3694 RUBY_ASSERT(ruby_thread_has_gvl_p());
3695
3696 long capa;
3697 const int termlen = TERM_LEN(str);
3698
3699 str_modifiable(str);
3700 if (STR_SHARED_P(str)) {
3701 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3702 }
3703 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3704 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3705 }
3706
3707 int cr = ENC_CODERANGE(str);
3708 if (len == 0) {
3709 /* Empty string does not contain non-ASCII */
3711 }
3712 else if (cr == ENC_CODERANGE_UNKNOWN) {
3713 /* Leave unknown. */
3714 }
3715 else if (len > RSTRING_LEN(str)) {
3716 if (ENC_CODERANGE_CLEAN_P(cr)) {
3717 /* Update the coderange regarding the extended part. */
3718 const char *const prev_end = RSTRING_END(str);
3719 const char *const new_end = RSTRING_PTR(str) + len;
3720 rb_encoding *enc = rb_enc_get(str);
3721 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3722 ENC_CODERANGE_SET(str, cr);
3723 }
3724 else if (cr == ENC_CODERANGE_BROKEN) {
3725 /* May be valid now, by appended part. */
3727 }
3728 }
3729 else if (len < RSTRING_LEN(str)) {
3730 if (cr != ENC_CODERANGE_7BIT) {
3731 /* ASCII-only string is keeping after truncated. Valid
3732 * and broken may be invalid or valid, leave unknown. */
3734 }
3735 }
3736
3737 STR_SET_LEN(str, len);
3738 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3739}
3740
3741VALUE
3742rb_str_resize(VALUE str, long len)
3743{
3744 if (len < 0) {
3745 rb_raise(rb_eArgError, "negative string size (or size too big)");
3746 }
3747
3748 int independent = str_independent(str);
3749 long slen = RSTRING_LEN(str);
3750 const int termlen = TERM_LEN(str);
3751
3752 if (slen > len || (termlen != 1 && slen < len)) {
3754 }
3755
3756 {
3757 long capa;
3758 if (STR_EMBED_P(str)) {
3759 if (len == slen) return str;
3760 if (str_embed_capa(str) >= len + termlen) {
3761 STR_SET_LEN(str, len);
3762 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3763 return str;
3764 }
3765 str_make_independent_expand(str, slen, len - slen, termlen);
3766 }
3767 else if (str_embed_capa(str) >= len + termlen) {
3768 char *ptr = STR_HEAP_PTR(str);
3769 STR_SET_EMBED(str);
3770 if (slen > len) slen = len;
3771 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3772 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3773 STR_SET_LEN(str, len);
3774 if (independent) ruby_xfree(ptr);
3775 return str;
3776 }
3777 else if (!independent) {
3778 if (len == slen) return str;
3779 str_make_independent_expand(str, slen, len - slen, termlen);
3780 }
3781 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3782 (capa - len) > (len < 1024 ? len : 1024)) {
3783 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3784 (size_t)len + termlen, STR_HEAP_SIZE(str));
3785 RSTRING(str)->as.heap.aux.capa = len;
3786 }
3787 else if (len == slen) return str;
3788 STR_SET_LEN(str, len);
3789 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3790 }
3791 return str;
3792}
3793
3794static void
3795str_ensure_available_capa(VALUE str, long len)
3796{
3797 str_modify_keep_cr(str);
3798
3799 const int termlen = TERM_LEN(str);
3800 long olen = RSTRING_LEN(str);
3801
3802 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3803 rb_raise(rb_eArgError, "string sizes too big");
3804 }
3805
3806 long total = olen + len;
3807 long capa = str_capacity(str, termlen);
3808
3809 if (capa < total) {
3810 if (total >= LONG_MAX / 2) {
3811 capa = total;
3812 }
3813 while (total > capa) {
3814 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3815 }
3816 RESIZE_CAPA_TERM(str, capa, termlen);
3817 }
3818}
3819
3820static VALUE
3821str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3822{
3823 if (keep_cr) {
3824 str_modify_keep_cr(str);
3825 }
3826 else {
3827 rb_str_modify(str);
3828 }
3829 if (len == 0) return 0;
3830
3831 long total, olen, off = -1;
3832 char *sptr;
3833 const int termlen = TERM_LEN(str);
3834
3835 RSTRING_GETMEM(str, sptr, olen);
3836 if (ptr >= sptr && ptr <= sptr + olen) {
3837 off = ptr - sptr;
3838 }
3839
3840 long capa = str_capacity(str, termlen);
3841
3842 if (olen > LONG_MAX - len) {
3843 rb_raise(rb_eArgError, "string sizes too big");
3844 }
3845 total = olen + len;
3846 if (capa < total) {
3847 if (total >= LONG_MAX / 2) {
3848 capa = total;
3849 }
3850 while (total > capa) {
3851 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3852 }
3853 RESIZE_CAPA_TERM(str, capa, termlen);
3854 sptr = RSTRING_PTR(str);
3855 }
3856 if (off != -1) {
3857 ptr = sptr + off;
3858 }
3859 memcpy(sptr + olen, ptr, len);
3860 STR_SET_LEN(str, total);
3861 TERM_FILL(sptr + total, termlen); /* sentinel */
3862
3863 return str;
3864}
3865
3866#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3867#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3868
3869VALUE
3870rb_str_cat(VALUE str, const char *ptr, long len)
3871{
3872 if (len == 0) return str;
3873 if (len < 0) {
3874 rb_raise(rb_eArgError, "negative string size (or size too big)");
3875 }
3876 return str_buf_cat(str, ptr, len);
3877}
3878
3879VALUE
3880rb_str_cat_cstr(VALUE str, const char *ptr)
3881{
3882 must_not_null(ptr);
3883 return rb_str_buf_cat(str, ptr, strlen(ptr));
3884}
3885
3886static void
3887rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3888{
3889 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3890
3891 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3892 if (UNLIKELY(!str_independent(str))) {
3893 str_make_independent(str);
3894 }
3895
3896 long string_length = -1;
3897 const int null_terminator_length = 1;
3898 char *sptr;
3899 RSTRING_GETMEM(str, sptr, string_length);
3900
3901 // Ensure the resulting string wouldn't be too long.
3902 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3903 rb_raise(rb_eArgError, "string sizes too big");
3904 }
3905
3906 long string_capacity = str_capacity(str, null_terminator_length);
3907
3908 // Get the code range before any modifications since those might clear the code range.
3909 int cr = ENC_CODERANGE(str);
3910
3911 // Check if the string has spare string_capacity to write the new byte.
3912 if (LIKELY(string_capacity >= string_length + 1)) {
3913 // In fast path we can write the new byte and note the string's new length.
3914 sptr[string_length] = byte;
3915 STR_SET_LEN(str, string_length + 1);
3916 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3917 }
3918 else {
3919 // If there's not enough string_capacity, make a call into the general string concatenation function.
3920 str_buf_cat(str, (char *)&byte, 1);
3921 }
3922
3923 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3924 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3925 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3926 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3927 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3928 if (ISASCII(byte)) {
3930 }
3931 else {
3933
3934 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3935 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3936 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3937 }
3938 }
3939 }
3940}
3941
3942RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3943RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3944RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3945
3946static VALUE
3947rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3948 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3949{
3950 int str_encindex = ENCODING_GET(str);
3951 int res_encindex;
3952 int str_cr, res_cr;
3953 rb_encoding *str_enc, *ptr_enc;
3954
3955 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3956
3957 if (str_encindex == ptr_encindex) {
3958 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3959 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3960 }
3961 }
3962 else {
3963 str_enc = rb_enc_from_index(str_encindex);
3964 ptr_enc = rb_enc_from_index(ptr_encindex);
3965 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3966 if (len == 0)
3967 return str;
3968 if (RSTRING_LEN(str) == 0) {
3969 rb_str_buf_cat(str, ptr, len);
3970 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3971 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3972 return str;
3973 }
3974 goto incompatible;
3975 }
3976 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3977 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3978 }
3979 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3980 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3981 str_cr = rb_enc_str_coderange(str);
3982 }
3983 }
3984 }
3985 if (ptr_cr_ret)
3986 *ptr_cr_ret = ptr_cr;
3987
3988 if (str_encindex != ptr_encindex &&
3989 str_cr != ENC_CODERANGE_7BIT &&
3990 ptr_cr != ENC_CODERANGE_7BIT) {
3991 str_enc = rb_enc_from_index(str_encindex);
3992 ptr_enc = rb_enc_from_index(ptr_encindex);
3993 goto incompatible;
3994 }
3995
3996 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3997 res_encindex = str_encindex;
3998 res_cr = ENC_CODERANGE_UNKNOWN;
3999 }
4000 else if (str_cr == ENC_CODERANGE_7BIT) {
4001 if (ptr_cr == ENC_CODERANGE_7BIT) {
4002 res_encindex = str_encindex;
4003 res_cr = ENC_CODERANGE_7BIT;
4004 }
4005 else {
4006 res_encindex = ptr_encindex;
4007 res_cr = ptr_cr;
4008 }
4009 }
4010 else if (str_cr == ENC_CODERANGE_VALID) {
4011 res_encindex = str_encindex;
4012 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
4013 res_cr = str_cr;
4014 else
4015 res_cr = ptr_cr;
4016 }
4017 else { /* str_cr == ENC_CODERANGE_BROKEN */
4018 res_encindex = str_encindex;
4019 res_cr = str_cr;
4020 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
4021 }
4022
4023 if (len < 0) {
4024 rb_raise(rb_eArgError, "negative string size (or size too big)");
4025 }
4026 str_buf_cat(str, ptr, len);
4027 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
4028 return str;
4029
4030 incompatible:
4031 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
4032 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
4034}
4035
4036VALUE
4037rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
4038{
4039 return rb_enc_cr_str_buf_cat(str, ptr, len,
4040 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
4041}
4042
4043VALUE
4045{
4046 /* ptr must reference NUL terminated ASCII string. */
4047 int encindex = ENCODING_GET(str);
4048 rb_encoding *enc = rb_enc_from_index(encindex);
4049 if (rb_enc_asciicompat(enc)) {
4050 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
4051 encindex, ENC_CODERANGE_7BIT, 0);
4052 }
4053 else {
4054 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
4055 while (*ptr) {
4056 unsigned int c = (unsigned char)*ptr;
4057 int len = rb_enc_codelen(c, enc);
4058 rb_enc_mbcput(c, buf, enc);
4059 rb_enc_cr_str_buf_cat(str, buf, len,
4060 encindex, ENC_CODERANGE_VALID, 0);
4061 ptr++;
4062 }
4063 return str;
4064 }
4065}
4066
4067VALUE
4069{
4070 int str2_cr = rb_enc_str_coderange(str2);
4071
4072 if (str_enc_fastpath(str)) {
4073 switch (str2_cr) {
4074 case ENC_CODERANGE_7BIT:
4075 // If RHS is 7bit we can do simple concatenation
4076 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
4077 RB_GC_GUARD(str2);
4078 return str;
4080 // If RHS is valid, we can do simple concatenation if encodings are the same
4081 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
4082 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
4083 int str_cr = ENC_CODERANGE(str);
4084 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
4085 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
4086 }
4087 RB_GC_GUARD(str2);
4088 return str;
4089 }
4090 }
4091 }
4092
4093 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
4094 ENCODING_GET(str2), str2_cr, &str2_cr);
4095
4096 ENC_CODERANGE_SET(str2, str2_cr);
4097
4098 return str;
4099}
4100
4101VALUE
4103{
4104 StringValue(str2);
4105 return rb_str_buf_append(str, str2);
4106}
4107
4108VALUE
4109rb_str_concat_literals(size_t num, const VALUE *strary)
4110{
4111 VALUE str;
4112 size_t i, s = 0;
4113 unsigned long len = 1;
4114
4115 if (UNLIKELY(!num)) return rb_str_new(0, 0);
4116 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
4117
4118 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
4119 str = rb_str_buf_new(len);
4120 str_enc_copy_direct(str, strary[0]);
4121
4122 for (i = s; i < num; ++i) {
4123 const VALUE v = strary[i];
4124 int encidx = ENCODING_GET(v);
4125
4126 rb_str_buf_append(str, v);
4127 if (encidx != ENCINDEX_US_ASCII) {
4128 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
4129 rb_enc_set_index(str, encidx);
4130 }
4131 }
4132 return str;
4133}
4134
4135/*
4136 * call-seq:
4137 * concat(*objects) -> string
4138 *
4139 * Concatenates each object in +objects+ to +self+ and returns +self+:
4140 *
4141 * s = 'foo'
4142 * s.concat('bar', 'baz') # => "foobarbaz"
4143 * s # => "foobarbaz"
4144 *
4145 * For each given object +object+ that is an Integer,
4146 * the value is considered a codepoint and converted to a character before concatenation:
4147 *
4148 * s = 'foo'
4149 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
4150 *
4151 * Related: String#<<, which takes a single argument.
4152 */
4153static VALUE
4154rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
4155{
4156 str_modifiable(str);
4157
4158 if (argc == 1) {
4159 return rb_str_concat(str, argv[0]);
4160 }
4161 else if (argc > 1) {
4162 int i;
4163 VALUE arg_str = rb_str_tmp_new(0);
4164 rb_enc_copy(arg_str, str);
4165 for (i = 0; i < argc; i++) {
4166 rb_str_concat(arg_str, argv[i]);
4167 }
4168 rb_str_buf_append(str, arg_str);
4169 }
4170
4171 return str;
4172}
4173
4174/*
4175 * call-seq:
4176 * append_as_bytes(*objects) -> string
4177 *
4178 * Concatenates each object in +objects+ into +self+ without any encoding
4179 * validation or conversion and returns +self+:
4180 *
4181 * s = 'foo'
4182 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
4183 * s.valid_encoding? # => false
4184 * s.append_as_bytes("\xAC 12")
4185 * s.valid_encoding? # => true
4186 *
4187 * For each given object +object+ that is an Integer,
4188 * the value is considered a Byte. If the Integer is bigger
4189 * than one byte, only the lower byte is considered, similar to String#setbyte:
4190 *
4191 * s = ""
4192 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
4193 *
4194 * Related: String#<<, String#concat, which do an encoding aware concatenation.
4195 */
4196
4197VALUE
4198rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
4199{
4200 long needed_capacity = 0;
4201 volatile VALUE t0;
4202 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
4203
4204 for (int index = 0; index < argc; index++) {
4205 VALUE obj = argv[index];
4206 enum ruby_value_type type = types[index] = rb_type(obj);
4207 switch (type) {
4208 case T_FIXNUM:
4209 case T_BIGNUM:
4210 needed_capacity++;
4211 break;
4212 case T_STRING:
4213 needed_capacity += RSTRING_LEN(obj);
4214 break;
4215 default:
4216 rb_raise(
4218 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
4219 rb_obj_class(obj)
4220 );
4221 break;
4222 }
4223 }
4224
4225 str_ensure_available_capa(str, needed_capacity);
4226 char *sptr = RSTRING_END(str);
4227
4228 for (int index = 0; index < argc; index++) {
4229 VALUE obj = argv[index];
4230 enum ruby_value_type type = types[index];
4231 switch (type) {
4232 case T_FIXNUM:
4233 case T_BIGNUM: {
4234 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
4235 char byte = (char)(NUM2INT(obj) & 0xFF);
4236 *sptr = byte;
4237 sptr++;
4238 break;
4239 }
4240 case T_STRING: {
4241 const char *ptr;
4242 long len;
4243 RSTRING_GETMEM(obj, ptr, len);
4244 memcpy(sptr, ptr, len);
4245 sptr += len;
4246 break;
4247 }
4248 default:
4249 rb_bug("append_as_bytes arguments should have been validated");
4250 }
4251 }
4252
4253 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
4254 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
4255
4256 int cr = ENC_CODERANGE(str);
4257 switch (cr) {
4258 case ENC_CODERANGE_7BIT: {
4259 for (int index = 0; index < argc; index++) {
4260 VALUE obj = argv[index];
4261 enum ruby_value_type type = types[index];
4262 switch (type) {
4263 case T_FIXNUM:
4264 case T_BIGNUM: {
4265 if (!ISASCII(NUM2INT(obj))) {
4266 goto clear_cr;
4267 }
4268 break;
4269 }
4270 case T_STRING: {
4271 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
4272 goto clear_cr;
4273 }
4274 break;
4275 }
4276 default:
4277 rb_bug("append_as_bytes arguments should have been validated");
4278 }
4279 }
4280 break;
4281 }
4283 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
4284 goto keep_cr;
4285 }
4286 else {
4287 goto clear_cr;
4288 }
4289 break;
4290 default:
4291 goto clear_cr;
4292 break;
4293 }
4294
4295 RB_GC_GUARD(t0);
4296
4297 clear_cr:
4298 // If no fast path was hit, we clear the coderange.
4299 // append_as_bytes is predominently meant to be used in
4300 // buffering situation, hence it's likely the coderange
4301 // will never be scanned, so it's not worth spending time
4302 // precomputing the coderange except for simple and common
4303 // situations.
4305 keep_cr:
4306 return str;
4307}
4308
4309/*
4310 * call-seq:
4311 * self << object -> self
4312 *
4313 * Appends a string representation of +object+ to +self+;
4314 * returns +self+.
4315 *
4316 * If +object+ is a string, appends it to +self+:
4317 *
4318 * s = 'foo'
4319 * s << 'bar' # => "foobar"
4320 * s # => "foobar"
4321 *
4322 * If +object+ is an integer,
4323 * its value is considered a codepoint;
4324 * converts the value to a character before concatenating:
4325 *
4326 * s = 'foo'
4327 * s << 33 # => "foo!"
4328 *
4329 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4330 * and the encoding of +self+ is Encoding::US_ASCII,
4331 * changes the encoding to Encoding::ASCII_8BIT:
4332 *
4333 * s = 'foo'.encode(Encoding::US_ASCII)
4334 * s.encoding # => #<Encoding:US-ASCII>
4335 * s << 0xff # => "foo\xFF"
4336 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4337 *
4338 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4339 *
4340 * s = 'foo'
4341 * s.encoding # => <Encoding:UTF-8>
4342 * s << 0x00110000 # 1114112 out of char range (RangeError)
4343 * s = 'foo'.encode(Encoding::EUC_JP)
4344 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4345 *
4346 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4347 */
4348VALUE
4350{
4351 unsigned int code;
4352 rb_encoding *enc = STR_ENC_GET(str1);
4353 int encidx;
4354
4355 if (RB_INTEGER_TYPE_P(str2)) {
4356 if (rb_num_to_uint(str2, &code) == 0) {
4357 }
4358 else if (FIXNUM_P(str2)) {
4359 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4360 }
4361 else {
4362 rb_raise(rb_eRangeError, "bignum out of char range");
4363 }
4364 }
4365 else {
4366 return rb_str_append(str1, str2);
4367 }
4368
4369 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4370
4371 if (encidx >= 0) {
4372 rb_str_buf_cat_byte(str1, (unsigned char)code);
4373 }
4374 else {
4375 long pos = RSTRING_LEN(str1);
4376 int cr = ENC_CODERANGE(str1);
4377 int len;
4378 char *buf;
4379
4380 switch (len = rb_enc_codelen(code, enc)) {
4381 case ONIGERR_INVALID_CODE_POINT_VALUE:
4382 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4383 break;
4384 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4385 case 0:
4386 rb_raise(rb_eRangeError, "%u out of char range", code);
4387 break;
4388 }
4389 buf = ALLOCA_N(char, len + 1);
4390 rb_enc_mbcput(code, buf, enc);
4391 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4392 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4393 }
4394 rb_str_resize(str1, pos+len);
4395 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4396 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4398 }
4399 else if (cr == ENC_CODERANGE_BROKEN) {
4401 }
4402 ENC_CODERANGE_SET(str1, cr);
4403 }
4404 return str1;
4405}
4406
4407int
4408rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4409{
4410 int encidx = rb_enc_to_index(enc);
4411
4412 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4413 /* US-ASCII automatically extended to ASCII-8BIT */
4414 if (code > 0xFF) {
4415 rb_raise(rb_eRangeError, "%u out of char range", code);
4416 }
4417 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4418 return ENCINDEX_ASCII_8BIT;
4419 }
4420 return encidx;
4421 }
4422 else {
4423 return -1;
4424 }
4425}
4426
4427/*
4428 * call-seq:
4429 * prepend(*other_strings) -> string
4430 *
4431 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4432 *
4433 * s = 'foo'
4434 * s.prepend('bar', 'baz') # => "barbazfoo"
4435 * s # => "barbazfoo"
4436 *
4437 * Related: String#concat.
4438 */
4439
4440static VALUE
4441rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4442{
4443 str_modifiable(str);
4444
4445 if (argc == 1) {
4446 rb_str_update(str, 0L, 0L, argv[0]);
4447 }
4448 else if (argc > 1) {
4449 int i;
4450 VALUE arg_str = rb_str_tmp_new(0);
4451 rb_enc_copy(arg_str, str);
4452 for (i = 0; i < argc; i++) {
4453 rb_str_append(arg_str, argv[i]);
4454 }
4455 rb_str_update(str, 0L, 0L, arg_str);
4456 }
4457
4458 return str;
4459}
4460
4461st_index_t
4463{
4464 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4465 st_index_t precomputed_hash;
4466 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4467
4468 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4469 return precomputed_hash;
4470 }
4471
4472 return str_do_hash(str);
4473}
4474
4475int
4477{
4478 long len1, len2;
4479 const char *ptr1, *ptr2;
4480 RSTRING_GETMEM(str1, ptr1, len1);
4481 RSTRING_GETMEM(str2, ptr2, len2);
4482 return (len1 != len2 ||
4483 !rb_str_comparable(str1, str2) ||
4484 memcmp(ptr1, ptr2, len1) != 0);
4485}
4486
4487/*
4488 * call-seq:
4489 * hash -> integer
4490 *
4491 * Returns the integer hash value for +self+.
4492 * The value is based on the length, content and encoding of +self+.
4493 *
4494 * Related: Object#hash.
4495 */
4496
4497static VALUE
4498rb_str_hash_m(VALUE str)
4499{
4500 st_index_t hval = rb_str_hash(str);
4501 return ST2FIX(hval);
4502}
4503
4504#define lesser(a,b) (((a)>(b))?(b):(a))
4505
4506int
4508{
4509 int idx1, idx2;
4510 int rc1, rc2;
4511
4512 if (RSTRING_LEN(str1) == 0) return TRUE;
4513 if (RSTRING_LEN(str2) == 0) return TRUE;
4514 idx1 = ENCODING_GET(str1);
4515 idx2 = ENCODING_GET(str2);
4516 if (idx1 == idx2) return TRUE;
4517 rc1 = rb_enc_str_coderange(str1);
4518 rc2 = rb_enc_str_coderange(str2);
4519 if (rc1 == ENC_CODERANGE_7BIT) {
4520 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4521 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4522 return TRUE;
4523 }
4524 if (rc2 == ENC_CODERANGE_7BIT) {
4525 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4526 return TRUE;
4527 }
4528 return FALSE;
4529}
4530
4531int
4533{
4534 long len1, len2;
4535 const char *ptr1, *ptr2;
4536 int retval;
4537
4538 if (str1 == str2) return 0;
4539 RSTRING_GETMEM(str1, ptr1, len1);
4540 RSTRING_GETMEM(str2, ptr2, len2);
4541 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4542 if (len1 == len2) {
4543 if (!rb_str_comparable(str1, str2)) {
4544 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4545 return 1;
4546 return -1;
4547 }
4548 return 0;
4549 }
4550 if (len1 > len2) return 1;
4551 return -1;
4552 }
4553 if (retval > 0) return 1;
4554 return -1;
4555}
4556
4557/*
4558 * call-seq:
4559 * self == object -> true or false
4560 *
4561 * Returns whether +object+ is equal to +self+.
4562 *
4563 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4564 *
4565 * s = 'foo'
4566 * s == 'foo' # => true
4567 * s == 'food' # => false
4568 * s == 'FOO' # => false
4569 *
4570 * Returns +false+ if the two strings' encodings are not compatible:
4571 *
4572 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4573 *
4574 * When +object+ is not a string:
4575 *
4576 * - If +object+ responds to method <tt>to_str</tt>,
4577 * <tt>object == self</tt> is called and its return value is returned.
4578 * - If +object+ does not respond to <tt>to_str</tt>,
4579 * +false+ is returned.
4580 *
4581 * Related: {Comparing}[rdoc-ref:String@Comparing].
4582 */
4583
4584VALUE
4586{
4587 if (str1 == str2) return Qtrue;
4588 if (!RB_TYPE_P(str2, T_STRING)) {
4589 if (!rb_respond_to(str2, idTo_str)) {
4590 return Qfalse;
4591 }
4592 return rb_equal(str2, str1);
4593 }
4594 return rb_str_eql_internal(str1, str2);
4595}
4596
4597/*
4598 * call-seq:
4599 * eql?(object) -> true or false
4600 *
4601 * Returns +true+ if +object+ has the same length and content;
4602 * as +self+; +false+ otherwise:
4603 *
4604 * s = 'foo'
4605 * s.eql?('foo') # => true
4606 * s.eql?('food') # => false
4607 * s.eql?('FOO') # => false
4608 *
4609 * Returns +false+ if the two strings' encodings are not compatible:
4610 *
4611 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1).eql?("\u{c4 d6 dc}") # => false
4612 *
4613 */
4614
4615VALUE
4616rb_str_eql(VALUE str1, VALUE str2)
4617{
4618 if (str1 == str2) return Qtrue;
4619 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4620 return rb_str_eql_internal(str1, str2);
4621}
4622
4623/*
4624 * call-seq:
4625 * self <=> other_string -> -1, 0, 1, or nil
4626 *
4627 * Compares +self+ and +other_string+, returning:
4628 *
4629 * - -1 if +other_string+ is larger.
4630 * - 0 if the two are equal.
4631 * - 1 if +other_string+ is smaller.
4632 * - +nil+ if the two are incomparable.
4633 *
4634 * Examples:
4635 *
4636 * 'foo' <=> 'foo' # => 0
4637 * 'foo' <=> 'food' # => -1
4638 * 'food' <=> 'foo' # => 1
4639 * 'FOO' <=> 'foo' # => -1
4640 * 'foo' <=> 'FOO' # => 1
4641 * 'foo' <=> 1 # => nil
4642 *
4643 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4644 */
4645
4646static VALUE
4647rb_str_cmp_m(VALUE str1, VALUE str2)
4648{
4649 int result;
4650 VALUE s = rb_check_string_type(str2);
4651 if (NIL_P(s)) {
4652 return rb_invcmp(str1, str2);
4653 }
4654 result = rb_str_cmp(str1, s);
4655 return INT2FIX(result);
4656}
4657
4658static VALUE str_casecmp(VALUE str1, VALUE str2);
4659static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4660
4661/*
4662 * call-seq:
4663 * casecmp(other_string) -> -1, 0, 1, or nil
4664 *
4665 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4666 *
4667 * - -1 if <tt>other_string.downcase</tt> is larger.
4668 * - 0 if the two are equal.
4669 * - 1 if <tt>other_string.downcase</tt> is smaller.
4670 * - +nil+ if the two are incomparable.
4671 *
4672 * Examples:
4673 *
4674 * 'foo'.casecmp('foo') # => 0
4675 * 'foo'.casecmp('food') # => -1
4676 * 'food'.casecmp('foo') # => 1
4677 * 'FOO'.casecmp('foo') # => 0
4678 * 'foo'.casecmp('FOO') # => 0
4679 * 'foo'.casecmp(1) # => nil
4680 *
4681 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4682 *
4683 * Related: String#casecmp?.
4684 *
4685 */
4686
4687static VALUE
4688rb_str_casecmp(VALUE str1, VALUE str2)
4689{
4690 VALUE s = rb_check_string_type(str2);
4691 if (NIL_P(s)) {
4692 return Qnil;
4693 }
4694 return str_casecmp(str1, s);
4695}
4696
4697static VALUE
4698str_casecmp(VALUE str1, VALUE str2)
4699{
4700 long len;
4701 rb_encoding *enc;
4702 const char *p1, *p1end, *p2, *p2end;
4703
4704 enc = rb_enc_compatible(str1, str2);
4705 if (!enc) {
4706 return Qnil;
4707 }
4708
4709 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4710 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4711 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4712 while (p1 < p1end && p2 < p2end) {
4713 if (*p1 != *p2) {
4714 unsigned int c1 = TOLOWER(*p1 & 0xff);
4715 unsigned int c2 = TOLOWER(*p2 & 0xff);
4716 if (c1 != c2)
4717 return INT2FIX(c1 < c2 ? -1 : 1);
4718 }
4719 p1++;
4720 p2++;
4721 }
4722 }
4723 else {
4724 while (p1 < p1end && p2 < p2end) {
4725 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4726 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4727
4728 if (0 <= c1 && 0 <= c2) {
4729 c1 = TOLOWER(c1);
4730 c2 = TOLOWER(c2);
4731 if (c1 != c2)
4732 return INT2FIX(c1 < c2 ? -1 : 1);
4733 }
4734 else {
4735 int r;
4736 l1 = rb_enc_mbclen(p1, p1end, enc);
4737 l2 = rb_enc_mbclen(p2, p2end, enc);
4738 len = l1 < l2 ? l1 : l2;
4739 r = memcmp(p1, p2, len);
4740 if (r != 0)
4741 return INT2FIX(r < 0 ? -1 : 1);
4742 if (l1 != l2)
4743 return INT2FIX(l1 < l2 ? -1 : 1);
4744 }
4745 p1 += l1;
4746 p2 += l2;
4747 }
4748 }
4749 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4750 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4751 return INT2FIX(-1);
4752}
4753
4754/*
4755 * call-seq:
4756 * casecmp?(other_string) -> true, false, or nil
4757 *
4758 * Returns +true+ if +self+ and +other_string+ are equal after
4759 * Unicode case folding, otherwise +false+:
4760 *
4761 * 'foo'.casecmp?('foo') # => true
4762 * 'foo'.casecmp?('food') # => false
4763 * 'food'.casecmp?('foo') # => false
4764 * 'FOO'.casecmp?('foo') # => true
4765 * 'foo'.casecmp?('FOO') # => true
4766 *
4767 * Returns +nil+ if the two values are incomparable:
4768 *
4769 * 'foo'.casecmp?(1) # => nil
4770 *
4771 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4772 *
4773 * Related: String#casecmp.
4774 *
4775 */
4776
4777static VALUE
4778rb_str_casecmp_p(VALUE str1, VALUE str2)
4779{
4780 VALUE s = rb_check_string_type(str2);
4781 if (NIL_P(s)) {
4782 return Qnil;
4783 }
4784 return str_casecmp_p(str1, s);
4785}
4786
4787static VALUE
4788str_casecmp_p(VALUE str1, VALUE str2)
4789{
4790 rb_encoding *enc;
4791 VALUE folded_str1, folded_str2;
4792 VALUE fold_opt = sym_fold;
4793
4794 enc = rb_enc_compatible(str1, str2);
4795 if (!enc) {
4796 return Qnil;
4797 }
4798
4799 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4800 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4801
4802 return rb_str_eql(folded_str1, folded_str2);
4803}
4804
4805static long
4806strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4807 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4808{
4809 const char *search_start = str_ptr;
4810 long pos, search_len = str_len - offset;
4811
4812 for (;;) {
4813 const char *t;
4814 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4815 if (pos < 0) return pos;
4816 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4817 if (t == search_start + pos) break;
4818 search_len -= t - search_start;
4819 if (search_len <= 0) return -1;
4820 offset += t - search_start;
4821 search_start = t;
4822 }
4823 return pos + offset;
4824}
4825
4826/* found index in byte */
4827#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4828#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4829
4830static long
4831rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4832{
4833 const char *str_ptr, *str_ptr_end, *sub_ptr;
4834 long str_len, sub_len;
4835 rb_encoding *enc;
4836
4837 enc = rb_enc_check(str, sub);
4838 if (is_broken_string(sub)) return -1;
4839
4840 str_ptr = RSTRING_PTR(str);
4841 str_ptr_end = RSTRING_END(str);
4842 str_len = RSTRING_LEN(str);
4843 sub_ptr = RSTRING_PTR(sub);
4844 sub_len = RSTRING_LEN(sub);
4845
4846 if (str_len < sub_len) return -1;
4847
4848 if (offset != 0) {
4849 long str_len_char, sub_len_char;
4850 int single_byte = single_byte_optimizable(str);
4851 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4852 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4853 if (offset < 0) {
4854 offset += str_len_char;
4855 if (offset < 0) return -1;
4856 }
4857 if (str_len_char - offset < sub_len_char) return -1;
4858 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4859 str_ptr += offset;
4860 }
4861 if (sub_len == 0) return offset;
4862
4863 /* need proceed one character at a time */
4864 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4865}
4866
4867
4868/*
4869 * call-seq:
4870 * index(substring, offset = 0) -> integer or nil
4871 * index(regexp, offset = 0) -> integer or nil
4872 *
4873 * :include: doc/string/index.rdoc
4874 *
4875 */
4876
4877static VALUE
4878rb_str_index_m(int argc, VALUE *argv, VALUE str)
4879{
4880 VALUE sub;
4881 VALUE initpos;
4882 rb_encoding *enc = STR_ENC_GET(str);
4883 long pos;
4884
4885 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4886 long slen = str_strlen(str, enc); /* str's enc */
4887 pos = NUM2LONG(initpos);
4888 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4889 if (RB_TYPE_P(sub, T_REGEXP)) {
4891 }
4892 return Qnil;
4893 }
4894 }
4895 else {
4896 pos = 0;
4897 }
4898
4899 if (RB_TYPE_P(sub, T_REGEXP)) {
4900 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4901 enc, single_byte_optimizable(str));
4902
4903 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4904 VALUE match = rb_backref_get();
4905 struct re_registers *regs = RMATCH_REGS(match);
4906 pos = rb_str_sublen(str, BEG(0));
4907 return LONG2NUM(pos);
4908 }
4909 }
4910 else {
4911 StringValue(sub);
4912 pos = rb_str_index(str, sub, pos);
4913 if (pos >= 0) {
4914 pos = rb_str_sublen(str, pos);
4915 return LONG2NUM(pos);
4916 }
4917 }
4918 return Qnil;
4919}
4920
4921/* Ensure that the given pos is a valid character boundary.
4922 * Note that in this function, "character" means a code point
4923 * (Unicode scalar value), not a grapheme cluster.
4924 */
4925static void
4926str_ensure_byte_pos(VALUE str, long pos)
4927{
4928 if (!single_byte_optimizable(str)) {
4929 const char *s = RSTRING_PTR(str);
4930 const char *e = RSTRING_END(str);
4931 const char *p = s + pos;
4932 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4933 rb_raise(rb_eIndexError,
4934 "offset %ld does not land on character boundary", pos);
4935 }
4936 }
4937}
4938
4939/*
4940 * call-seq:
4941 * byteindex(substring, offset = 0) -> integer or nil
4942 * byteindex(regexp, offset = 0) -> integer or nil
4943 *
4944 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4945 * or +nil+ if none found:
4946 *
4947 * 'foo'.byteindex('f') # => 0
4948 * 'foo'.byteindex('o') # => 1
4949 * 'foo'.byteindex('oo') # => 1
4950 * 'foo'.byteindex('ooo') # => nil
4951 *
4952 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4953 * or +nil+ if none found:
4954 *
4955 * 'foo'.byteindex(/f/) # => 0
4956 * 'foo'.byteindex(/o/) # => 1
4957 * 'foo'.byteindex(/oo/) # => 1
4958 * 'foo'.byteindex(/ooo/) # => nil
4959 *
4960 * Integer argument +offset+, if given, specifies the byte-based position in the
4961 * string to begin the search:
4962 *
4963 * 'foo'.byteindex('o', 1) # => 1
4964 * 'foo'.byteindex('o', 2) # => 2
4965 * 'foo'.byteindex('o', 3) # => nil
4966 *
4967 * If +offset+ is negative, counts backward from the end of +self+:
4968 *
4969 * 'foo'.byteindex('o', -1) # => 2
4970 * 'foo'.byteindex('o', -2) # => 1
4971 * 'foo'.byteindex('o', -3) # => 1
4972 * 'foo'.byteindex('o', -4) # => nil
4973 *
4974 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4975 * raised.
4976 *
4977 * Related: String#index, String#byterindex.
4978 */
4979
4980static VALUE
4981rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4982{
4983 VALUE sub;
4984 VALUE initpos;
4985 long pos;
4986
4987 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4988 long slen = RSTRING_LEN(str);
4989 pos = NUM2LONG(initpos);
4990 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4991 if (RB_TYPE_P(sub, T_REGEXP)) {
4993 }
4994 return Qnil;
4995 }
4996 }
4997 else {
4998 pos = 0;
4999 }
5000
5001 str_ensure_byte_pos(str, pos);
5002
5003 if (RB_TYPE_P(sub, T_REGEXP)) {
5004 if (rb_reg_search(sub, str, pos, 0) >= 0) {
5005 VALUE match = rb_backref_get();
5006 struct re_registers *regs = RMATCH_REGS(match);
5007 pos = BEG(0);
5008 return LONG2NUM(pos);
5009 }
5010 }
5011 else {
5012 StringValue(sub);
5013 pos = rb_str_byteindex(str, sub, pos);
5014 if (pos >= 0) return LONG2NUM(pos);
5015 }
5016 return Qnil;
5017}
5018
5019#ifndef HAVE_MEMRCHR
5020static void*
5021memrchr(const char *search_str, int chr, long search_len)
5022{
5023 const char *ptr = search_str + search_len;
5024 while (ptr > search_str) {
5025 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
5026 }
5027
5028 return ((void *)0);
5029}
5030#endif
5031
5032static long
5033str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
5034{
5035 char *hit, *adjusted;
5036 int c;
5037 long slen, searchlen;
5038 char *sbeg, *e, *t;
5039
5040 sbeg = RSTRING_PTR(str);
5041 slen = RSTRING_LEN(sub);
5042 if (slen == 0) return s - sbeg;
5043 e = RSTRING_END(str);
5044 t = RSTRING_PTR(sub);
5045 c = *t & 0xff;
5046 searchlen = s - sbeg + 1;
5047
5048 if (memcmp(s, t, slen) == 0) {
5049 return s - sbeg;
5050 }
5051
5052 do {
5053 hit = memrchr(sbeg, c, searchlen);
5054 if (!hit) break;
5055 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
5056 if (hit != adjusted) {
5057 searchlen = adjusted - sbeg;
5058 continue;
5059 }
5060 if (memcmp(hit, t, slen) == 0)
5061 return hit - sbeg;
5062 searchlen = adjusted - sbeg;
5063 } while (searchlen > 0);
5064
5065 return -1;
5066}
5067
5068/* found index in byte */
5069static long
5070rb_str_rindex(VALUE str, VALUE sub, long pos)
5071{
5072 long len, slen;
5073 char *sbeg, *s;
5074 rb_encoding *enc;
5075 int singlebyte;
5076
5077 enc = rb_enc_check(str, sub);
5078 if (is_broken_string(sub)) return -1;
5079 singlebyte = single_byte_optimizable(str);
5080 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
5081 slen = str_strlen(sub, enc); /* rb_enc_check */
5082
5083 /* substring longer than string */
5084 if (len < slen) return -1;
5085 if (len - pos < slen) pos = len - slen;
5086 if (len == 0) return pos;
5087
5088 sbeg = RSTRING_PTR(str);
5089
5090 if (pos == 0) {
5091 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
5092 return 0;
5093 else
5094 return -1;
5095 }
5096
5097 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
5098 return str_rindex(str, sub, s, enc);
5099}
5100
5101/*
5102 * call-seq:
5103 * rindex(substring, offset = self.length) -> integer or nil
5104 * rindex(regexp, offset = self.length) -> integer or nil
5105 *
5106 * Returns the Integer index of the _last_ occurrence of the given +substring+,
5107 * or +nil+ if none found:
5108 *
5109 * 'foo'.rindex('f') # => 0
5110 * 'foo'.rindex('o') # => 2
5111 * 'foo'.rindex('oo') # => 1
5112 * 'foo'.rindex('ooo') # => nil
5113 *
5114 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
5115 * or +nil+ if none found:
5116 *
5117 * 'foo'.rindex(/f/) # => 0
5118 * 'foo'.rindex(/o/) # => 2
5119 * 'foo'.rindex(/oo/) # => 1
5120 * 'foo'.rindex(/ooo/) # => nil
5121 *
5122 * The _last_ match means starting at the possible last position, not
5123 * the last of longest matches.
5124 *
5125 * 'foo'.rindex(/o+/) # => 2
5126 * $~ #=> #<MatchData "o">
5127 *
5128 * To get the last longest match, needs to combine with negative
5129 * lookbehind.
5130 *
5131 * 'foo'.rindex(/(?<!o)o+/) # => 1
5132 * $~ #=> #<MatchData "oo">
5133 *
5134 * Or String#index with negative lookforward.
5135 *
5136 * 'foo'.index(/o+(?!.*o)/) # => 1
5137 * $~ #=> #<MatchData "oo">
5138 *
5139 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
5140 * string to _end_ the search:
5141 *
5142 * 'foo'.rindex('o', 0) # => nil
5143 * 'foo'.rindex('o', 1) # => 1
5144 * 'foo'.rindex('o', 2) # => 2
5145 * 'foo'.rindex('o', 3) # => 2
5146 *
5147 * If +offset+ is a negative Integer, the maximum starting position in the
5148 * string to _end_ the search is the sum of the string's length and +offset+:
5149 *
5150 * 'foo'.rindex('o', -1) # => 2
5151 * 'foo'.rindex('o', -2) # => 1
5152 * 'foo'.rindex('o', -3) # => nil
5153 * 'foo'.rindex('o', -4) # => nil
5154 *
5155 * Related: String#index.
5156 */
5157
5158static VALUE
5159rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
5160{
5161 VALUE sub;
5162 VALUE initpos;
5163 rb_encoding *enc = STR_ENC_GET(str);
5164 long pos, len = str_strlen(str, enc); /* str's enc */
5165
5166 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5167 pos = NUM2LONG(initpos);
5168 if (pos < 0 && (pos += len) < 0) {
5169 if (RB_TYPE_P(sub, T_REGEXP)) {
5171 }
5172 return Qnil;
5173 }
5174 if (pos > len) pos = len;
5175 }
5176 else {
5177 pos = len;
5178 }
5179
5180 if (RB_TYPE_P(sub, T_REGEXP)) {
5181 /* enc = rb_enc_check(str, sub); */
5182 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
5183 enc, single_byte_optimizable(str));
5184
5185 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5186 VALUE match = rb_backref_get();
5187 struct re_registers *regs = RMATCH_REGS(match);
5188 pos = rb_str_sublen(str, BEG(0));
5189 return LONG2NUM(pos);
5190 }
5191 }
5192 else {
5193 StringValue(sub);
5194 pos = rb_str_rindex(str, sub, pos);
5195 if (pos >= 0) {
5196 pos = rb_str_sublen(str, pos);
5197 return LONG2NUM(pos);
5198 }
5199 }
5200 return Qnil;
5201}
5202
5203static long
5204rb_str_byterindex(VALUE str, VALUE sub, long pos)
5205{
5206 long len, slen;
5207 char *sbeg, *s;
5208 rb_encoding *enc;
5209
5210 enc = rb_enc_check(str, sub);
5211 if (is_broken_string(sub)) return -1;
5212 len = RSTRING_LEN(str);
5213 slen = RSTRING_LEN(sub);
5214
5215 /* substring longer than string */
5216 if (len < slen) return -1;
5217 if (len - pos < slen) pos = len - slen;
5218 if (len == 0) return pos;
5219
5220 sbeg = RSTRING_PTR(str);
5221
5222 if (pos == 0) {
5223 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
5224 return 0;
5225 else
5226 return -1;
5227 }
5228
5229 s = sbeg + pos;
5230 return str_rindex(str, sub, s, enc);
5231}
5232
5233
5234/*
5235 * call-seq:
5236 * byterindex(substring, offset = self.bytesize) -> integer or nil
5237 * byterindex(regexp, offset = self.bytesize) -> integer or nil
5238 *
5239 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
5240 * or +nil+ if none found:
5241 *
5242 * 'foo'.byterindex('f') # => 0
5243 * 'foo'.byterindex('o') # => 2
5244 * 'foo'.byterindex('oo') # => 1
5245 * 'foo'.byterindex('ooo') # => nil
5246 *
5247 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
5248 * or +nil+ if none found:
5249 *
5250 * 'foo'.byterindex(/f/) # => 0
5251 * 'foo'.byterindex(/o/) # => 2
5252 * 'foo'.byterindex(/oo/) # => 1
5253 * 'foo'.byterindex(/ooo/) # => nil
5254 *
5255 * The _last_ match means starting at the possible last position, not
5256 * the last of longest matches.
5257 *
5258 * 'foo'.byterindex(/o+/) # => 2
5259 * $~ #=> #<MatchData "o">
5260 *
5261 * To get the last longest match, needs to combine with negative
5262 * lookbehind.
5263 *
5264 * 'foo'.byterindex(/(?<!o)o+/) # => 1
5265 * $~ #=> #<MatchData "oo">
5266 *
5267 * Or String#byteindex with negative lookforward.
5268 *
5269 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
5270 * $~ #=> #<MatchData "oo">
5271 *
5272 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
5273 * string to _end_ the search:
5274 *
5275 * 'foo'.byterindex('o', 0) # => nil
5276 * 'foo'.byterindex('o', 1) # => 1
5277 * 'foo'.byterindex('o', 2) # => 2
5278 * 'foo'.byterindex('o', 3) # => 2
5279 *
5280 * If +offset+ is a negative Integer, the maximum starting position in the
5281 * string to _end_ the search is the sum of the string's length and +offset+:
5282 *
5283 * 'foo'.byterindex('o', -1) # => 2
5284 * 'foo'.byterindex('o', -2) # => 1
5285 * 'foo'.byterindex('o', -3) # => nil
5286 * 'foo'.byterindex('o', -4) # => nil
5287 *
5288 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
5289 * raised.
5290 *
5291 * Related: String#byteindex.
5292 */
5293
5294static VALUE
5295rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
5296{
5297 VALUE sub;
5298 VALUE initpos;
5299 long pos, len = RSTRING_LEN(str);
5300
5301 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5302 pos = NUM2LONG(initpos);
5303 if (pos < 0 && (pos += len) < 0) {
5304 if (RB_TYPE_P(sub, T_REGEXP)) {
5306 }
5307 return Qnil;
5308 }
5309 if (pos > len) pos = len;
5310 }
5311 else {
5312 pos = len;
5313 }
5314
5315 str_ensure_byte_pos(str, pos);
5316
5317 if (RB_TYPE_P(sub, T_REGEXP)) {
5318 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5319 VALUE match = rb_backref_get();
5320 struct re_registers *regs = RMATCH_REGS(match);
5321 pos = BEG(0);
5322 return LONG2NUM(pos);
5323 }
5324 }
5325 else {
5326 StringValue(sub);
5327 pos = rb_str_byterindex(str, sub, pos);
5328 if (pos >= 0) return LONG2NUM(pos);
5329 }
5330 return Qnil;
5331}
5332
5333/*
5334 * call-seq:
5335 * self =~ object -> integer or nil
5336 *
5337 * When +object+ is a Regexp, returns the index of the first substring in +self+
5338 * matched by +object+,
5339 * or +nil+ if no match is found;
5340 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5341 *
5342 * 'foo' =~ /f/ # => 0
5343 * $~ # => #<MatchData "f">
5344 * 'foo' =~ /o/ # => 1
5345 * $~ # => #<MatchData "o">
5346 * 'foo' =~ /x/ # => nil
5347 * $~ # => nil
5348 *
5349 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5350 * (see Regexp#=~):
5351 *
5352 * number = nil
5353 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5354 * number # => nil # Not assigned.
5355 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5356 * number # => "9" # Assigned.
5357 *
5358 * If +object+ is not a Regexp, returns the value
5359 * returned by <tt>object =~ self</tt>.
5360 *
5361 * Related: see {Querying}[rdoc-ref:String@Querying].
5362 */
5363
5364static VALUE
5365rb_str_match(VALUE x, VALUE y)
5366{
5367 switch (OBJ_BUILTIN_TYPE(y)) {
5368 case T_STRING:
5369 rb_raise(rb_eTypeError, "type mismatch: String given");
5370
5371 case T_REGEXP:
5372 return rb_reg_match(y, x);
5373
5374 default:
5375 return rb_funcall(y, idEqTilde, 1, x);
5376 }
5377}
5378
5379
5380static VALUE get_pat(VALUE);
5381
5382
5383/*
5384 * call-seq:
5385 * match(pattern, offset = 0) -> matchdata or nil
5386 * match(pattern, offset = 0) {|matchdata| ... } -> object
5387 *
5388 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
5389 *
5390 * Note: also updates Regexp@Global+Variables.
5391 *
5392 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5393 * regexp = Regexp.new(pattern)
5394 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5395 * (see Regexp#match):
5396 * matchdata = regexp.match(self)
5397 *
5398 * With no block given, returns the computed +matchdata+:
5399 *
5400 * 'foo'.match('f') # => #<MatchData "f">
5401 * 'foo'.match('o') # => #<MatchData "o">
5402 * 'foo'.match('x') # => nil
5403 *
5404 * If Integer argument +offset+ is given, the search begins at index +offset+:
5405 *
5406 * 'foo'.match('f', 1) # => nil
5407 * 'foo'.match('o', 1) # => #<MatchData "o">
5408 *
5409 * With a block given, calls the block with the computed +matchdata+
5410 * and returns the block's return value:
5411 *
5412 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5413 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
5414 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
5415 *
5416 */
5417
5418static VALUE
5419rb_str_match_m(int argc, VALUE *argv, VALUE str)
5420{
5421 VALUE re, result;
5422 if (argc < 1)
5423 rb_check_arity(argc, 1, 2);
5424 re = argv[0];
5425 argv[0] = str;
5426 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5427 if (!NIL_P(result) && rb_block_given_p()) {
5428 return rb_yield(result);
5429 }
5430 return result;
5431}
5432
5433/*
5434 * call-seq:
5435 * match?(pattern, offset = 0) -> true or false
5436 *
5437 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5438 *
5439 * Note: does not update Regexp@Global+Variables.
5440 *
5441 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5442 * regexp = Regexp.new(pattern)
5443 *
5444 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5445 * +false+ otherwise:
5446 *
5447 * 'foo'.match?(/o/) # => true
5448 * 'foo'.match?('o') # => true
5449 * 'foo'.match?(/x/) # => false
5450 *
5451 * If Integer argument +offset+ is given, the search begins at index +offset+:
5452 * 'foo'.match?('f', 1) # => false
5453 * 'foo'.match?('o', 1) # => true
5454 *
5455 */
5456
5457static VALUE
5458rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5459{
5460 VALUE re;
5461 rb_check_arity(argc, 1, 2);
5462 re = get_pat(argv[0]);
5463 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5464}
5465
5466enum neighbor_char {
5467 NEIGHBOR_NOT_CHAR,
5468 NEIGHBOR_FOUND,
5469 NEIGHBOR_WRAPPED
5470};
5471
5472static enum neighbor_char
5473enc_succ_char(char *p, long len, rb_encoding *enc)
5474{
5475 long i;
5476 int l;
5477
5478 if (rb_enc_mbminlen(enc) > 1) {
5479 /* wchar, trivial case */
5480 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5481 if (!MBCLEN_CHARFOUND_P(r)) {
5482 return NEIGHBOR_NOT_CHAR;
5483 }
5484 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5485 l = rb_enc_code_to_mbclen(c, enc);
5486 if (!l) return NEIGHBOR_NOT_CHAR;
5487 if (l != len) return NEIGHBOR_WRAPPED;
5488 rb_enc_mbcput(c, p, enc);
5489 r = rb_enc_precise_mbclen(p, p + len, enc);
5490 if (!MBCLEN_CHARFOUND_P(r)) {
5491 return NEIGHBOR_NOT_CHAR;
5492 }
5493 return NEIGHBOR_FOUND;
5494 }
5495 while (1) {
5496 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5497 p[i] = '\0';
5498 if (i < 0)
5499 return NEIGHBOR_WRAPPED;
5500 ++((unsigned char*)p)[i];
5501 l = rb_enc_precise_mbclen(p, p+len, enc);
5502 if (MBCLEN_CHARFOUND_P(l)) {
5503 l = MBCLEN_CHARFOUND_LEN(l);
5504 if (l == len) {
5505 return NEIGHBOR_FOUND;
5506 }
5507 else {
5508 memset(p+l, 0xff, len-l);
5509 }
5510 }
5511 if (MBCLEN_INVALID_P(l) && i < len-1) {
5512 long len2;
5513 int l2;
5514 for (len2 = len-1; 0 < len2; len2--) {
5515 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5516 if (!MBCLEN_INVALID_P(l2))
5517 break;
5518 }
5519 memset(p+len2+1, 0xff, len-(len2+1));
5520 }
5521 }
5522}
5523
5524static enum neighbor_char
5525enc_pred_char(char *p, long len, rb_encoding *enc)
5526{
5527 long i;
5528 int l;
5529 if (rb_enc_mbminlen(enc) > 1) {
5530 /* wchar, trivial case */
5531 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5532 if (!MBCLEN_CHARFOUND_P(r)) {
5533 return NEIGHBOR_NOT_CHAR;
5534 }
5535 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5536 if (!c) return NEIGHBOR_NOT_CHAR;
5537 --c;
5538 l = rb_enc_code_to_mbclen(c, enc);
5539 if (!l) return NEIGHBOR_NOT_CHAR;
5540 if (l != len) return NEIGHBOR_WRAPPED;
5541 rb_enc_mbcput(c, p, enc);
5542 r = rb_enc_precise_mbclen(p, p + len, enc);
5543 if (!MBCLEN_CHARFOUND_P(r)) {
5544 return NEIGHBOR_NOT_CHAR;
5545 }
5546 return NEIGHBOR_FOUND;
5547 }
5548 while (1) {
5549 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5550 p[i] = '\xff';
5551 if (i < 0)
5552 return NEIGHBOR_WRAPPED;
5553 --((unsigned char*)p)[i];
5554 l = rb_enc_precise_mbclen(p, p+len, enc);
5555 if (MBCLEN_CHARFOUND_P(l)) {
5556 l = MBCLEN_CHARFOUND_LEN(l);
5557 if (l == len) {
5558 return NEIGHBOR_FOUND;
5559 }
5560 else {
5561 memset(p+l, 0, len-l);
5562 }
5563 }
5564 if (MBCLEN_INVALID_P(l) && i < len-1) {
5565 long len2;
5566 int l2;
5567 for (len2 = len-1; 0 < len2; len2--) {
5568 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5569 if (!MBCLEN_INVALID_P(l2))
5570 break;
5571 }
5572 memset(p+len2+1, 0, len-(len2+1));
5573 }
5574 }
5575}
5576
5577/*
5578 overwrite +p+ by succeeding letter in +enc+ and returns
5579 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5580 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5581 assuming each ranges are successive, and mbclen
5582 never change in each ranges.
5583 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5584 character.
5585 */
5586static enum neighbor_char
5587enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5588{
5589 enum neighbor_char ret;
5590 unsigned int c;
5591 int ctype;
5592 int range;
5593 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5594
5595 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5596 int try;
5597 const int max_gaps = 1;
5598
5599 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5600 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5601 ctype = ONIGENC_CTYPE_DIGIT;
5602 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5603 ctype = ONIGENC_CTYPE_ALPHA;
5604 else
5605 return NEIGHBOR_NOT_CHAR;
5606
5607 MEMCPY(save, p, char, len);
5608 for (try = 0; try <= max_gaps; ++try) {
5609 ret = enc_succ_char(p, len, enc);
5610 if (ret == NEIGHBOR_FOUND) {
5611 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5612 if (rb_enc_isctype(c, ctype, enc))
5613 return NEIGHBOR_FOUND;
5614 }
5615 }
5616 MEMCPY(p, save, char, len);
5617 range = 1;
5618 while (1) {
5619 MEMCPY(save, p, char, len);
5620 ret = enc_pred_char(p, len, enc);
5621 if (ret == NEIGHBOR_FOUND) {
5622 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5623 if (!rb_enc_isctype(c, ctype, enc)) {
5624 MEMCPY(p, save, char, len);
5625 break;
5626 }
5627 }
5628 else {
5629 MEMCPY(p, save, char, len);
5630 break;
5631 }
5632 range++;
5633 }
5634 if (range == 1) {
5635 return NEIGHBOR_NOT_CHAR;
5636 }
5637
5638 if (ctype != ONIGENC_CTYPE_DIGIT) {
5639 MEMCPY(carry, p, char, len);
5640 return NEIGHBOR_WRAPPED;
5641 }
5642
5643 MEMCPY(carry, p, char, len);
5644 enc_succ_char(carry, len, enc);
5645 return NEIGHBOR_WRAPPED;
5646}
5647
5648
5649static VALUE str_succ(VALUE str);
5650
5651/*
5652 * call-seq:
5653 * succ -> new_str
5654 *
5655 * Returns the successor to +self+. The successor is calculated by
5656 * incrementing characters.
5657 *
5658 * The first character to be incremented is the rightmost alphanumeric:
5659 * or, if no alphanumerics, the rightmost character:
5660 *
5661 * 'THX1138'.succ # => "THX1139"
5662 * '<<koala>>'.succ # => "<<koalb>>"
5663 * '***'.succ # => '**+'
5664 *
5665 * The successor to a digit is another digit, "carrying" to the next-left
5666 * character for a "rollover" from 9 to 0, and prepending another digit
5667 * if necessary:
5668 *
5669 * '00'.succ # => "01"
5670 * '09'.succ # => "10"
5671 * '99'.succ # => "100"
5672 *
5673 * The successor to a letter is another letter of the same case,
5674 * carrying to the next-left character for a rollover,
5675 * and prepending another same-case letter if necessary:
5676 *
5677 * 'aa'.succ # => "ab"
5678 * 'az'.succ # => "ba"
5679 * 'zz'.succ # => "aaa"
5680 * 'AA'.succ # => "AB"
5681 * 'AZ'.succ # => "BA"
5682 * 'ZZ'.succ # => "AAA"
5683 *
5684 * The successor to a non-alphanumeric character is the next character
5685 * in the underlying character set's collating sequence,
5686 * carrying to the next-left character for a rollover,
5687 * and prepending another character if necessary:
5688 *
5689 * s = 0.chr * 3
5690 * s # => "\x00\x00\x00"
5691 * s.succ # => "\x00\x00\x01"
5692 * s = 255.chr * 3
5693 * s # => "\xFF\xFF\xFF"
5694 * s.succ # => "\x01\x00\x00\x00"
5695 *
5696 * Carrying can occur between and among mixtures of alphanumeric characters:
5697 *
5698 * s = 'zz99zz99'
5699 * s.succ # => "aaa00aa00"
5700 * s = '99zz99zz'
5701 * s.succ # => "100aa00aa"
5702 *
5703 * The successor to an empty +String+ is a new empty +String+:
5704 *
5705 * ''.succ # => ""
5706 *
5707 */
5708
5709VALUE
5711{
5712 VALUE str;
5713 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5714 rb_enc_cr_str_copy_for_substr(str, orig);
5715 return str_succ(str);
5716}
5717
5718static VALUE
5719str_succ(VALUE str)
5720{
5721 rb_encoding *enc;
5722 char *sbeg, *s, *e, *last_alnum = 0;
5723 int found_alnum = 0;
5724 long l, slen;
5725 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5726 long carry_pos = 0, carry_len = 1;
5727 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5728
5729 slen = RSTRING_LEN(str);
5730 if (slen == 0) return str;
5731
5732 enc = STR_ENC_GET(str);
5733 sbeg = RSTRING_PTR(str);
5734 s = e = sbeg + slen;
5735
5736 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5737 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5738 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5739 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5740 break;
5741 }
5742 }
5743 l = rb_enc_precise_mbclen(s, e, enc);
5744 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5745 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5746 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5747 switch (neighbor) {
5748 case NEIGHBOR_NOT_CHAR:
5749 continue;
5750 case NEIGHBOR_FOUND:
5751 return str;
5752 case NEIGHBOR_WRAPPED:
5753 last_alnum = s;
5754 break;
5755 }
5756 found_alnum = 1;
5757 carry_pos = s - sbeg;
5758 carry_len = l;
5759 }
5760 if (!found_alnum) { /* str contains no alnum */
5761 s = e;
5762 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5763 enum neighbor_char neighbor;
5764 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5765 l = rb_enc_precise_mbclen(s, e, enc);
5766 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5767 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5768 MEMCPY(tmp, s, char, l);
5769 neighbor = enc_succ_char(tmp, l, enc);
5770 switch (neighbor) {
5771 case NEIGHBOR_FOUND:
5772 MEMCPY(s, tmp, char, l);
5773 return str;
5774 break;
5775 case NEIGHBOR_WRAPPED:
5776 MEMCPY(s, tmp, char, l);
5777 break;
5778 case NEIGHBOR_NOT_CHAR:
5779 break;
5780 }
5781 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5782 /* wrapped to \0...\0. search next valid char. */
5783 enc_succ_char(s, l, enc);
5784 }
5785 if (!rb_enc_asciicompat(enc)) {
5786 MEMCPY(carry, s, char, l);
5787 carry_len = l;
5788 }
5789 carry_pos = s - sbeg;
5790 }
5792 }
5793 RESIZE_CAPA(str, slen + carry_len);
5794 sbeg = RSTRING_PTR(str);
5795 s = sbeg + carry_pos;
5796 memmove(s + carry_len, s, slen - carry_pos);
5797 memmove(s, carry, carry_len);
5798 slen += carry_len;
5799 STR_SET_LEN(str, slen);
5800 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5802 return str;
5803}
5804
5805
5806/*
5807 * call-seq:
5808 * succ! -> self
5809 *
5810 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5811 */
5812
5813static VALUE
5814rb_str_succ_bang(VALUE str)
5815{
5816 rb_str_modify(str);
5817 str_succ(str);
5818 return str;
5819}
5820
5821static int
5822all_digits_p(const char *s, long len)
5823{
5824 while (len-- > 0) {
5825 if (!ISDIGIT(*s)) return 0;
5826 s++;
5827 }
5828 return 1;
5829}
5830
5831static int
5832str_upto_i(VALUE str, VALUE arg)
5833{
5834 rb_yield(str);
5835 return 0;
5836}
5837
5838/*
5839 * call-seq:
5840 * upto(other_string, exclusive = false) {|string| ... } -> self
5841 * upto(other_string, exclusive = false) -> new_enumerator
5842 *
5843 * With a block given, calls the block with each +String+ value
5844 * returned by successive calls to String#succ;
5845 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5846 * the sequence terminates when value +other_string+ is reached;
5847 * returns +self+:
5848 *
5849 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5850 * Output:
5851 *
5852 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5853 *
5854 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5855 *
5856 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5857 *
5858 * Output:
5859 *
5860 * a8 a9 b0 b1 b2 b3 b4 b5
5861 *
5862 * If +other_string+ would not be reached, does not call the block:
5863 *
5864 * '25'.upto('5') {|s| fail s }
5865 * 'aa'.upto('a') {|s| fail s }
5866 *
5867 * With no block given, returns a new Enumerator:
5868 *
5869 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5870 *
5871 */
5872
5873static VALUE
5874rb_str_upto(int argc, VALUE *argv, VALUE beg)
5875{
5876 VALUE end, exclusive;
5877
5878 rb_scan_args(argc, argv, "11", &end, &exclusive);
5879 RETURN_ENUMERATOR(beg, argc, argv);
5880 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5881}
5882
5883VALUE
5884rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5885{
5886 VALUE current, after_end;
5887 ID succ;
5888 int n, ascii;
5889 rb_encoding *enc;
5890
5891 CONST_ID(succ, "succ");
5892 StringValue(end);
5893 enc = rb_enc_check(beg, end);
5894 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5895 /* single character */
5896 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5897 char c = RSTRING_PTR(beg)[0];
5898 char e = RSTRING_PTR(end)[0];
5899
5900 if (c > e || (excl && c == e)) return beg;
5901 for (;;) {
5902 VALUE str = rb_enc_str_new(&c, 1, enc);
5904 if ((*each)(str, arg)) break;
5905 if (!excl && c == e) break;
5906 c++;
5907 if (excl && c == e) break;
5908 }
5909 return beg;
5910 }
5911 /* both edges are all digits */
5912 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5913 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5914 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5915 VALUE b, e;
5916 int width;
5917
5918 width = RSTRING_LENINT(beg);
5919 b = rb_str_to_inum(beg, 10, FALSE);
5920 e = rb_str_to_inum(end, 10, FALSE);
5921 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5922 long bi = FIX2LONG(b);
5923 long ei = FIX2LONG(e);
5924 rb_encoding *usascii = rb_usascii_encoding();
5925
5926 while (bi <= ei) {
5927 if (excl && bi == ei) break;
5928 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5929 bi++;
5930 }
5931 }
5932 else {
5933 ID op = excl ? '<' : idLE;
5934 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5935
5936 args[0] = INT2FIX(width);
5937 while (rb_funcall(b, op, 1, e)) {
5938 args[1] = b;
5939 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5940 b = rb_funcallv(b, succ, 0, 0);
5941 }
5942 }
5943 return beg;
5944 }
5945 /* normal case */
5946 n = rb_str_cmp(beg, end);
5947 if (n > 0 || (excl && n == 0)) return beg;
5948
5949 after_end = rb_funcallv(end, succ, 0, 0);
5950 current = str_duplicate(rb_cString, beg);
5951 while (!rb_str_equal(current, after_end)) {
5952 VALUE next = Qnil;
5953 if (excl || !rb_str_equal(current, end))
5954 next = rb_funcallv(current, succ, 0, 0);
5955 if ((*each)(current, arg)) break;
5956 if (NIL_P(next)) break;
5957 current = next;
5958 StringValue(current);
5959 if (excl && rb_str_equal(current, end)) break;
5960 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5961 break;
5962 }
5963
5964 return beg;
5965}
5966
5967VALUE
5968rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5969{
5970 VALUE current;
5971 ID succ;
5972
5973 CONST_ID(succ, "succ");
5974 /* both edges are all digits */
5975 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5976 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5977 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5978 int width = RSTRING_LENINT(beg);
5979 b = rb_str_to_inum(beg, 10, FALSE);
5980 if (FIXNUM_P(b)) {
5981 long bi = FIX2LONG(b);
5982 rb_encoding *usascii = rb_usascii_encoding();
5983
5984 while (FIXABLE(bi)) {
5985 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5986 bi++;
5987 }
5988 b = LONG2NUM(bi);
5989 }
5990 args[0] = INT2FIX(width);
5991 while (1) {
5992 args[1] = b;
5993 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5994 b = rb_funcallv(b, succ, 0, 0);
5995 }
5996 }
5997 /* normal case */
5998 current = str_duplicate(rb_cString, beg);
5999 while (1) {
6000 VALUE next = rb_funcallv(current, succ, 0, 0);
6001 if ((*each)(current, arg)) break;
6002 current = next;
6003 StringValue(current);
6004 if (RSTRING_LEN(current) == 0)
6005 break;
6006 }
6007
6008 return beg;
6009}
6010
6011static int
6012include_range_i(VALUE str, VALUE arg)
6013{
6014 VALUE *argp = (VALUE *)arg;
6015 if (!rb_equal(str, *argp)) return 0;
6016 *argp = Qnil;
6017 return 1;
6018}
6019
6020VALUE
6021rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
6022{
6023 beg = rb_str_new_frozen(beg);
6024 StringValue(end);
6025 end = rb_str_new_frozen(end);
6026 if (NIL_P(val)) return Qfalse;
6027 val = rb_check_string_type(val);
6028 if (NIL_P(val)) return Qfalse;
6029 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
6030 rb_enc_asciicompat(STR_ENC_GET(end)) &&
6031 rb_enc_asciicompat(STR_ENC_GET(val))) {
6032 const char *bp = RSTRING_PTR(beg);
6033 const char *ep = RSTRING_PTR(end);
6034 const char *vp = RSTRING_PTR(val);
6035 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
6036 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
6037 return Qfalse;
6038 else {
6039 char b = *bp;
6040 char e = *ep;
6041 char v = *vp;
6042
6043 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
6044 if (b <= v && v < e) return Qtrue;
6045 return RBOOL(!RTEST(exclusive) && v == e);
6046 }
6047 }
6048 }
6049#if 0
6050 /* both edges are all digits */
6051 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
6052 all_digits_p(bp, RSTRING_LEN(beg)) &&
6053 all_digits_p(ep, RSTRING_LEN(end))) {
6054 /* TODO */
6055 }
6056#endif
6057 }
6058 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
6059
6060 return RBOOL(NIL_P(val));
6061}
6062
6063static VALUE
6064rb_str_subpat(VALUE str, VALUE re, VALUE backref)
6065{
6066 if (rb_reg_search(re, str, 0, 0) >= 0) {
6067 VALUE match = rb_backref_get();
6068 int nth = rb_reg_backref_number(match, backref);
6069 return rb_reg_nth_match(nth, match);
6070 }
6071 return Qnil;
6072}
6073
6074static VALUE
6075rb_str_aref(VALUE str, VALUE indx)
6076{
6077 long idx;
6078
6079 if (FIXNUM_P(indx)) {
6080 idx = FIX2LONG(indx);
6081 }
6082 else if (RB_TYPE_P(indx, T_REGEXP)) {
6083 return rb_str_subpat(str, indx, INT2FIX(0));
6084 }
6085 else if (RB_TYPE_P(indx, T_STRING)) {
6086 if (rb_str_index(str, indx, 0) != -1)
6087 return str_duplicate(rb_cString, indx);
6088 return Qnil;
6089 }
6090 else {
6091 /* check if indx is Range */
6092 long beg, len = str_strlen(str, NULL);
6093 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6094 case Qfalse:
6095 break;
6096 case Qnil:
6097 return Qnil;
6098 default:
6099 return rb_str_substr(str, beg, len);
6100 }
6101 idx = NUM2LONG(indx);
6102 }
6103
6104 return str_substr(str, idx, 1, FALSE);
6105}
6106
6107
6108/*
6109 * call-seq:
6110 * self[index] -> new_string or nil
6111 * self[start, length] -> new_string or nil
6112 * self[range] -> new_string or nil
6113 * self[regexp, capture = 0] -> new_string or nil
6114 * self[substring] -> new_string or nil
6115 *
6116 * Returns the substring of +self+ specified by the arguments.
6117 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
6118 *
6119 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6120 */
6121
6122static VALUE
6123rb_str_aref_m(int argc, VALUE *argv, VALUE str)
6124{
6125 if (argc == 2) {
6126 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6127 return rb_str_subpat(str, argv[0], argv[1]);
6128 }
6129 else {
6130 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
6131 }
6132 }
6133 rb_check_arity(argc, 1, 2);
6134 return rb_str_aref(str, argv[0]);
6135}
6136
6137VALUE
6139{
6140 char *ptr = RSTRING_PTR(str);
6141 long olen = RSTRING_LEN(str), nlen;
6142
6143 str_modifiable(str);
6144 if (len > olen) len = olen;
6145 nlen = olen - len;
6146 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
6147 char *oldptr = ptr;
6148 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
6149 STR_SET_EMBED(str);
6150 ptr = RSTRING(str)->as.embed.ary;
6151 memmove(ptr, oldptr + len, nlen);
6152 if (fl == STR_NOEMBED) xfree(oldptr);
6153 }
6154 else {
6155 if (!STR_SHARED_P(str)) {
6156 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
6157 rb_enc_cr_str_exact_copy(shared, str);
6158 OBJ_FREEZE(shared);
6159 }
6160 ptr = RSTRING(str)->as.heap.ptr += len;
6161 }
6162 STR_SET_LEN(str, nlen);
6163
6164 if (!SHARABLE_MIDDLE_SUBSTRING) {
6165 TERM_FILL(ptr + nlen, TERM_LEN(str));
6166 }
6168 return str;
6169}
6170
6171static void
6172rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
6173{
6174 char *sptr;
6175 long slen;
6176 int cr;
6177
6178 if (beg == 0 && vlen == 0) {
6179 rb_str_drop_bytes(str, len);
6180 return;
6181 }
6182
6183 str_modify_keep_cr(str);
6184 RSTRING_GETMEM(str, sptr, slen);
6185 if (len < vlen) {
6186 /* expand string */
6187 RESIZE_CAPA(str, slen + vlen - len);
6188 sptr = RSTRING_PTR(str);
6189 }
6190
6192 cr = rb_enc_str_coderange(val);
6193 else
6195
6196 if (vlen != len) {
6197 memmove(sptr + beg + vlen,
6198 sptr + beg + len,
6199 slen - (beg + len));
6200 }
6201 if (vlen < beg && len < 0) {
6202 MEMZERO(sptr + slen, char, -len);
6203 }
6204 if (vlen > 0) {
6205 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
6206 }
6207 slen += vlen - len;
6208 STR_SET_LEN(str, slen);
6209 TERM_FILL(&sptr[slen], TERM_LEN(str));
6210 ENC_CODERANGE_SET(str, cr);
6211}
6212
6213static inline void
6214rb_str_update_0(VALUE str, long beg, long len, VALUE val)
6215{
6216 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
6217}
6218
6219void
6220rb_str_update(VALUE str, long beg, long len, VALUE val)
6221{
6222 long slen;
6223 char *p, *e;
6224 rb_encoding *enc;
6225 int singlebyte = single_byte_optimizable(str);
6226 int cr;
6227
6228 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
6229
6230 StringValue(val);
6231 enc = rb_enc_check(str, val);
6232 slen = str_strlen(str, enc); /* rb_enc_check */
6233
6234 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6235 rb_raise(rb_eIndexError, "index %ld out of string", beg);
6236 }
6237 if (beg < 0) {
6238 beg += slen;
6239 }
6240 RUBY_ASSERT(beg >= 0);
6241 RUBY_ASSERT(beg <= slen);
6242
6243 if (len > slen - beg) {
6244 len = slen - beg;
6245 }
6246 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
6247 if (!p) p = RSTRING_END(str);
6248 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
6249 if (!e) e = RSTRING_END(str);
6250 /* error check */
6251 beg = p - RSTRING_PTR(str); /* physical position */
6252 len = e - p; /* physical length */
6253 rb_str_update_0(str, beg, len, val);
6254 rb_enc_associate(str, enc);
6256 if (cr != ENC_CODERANGE_BROKEN)
6257 ENC_CODERANGE_SET(str, cr);
6258}
6259
6260static void
6261rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
6262{
6263 int nth;
6264 VALUE match;
6265 long start, end, len;
6266 rb_encoding *enc;
6267 struct re_registers *regs;
6268
6269 if (rb_reg_search(re, str, 0, 0) < 0) {
6270 rb_raise(rb_eIndexError, "regexp not matched");
6271 }
6272 match = rb_backref_get();
6273 nth = rb_reg_backref_number(match, backref);
6274 regs = RMATCH_REGS(match);
6275 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
6276 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
6277 }
6278 if (nth < 0) {
6279 nth += regs->num_regs;
6280 }
6281
6282 start = BEG(nth);
6283 if (start == -1) {
6284 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
6285 }
6286 end = END(nth);
6287 len = end - start;
6288 StringValue(val);
6289 enc = rb_enc_check_str(str, val);
6290 rb_str_update_0(str, start, len, val);
6291 rb_enc_associate(str, enc);
6292}
6293
6294static VALUE
6295rb_str_aset(VALUE str, VALUE indx, VALUE val)
6296{
6297 long idx, beg;
6298
6299 switch (TYPE(indx)) {
6300 case T_REGEXP:
6301 rb_str_subpat_set(str, indx, INT2FIX(0), val);
6302 return val;
6303
6304 case T_STRING:
6305 beg = rb_str_index(str, indx, 0);
6306 if (beg < 0) {
6307 rb_raise(rb_eIndexError, "string not matched");
6308 }
6309 beg = rb_str_sublen(str, beg);
6310 rb_str_update(str, beg, str_strlen(indx, NULL), val);
6311 return val;
6312
6313 default:
6314 /* check if indx is Range */
6315 {
6316 long beg, len;
6317 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6318 rb_str_update(str, beg, len, val);
6319 return val;
6320 }
6321 }
6322 /* FALLTHROUGH */
6323
6324 case T_FIXNUM:
6325 idx = NUM2LONG(indx);
6326 rb_str_update(str, idx, 1, val);
6327 return val;
6328 }
6329}
6330
6331/*
6332 * call-seq:
6333 * self[index] = new_string
6334 * self[start, length] = new_string
6335 * self[range] = new_string
6336 * self[regexp, capture = 0] = new_string
6337 * self[substring] = new_string
6338 *
6339 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6340 * See {String Slices}[rdoc-ref:String@String+Slices].
6341 *
6342 * A few examples:
6343 *
6344 * s = 'foo'
6345 * s[2] = 'rtune' # => "rtune"
6346 * s # => "fortune"
6347 * s[1, 5] = 'init' # => "init"
6348 * s # => "finite"
6349 * s[3..4] = 'al' # => "al"
6350 * s # => "finale"
6351 * s[/e$/] = 'ly' # => "ly"
6352 * s # => "finally"
6353 * s['lly'] = 'ncial' # => "ncial"
6354 * s # => "financial"
6355 *
6356 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6357 */
6358
6359static VALUE
6360rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6361{
6362 if (argc == 3) {
6363 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6364 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6365 }
6366 else {
6367 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6368 }
6369 return argv[2];
6370 }
6371 rb_check_arity(argc, 2, 3);
6372 return rb_str_aset(str, argv[0], argv[1]);
6373}
6374
6375/*
6376 * call-seq:
6377 * insert(index, other_string) -> self
6378 *
6379 * Inserts the given +other_string+ into +self+; returns +self+.
6380 *
6381 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
6382 *
6383 * 'foo'.insert(1, 'bar') # => "fbaroo"
6384 *
6385 * If the Integer +index+ is negative, counts backward from the end of +self+
6386 * and inserts +other_string+ at offset <tt>index+1</tt>
6387 * (that is, _after_ <tt>self[index]</tt>):
6388 *
6389 * 'foo'.insert(-2, 'bar') # => "fobaro"
6390 *
6391 */
6392
6393static VALUE
6394rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6395{
6396 long pos = NUM2LONG(idx);
6397
6398 if (pos == -1) {
6399 return rb_str_append(str, str2);
6400 }
6401 else if (pos < 0) {
6402 pos++;
6403 }
6404 rb_str_update(str, pos, 0, str2);
6405 return str;
6406}
6407
6408
6409/*
6410 * call-seq:
6411 * slice!(index) -> new_string or nil
6412 * slice!(start, length) -> new_string or nil
6413 * slice!(range) -> new_string or nil
6414 * slice!(regexp, capture = 0) -> new_string or nil
6415 * slice!(substring) -> new_string or nil
6416 *
6417 * Removes and returns the substring of +self+ specified by the arguments.
6418 * See {String Slices}[rdoc-ref:String@String+Slices].
6419 *
6420 * A few examples:
6421 *
6422 * string = "This is a string"
6423 * string.slice!(2) #=> "i"
6424 * string.slice!(3..6) #=> " is "
6425 * string.slice!(/s.*t/) #=> "sa st"
6426 * string.slice!("r") #=> "r"
6427 * string #=> "Thing"
6428 *
6429 */
6430
6431static VALUE
6432rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6433{
6434 VALUE result = Qnil;
6435 VALUE indx;
6436 long beg, len = 1;
6437 char *p;
6438
6439 rb_check_arity(argc, 1, 2);
6440 str_modify_keep_cr(str);
6441 indx = argv[0];
6442 if (RB_TYPE_P(indx, T_REGEXP)) {
6443 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6444 VALUE match = rb_backref_get();
6445 struct re_registers *regs = RMATCH_REGS(match);
6446 int nth = 0;
6447 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6448 if ((nth += regs->num_regs) <= 0) return Qnil;
6449 }
6450 else if (nth >= regs->num_regs) return Qnil;
6451 beg = BEG(nth);
6452 len = END(nth) - beg;
6453 goto subseq;
6454 }
6455 else if (argc == 2) {
6456 beg = NUM2LONG(indx);
6457 len = NUM2LONG(argv[1]);
6458 goto num_index;
6459 }
6460 else if (FIXNUM_P(indx)) {
6461 beg = FIX2LONG(indx);
6462 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6463 if (!len) return Qnil;
6464 beg = p - RSTRING_PTR(str);
6465 goto subseq;
6466 }
6467 else if (RB_TYPE_P(indx, T_STRING)) {
6468 beg = rb_str_index(str, indx, 0);
6469 if (beg == -1) return Qnil;
6470 len = RSTRING_LEN(indx);
6471 result = str_duplicate(rb_cString, indx);
6472 goto squash;
6473 }
6474 else {
6475 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6476 case Qnil:
6477 return Qnil;
6478 case Qfalse:
6479 beg = NUM2LONG(indx);
6480 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6481 if (!len) return Qnil;
6482 beg = p - RSTRING_PTR(str);
6483 goto subseq;
6484 default:
6485 goto num_index;
6486 }
6487 }
6488
6489 num_index:
6490 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6491 beg = p - RSTRING_PTR(str);
6492
6493 subseq:
6494 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6495 rb_enc_cr_str_copy_for_substr(result, str);
6496
6497 squash:
6498 if (len > 0) {
6499 if (beg == 0) {
6500 rb_str_drop_bytes(str, len);
6501 }
6502 else {
6503 char *sptr = RSTRING_PTR(str);
6504 long slen = RSTRING_LEN(str);
6505 if (beg + len > slen) /* pathological check */
6506 len = slen - beg;
6507 memmove(sptr + beg,
6508 sptr + beg + len,
6509 slen - (beg + len));
6510 slen -= len;
6511 STR_SET_LEN(str, slen);
6512 TERM_FILL(&sptr[slen], TERM_LEN(str));
6513 }
6514 }
6515 return result;
6516}
6517
6518static VALUE
6519get_pat(VALUE pat)
6520{
6521 VALUE val;
6522
6523 switch (OBJ_BUILTIN_TYPE(pat)) {
6524 case T_REGEXP:
6525 return pat;
6526
6527 case T_STRING:
6528 break;
6529
6530 default:
6531 val = rb_check_string_type(pat);
6532 if (NIL_P(val)) {
6533 Check_Type(pat, T_REGEXP);
6534 }
6535 pat = val;
6536 }
6537
6538 return rb_reg_regcomp(pat);
6539}
6540
6541static VALUE
6542get_pat_quoted(VALUE pat, int check)
6543{
6544 VALUE val;
6545
6546 switch (OBJ_BUILTIN_TYPE(pat)) {
6547 case T_REGEXP:
6548 return pat;
6549
6550 case T_STRING:
6551 break;
6552
6553 default:
6554 val = rb_check_string_type(pat);
6555 if (NIL_P(val)) {
6556 Check_Type(pat, T_REGEXP);
6557 }
6558 pat = val;
6559 }
6560 if (check && is_broken_string(pat)) {
6561 rb_exc_raise(rb_reg_check_preprocess(pat));
6562 }
6563 return pat;
6564}
6565
6566static long
6567rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6568{
6569 if (BUILTIN_TYPE(pat) == T_STRING) {
6570 pos = rb_str_byteindex(str, pat, pos);
6571 if (set_backref_str) {
6572 if (pos >= 0) {
6573 str = rb_str_new_frozen_String(str);
6574 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6575 if (match) {
6576 *match = match_data;
6577 }
6578 }
6579 else {
6581 }
6582 }
6583 return pos;
6584 }
6585 else {
6586 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6587 }
6588}
6589
6590static long
6591rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6592{
6593 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6594}
6595
6596
6597/*
6598 * call-seq:
6599 * sub!(pattern, replacement) -> self or nil
6600 * sub!(pattern) {|match| ... } -> self or nil
6601 *
6602 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6603 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6604 *
6605 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6606 *
6607 * Related: String#sub, String#gsub, String#gsub!.
6608 *
6609 */
6610
6611static VALUE
6612rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6613{
6614 VALUE pat, repl, hash = Qnil;
6615 int iter = 0;
6616 long plen;
6617 int min_arity = rb_block_given_p() ? 1 : 2;
6618 long beg;
6619
6620 rb_check_arity(argc, min_arity, 2);
6621 if (argc == 1) {
6622 iter = 1;
6623 }
6624 else {
6625 repl = argv[1];
6626 hash = rb_check_hash_type(argv[1]);
6627 if (NIL_P(hash)) {
6628 StringValue(repl);
6629 }
6630 }
6631
6632 pat = get_pat_quoted(argv[0], 1);
6633
6634 str_modifiable(str);
6635 beg = rb_pat_search(pat, str, 0, 1);
6636 if (beg >= 0) {
6637 rb_encoding *enc;
6638 int cr = ENC_CODERANGE(str);
6639 long beg0, end0;
6640 VALUE match, match0 = Qnil;
6641 struct re_registers *regs;
6642 char *p, *rp;
6643 long len, rlen;
6644
6645 match = rb_backref_get();
6646 regs = RMATCH_REGS(match);
6647 if (RB_TYPE_P(pat, T_STRING)) {
6648 beg0 = beg;
6649 end0 = beg0 + RSTRING_LEN(pat);
6650 match0 = pat;
6651 }
6652 else {
6653 beg0 = BEG(0);
6654 end0 = END(0);
6655 if (iter) match0 = rb_reg_nth_match(0, match);
6656 }
6657
6658 if (iter || !NIL_P(hash)) {
6659 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6660
6661 if (iter) {
6662 repl = rb_obj_as_string(rb_yield(match0));
6663 }
6664 else {
6665 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6666 repl = rb_obj_as_string(repl);
6667 }
6668 str_mod_check(str, p, len);
6669 rb_check_frozen(str);
6670 }
6671 else {
6672 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6673 }
6674
6675 enc = rb_enc_compatible(str, repl);
6676 if (!enc) {
6677 rb_encoding *str_enc = STR_ENC_GET(str);
6678 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6679 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6680 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6681 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6682 rb_enc_inspect_name(str_enc),
6683 rb_enc_inspect_name(STR_ENC_GET(repl)));
6684 }
6685 enc = STR_ENC_GET(repl);
6686 }
6687 rb_str_modify(str);
6688 rb_enc_associate(str, enc);
6690 int cr2 = ENC_CODERANGE(repl);
6691 if (cr2 == ENC_CODERANGE_BROKEN ||
6692 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6694 else
6695 cr = cr2;
6696 }
6697 plen = end0 - beg0;
6698 rlen = RSTRING_LEN(repl);
6699 len = RSTRING_LEN(str);
6700 if (rlen > plen) {
6701 RESIZE_CAPA(str, len + rlen - plen);
6702 }
6703 p = RSTRING_PTR(str);
6704 if (rlen != plen) {
6705 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6706 }
6707 rp = RSTRING_PTR(repl);
6708 memmove(p + beg0, rp, rlen);
6709 len += rlen - plen;
6710 STR_SET_LEN(str, len);
6711 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6712 ENC_CODERANGE_SET(str, cr);
6713
6714 RB_GC_GUARD(match);
6715
6716 return str;
6717 }
6718 return Qnil;
6719}
6720
6721
6722/*
6723 * call-seq:
6724 * sub(pattern, replacement) -> new_string
6725 * sub(pattern) {|match| ... } -> new_string
6726 *
6727 * Returns a copy of +self+ with only the first occurrence
6728 * (not all occurrences) of the given +pattern+ replaced.
6729 *
6730 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6731 *
6732 * Related: String#sub!, String#gsub, String#gsub!.
6733 *
6734 */
6735
6736static VALUE
6737rb_str_sub(int argc, VALUE *argv, VALUE str)
6738{
6739 str = str_duplicate(rb_cString, str);
6740 rb_str_sub_bang(argc, argv, str);
6741 return str;
6742}
6743
6744static VALUE
6745str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6746{
6747 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6748 long beg, beg0, end0;
6749 long offset, blen, slen, len, last;
6750 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6751 char *sp, *cp;
6752 int need_backref_str = -1;
6753 rb_encoding *str_enc;
6754
6755 switch (argc) {
6756 case 1:
6757 RETURN_ENUMERATOR(str, argc, argv);
6758 mode = ITER;
6759 break;
6760 case 2:
6761 repl = argv[1];
6762 hash = rb_check_hash_type(argv[1]);
6763 if (NIL_P(hash)) {
6764 StringValue(repl);
6765 }
6766 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6767 mode = FAST_MAP;
6768 }
6769 else {
6770 mode = MAP;
6771 }
6772 break;
6773 default:
6774 rb_error_arity(argc, 1, 2);
6775 }
6776
6777 pat = get_pat_quoted(argv[0], 1);
6778 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6779
6780 if (beg < 0) {
6781 if (bang) return Qnil; /* no match, no substitution */
6782 return str_duplicate(rb_cString, str);
6783 }
6784
6785 offset = 0;
6786 blen = RSTRING_LEN(str) + 30; /* len + margin */
6787 dest = rb_str_buf_new(blen);
6788 sp = RSTRING_PTR(str);
6789 slen = RSTRING_LEN(str);
6790 cp = sp;
6791 str_enc = STR_ENC_GET(str);
6792 rb_enc_associate(dest, str_enc);
6793 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6794
6795 do {
6796 struct re_registers *regs = RMATCH_REGS(match);
6797 if (RB_TYPE_P(pat, T_STRING)) {
6798 beg0 = beg;
6799 end0 = beg0 + RSTRING_LEN(pat);
6800 match0 = pat;
6801 }
6802 else {
6803 beg0 = BEG(0);
6804 end0 = END(0);
6805 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6806 }
6807
6808 if (mode != STR) {
6809 if (mode == ITER) {
6810 val = rb_obj_as_string(rb_yield(match0));
6811 }
6812 else {
6813 struct RString fake_str;
6814 VALUE key;
6815 if (mode == FAST_MAP) {
6816 // It is safe to use a fake_str here because we established that it won't escape,
6817 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6818 // default proc.
6819 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6820 }
6821 else {
6822 key = rb_str_subseq(str, beg0, end0 - beg0);
6823 }
6824 val = rb_hash_aref(hash, key);
6825 val = rb_obj_as_string(val);
6826 }
6827 str_mod_check(str, sp, slen);
6828 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6829 rb_raise(rb_eRuntimeError, "block should not cheat");
6830 }
6831 }
6832 else if (need_backref_str) {
6833 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6834 if (need_backref_str < 0) {
6835 need_backref_str = val != repl;
6836 }
6837 }
6838 else {
6839 val = repl;
6840 }
6841
6842 len = beg0 - offset; /* copy pre-match substr */
6843 if (len) {
6844 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6845 }
6846
6847 rb_str_buf_append(dest, val);
6848
6849 last = offset;
6850 offset = end0;
6851 if (beg0 == end0) {
6852 /*
6853 * Always consume at least one character of the input string
6854 * in order to prevent infinite loops.
6855 */
6856 if (RSTRING_LEN(str) <= end0) break;
6857 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6858 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6859 offset = end0 + len;
6860 }
6861 cp = RSTRING_PTR(str) + offset;
6862 if (offset > RSTRING_LEN(str)) break;
6863
6864 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6865 if (mode != FAST_MAP && mode != STR) {
6866 match = Qnil;
6867 }
6868 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6869
6870 RB_GC_GUARD(match);
6871 } while (beg >= 0);
6872
6873 if (RSTRING_LEN(str) > offset) {
6874 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6875 }
6876 rb_pat_search0(pat, str, last, 1, &match);
6877 if (bang) {
6878 str_shared_replace(str, dest);
6879 }
6880 else {
6881 str = dest;
6882 }
6883
6884 return str;
6885}
6886
6887
6888/*
6889 * call-seq:
6890 * gsub!(pattern, replacement) -> self or nil
6891 * gsub!(pattern) {|match| ... } -> self or nil
6892 * gsub!(pattern) -> an_enumerator
6893 *
6894 * Performs the specified substring replacement(s) on +self+;
6895 * returns +self+ if any replacement occurred, +nil+ otherwise.
6896 *
6897 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6898 *
6899 * Returns an Enumerator if no +replacement+ and no block given.
6900 *
6901 * Related: String#sub, String#gsub, String#sub!.
6902 *
6903 */
6904
6905static VALUE
6906rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6907{
6908 str_modify_keep_cr(str);
6909 return str_gsub(argc, argv, str, 1);
6910}
6911
6912
6913/*
6914 * call-seq:
6915 * gsub(pattern, replacement) -> new_string
6916 * gsub(pattern) {|match| ... } -> new_string
6917 * gsub(pattern) -> enumerator
6918 *
6919 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6920 *
6921 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6922 *
6923 * Returns an Enumerator if no +replacement+ and no block given.
6924 *
6925 * Related: String#sub, String#sub!, String#gsub!.
6926 *
6927 */
6928
6929static VALUE
6930rb_str_gsub(int argc, VALUE *argv, VALUE str)
6931{
6932 return str_gsub(argc, argv, str, 0);
6933}
6934
6935
6936/*
6937 * call-seq:
6938 * replace(other_string) -> self
6939 *
6940 * Replaces the contents of +self+ with the contents of +other_string+:
6941 *
6942 * s = 'foo' # => "foo"
6943 * s.replace('bar') # => "bar"
6944 *
6945 */
6946
6947VALUE
6949{
6950 str_modifiable(str);
6951 if (str == str2) return str;
6952
6953 StringValue(str2);
6954 str_discard(str);
6955 return str_replace(str, str2);
6956}
6957
6958/*
6959 * call-seq:
6960 * clear -> self
6961 *
6962 * Removes the contents of +self+:
6963 *
6964 * s = 'foo' # => "foo"
6965 * s.clear # => ""
6966 *
6967 */
6968
6969static VALUE
6970rb_str_clear(VALUE str)
6971{
6972 str_discard(str);
6973 STR_SET_EMBED(str);
6974 STR_SET_LEN(str, 0);
6975 RSTRING_PTR(str)[0] = 0;
6976 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6978 else
6980 return str;
6981}
6982
6983/*
6984 * call-seq:
6985 * chr -> string
6986 *
6987 * Returns a string containing the first character of +self+:
6988 *
6989 * s = 'foo' # => "foo"
6990 * s.chr # => "f"
6991 *
6992 */
6993
6994static VALUE
6995rb_str_chr(VALUE str)
6996{
6997 return rb_str_substr(str, 0, 1);
6998}
6999
7000/*
7001 * call-seq:
7002 * getbyte(index) -> integer or nil
7003 *
7004 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
7005 *
7006 * s = 'abcde' # => "abcde"
7007 * s.getbyte(0) # => 97
7008 * s.getbyte(-1) # => 101
7009 * s.getbyte(5) # => nil
7010 *
7011 * Related: String#setbyte.
7012 */
7013VALUE
7014rb_str_getbyte(VALUE str, VALUE index)
7015{
7016 long pos = NUM2LONG(index);
7017
7018 if (pos < 0)
7019 pos += RSTRING_LEN(str);
7020 if (pos < 0 || RSTRING_LEN(str) <= pos)
7021 return Qnil;
7022
7023 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
7024}
7025
7026/*
7027 * call-seq:
7028 * setbyte(index, integer) -> integer
7029 *
7030 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
7031 *
7032 * s = 'abcde' # => "abcde"
7033 * s.setbyte(0, 98) # => 98
7034 * s # => "bbcde"
7035 *
7036 * Related: String#getbyte.
7037 */
7038VALUE
7039rb_str_setbyte(VALUE str, VALUE index, VALUE value)
7040{
7041 long pos = NUM2LONG(index);
7042 long len = RSTRING_LEN(str);
7043 char *ptr, *head, *left = 0;
7044 rb_encoding *enc;
7045 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
7046
7047 if (pos < -len || len <= pos)
7048 rb_raise(rb_eIndexError, "index %ld out of string", pos);
7049 if (pos < 0)
7050 pos += len;
7051
7052 VALUE v = rb_to_int(value);
7053 VALUE w = rb_int_and(v, INT2FIX(0xff));
7054 char byte = (char)(NUM2INT(w) & 0xFF);
7055
7056 if (!str_independent(str))
7057 str_make_independent(str);
7058 enc = STR_ENC_GET(str);
7059 head = RSTRING_PTR(str);
7060 ptr = &head[pos];
7061 if (!STR_EMBED_P(str)) {
7062 cr = ENC_CODERANGE(str);
7063 switch (cr) {
7064 case ENC_CODERANGE_7BIT:
7065 left = ptr;
7066 *ptr = byte;
7067 if (ISASCII(byte)) goto end;
7068 nlen = rb_enc_precise_mbclen(left, head+len, enc);
7069 if (!MBCLEN_CHARFOUND_P(nlen))
7071 else
7073 goto end;
7075 left = rb_enc_left_char_head(head, ptr, head+len, enc);
7076 width = rb_enc_precise_mbclen(left, head+len, enc);
7077 *ptr = byte;
7078 nlen = rb_enc_precise_mbclen(left, head+len, enc);
7079 if (!MBCLEN_CHARFOUND_P(nlen))
7081 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
7083 goto end;
7084 }
7085 }
7087 *ptr = byte;
7088
7089 end:
7090 return value;
7091}
7092
7093static VALUE
7094str_byte_substr(VALUE str, long beg, long len, int empty)
7095{
7096 long n = RSTRING_LEN(str);
7097
7098 if (beg > n || len < 0) return Qnil;
7099 if (beg < 0) {
7100 beg += n;
7101 if (beg < 0) return Qnil;
7102 }
7103 if (len > n - beg)
7104 len = n - beg;
7105 if (len <= 0) {
7106 if (!empty) return Qnil;
7107 len = 0;
7108 }
7109
7110 VALUE str2 = str_subseq(str, beg, len);
7111
7112 str_enc_copy_direct(str2, str);
7113
7114 if (RSTRING_LEN(str2) == 0) {
7115 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
7117 else
7119 }
7120 else {
7121 switch (ENC_CODERANGE(str)) {
7122 case ENC_CODERANGE_7BIT:
7124 break;
7125 default:
7127 break;
7128 }
7129 }
7130
7131 return str2;
7132}
7133
7134VALUE
7135rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
7136{
7137 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
7138}
7139
7140static VALUE
7141str_byte_aref(VALUE str, VALUE indx)
7142{
7143 long idx;
7144 if (FIXNUM_P(indx)) {
7145 idx = FIX2LONG(indx);
7146 }
7147 else {
7148 /* check if indx is Range */
7149 long beg, len = RSTRING_LEN(str);
7150
7151 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
7152 case Qfalse:
7153 break;
7154 case Qnil:
7155 return Qnil;
7156 default:
7157 return str_byte_substr(str, beg, len, TRUE);
7158 }
7159
7160 idx = NUM2LONG(indx);
7161 }
7162 return str_byte_substr(str, idx, 1, FALSE);
7163}
7164
7165/*
7166 * call-seq:
7167 * byteslice(index, length = 1) -> string or nil
7168 * byteslice(range) -> string or nil
7169 *
7170 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
7171 *
7172 * With integer arguments +index+ and +length+ given,
7173 * returns the substring beginning at the given +index+
7174 * of the given +length+ (if possible),
7175 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
7176 *
7177 * s = '0123456789' # => "0123456789"
7178 * s.byteslice(2) # => "2"
7179 * s.byteslice(200) # => nil
7180 * s.byteslice(4, 3) # => "456"
7181 * s.byteslice(4, 30) # => "456789"
7182 * s.byteslice(4, -1) # => nil
7183 * s.byteslice(40, 2) # => nil
7184 *
7185 * In either case above, counts backwards from the end of +self+
7186 * if +index+ is negative:
7187 *
7188 * s = '0123456789' # => "0123456789"
7189 * s.byteslice(-4) # => "6"
7190 * s.byteslice(-4, 3) # => "678"
7191 *
7192 * With Range argument +range+ given, returns
7193 * <tt>byteslice(range.begin, range.size)</tt>:
7194 *
7195 * s = '0123456789' # => "0123456789"
7196 * s.byteslice(4..6) # => "456"
7197 * s.byteslice(-6..-4) # => "456"
7198 * s.byteslice(5..2) # => "" # range.size is zero.
7199 * s.byteslice(40..42) # => nil
7200 *
7201 * In all cases, a returned string has the same encoding as +self+:
7202 *
7203 * s.encoding # => #<Encoding:UTF-8>
7204 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
7205 *
7206 */
7207
7208static VALUE
7209rb_str_byteslice(int argc, VALUE *argv, VALUE str)
7210{
7211 if (argc == 2) {
7212 long beg = NUM2LONG(argv[0]);
7213 long len = NUM2LONG(argv[1]);
7214 return str_byte_substr(str, beg, len, TRUE);
7215 }
7216 rb_check_arity(argc, 1, 2);
7217 return str_byte_aref(str, argv[0]);
7218}
7219
7220static void
7221str_check_beg_len(VALUE str, long *beg, long *len)
7222{
7223 long end, slen = RSTRING_LEN(str);
7224
7225 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
7226 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
7227 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
7228 }
7229 if (*beg < 0) {
7230 *beg += slen;
7231 }
7232 RUBY_ASSERT(*beg >= 0);
7233 RUBY_ASSERT(*beg <= slen);
7234
7235 if (*len > slen - *beg) {
7236 *len = slen - *beg;
7237 }
7238 end = *beg + *len;
7239 str_ensure_byte_pos(str, *beg);
7240 str_ensure_byte_pos(str, end);
7241}
7242
7243/*
7244 * call-seq:
7245 * bytesplice(index, length, str) -> string
7246 * bytesplice(index, length, str, str_index, str_length) -> string
7247 * bytesplice(range, str) -> string
7248 * bytesplice(range, str, str_range) -> string
7249 *
7250 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
7251 * The portion of the string affected is determined using
7252 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
7253 * If the replacement string is not the same length as the text it is replacing,
7254 * the string will be adjusted accordingly.
7255 *
7256 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
7257 *
7258 * The form that take an Integer will raise an IndexError if the value is out
7259 * of range; the Range form will raise a RangeError.
7260 * If the beginning or ending offset does not land on character (codepoint)
7261 * boundary, an IndexError will be raised.
7262 */
7263
7264static VALUE
7265rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
7266{
7267 long beg, len, vbeg, vlen;
7268 VALUE val;
7269 int cr;
7270
7271 rb_check_arity(argc, 2, 5);
7272 if (!(argc == 2 || argc == 3 || argc == 5)) {
7273 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
7274 }
7275 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
7276 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
7277 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
7278 rb_builtin_class_name(argv[0]));
7279 }
7280 val = argv[1];
7281 StringValue(val);
7282 if (argc == 2) {
7283 /* bytesplice(range, str) */
7284 vbeg = 0;
7285 vlen = RSTRING_LEN(val);
7286 }
7287 else {
7288 /* bytesplice(range, str, str_range) */
7289 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
7290 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
7291 rb_builtin_class_name(argv[2]));
7292 }
7293 }
7294 }
7295 else {
7296 beg = NUM2LONG(argv[0]);
7297 len = NUM2LONG(argv[1]);
7298 val = argv[2];
7299 StringValue(val);
7300 if (argc == 3) {
7301 /* bytesplice(index, length, str) */
7302 vbeg = 0;
7303 vlen = RSTRING_LEN(val);
7304 }
7305 else {
7306 /* bytesplice(index, length, str, str_index, str_length) */
7307 vbeg = NUM2LONG(argv[3]);
7308 vlen = NUM2LONG(argv[4]);
7309 }
7310 }
7311 str_check_beg_len(str, &beg, &len);
7312 str_check_beg_len(val, &vbeg, &vlen);
7313 str_modify_keep_cr(str);
7314
7315 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
7316 rb_enc_associate(str, rb_enc_check(str, val));
7317 }
7318
7319 rb_str_update_1(str, beg, len, val, vbeg, vlen);
7321 if (cr != ENC_CODERANGE_BROKEN)
7322 ENC_CODERANGE_SET(str, cr);
7323 return str;
7324}
7325
7326/*
7327 * call-seq:
7328 * reverse -> string
7329 *
7330 * Returns a new string with the characters from +self+ in reverse order.
7331 *
7332 * 'stressed'.reverse # => "desserts"
7333 *
7334 */
7335
7336static VALUE
7337rb_str_reverse(VALUE str)
7338{
7339 rb_encoding *enc;
7340 VALUE rev;
7341 char *s, *e, *p;
7342 int cr;
7343
7344 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
7345 enc = STR_ENC_GET(str);
7346 rev = rb_str_new(0, RSTRING_LEN(str));
7347 s = RSTRING_PTR(str); e = RSTRING_END(str);
7348 p = RSTRING_END(rev);
7349 cr = ENC_CODERANGE(str);
7350
7351 if (RSTRING_LEN(str) > 1) {
7352 if (single_byte_optimizable(str)) {
7353 while (s < e) {
7354 *--p = *s++;
7355 }
7356 }
7357 else if (cr == ENC_CODERANGE_VALID) {
7358 while (s < e) {
7359 int clen = rb_enc_fast_mbclen(s, e, enc);
7360
7361 p -= clen;
7362 memcpy(p, s, clen);
7363 s += clen;
7364 }
7365 }
7366 else {
7367 cr = rb_enc_asciicompat(enc) ?
7369 while (s < e) {
7370 int clen = rb_enc_mbclen(s, e, enc);
7371
7372 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7373 p -= clen;
7374 memcpy(p, s, clen);
7375 s += clen;
7376 }
7377 }
7378 }
7379 STR_SET_LEN(rev, RSTRING_LEN(str));
7380 str_enc_copy_direct(rev, str);
7381 ENC_CODERANGE_SET(rev, cr);
7382
7383 return rev;
7384}
7385
7386
7387/*
7388 * call-seq:
7389 * reverse! -> self
7390 *
7391 * Returns +self+ with its characters reversed:
7392 *
7393 * s = 'stressed'
7394 * s.reverse! # => "desserts"
7395 * s # => "desserts"
7396 *
7397 */
7398
7399static VALUE
7400rb_str_reverse_bang(VALUE str)
7401{
7402 if (RSTRING_LEN(str) > 1) {
7403 if (single_byte_optimizable(str)) {
7404 char *s, *e, c;
7405
7406 str_modify_keep_cr(str);
7407 s = RSTRING_PTR(str);
7408 e = RSTRING_END(str) - 1;
7409 while (s < e) {
7410 c = *s;
7411 *s++ = *e;
7412 *e-- = c;
7413 }
7414 }
7415 else {
7416 str_shared_replace(str, rb_str_reverse(str));
7417 }
7418 }
7419 else {
7420 str_modify_keep_cr(str);
7421 }
7422 return str;
7423}
7424
7425
7426/*
7427 * call-seq:
7428 * include?(other_string) -> true or false
7429 *
7430 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
7431 *
7432 * s = 'foo'
7433 * s.include?('f') # => true
7434 * s.include?('fo') # => true
7435 * s.include?('food') # => false
7436 *
7437 */
7438
7439VALUE
7440rb_str_include(VALUE str, VALUE arg)
7441{
7442 long i;
7443
7444 StringValue(arg);
7445 i = rb_str_index(str, arg, 0);
7446
7447 return RBOOL(i != -1);
7448}
7449
7450
7451/*
7452 * call-seq:
7453 * to_i(base = 10) -> integer
7454 *
7455 * Returns the result of interpreting leading characters in +self+
7456 * as an integer in the given +base+ (which must be in (0, 2..36)):
7457 *
7458 * '123456'.to_i # => 123456
7459 * '123def'.to_i(16) # => 1195503
7460 *
7461 * With +base+ zero, string +object+ may contain leading characters
7462 * to specify the actual base:
7463 *
7464 * '123def'.to_i(0) # => 123
7465 * '0123def'.to_i(0) # => 83
7466 * '0b123def'.to_i(0) # => 1
7467 * '0o123def'.to_i(0) # => 83
7468 * '0d123def'.to_i(0) # => 123
7469 * '0x123def'.to_i(0) # => 1195503
7470 *
7471 * Characters past a leading valid number (in the given +base+) are ignored:
7472 *
7473 * '12.345'.to_i # => 12
7474 * '12345'.to_i(2) # => 1
7475 *
7476 * Returns zero if there is no leading valid number:
7477 *
7478 * 'abcdef'.to_i # => 0
7479 * '2'.to_i(2) # => 0
7480 *
7481 */
7482
7483static VALUE
7484rb_str_to_i(int argc, VALUE *argv, VALUE str)
7485{
7486 int base = 10;
7487
7488 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7489 rb_raise(rb_eArgError, "invalid radix %d", base);
7490 }
7491 return rb_str_to_inum(str, base, FALSE);
7492}
7493
7494
7495/*
7496 * call-seq:
7497 * to_f -> float
7498 *
7499 * Returns the result of interpreting leading characters in +self+ as a Float:
7500 *
7501 * '3.14159'.to_f # => 3.14159
7502 * '1.234e-2'.to_f # => 0.01234
7503 *
7504 * Characters past a leading valid number (in the given +base+) are ignored:
7505 *
7506 * '3.14 (pi to two places)'.to_f # => 3.14
7507 *
7508 * Returns zero if there is no leading valid number:
7509 *
7510 * 'abcdef'.to_f # => 0.0
7511 *
7512 */
7513
7514static VALUE
7515rb_str_to_f(VALUE str)
7516{
7517 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7518}
7519
7520
7521/*
7522 * call-seq:
7523 * to_s -> self or string
7524 *
7525 * Returns +self+ if +self+ is a +String+,
7526 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7527 */
7528
7529static VALUE
7530rb_str_to_s(VALUE str)
7531{
7532 if (rb_obj_class(str) != rb_cString) {
7533 return str_duplicate(rb_cString, str);
7534 }
7535 return str;
7536}
7537
7538#if 0
7539static void
7540str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7541{
7542 char s[RUBY_MAX_CHAR_LEN];
7543 int n = rb_enc_codelen(c, enc);
7544
7545 rb_enc_mbcput(c, s, enc);
7546 rb_enc_str_buf_cat(str, s, n, enc);
7547}
7548#endif
7549
7550#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7551
7552int
7553rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7554{
7555 char buf[CHAR_ESC_LEN + 1];
7556 int l;
7557
7558#if SIZEOF_INT > 4
7559 c &= 0xffffffff;
7560#endif
7561 if (unicode_p) {
7562 if (c < 0x7F && ISPRINT(c)) {
7563 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7564 }
7565 else if (c < 0x10000) {
7566 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7567 }
7568 else {
7569 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7570 }
7571 }
7572 else {
7573 if (c < 0x100) {
7574 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7575 }
7576 else {
7577 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7578 }
7579 }
7580 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7581 rb_str_buf_cat(result, buf, l);
7582 return l;
7583}
7584
7585const char *
7586ruby_escaped_char(int c)
7587{
7588 switch (c) {
7589 case '\0': return "\\0";
7590 case '\n': return "\\n";
7591 case '\r': return "\\r";
7592 case '\t': return "\\t";
7593 case '\f': return "\\f";
7594 case '\013': return "\\v";
7595 case '\010': return "\\b";
7596 case '\007': return "\\a";
7597 case '\033': return "\\e";
7598 case '\x7f': return "\\c?";
7599 }
7600 return NULL;
7601}
7602
7603VALUE
7604rb_str_escape(VALUE str)
7605{
7606 int encidx = ENCODING_GET(str);
7607 rb_encoding *enc = rb_enc_from_index(encidx);
7608 const char *p = RSTRING_PTR(str);
7609 const char *pend = RSTRING_END(str);
7610 const char *prev = p;
7611 char buf[CHAR_ESC_LEN + 1];
7612 VALUE result = rb_str_buf_new(0);
7613 int unicode_p = rb_enc_unicode_p(enc);
7614 int asciicompat = rb_enc_asciicompat(enc);
7615
7616 while (p < pend) {
7617 unsigned int c;
7618 const char *cc;
7619 int n = rb_enc_precise_mbclen(p, pend, enc);
7620 if (!MBCLEN_CHARFOUND_P(n)) {
7621 if (p > prev) str_buf_cat(result, prev, p - prev);
7622 n = rb_enc_mbminlen(enc);
7623 if (pend < p + n)
7624 n = (int)(pend - p);
7625 while (n--) {
7626 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7627 str_buf_cat(result, buf, strlen(buf));
7628 prev = ++p;
7629 }
7630 continue;
7631 }
7632 n = MBCLEN_CHARFOUND_LEN(n);
7633 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7634 p += n;
7635 cc = ruby_escaped_char(c);
7636 if (cc) {
7637 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7638 str_buf_cat(result, cc, strlen(cc));
7639 prev = p;
7640 }
7641 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7642 }
7643 else {
7644 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7645 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7646 prev = p;
7647 }
7648 }
7649 if (p > prev) str_buf_cat(result, prev, p - prev);
7650 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7651
7652 return result;
7653}
7654
7655/*
7656 * call-seq:
7657 * inspect -> string
7658 *
7659 * Returns a printable version of +self+, enclosed in double-quotes,
7660 * and with special characters escaped:
7661 *
7662 * s = "foo\tbar\tbaz\n"
7663 * s.inspect
7664 * # => "\"foo\\tbar\\tbaz\\n\""
7665 *
7666 */
7667
7668VALUE
7670{
7671 int encidx = ENCODING_GET(str);
7672 rb_encoding *enc = rb_enc_from_index(encidx);
7673 const char *p, *pend, *prev;
7674 char buf[CHAR_ESC_LEN + 1];
7675 VALUE result = rb_str_buf_new(0);
7676 rb_encoding *resenc = rb_default_internal_encoding();
7677 int unicode_p = rb_enc_unicode_p(enc);
7678 int asciicompat = rb_enc_asciicompat(enc);
7679
7680 if (resenc == NULL) resenc = rb_default_external_encoding();
7681 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7682 rb_enc_associate(result, resenc);
7683 str_buf_cat2(result, "\"");
7684
7685 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7686 prev = p;
7687 while (p < pend) {
7688 unsigned int c, cc;
7689 int n;
7690
7691 n = rb_enc_precise_mbclen(p, pend, enc);
7692 if (!MBCLEN_CHARFOUND_P(n)) {
7693 if (p > prev) str_buf_cat(result, prev, p - prev);
7694 n = rb_enc_mbminlen(enc);
7695 if (pend < p + n)
7696 n = (int)(pend - p);
7697 while (n--) {
7698 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7699 str_buf_cat(result, buf, strlen(buf));
7700 prev = ++p;
7701 }
7702 continue;
7703 }
7704 n = MBCLEN_CHARFOUND_LEN(n);
7705 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7706 p += n;
7707 if ((asciicompat || unicode_p) &&
7708 (c == '"'|| c == '\\' ||
7709 (c == '#' &&
7710 p < pend &&
7711 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7712 (cc = rb_enc_codepoint(p,pend,enc),
7713 (cc == '$' || cc == '@' || cc == '{'))))) {
7714 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7715 str_buf_cat2(result, "\\");
7716 if (asciicompat || enc == resenc) {
7717 prev = p - n;
7718 continue;
7719 }
7720 }
7721 switch (c) {
7722 case '\n': cc = 'n'; break;
7723 case '\r': cc = 'r'; break;
7724 case '\t': cc = 't'; break;
7725 case '\f': cc = 'f'; break;
7726 case '\013': cc = 'v'; break;
7727 case '\010': cc = 'b'; break;
7728 case '\007': cc = 'a'; break;
7729 case 033: cc = 'e'; break;
7730 default: cc = 0; break;
7731 }
7732 if (cc) {
7733 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7734 buf[0] = '\\';
7735 buf[1] = (char)cc;
7736 str_buf_cat(result, buf, 2);
7737 prev = p;
7738 continue;
7739 }
7740 /* The special casing of 0x85 (NEXT_LINE) here is because
7741 * Oniguruma historically treats it as printable, but it
7742 * doesn't match the print POSIX bracket class or character
7743 * property in regexps.
7744 *
7745 * See Ruby Bug #16842 for details:
7746 * https://bugs.ruby-lang.org/issues/16842
7747 */
7748 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7749 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7750 continue;
7751 }
7752 else {
7753 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7754 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7755 prev = p;
7756 continue;
7757 }
7758 }
7759 if (p > prev) str_buf_cat(result, prev, p - prev);
7760 str_buf_cat2(result, "\"");
7761
7762 return result;
7763}
7764
7765#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7766
7767/*
7768 * call-seq:
7769 * dump -> string
7770 *
7771 * Returns a printable version of +self+, enclosed in double-quotes,
7772 * with special characters escaped, and with non-printing characters
7773 * replaced by hexadecimal notation:
7774 *
7775 * "hello \n ''".dump # => "\"hello \\n ''\""
7776 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7777 *
7778 * Related: String#undump (inverse of String#dump).
7779 *
7780 */
7781
7782VALUE
7784{
7785 int encidx = rb_enc_get_index(str);
7786 rb_encoding *enc = rb_enc_from_index(encidx);
7787 long len;
7788 const char *p, *pend;
7789 char *q, *qend;
7790 VALUE result;
7791 int u8 = (encidx == rb_utf8_encindex());
7792 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7793
7794 len = 2; /* "" */
7795 if (!rb_enc_asciicompat(enc)) {
7796 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7797 len += strlen(enc->name);
7798 }
7799
7800 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7801 while (p < pend) {
7802 int clen;
7803 unsigned char c = *p++;
7804
7805 switch (c) {
7806 case '"': case '\\':
7807 case '\n': case '\r':
7808 case '\t': case '\f':
7809 case '\013': case '\010': case '\007': case '\033':
7810 clen = 2;
7811 break;
7812
7813 case '#':
7814 clen = IS_EVSTR(p, pend) ? 2 : 1;
7815 break;
7816
7817 default:
7818 if (ISPRINT(c)) {
7819 clen = 1;
7820 }
7821 else {
7822 if (u8 && c > 0x7F) { /* \u notation */
7823 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7824 if (MBCLEN_CHARFOUND_P(n)) {
7825 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7826 if (cc <= 0xFFFF)
7827 clen = 6; /* \uXXXX */
7828 else if (cc <= 0xFFFFF)
7829 clen = 9; /* \u{XXXXX} */
7830 else
7831 clen = 10; /* \u{XXXXXX} */
7832 p += MBCLEN_CHARFOUND_LEN(n)-1;
7833 break;
7834 }
7835 }
7836 clen = 4; /* \xNN */
7837 }
7838 break;
7839 }
7840
7841 if (clen > LONG_MAX - len) {
7842 rb_raise(rb_eRuntimeError, "string size too big");
7843 }
7844 len += clen;
7845 }
7846
7847 result = rb_str_new(0, len);
7848 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7849 q = RSTRING_PTR(result); qend = q + len + 1;
7850
7851 *q++ = '"';
7852 while (p < pend) {
7853 unsigned char c = *p++;
7854
7855 if (c == '"' || c == '\\') {
7856 *q++ = '\\';
7857 *q++ = c;
7858 }
7859 else if (c == '#') {
7860 if (IS_EVSTR(p, pend)) *q++ = '\\';
7861 *q++ = '#';
7862 }
7863 else if (c == '\n') {
7864 *q++ = '\\';
7865 *q++ = 'n';
7866 }
7867 else if (c == '\r') {
7868 *q++ = '\\';
7869 *q++ = 'r';
7870 }
7871 else if (c == '\t') {
7872 *q++ = '\\';
7873 *q++ = 't';
7874 }
7875 else if (c == '\f') {
7876 *q++ = '\\';
7877 *q++ = 'f';
7878 }
7879 else if (c == '\013') {
7880 *q++ = '\\';
7881 *q++ = 'v';
7882 }
7883 else if (c == '\010') {
7884 *q++ = '\\';
7885 *q++ = 'b';
7886 }
7887 else if (c == '\007') {
7888 *q++ = '\\';
7889 *q++ = 'a';
7890 }
7891 else if (c == '\033') {
7892 *q++ = '\\';
7893 *q++ = 'e';
7894 }
7895 else if (ISPRINT(c)) {
7896 *q++ = c;
7897 }
7898 else {
7899 *q++ = '\\';
7900 if (u8) {
7901 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7902 if (MBCLEN_CHARFOUND_P(n)) {
7903 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7904 p += n;
7905 if (cc <= 0xFFFF)
7906 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7907 else
7908 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7909 q += strlen(q);
7910 continue;
7911 }
7912 }
7913 snprintf(q, qend-q, "x%02X", c);
7914 q += 3;
7915 }
7916 }
7917 *q++ = '"';
7918 *q = '\0';
7919 if (!rb_enc_asciicompat(enc)) {
7920 snprintf(q, qend-q, nonascii_suffix, enc->name);
7921 encidx = rb_ascii8bit_encindex();
7922 }
7923 /* result from dump is ASCII */
7924 rb_enc_associate_index(result, encidx);
7926 return result;
7927}
7928
7929static int
7930unescape_ascii(unsigned int c)
7931{
7932 switch (c) {
7933 case 'n':
7934 return '\n';
7935 case 'r':
7936 return '\r';
7937 case 't':
7938 return '\t';
7939 case 'f':
7940 return '\f';
7941 case 'v':
7942 return '\13';
7943 case 'b':
7944 return '\010';
7945 case 'a':
7946 return '\007';
7947 case 'e':
7948 return 033;
7949 }
7951}
7952
7953static void
7954undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7955{
7956 const char *s = *ss;
7957 unsigned int c;
7958 int codelen;
7959 size_t hexlen;
7960 unsigned char buf[6];
7961 static rb_encoding *enc_utf8 = NULL;
7962
7963 switch (*s) {
7964 case '\\':
7965 case '"':
7966 case '#':
7967 rb_str_cat(undumped, s, 1); /* cat itself */
7968 s++;
7969 break;
7970 case 'n':
7971 case 'r':
7972 case 't':
7973 case 'f':
7974 case 'v':
7975 case 'b':
7976 case 'a':
7977 case 'e':
7978 *buf = unescape_ascii(*s);
7979 rb_str_cat(undumped, (char *)buf, 1);
7980 s++;
7981 break;
7982 case 'u':
7983 if (*binary) {
7984 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7985 }
7986 *utf8 = true;
7987 if (++s >= s_end) {
7988 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7989 }
7990 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7991 if (*penc != enc_utf8) {
7992 *penc = enc_utf8;
7993 rb_enc_associate(undumped, enc_utf8);
7994 }
7995 if (*s == '{') { /* handle \u{...} form */
7996 s++;
7997 for (;;) {
7998 if (s >= s_end) {
7999 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
8000 }
8001 if (*s == '}') {
8002 s++;
8003 break;
8004 }
8005 if (ISSPACE(*s)) {
8006 s++;
8007 continue;
8008 }
8009 c = scan_hex(s, s_end-s, &hexlen);
8010 if (hexlen == 0 || hexlen > 6) {
8011 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
8012 }
8013 if (c > 0x10ffff) {
8014 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
8015 }
8016 if (0xd800 <= c && c <= 0xdfff) {
8017 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
8018 }
8019 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
8020 rb_str_cat(undumped, (char *)buf, codelen);
8021 s += hexlen;
8022 }
8023 }
8024 else { /* handle \uXXXX form */
8025 c = scan_hex(s, 4, &hexlen);
8026 if (hexlen != 4) {
8027 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
8028 }
8029 if (0xd800 <= c && c <= 0xdfff) {
8030 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
8031 }
8032 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
8033 rb_str_cat(undumped, (char *)buf, codelen);
8034 s += hexlen;
8035 }
8036 break;
8037 case 'x':
8038 if (*utf8) {
8039 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
8040 }
8041 *binary = true;
8042 if (++s >= s_end) {
8043 rb_raise(rb_eRuntimeError, "invalid hex escape");
8044 }
8045 *buf = scan_hex(s, 2, &hexlen);
8046 if (hexlen != 2) {
8047 rb_raise(rb_eRuntimeError, "invalid hex escape");
8048 }
8049 rb_str_cat(undumped, (char *)buf, 1);
8050 s += hexlen;
8051 break;
8052 default:
8053 rb_str_cat(undumped, s-1, 2);
8054 s++;
8055 }
8056
8057 *ss = s;
8058}
8059
8060static VALUE rb_str_is_ascii_only_p(VALUE str);
8061
8062/*
8063 * call-seq:
8064 * undump -> string
8065 *
8066 * Returns an unescaped version of +self+:
8067 *
8068 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
8069 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
8070 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
8071 * s_undumped == s_orig # => true
8072 *
8073 * Related: String#dump (inverse of String#undump).
8074 *
8075 */
8076
8077static VALUE
8078str_undump(VALUE str)
8079{
8080 const char *s = RSTRING_PTR(str);
8081 const char *s_end = RSTRING_END(str);
8082 rb_encoding *enc = rb_enc_get(str);
8083 VALUE undumped = rb_enc_str_new(s, 0L, enc);
8084 bool utf8 = false;
8085 bool binary = false;
8086 int w;
8087
8089 if (rb_str_is_ascii_only_p(str) == Qfalse) {
8090 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
8091 }
8092 if (!str_null_check(str, &w)) {
8093 rb_raise(rb_eRuntimeError, "string contains null byte");
8094 }
8095 if (RSTRING_LEN(str) < 2) goto invalid_format;
8096 if (*s != '"') goto invalid_format;
8097
8098 /* strip '"' at the start */
8099 s++;
8100
8101 for (;;) {
8102 if (s >= s_end) {
8103 rb_raise(rb_eRuntimeError, "unterminated dumped string");
8104 }
8105
8106 if (*s == '"') {
8107 /* epilogue */
8108 s++;
8109 if (s == s_end) {
8110 /* ascii compatible dumped string */
8111 break;
8112 }
8113 else {
8114 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
8115 static const char dup_suffix[] = ".dup";
8116 const char *encname;
8117 int encidx;
8118 ptrdiff_t size;
8119
8120 /* check separately for strings dumped by older versions */
8121 size = sizeof(dup_suffix) - 1;
8122 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
8123
8124 size = sizeof(force_encoding_suffix) - 1;
8125 if (s_end - s <= size) goto invalid_format;
8126 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
8127 s += size;
8128
8129 if (utf8) {
8130 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
8131 }
8132
8133 encname = s;
8134 s = memchr(s, '"', s_end-s);
8135 size = s - encname;
8136 if (!s) goto invalid_format;
8137 if (s_end - s != 2) goto invalid_format;
8138 if (s[0] != '"' || s[1] != ')') goto invalid_format;
8139
8140 encidx = rb_enc_find_index2(encname, (long)size);
8141 if (encidx < 0) {
8142 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
8143 }
8144 rb_enc_associate_index(undumped, encidx);
8145 }
8146 break;
8147 }
8148
8149 if (*s == '\\') {
8150 s++;
8151 if (s >= s_end) {
8152 rb_raise(rb_eRuntimeError, "invalid escape");
8153 }
8154 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
8155 }
8156 else {
8157 rb_str_cat(undumped, s++, 1);
8158 }
8159 }
8160
8161 RB_GC_GUARD(str);
8162
8163 return undumped;
8164invalid_format:
8165 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
8166}
8167
8168static void
8169rb_str_check_dummy_enc(rb_encoding *enc)
8170{
8171 if (rb_enc_dummy_p(enc)) {
8172 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
8173 rb_enc_name(enc));
8174 }
8175}
8176
8177static rb_encoding *
8178str_true_enc(VALUE str)
8179{
8180 rb_encoding *enc = STR_ENC_GET(str);
8181 rb_str_check_dummy_enc(enc);
8182 return enc;
8183}
8184
8185static OnigCaseFoldType
8186check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
8187{
8188 if (argc==0)
8189 return flags;
8190 if (argc>2)
8191 rb_raise(rb_eArgError, "too many options");
8192 if (argv[0]==sym_turkic) {
8193 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
8194 if (argc==2) {
8195 if (argv[1]==sym_lithuanian)
8196 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
8197 else
8198 rb_raise(rb_eArgError, "invalid second option");
8199 }
8200 }
8201 else if (argv[0]==sym_lithuanian) {
8202 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
8203 if (argc==2) {
8204 if (argv[1]==sym_turkic)
8205 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
8206 else
8207 rb_raise(rb_eArgError, "invalid second option");
8208 }
8209 }
8210 else if (argc>1)
8211 rb_raise(rb_eArgError, "too many options");
8212 else if (argv[0]==sym_ascii)
8213 flags |= ONIGENC_CASE_ASCII_ONLY;
8214 else if (argv[0]==sym_fold) {
8215 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
8216 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
8217 else
8218 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
8219 }
8220 else
8221 rb_raise(rb_eArgError, "invalid option");
8222 return flags;
8223}
8224
8225static inline bool
8226case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
8227{
8228 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
8229 return true;
8230 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
8231}
8232
8233/* 16 should be long enough to absorb any kind of single character length increase */
8234#define CASE_MAPPING_ADDITIONAL_LENGTH 20
8235#ifndef CASEMAP_DEBUG
8236# define CASEMAP_DEBUG 0
8237#endif
8238
8239struct mapping_buffer;
8240typedef struct mapping_buffer {
8241 size_t capa;
8242 size_t used;
8243 struct mapping_buffer *next;
8244 OnigUChar space[FLEX_ARY_LEN];
8246
8247static void
8248mapping_buffer_free(void *p)
8249{
8250 mapping_buffer *previous_buffer;
8251 mapping_buffer *current_buffer = p;
8252 while (current_buffer) {
8253 previous_buffer = current_buffer;
8254 current_buffer = current_buffer->next;
8255 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
8256 }
8257}
8258
8259static const rb_data_type_t mapping_buffer_type = {
8260 "mapping_buffer",
8261 {0, mapping_buffer_free,},
8262 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
8263};
8264
8265static VALUE
8266rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
8267{
8268 VALUE target;
8269
8270 const OnigUChar *source_current, *source_end;
8271 int target_length = 0;
8272 VALUE buffer_anchor;
8273 mapping_buffer *current_buffer = 0;
8274 mapping_buffer **pre_buffer;
8275 size_t buffer_count = 0;
8276 int buffer_length_or_invalid;
8277
8278 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
8279
8280 source_current = (OnigUChar*)RSTRING_PTR(source);
8281 source_end = (OnigUChar*)RSTRING_END(source);
8282
8283 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
8284 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
8285 while (source_current < source_end) {
8286 /* increase multiplier using buffer count to converge quickly */
8287 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
8288 if (CASEMAP_DEBUG) {
8289 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
8290 }
8291 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
8292 *pre_buffer = current_buffer;
8293 pre_buffer = &current_buffer->next;
8294 current_buffer->next = NULL;
8295 current_buffer->capa = capa;
8296 buffer_length_or_invalid = enc->case_map(flags,
8297 &source_current, source_end,
8298 current_buffer->space,
8299 current_buffer->space+current_buffer->capa,
8300 enc);
8301 if (buffer_length_or_invalid < 0) {
8302 current_buffer = DATA_PTR(buffer_anchor);
8303 DATA_PTR(buffer_anchor) = 0;
8304 mapping_buffer_free(current_buffer);
8305 rb_raise(rb_eArgError, "input string invalid");
8306 }
8307 target_length += current_buffer->used = buffer_length_or_invalid;
8308 }
8309 if (CASEMAP_DEBUG) {
8310 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
8311 }
8312
8313 if (buffer_count==1) {
8314 target = rb_str_new((const char*)current_buffer->space, target_length);
8315 }
8316 else {
8317 char *target_current;
8318
8319 target = rb_str_new(0, target_length);
8320 target_current = RSTRING_PTR(target);
8321 current_buffer = DATA_PTR(buffer_anchor);
8322 while (current_buffer) {
8323 memcpy(target_current, current_buffer->space, current_buffer->used);
8324 target_current += current_buffer->used;
8325 current_buffer = current_buffer->next;
8326 }
8327 }
8328 current_buffer = DATA_PTR(buffer_anchor);
8329 DATA_PTR(buffer_anchor) = 0;
8330 mapping_buffer_free(current_buffer);
8331
8332 RB_GC_GUARD(buffer_anchor);
8333
8334 /* TODO: check about string terminator character */
8335 str_enc_copy_direct(target, source);
8336 /*ENC_CODERANGE_SET(mapped, cr);*/
8337
8338 return target;
8339}
8340
8341static VALUE
8342rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
8343{
8344 const OnigUChar *source_current, *source_end;
8345 OnigUChar *target_current, *target_end;
8346 long old_length = RSTRING_LEN(source);
8347 int length_or_invalid;
8348
8349 if (old_length == 0) return Qnil;
8350
8351 source_current = (OnigUChar*)RSTRING_PTR(source);
8352 source_end = (OnigUChar*)RSTRING_END(source);
8353 if (source == target) {
8354 target_current = (OnigUChar*)source_current;
8355 target_end = (OnigUChar*)source_end;
8356 }
8357 else {
8358 target_current = (OnigUChar*)RSTRING_PTR(target);
8359 target_end = (OnigUChar*)RSTRING_END(target);
8360 }
8361
8362 length_or_invalid = onigenc_ascii_only_case_map(flags,
8363 &source_current, source_end,
8364 target_current, target_end, enc);
8365 if (length_or_invalid < 0)
8366 rb_raise(rb_eArgError, "input string invalid");
8367 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8368 fprintf(stderr, "problem with rb_str_ascii_casemap"
8369 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8370 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8371 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8372 }
8373
8374 str_enc_copy(target, source);
8375
8376 return target;
8377}
8378
8379static bool
8380upcase_single(VALUE str)
8381{
8382 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8383 bool modified = false;
8384
8385 while (s < send) {
8386 unsigned int c = *(unsigned char*)s;
8387
8388 if ('a' <= c && c <= 'z') {
8389 *s = 'A' + (c - 'a');
8390 modified = true;
8391 }
8392 s++;
8393 }
8394 return modified;
8395}
8396
8397/*
8398 * call-seq:
8399 * upcase!(*options) -> self or nil
8400 *
8401 * Upcases the characters in +self+;
8402 * returns +self+ if any changes were made, +nil+ otherwise:
8403 *
8404 * s = 'Hello World!' # => "Hello World!"
8405 * s.upcase! # => "HELLO WORLD!"
8406 * s # => "HELLO WORLD!"
8407 * s.upcase! # => nil
8408 *
8409 * The casing may be affected by the given +options+;
8410 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8411 *
8412 * Related: String#upcase, String#downcase, String#downcase!.
8413 *
8414 */
8415
8416static VALUE
8417rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8418{
8419 rb_encoding *enc;
8420 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8421
8422 flags = check_case_options(argc, argv, flags);
8423 str_modify_keep_cr(str);
8424 enc = str_true_enc(str);
8425 if (case_option_single_p(flags, enc, str)) {
8426 if (upcase_single(str))
8427 flags |= ONIGENC_CASE_MODIFIED;
8428 }
8429 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8430 rb_str_ascii_casemap(str, str, &flags, enc);
8431 else
8432 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8433
8434 if (ONIGENC_CASE_MODIFIED&flags) return str;
8435 return Qnil;
8436}
8437
8438
8439/*
8440 * call-seq:
8441 * upcase(*options) -> string
8442 *
8443 * Returns a string containing the upcased characters in +self+:
8444 *
8445 * s = 'Hello World!' # => "Hello World!"
8446 * s.upcase # => "HELLO WORLD!"
8447 *
8448 * The casing may be affected by the given +options+;
8449 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8450 *
8451 * Related: String#upcase!, String#downcase, String#downcase!.
8452 *
8453 */
8454
8455static VALUE
8456rb_str_upcase(int argc, VALUE *argv, VALUE str)
8457{
8458 rb_encoding *enc;
8459 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8460 VALUE ret;
8461
8462 flags = check_case_options(argc, argv, flags);
8463 enc = str_true_enc(str);
8464 if (case_option_single_p(flags, enc, str)) {
8465 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8466 str_enc_copy_direct(ret, str);
8467 upcase_single(ret);
8468 }
8469 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8470 ret = rb_str_new(0, RSTRING_LEN(str));
8471 rb_str_ascii_casemap(str, ret, &flags, enc);
8472 }
8473 else {
8474 ret = rb_str_casemap(str, &flags, enc);
8475 }
8476
8477 return ret;
8478}
8479
8480static bool
8481downcase_single(VALUE str)
8482{
8483 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8484 bool modified = false;
8485
8486 while (s < send) {
8487 unsigned int c = *(unsigned char*)s;
8488
8489 if ('A' <= c && c <= 'Z') {
8490 *s = 'a' + (c - 'A');
8491 modified = true;
8492 }
8493 s++;
8494 }
8495
8496 return modified;
8497}
8498
8499/*
8500 * call-seq:
8501 * downcase!(*options) -> self or nil
8502 *
8503 * Downcases the characters in +self+;
8504 * returns +self+ if any changes were made, +nil+ otherwise:
8505 *
8506 * s = 'Hello World!' # => "Hello World!"
8507 * s.downcase! # => "hello world!"
8508 * s # => "hello world!"
8509 * s.downcase! # => nil
8510 *
8511 * The casing may be affected by the given +options+;
8512 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8513 *
8514 * Related: String#downcase, String#upcase, String#upcase!.
8515 *
8516 */
8517
8518static VALUE
8519rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8520{
8521 rb_encoding *enc;
8522 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8523
8524 flags = check_case_options(argc, argv, flags);
8525 str_modify_keep_cr(str);
8526 enc = str_true_enc(str);
8527 if (case_option_single_p(flags, enc, str)) {
8528 if (downcase_single(str))
8529 flags |= ONIGENC_CASE_MODIFIED;
8530 }
8531 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8532 rb_str_ascii_casemap(str, str, &flags, enc);
8533 else
8534 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8535
8536 if (ONIGENC_CASE_MODIFIED&flags) return str;
8537 return Qnil;
8538}
8539
8540
8541/*
8542 * call-seq:
8543 * downcase(*options) -> string
8544 *
8545 * Returns a string containing the downcased characters in +self+:
8546 *
8547 * s = 'Hello World!' # => "Hello World!"
8548 * s.downcase # => "hello world!"
8549 *
8550 * The casing may be affected by the given +options+;
8551 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8552 *
8553 * Related: String#downcase!, String#upcase, String#upcase!.
8554 *
8555 */
8556
8557static VALUE
8558rb_str_downcase(int argc, VALUE *argv, VALUE str)
8559{
8560 rb_encoding *enc;
8561 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8562 VALUE ret;
8563
8564 flags = check_case_options(argc, argv, flags);
8565 enc = str_true_enc(str);
8566 if (case_option_single_p(flags, enc, str)) {
8567 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8568 str_enc_copy_direct(ret, str);
8569 downcase_single(ret);
8570 }
8571 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8572 ret = rb_str_new(0, RSTRING_LEN(str));
8573 rb_str_ascii_casemap(str, ret, &flags, enc);
8574 }
8575 else {
8576 ret = rb_str_casemap(str, &flags, enc);
8577 }
8578
8579 return ret;
8580}
8581
8582
8583/*
8584 * call-seq:
8585 * capitalize!(*options) -> self or nil
8586 *
8587 * Upcases the first character in +self+;
8588 * downcases the remaining characters;
8589 * returns +self+ if any changes were made, +nil+ otherwise:
8590 *
8591 * s = 'hello World!' # => "hello World!"
8592 * s.capitalize! # => "Hello world!"
8593 * s # => "Hello world!"
8594 * s.capitalize! # => nil
8595 *
8596 * The casing may be affected by the given +options+;
8597 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8598 *
8599 * Related: String#capitalize.
8600 *
8601 */
8602
8603static VALUE
8604rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8605{
8606 rb_encoding *enc;
8607 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8608
8609 flags = check_case_options(argc, argv, flags);
8610 str_modify_keep_cr(str);
8611 enc = str_true_enc(str);
8612 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8613 if (flags&ONIGENC_CASE_ASCII_ONLY)
8614 rb_str_ascii_casemap(str, str, &flags, enc);
8615 else
8616 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8617
8618 if (ONIGENC_CASE_MODIFIED&flags) return str;
8619 return Qnil;
8620}
8621
8622
8623/*
8624 * call-seq:
8625 * capitalize(*options) -> string
8626 *
8627 * Returns a string containing the characters in +self+;
8628 * the first character is upcased;
8629 * the remaining characters are downcased:
8630 *
8631 * s = 'hello World!' # => "hello World!"
8632 * s.capitalize # => "Hello world!"
8633 *
8634 * The casing may be affected by the given +options+;
8635 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8636 *
8637 * Related: String#capitalize!.
8638 *
8639 */
8640
8641static VALUE
8642rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8643{
8644 rb_encoding *enc;
8645 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8646 VALUE ret;
8647
8648 flags = check_case_options(argc, argv, flags);
8649 enc = str_true_enc(str);
8650 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8651 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8652 ret = rb_str_new(0, RSTRING_LEN(str));
8653 rb_str_ascii_casemap(str, ret, &flags, enc);
8654 }
8655 else {
8656 ret = rb_str_casemap(str, &flags, enc);
8657 }
8658 return ret;
8659}
8660
8661
8662/*
8663 * call-seq:
8664 * swapcase!(*options) -> self or nil
8665 *
8666 * Upcases each lowercase character in +self+;
8667 * downcases uppercase character;
8668 * returns +self+ if any changes were made, +nil+ otherwise:
8669 *
8670 * s = 'Hello World!' # => "Hello World!"
8671 * s.swapcase! # => "hELLO wORLD!"
8672 * s # => "hELLO wORLD!"
8673 * ''.swapcase! # => nil
8674 *
8675 * The casing may be affected by the given +options+;
8676 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8677 *
8678 * Related: String#swapcase.
8679 *
8680 */
8681
8682static VALUE
8683rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8684{
8685 rb_encoding *enc;
8686 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8687
8688 flags = check_case_options(argc, argv, flags);
8689 str_modify_keep_cr(str);
8690 enc = str_true_enc(str);
8691 if (flags&ONIGENC_CASE_ASCII_ONLY)
8692 rb_str_ascii_casemap(str, str, &flags, enc);
8693 else
8694 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8695
8696 if (ONIGENC_CASE_MODIFIED&flags) return str;
8697 return Qnil;
8698}
8699
8700
8701/*
8702 * call-seq:
8703 * swapcase(*options) -> string
8704 *
8705 * Returns a string containing the characters in +self+, with cases reversed;
8706 * each uppercase character is downcased;
8707 * each lowercase character is upcased:
8708 *
8709 * s = 'Hello World!' # => "Hello World!"
8710 * s.swapcase # => "hELLO wORLD!"
8711 *
8712 * The casing may be affected by the given +options+;
8713 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8714 *
8715 * Related: String#swapcase!.
8716 *
8717 */
8718
8719static VALUE
8720rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8721{
8722 rb_encoding *enc;
8723 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8724 VALUE ret;
8725
8726 flags = check_case_options(argc, argv, flags);
8727 enc = str_true_enc(str);
8728 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8729 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8730 ret = rb_str_new(0, RSTRING_LEN(str));
8731 rb_str_ascii_casemap(str, ret, &flags, enc);
8732 }
8733 else {
8734 ret = rb_str_casemap(str, &flags, enc);
8735 }
8736 return ret;
8737}
8738
8739typedef unsigned char *USTR;
8740
8741struct tr {
8742 int gen;
8743 unsigned int now, max;
8744 char *p, *pend;
8745};
8746
8747static unsigned int
8748trnext(struct tr *t, rb_encoding *enc)
8749{
8750 int n;
8751
8752 for (;;) {
8753 nextpart:
8754 if (!t->gen) {
8755 if (t->p == t->pend) return -1;
8756 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8757 t->p += n;
8758 }
8759 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8760 t->p += n;
8761 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8762 t->p += n;
8763 if (t->p < t->pend) {
8764 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8765 t->p += n;
8766 if (t->now > c) {
8767 if (t->now < 0x80 && c < 0x80) {
8768 rb_raise(rb_eArgError,
8769 "invalid range \"%c-%c\" in string transliteration",
8770 t->now, c);
8771 }
8772 else {
8773 rb_raise(rb_eArgError, "invalid range in string transliteration");
8774 }
8775 continue; /* not reached */
8776 }
8777 else if (t->now < c) {
8778 t->gen = 1;
8779 t->max = c;
8780 }
8781 }
8782 }
8783 return t->now;
8784 }
8785 else {
8786 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8787 if (t->now == t->max) {
8788 t->gen = 0;
8789 goto nextpart;
8790 }
8791 }
8792 if (t->now < t->max) {
8793 return t->now;
8794 }
8795 else {
8796 t->gen = 0;
8797 return t->max;
8798 }
8799 }
8800 }
8801}
8802
8803static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8804
8805static VALUE
8806tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8807{
8808 const unsigned int errc = -1;
8809 unsigned int trans[256];
8810 rb_encoding *enc, *e1, *e2;
8811 struct tr trsrc, trrepl;
8812 int cflag = 0;
8813 unsigned int c, c0, last = 0;
8814 int modify = 0, i, l;
8815 unsigned char *s, *send;
8816 VALUE hash = 0;
8817 int singlebyte = single_byte_optimizable(str);
8818 int termlen;
8819 int cr;
8820
8821#define CHECK_IF_ASCII(c) \
8822 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8823 (cr = ENC_CODERANGE_VALID) : 0)
8824
8825 StringValue(src);
8826 StringValue(repl);
8827 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8828 if (RSTRING_LEN(repl) == 0) {
8829 return rb_str_delete_bang(1, &src, str);
8830 }
8831
8832 cr = ENC_CODERANGE(str);
8833 e1 = rb_enc_check(str, src);
8834 e2 = rb_enc_check(str, repl);
8835 if (e1 == e2) {
8836 enc = e1;
8837 }
8838 else {
8839 enc = rb_enc_check(src, repl);
8840 }
8841 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8842 if (RSTRING_LEN(src) > 1 &&
8843 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8844 trsrc.p + l < trsrc.pend) {
8845 cflag = 1;
8846 trsrc.p += l;
8847 }
8848 trrepl.p = RSTRING_PTR(repl);
8849 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8850 trsrc.gen = trrepl.gen = 0;
8851 trsrc.now = trrepl.now = 0;
8852 trsrc.max = trrepl.max = 0;
8853
8854 if (cflag) {
8855 for (i=0; i<256; i++) {
8856 trans[i] = 1;
8857 }
8858 while ((c = trnext(&trsrc, enc)) != errc) {
8859 if (c < 256) {
8860 trans[c] = errc;
8861 }
8862 else {
8863 if (!hash) hash = rb_hash_new();
8864 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8865 }
8866 }
8867 while ((c = trnext(&trrepl, enc)) != errc)
8868 /* retrieve last replacer */;
8869 last = trrepl.now;
8870 for (i=0; i<256; i++) {
8871 if (trans[i] != errc) {
8872 trans[i] = last;
8873 }
8874 }
8875 }
8876 else {
8877 unsigned int r;
8878
8879 for (i=0; i<256; i++) {
8880 trans[i] = errc;
8881 }
8882 while ((c = trnext(&trsrc, enc)) != errc) {
8883 r = trnext(&trrepl, enc);
8884 if (r == errc) r = trrepl.now;
8885 if (c < 256) {
8886 trans[c] = r;
8887 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8888 }
8889 else {
8890 if (!hash) hash = rb_hash_new();
8891 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8892 }
8893 }
8894 }
8895
8896 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8897 cr = ENC_CODERANGE_7BIT;
8898 str_modify_keep_cr(str);
8899 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8900 termlen = rb_enc_mbminlen(enc);
8901 if (sflag) {
8902 int clen, tlen;
8903 long offset, max = RSTRING_LEN(str);
8904 unsigned int save = -1;
8905 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8906
8907 while (s < send) {
8908 int may_modify = 0;
8909
8910 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8911 if (!MBCLEN_CHARFOUND_P(r)) {
8912 xfree(buf);
8913 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8914 }
8915 clen = MBCLEN_CHARFOUND_LEN(r);
8916 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8917
8918 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8919
8920 s += clen;
8921 if (c < 256) {
8922 c = trans[c];
8923 }
8924 else if (hash) {
8925 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8926 if (NIL_P(tmp)) {
8927 if (cflag) c = last;
8928 else c = errc;
8929 }
8930 else if (cflag) c = errc;
8931 else c = NUM2INT(tmp);
8932 }
8933 else {
8934 c = errc;
8935 }
8936 if (c != (unsigned int)-1) {
8937 if (save == c) {
8938 CHECK_IF_ASCII(c);
8939 continue;
8940 }
8941 save = c;
8942 tlen = rb_enc_codelen(c, enc);
8943 modify = 1;
8944 }
8945 else {
8946 save = -1;
8947 c = c0;
8948 if (enc != e1) may_modify = 1;
8949 }
8950 if ((offset = t - buf) + tlen > max) {
8951 size_t MAYBE_UNUSED(old) = max + termlen;
8952 max = offset + tlen + (send - s);
8953 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8954 t = buf + offset;
8955 }
8956 rb_enc_mbcput(c, t, enc);
8957 if (may_modify && memcmp(s, t, tlen) != 0) {
8958 modify = 1;
8959 }
8960 CHECK_IF_ASCII(c);
8961 t += tlen;
8962 }
8963 if (!STR_EMBED_P(str)) {
8964 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8965 }
8966 TERM_FILL((char *)t, termlen);
8967 RSTRING(str)->as.heap.ptr = (char *)buf;
8968 STR_SET_LEN(str, t - buf);
8969 STR_SET_NOEMBED(str);
8970 RSTRING(str)->as.heap.aux.capa = max;
8971 }
8972 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8973 while (s < send) {
8974 c = (unsigned char)*s;
8975 if (trans[c] != errc) {
8976 if (!cflag) {
8977 c = trans[c];
8978 *s = c;
8979 modify = 1;
8980 }
8981 else {
8982 *s = last;
8983 modify = 1;
8984 }
8985 }
8986 CHECK_IF_ASCII(c);
8987 s++;
8988 }
8989 }
8990 else {
8991 int clen, tlen;
8992 long offset, max = (long)((send - s) * 1.2);
8993 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8994
8995 while (s < send) {
8996 int may_modify = 0;
8997
8998 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8999 if (!MBCLEN_CHARFOUND_P(r)) {
9000 xfree(buf);
9001 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
9002 }
9003 clen = MBCLEN_CHARFOUND_LEN(r);
9004 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
9005
9006 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
9007
9008 if (c < 256) {
9009 c = trans[c];
9010 }
9011 else if (hash) {
9012 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
9013 if (NIL_P(tmp)) {
9014 if (cflag) c = last;
9015 else c = errc;
9016 }
9017 else if (cflag) c = errc;
9018 else c = NUM2INT(tmp);
9019 }
9020 else {
9021 c = cflag ? last : errc;
9022 }
9023 if (c != errc) {
9024 tlen = rb_enc_codelen(c, enc);
9025 modify = 1;
9026 }
9027 else {
9028 c = c0;
9029 if (enc != e1) may_modify = 1;
9030 }
9031 if ((offset = t - buf) + tlen > max) {
9032 size_t MAYBE_UNUSED(old) = max + termlen;
9033 max = offset + tlen + (long)((send - s) * 1.2);
9034 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
9035 t = buf + offset;
9036 }
9037 if (s != t) {
9038 rb_enc_mbcput(c, t, enc);
9039 if (may_modify && memcmp(s, t, tlen) != 0) {
9040 modify = 1;
9041 }
9042 }
9043 CHECK_IF_ASCII(c);
9044 s += clen;
9045 t += tlen;
9046 }
9047 if (!STR_EMBED_P(str)) {
9048 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
9049 }
9050 TERM_FILL((char *)t, termlen);
9051 RSTRING(str)->as.heap.ptr = (char *)buf;
9052 STR_SET_LEN(str, t - buf);
9053 STR_SET_NOEMBED(str);
9054 RSTRING(str)->as.heap.aux.capa = max;
9055 }
9056
9057 if (modify) {
9058 if (cr != ENC_CODERANGE_BROKEN)
9059 ENC_CODERANGE_SET(str, cr);
9060 rb_enc_associate(str, enc);
9061 return str;
9062 }
9063 return Qnil;
9064}
9065
9066
9067/*
9068 * call-seq:
9069 * tr!(selector, replacements) -> self or nil
9070 *
9071 * Like String#tr, but modifies +self+ in place.
9072 * Returns +self+ if any changes were made, +nil+ otherwise.
9073 *
9074 */
9075
9076static VALUE
9077rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
9078{
9079 return tr_trans(str, src, repl, 0);
9080}
9081
9082
9083/*
9084 * call-seq:
9085 * tr(selector, replacements) -> new_string
9086 *
9087 * Returns a copy of +self+ with each character specified by string +selector+
9088 * translated to the corresponding character in string +replacements+.
9089 * The correspondence is _positional_:
9090 *
9091 * - Each occurrence of the first character specified by +selector+
9092 * is translated to the first character in +replacements+.
9093 * - Each occurrence of the second character specified by +selector+
9094 * is translated to the second character in +replacements+.
9095 * - And so on.
9096 *
9097 * Example:
9098 *
9099 * 'hello'.tr('el', 'ip') #=> "hippo"
9100 *
9101 * If +replacements+ is shorter than +selector+,
9102 * it is implicitly padded with its own last character:
9103 *
9104 * 'hello'.tr('aeiou', '-') # => "h-ll-"
9105 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
9106 *
9107 * Arguments +selector+ and +replacements+ must be valid character selectors
9108 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
9109 * and may use any of its valid forms, including negation, ranges, and escaping:
9110 *
9111 * # Negation.
9112 * 'hello'.tr('^aeiou', '-') # => "-e--o"
9113 * # Ranges.
9114 * 'ibm'.tr('b-z', 'a-z') # => "hal"
9115 * # Escapes.
9116 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
9117 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
9118 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
9119 *
9120 */
9121
9122static VALUE
9123rb_str_tr(VALUE str, VALUE src, VALUE repl)
9124{
9125 str = str_duplicate(rb_cString, str);
9126 tr_trans(str, src, repl, 0);
9127 return str;
9128}
9129
9130#define TR_TABLE_MAX (UCHAR_MAX+1)
9131#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
9132static void
9133tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
9134 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
9135{
9136 const unsigned int errc = -1;
9137 char buf[TR_TABLE_MAX];
9138 struct tr tr;
9139 unsigned int c;
9140 VALUE table = 0, ptable = 0;
9141 int i, l, cflag = 0;
9142
9143 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
9144 tr.gen = tr.now = tr.max = 0;
9145
9146 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
9147 cflag = 1;
9148 tr.p += l;
9149 }
9150 if (first) {
9151 for (i=0; i<TR_TABLE_MAX; i++) {
9152 stable[i] = 1;
9153 }
9154 stable[TR_TABLE_MAX] = cflag;
9155 }
9156 else if (stable[TR_TABLE_MAX] && !cflag) {
9157 stable[TR_TABLE_MAX] = 0;
9158 }
9159 for (i=0; i<TR_TABLE_MAX; i++) {
9160 buf[i] = cflag;
9161 }
9162
9163 while ((c = trnext(&tr, enc)) != errc) {
9164 if (c < TR_TABLE_MAX) {
9165 buf[(unsigned char)c] = !cflag;
9166 }
9167 else {
9168 VALUE key = UINT2NUM(c);
9169
9170 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
9171 if (cflag) {
9172 ptable = *ctablep;
9173 table = ptable ? ptable : rb_hash_new();
9174 *ctablep = table;
9175 }
9176 else {
9177 table = rb_hash_new();
9178 ptable = *tablep;
9179 *tablep = table;
9180 }
9181 }
9182 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
9183 rb_hash_aset(table, key, Qtrue);
9184 }
9185 }
9186 }
9187 for (i=0; i<TR_TABLE_MAX; i++) {
9188 stable[i] = stable[i] && buf[i];
9189 }
9190 if (!table && !cflag) {
9191 *tablep = 0;
9192 }
9193}
9194
9195
9196static int
9197tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
9198{
9199 if (c < TR_TABLE_MAX) {
9200 return table[c] != 0;
9201 }
9202 else {
9203 VALUE v = UINT2NUM(c);
9204
9205 if (del) {
9206 if (!NIL_P(rb_hash_lookup(del, v)) &&
9207 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
9208 return TRUE;
9209 }
9210 }
9211 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
9212 return FALSE;
9213 }
9214 return table[TR_TABLE_MAX] ? TRUE : FALSE;
9215 }
9216}
9217
9218/*
9219 * call-seq:
9220 * delete!(*selectors) -> self or nil
9221 *
9222 * Like String#delete, but modifies +self+ in place.
9223 * Returns +self+ if any changes were made, +nil+ otherwise.
9224 *
9225 */
9226
9227static VALUE
9228rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
9229{
9230 char squeez[TR_TABLE_SIZE];
9231 rb_encoding *enc = 0;
9232 char *s, *send, *t;
9233 VALUE del = 0, nodel = 0;
9234 int modify = 0;
9235 int i, ascompat, cr;
9236
9237 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
9239 for (i=0; i<argc; i++) {
9240 VALUE s = argv[i];
9241
9242 StringValue(s);
9243 enc = rb_enc_check(str, s);
9244 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
9245 }
9246
9247 str_modify_keep_cr(str);
9248 ascompat = rb_enc_asciicompat(enc);
9249 s = t = RSTRING_PTR(str);
9250 send = RSTRING_END(str);
9251 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
9252 while (s < send) {
9253 unsigned int c;
9254 int clen;
9255
9256 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9257 if (squeez[c]) {
9258 modify = 1;
9259 }
9260 else {
9261 if (t != s) *t = c;
9262 t++;
9263 }
9264 s++;
9265 }
9266 else {
9267 c = rb_enc_codepoint_len(s, send, &clen, enc);
9268
9269 if (tr_find(c, squeez, del, nodel)) {
9270 modify = 1;
9271 }
9272 else {
9273 if (t != s) rb_enc_mbcput(c, t, enc);
9274 t += clen;
9276 }
9277 s += clen;
9278 }
9279 }
9280 TERM_FILL(t, TERM_LEN(str));
9281 STR_SET_LEN(str, t - RSTRING_PTR(str));
9282 ENC_CODERANGE_SET(str, cr);
9283
9284 if (modify) return str;
9285 return Qnil;
9286}
9287
9288
9289/*
9290 * call-seq:
9291 * delete(*selectors) -> new_string
9292 *
9293 * Returns a copy of +self+ with characters specified by +selectors+ removed
9294 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9295 *
9296 * "hello".delete "l","lo" #=> "heo"
9297 * "hello".delete "lo" #=> "he"
9298 * "hello".delete "aeiou", "^e" #=> "hell"
9299 * "hello".delete "ej-m" #=> "ho"
9300 *
9301 */
9302
9303static VALUE
9304rb_str_delete(int argc, VALUE *argv, VALUE str)
9305{
9306 str = str_duplicate(rb_cString, str);
9307 rb_str_delete_bang(argc, argv, str);
9308 return str;
9309}
9310
9311
9312/*
9313 * call-seq:
9314 * squeeze!(*selectors) -> self or nil
9315 *
9316 * Like String#squeeze, but modifies +self+ in place.
9317 * Returns +self+ if any changes were made, +nil+ otherwise.
9318 */
9319
9320static VALUE
9321rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
9322{
9323 char squeez[TR_TABLE_SIZE];
9324 rb_encoding *enc = 0;
9325 VALUE del = 0, nodel = 0;
9326 unsigned char *s, *send, *t;
9327 int i, modify = 0;
9328 int ascompat, singlebyte = single_byte_optimizable(str);
9329 unsigned int save;
9330
9331 if (argc == 0) {
9332 enc = STR_ENC_GET(str);
9333 }
9334 else {
9335 for (i=0; i<argc; i++) {
9336 VALUE s = argv[i];
9337
9338 StringValue(s);
9339 enc = rb_enc_check(str, s);
9340 if (singlebyte && !single_byte_optimizable(s))
9341 singlebyte = 0;
9342 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
9343 }
9344 }
9345
9346 str_modify_keep_cr(str);
9347 s = t = (unsigned char *)RSTRING_PTR(str);
9348 if (!s || RSTRING_LEN(str) == 0) return Qnil;
9349 send = (unsigned char *)RSTRING_END(str);
9350 save = -1;
9351 ascompat = rb_enc_asciicompat(enc);
9352
9353 if (singlebyte) {
9354 while (s < send) {
9355 unsigned int c = *s++;
9356 if (c != save || (argc > 0 && !squeez[c])) {
9357 *t++ = save = c;
9358 }
9359 }
9360 }
9361 else {
9362 while (s < send) {
9363 unsigned int c;
9364 int clen;
9365
9366 if (ascompat && (c = *s) < 0x80) {
9367 if (c != save || (argc > 0 && !squeez[c])) {
9368 *t++ = save = c;
9369 }
9370 s++;
9371 }
9372 else {
9373 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
9374
9375 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9376 if (t != s) rb_enc_mbcput(c, t, enc);
9377 save = c;
9378 t += clen;
9379 }
9380 s += clen;
9381 }
9382 }
9383 }
9384
9385 TERM_FILL((char *)t, TERM_LEN(str));
9386 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9387 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9388 modify = 1;
9389 }
9390
9391 if (modify) return str;
9392 return Qnil;
9393}
9394
9395
9396/*
9397 * call-seq:
9398 * squeeze(*selectors) -> new_string
9399 *
9400 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9401 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9402 *
9403 * "Squeezed" means that each multiple-character run of a selected character
9404 * is squeezed down to a single character;
9405 * with no arguments given, squeezes all characters:
9406 *
9407 * "yellow moon".squeeze #=> "yelow mon"
9408 * " now is the".squeeze(" ") #=> " now is the"
9409 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9410 *
9411 */
9412
9413static VALUE
9414rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9415{
9416 str = str_duplicate(rb_cString, str);
9417 rb_str_squeeze_bang(argc, argv, str);
9418 return str;
9419}
9420
9421
9422/*
9423 * call-seq:
9424 * tr_s!(selector, replacements) -> self or nil
9425 *
9426 * Like String#tr_s, but modifies +self+ in place.
9427 * Returns +self+ if any changes were made, +nil+ otherwise.
9428 *
9429 * Related: String#squeeze!.
9430 */
9431
9432static VALUE
9433rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9434{
9435 return tr_trans(str, src, repl, 1);
9436}
9437
9438
9439/*
9440 * call-seq:
9441 * tr_s(selector, replacements) -> string
9442 *
9443 * Like String#tr, but also squeezes the modified portions of the translated string;
9444 * returns a new string (translated and squeezed).
9445 *
9446 * 'hello'.tr_s('l', 'r') #=> "hero"
9447 * 'hello'.tr_s('el', '-') #=> "h-o"
9448 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9449 *
9450 * Related: String#squeeze.
9451 *
9452 */
9453
9454static VALUE
9455rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9456{
9457 str = str_duplicate(rb_cString, str);
9458 tr_trans(str, src, repl, 1);
9459 return str;
9460}
9461
9462
9463/*
9464 * call-seq:
9465 * count(*selectors) -> integer
9466 *
9467 * Returns the total number of characters in +self+
9468 * that are specified by the given +selectors+
9469 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9470 *
9471 * a = "hello world"
9472 * a.count "lo" #=> 5
9473 * a.count "lo", "o" #=> 2
9474 * a.count "hello", "^l" #=> 4
9475 * a.count "ej-m" #=> 4
9476 *
9477 * "hello^world".count "\\^aeiou" #=> 4
9478 * "hello-world".count "a\\-eo" #=> 4
9479 *
9480 * c = "hello world\\r\\n"
9481 * c.count "\\" #=> 2
9482 * c.count "\\A" #=> 0
9483 * c.count "X-\\w" #=> 3
9484 */
9485
9486static VALUE
9487rb_str_count(int argc, VALUE *argv, VALUE str)
9488{
9489 char table[TR_TABLE_SIZE];
9490 rb_encoding *enc = 0;
9491 VALUE del = 0, nodel = 0, tstr;
9492 char *s, *send;
9493 int i;
9494 int ascompat;
9495 size_t n = 0;
9496
9498
9499 tstr = argv[0];
9500 StringValue(tstr);
9501 enc = rb_enc_check(str, tstr);
9502 if (argc == 1) {
9503 const char *ptstr;
9504 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9505 (ptstr = RSTRING_PTR(tstr),
9506 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9507 !is_broken_string(str)) {
9508 int clen;
9509 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9510
9511 s = RSTRING_PTR(str);
9512 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9513 send = RSTRING_END(str);
9514 while (s < send) {
9515 if (*(unsigned char*)s++ == c) n++;
9516 }
9517 return SIZET2NUM(n);
9518 }
9519 }
9520
9521 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9522 for (i=1; i<argc; i++) {
9523 tstr = argv[i];
9524 StringValue(tstr);
9525 enc = rb_enc_check(str, tstr);
9526 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9527 }
9528
9529 s = RSTRING_PTR(str);
9530 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9531 send = RSTRING_END(str);
9532 ascompat = rb_enc_asciicompat(enc);
9533 while (s < send) {
9534 unsigned int c;
9535
9536 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9537 if (table[c]) {
9538 n++;
9539 }
9540 s++;
9541 }
9542 else {
9543 int clen;
9544 c = rb_enc_codepoint_len(s, send, &clen, enc);
9545 if (tr_find(c, table, del, nodel)) {
9546 n++;
9547 }
9548 s += clen;
9549 }
9550 }
9551
9552 return SIZET2NUM(n);
9553}
9554
9555static VALUE
9556rb_fs_check(VALUE val)
9557{
9558 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9559 val = rb_check_string_type(val);
9560 if (NIL_P(val)) return 0;
9561 }
9562 return val;
9563}
9564
9565static const char isspacetable[256] = {
9566 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9567 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9568 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9569 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9570 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9571 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9572 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9573 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9574 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9575 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9576 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9577 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9578 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9579 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9580 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9581 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9582};
9583
9584#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9585
9586static long
9587split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9588{
9589 if (empty_count >= 0 && len == 0) {
9590 return empty_count + 1;
9591 }
9592 if (empty_count > 0) {
9593 /* make different substrings */
9594 if (result) {
9595 do {
9596 rb_ary_push(result, str_new_empty_String(str));
9597 } while (--empty_count > 0);
9598 }
9599 else {
9600 do {
9601 rb_yield(str_new_empty_String(str));
9602 } while (--empty_count > 0);
9603 }
9604 }
9605 str = rb_str_subseq(str, beg, len);
9606 if (result) {
9607 rb_ary_push(result, str);
9608 }
9609 else {
9610 rb_yield(str);
9611 }
9612 return empty_count;
9613}
9614
9615typedef enum {
9616 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9617} split_type_t;
9618
9619static split_type_t
9620literal_split_pattern(VALUE spat, split_type_t default_type)
9621{
9622 rb_encoding *enc = STR_ENC_GET(spat);
9623 const char *ptr;
9624 long len;
9625 RSTRING_GETMEM(spat, ptr, len);
9626 if (len == 0) {
9627 /* Special case - split into chars */
9628 return SPLIT_TYPE_CHARS;
9629 }
9630 else if (rb_enc_asciicompat(enc)) {
9631 if (len == 1 && ptr[0] == ' ') {
9632 return SPLIT_TYPE_AWK;
9633 }
9634 }
9635 else {
9636 int l;
9637 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9638 return SPLIT_TYPE_AWK;
9639 }
9640 }
9641 return default_type;
9642}
9643
9644/*
9645 * call-seq:
9646 * split(field_sep = $;, limit = 0) -> array
9647 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9648 *
9649 * :include: doc/string/split.rdoc
9650 *
9651 */
9652
9653static VALUE
9654rb_str_split_m(int argc, VALUE *argv, VALUE str)
9655{
9656 rb_encoding *enc;
9657 VALUE spat;
9658 VALUE limit;
9659 split_type_t split_type;
9660 long beg, end, i = 0, empty_count = -1;
9661 int lim = 0;
9662 VALUE result, tmp;
9663
9664 result = rb_block_given_p() ? Qfalse : Qnil;
9665 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9666 lim = NUM2INT(limit);
9667 if (lim <= 0) limit = Qnil;
9668 else if (lim == 1) {
9669 if (RSTRING_LEN(str) == 0)
9670 return result ? rb_ary_new2(0) : str;
9671 tmp = str_duplicate(rb_cString, str);
9672 if (!result) {
9673 rb_yield(tmp);
9674 return str;
9675 }
9676 return rb_ary_new3(1, tmp);
9677 }
9678 i = 1;
9679 }
9680 if (NIL_P(limit) && !lim) empty_count = 0;
9681
9682 enc = STR_ENC_GET(str);
9683 split_type = SPLIT_TYPE_REGEXP;
9684 if (!NIL_P(spat)) {
9685 spat = get_pat_quoted(spat, 0);
9686 }
9687 else if (NIL_P(spat = rb_fs)) {
9688 split_type = SPLIT_TYPE_AWK;
9689 }
9690 else if (!(spat = rb_fs_check(spat))) {
9691 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9692 }
9693 else {
9694 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9695 }
9696 if (split_type != SPLIT_TYPE_AWK) {
9697 switch (BUILTIN_TYPE(spat)) {
9698 case T_REGEXP:
9699 rb_reg_options(spat); /* check if uninitialized */
9700 tmp = RREGEXP_SRC(spat);
9701 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9702 if (split_type == SPLIT_TYPE_AWK) {
9703 spat = tmp;
9704 split_type = SPLIT_TYPE_STRING;
9705 }
9706 break;
9707
9708 case T_STRING:
9709 mustnot_broken(spat);
9710 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9711 break;
9712
9713 default:
9715 }
9716 }
9717
9718#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9719
9720 beg = 0;
9721 char *ptr = RSTRING_PTR(str);
9722 char *eptr = RSTRING_END(str);
9723 if (split_type == SPLIT_TYPE_AWK) {
9724 char *bptr = ptr;
9725 int skip = 1;
9726 unsigned int c;
9727
9728 if (result) result = rb_ary_new();
9729 end = beg;
9730 if (is_ascii_string(str)) {
9731 while (ptr < eptr) {
9732 c = (unsigned char)*ptr++;
9733 if (skip) {
9734 if (ascii_isspace(c)) {
9735 beg = ptr - bptr;
9736 }
9737 else {
9738 end = ptr - bptr;
9739 skip = 0;
9740 if (!NIL_P(limit) && lim <= i) break;
9741 }
9742 }
9743 else if (ascii_isspace(c)) {
9744 SPLIT_STR(beg, end-beg);
9745 skip = 1;
9746 beg = ptr - bptr;
9747 if (!NIL_P(limit)) ++i;
9748 }
9749 else {
9750 end = ptr - bptr;
9751 }
9752 }
9753 }
9754 else {
9755 while (ptr < eptr) {
9756 int n;
9757
9758 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9759 ptr += n;
9760 if (skip) {
9761 if (rb_isspace(c)) {
9762 beg = ptr - bptr;
9763 }
9764 else {
9765 end = ptr - bptr;
9766 skip = 0;
9767 if (!NIL_P(limit) && lim <= i) break;
9768 }
9769 }
9770 else if (rb_isspace(c)) {
9771 SPLIT_STR(beg, end-beg);
9772 skip = 1;
9773 beg = ptr - bptr;
9774 if (!NIL_P(limit)) ++i;
9775 }
9776 else {
9777 end = ptr - bptr;
9778 }
9779 }
9780 }
9781 }
9782 else if (split_type == SPLIT_TYPE_STRING) {
9783 char *str_start = ptr;
9784 char *substr_start = ptr;
9785 char *sptr = RSTRING_PTR(spat);
9786 long slen = RSTRING_LEN(spat);
9787
9788 if (result) result = rb_ary_new();
9789 mustnot_broken(str);
9790 enc = rb_enc_check(str, spat);
9791 while (ptr < eptr &&
9792 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9793 /* Check we are at the start of a char */
9794 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9795 if (t != ptr + end) {
9796 ptr = t;
9797 continue;
9798 }
9799 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9800 ptr += end + slen;
9801 substr_start = ptr;
9802 if (!NIL_P(limit) && lim <= ++i) break;
9803 }
9804 beg = ptr - str_start;
9805 }
9806 else if (split_type == SPLIT_TYPE_CHARS) {
9807 char *str_start = ptr;
9808 int n;
9809
9810 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9811 mustnot_broken(str);
9812 enc = rb_enc_get(str);
9813 while (ptr < eptr &&
9814 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9815 SPLIT_STR(ptr - str_start, n);
9816 ptr += n;
9817 if (!NIL_P(limit) && lim <= ++i) break;
9818 }
9819 beg = ptr - str_start;
9820 }
9821 else {
9822 if (result) result = rb_ary_new();
9823 long len = RSTRING_LEN(str);
9824 long start = beg;
9825 long idx;
9826 int last_null = 0;
9827 struct re_registers *regs;
9828 VALUE match = 0;
9829
9830 for (; rb_reg_search(spat, str, start, 0) >= 0;
9831 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9832 match = rb_backref_get();
9833 if (!result) rb_match_busy(match);
9834 regs = RMATCH_REGS(match);
9835 end = BEG(0);
9836 if (start == end && BEG(0) == END(0)) {
9837 if (!ptr) {
9838 SPLIT_STR(0, 0);
9839 break;
9840 }
9841 else if (last_null == 1) {
9842 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9843 beg = start;
9844 }
9845 else {
9846 if (start == len)
9847 start++;
9848 else
9849 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9850 last_null = 1;
9851 continue;
9852 }
9853 }
9854 else {
9855 SPLIT_STR(beg, end-beg);
9856 beg = start = END(0);
9857 }
9858 last_null = 0;
9859
9860 for (idx=1; idx < regs->num_regs; idx++) {
9861 if (BEG(idx) == -1) continue;
9862 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9863 }
9864 if (!NIL_P(limit) && lim <= ++i) break;
9865 }
9866 if (match) rb_match_unbusy(match);
9867 }
9868 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9869 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9870 }
9871
9872 return result ? result : str;
9873}
9874
9875VALUE
9876rb_str_split(VALUE str, const char *sep0)
9877{
9878 VALUE sep;
9879
9880 StringValue(str);
9881 sep = rb_str_new_cstr(sep0);
9882 return rb_str_split_m(1, &sep, str);
9883}
9884
9885#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9886
9887static inline int
9888enumerator_element(VALUE ary, VALUE e)
9889{
9890 if (ary) {
9891 rb_ary_push(ary, e);
9892 return 0;
9893 }
9894 else {
9895 rb_yield(e);
9896 return 1;
9897 }
9898}
9899
9900#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9901
9902static const char *
9903chomp_newline(const char *p, const char *e, rb_encoding *enc)
9904{
9905 const char *prev = rb_enc_prev_char(p, e, e, enc);
9906 if (rb_enc_is_newline(prev, e, enc)) {
9907 e = prev;
9908 prev = rb_enc_prev_char(p, e, e, enc);
9909 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9910 e = prev;
9911 }
9912 return e;
9913}
9914
9915static VALUE
9916get_rs(void)
9917{
9918 VALUE rs = rb_rs;
9919 if (!NIL_P(rs) &&
9920 (!RB_TYPE_P(rs, T_STRING) ||
9921 RSTRING_LEN(rs) != 1 ||
9922 RSTRING_PTR(rs)[0] != '\n')) {
9923 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9924 }
9925 return rs;
9926}
9927
9928#define rb_rs get_rs()
9929
9930static VALUE
9931rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9932{
9933 rb_encoding *enc;
9934 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9935 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9936 long pos, len, rslen;
9937 int rsnewline = 0;
9938
9939 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9940 rs = rb_rs;
9941 if (!NIL_P(opts)) {
9942 static ID keywords[1];
9943 if (!keywords[0]) {
9944 keywords[0] = rb_intern_const("chomp");
9945 }
9946 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9947 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9948 }
9949
9950 if (NIL_P(rs)) {
9951 if (!ENUM_ELEM(ary, str)) {
9952 return ary;
9953 }
9954 else {
9955 return orig;
9956 }
9957 }
9958
9959 if (!RSTRING_LEN(str)) goto end;
9960 str = rb_str_new_frozen(str);
9961 ptr = subptr = RSTRING_PTR(str);
9962 pend = RSTRING_END(str);
9963 len = RSTRING_LEN(str);
9964 StringValue(rs);
9965 rslen = RSTRING_LEN(rs);
9966
9967 if (rs == rb_default_rs)
9968 enc = rb_enc_get(str);
9969 else
9970 enc = rb_enc_check(str, rs);
9971
9972 if (rslen == 0) {
9973 /* paragraph mode */
9974 int n;
9975 const char *eol = NULL;
9976 subend = subptr;
9977 while (subend < pend) {
9978 long chomp_rslen = 0;
9979 do {
9980 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9981 n = 0;
9982 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9983 if (rb_enc_is_newline(subend + n, pend, enc)) {
9984 if (eol == subend) break;
9985 subend += rslen;
9986 if (subptr) {
9987 eol = subend;
9988 chomp_rslen = -rslen;
9989 }
9990 }
9991 else {
9992 if (!subptr) subptr = subend;
9993 subend += rslen;
9994 }
9995 rslen = 0;
9996 } while (subend < pend);
9997 if (!subptr) break;
9998 if (rslen == 0) chomp_rslen = 0;
9999 line = rb_str_subseq(str, subptr - ptr,
10000 subend - subptr + (chomp ? chomp_rslen : rslen));
10001 if (ENUM_ELEM(ary, line)) {
10002 str_mod_check(str, ptr, len);
10003 }
10004 subptr = eol = NULL;
10005 }
10006 goto end;
10007 }
10008 else {
10009 rsptr = RSTRING_PTR(rs);
10010 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
10011 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
10012 rsnewline = 1;
10013 }
10014 }
10015
10016 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
10017 rs = rb_str_new(rsptr, rslen);
10018 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
10019 rsptr = RSTRING_PTR(rs);
10020 rslen = RSTRING_LEN(rs);
10021 }
10022
10023 while (subptr < pend) {
10024 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
10025 if (pos < 0) break;
10026 hit = subptr + pos;
10027 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
10028 if (hit != adjusted) {
10029 subptr = adjusted;
10030 continue;
10031 }
10032 subend = hit += rslen;
10033 if (chomp) {
10034 if (rsnewline) {
10035 subend = chomp_newline(subptr, subend, enc);
10036 }
10037 else {
10038 subend -= rslen;
10039 }
10040 }
10041 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
10042 if (ENUM_ELEM(ary, line)) {
10043 str_mod_check(str, ptr, len);
10044 }
10045 subptr = hit;
10046 }
10047
10048 if (subptr != pend) {
10049 if (chomp) {
10050 if (rsnewline) {
10051 pend = chomp_newline(subptr, pend, enc);
10052 }
10053 else if (pend - subptr >= rslen &&
10054 memcmp(pend - rslen, rsptr, rslen) == 0) {
10055 pend -= rslen;
10056 }
10057 }
10058 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
10059 ENUM_ELEM(ary, line);
10060 RB_GC_GUARD(str);
10061 }
10062
10063 end:
10064 if (ary)
10065 return ary;
10066 else
10067 return orig;
10068}
10069
10070/*
10071 * call-seq:
10072 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
10073 * each_line(line_sep = $/, chomp: false) -> enumerator
10074 *
10075 * :include: doc/string/each_line.rdoc
10076 *
10077 */
10078
10079static VALUE
10080rb_str_each_line(int argc, VALUE *argv, VALUE str)
10081{
10082 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
10083 return rb_str_enumerate_lines(argc, argv, str, 0);
10084}
10085
10086/*
10087 * call-seq:
10088 * lines(Line_sep = $/, chomp: false) -> array_of_strings
10089 *
10090 * Forms substrings ("lines") of +self+ according to the given arguments
10091 * (see String#each_line for details); returns the lines in an array.
10092 *
10093 */
10094
10095static VALUE
10096rb_str_lines(int argc, VALUE *argv, VALUE str)
10097{
10098 VALUE ary = WANTARRAY("lines", 0);
10099 return rb_str_enumerate_lines(argc, argv, str, ary);
10100}
10101
10102static VALUE
10103rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
10104{
10105 return LONG2FIX(RSTRING_LEN(str));
10106}
10107
10108static VALUE
10109rb_str_enumerate_bytes(VALUE str, VALUE ary)
10110{
10111 long i;
10112
10113 for (i=0; i<RSTRING_LEN(str); i++) {
10114 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
10115 }
10116 if (ary)
10117 return ary;
10118 else
10119 return str;
10120}
10121
10122/*
10123 * call-seq:
10124 * each_byte {|byte| ... } -> self
10125 * each_byte -> enumerator
10126 *
10127 * :include: doc/string/each_byte.rdoc
10128 *
10129 */
10130
10131static VALUE
10132rb_str_each_byte(VALUE str)
10133{
10134 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
10135 return rb_str_enumerate_bytes(str, 0);
10136}
10137
10138/*
10139 * call-seq:
10140 * bytes -> array_of_bytes
10141 *
10142 * :include: doc/string/bytes.rdoc
10143 *
10144 */
10145
10146static VALUE
10147rb_str_bytes(VALUE str)
10148{
10149 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
10150 return rb_str_enumerate_bytes(str, ary);
10151}
10152
10153static VALUE
10154rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
10155{
10156 return rb_str_length(str);
10157}
10158
10159static VALUE
10160rb_str_enumerate_chars(VALUE str, VALUE ary)
10161{
10162 VALUE orig = str;
10163 long i, len, n;
10164 const char *ptr;
10165 rb_encoding *enc;
10166
10167 str = rb_str_new_frozen(str);
10168 ptr = RSTRING_PTR(str);
10169 len = RSTRING_LEN(str);
10170 enc = rb_enc_get(str);
10171
10173 for (i = 0; i < len; i += n) {
10174 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
10175 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
10176 }
10177 }
10178 else {
10179 for (i = 0; i < len; i += n) {
10180 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
10181 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
10182 }
10183 }
10184 RB_GC_GUARD(str);
10185 if (ary)
10186 return ary;
10187 else
10188 return orig;
10189}
10190
10191/*
10192 * call-seq:
10193 * each_char {|c| ... } -> self
10194 * each_char -> enumerator
10195 *
10196 * :include: doc/string/each_char.rdoc
10197 *
10198 */
10199
10200static VALUE
10201rb_str_each_char(VALUE str)
10202{
10203 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
10204 return rb_str_enumerate_chars(str, 0);
10205}
10206
10207/*
10208 * call-seq:
10209 * chars -> array_of_characters
10210 *
10211 * :include: doc/string/chars.rdoc
10212 *
10213 */
10214
10215static VALUE
10216rb_str_chars(VALUE str)
10217{
10218 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
10219 return rb_str_enumerate_chars(str, ary);
10220}
10221
10222static VALUE
10223rb_str_enumerate_codepoints(VALUE str, VALUE ary)
10224{
10225 VALUE orig = str;
10226 int n;
10227 unsigned int c;
10228 const char *ptr, *end;
10229 rb_encoding *enc;
10230
10231 if (single_byte_optimizable(str))
10232 return rb_str_enumerate_bytes(str, ary);
10233
10234 str = rb_str_new_frozen(str);
10235 ptr = RSTRING_PTR(str);
10236 end = RSTRING_END(str);
10237 enc = STR_ENC_GET(str);
10238
10239 while (ptr < end) {
10240 c = rb_enc_codepoint_len(ptr, end, &n, enc);
10241 ENUM_ELEM(ary, UINT2NUM(c));
10242 ptr += n;
10243 }
10244 RB_GC_GUARD(str);
10245 if (ary)
10246 return ary;
10247 else
10248 return orig;
10249}
10250
10251/*
10252 * call-seq:
10253 * each_codepoint {|integer| ... } -> self
10254 * each_codepoint -> enumerator
10255 *
10256 * :include: doc/string/each_codepoint.rdoc
10257 *
10258 */
10259
10260static VALUE
10261rb_str_each_codepoint(VALUE str)
10262{
10263 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
10264 return rb_str_enumerate_codepoints(str, 0);
10265}
10266
10267/*
10268 * call-seq:
10269 * codepoints -> array_of_integers
10270 *
10271 * :include: doc/string/codepoints.rdoc
10272 *
10273 */
10274
10275static VALUE
10276rb_str_codepoints(VALUE str)
10277{
10278 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
10279 return rb_str_enumerate_codepoints(str, ary);
10280}
10281
10282static regex_t *
10283get_reg_grapheme_cluster(rb_encoding *enc)
10284{
10285 int encidx = rb_enc_to_index(enc);
10286
10287 const OnigUChar source_ascii[] = "\\X";
10288 const OnigUChar *source = source_ascii;
10289 size_t source_len = sizeof(source_ascii) - 1;
10290
10291 switch (encidx) {
10292#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
10293#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
10294#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
10295#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
10296#define CASE_UTF(e) \
10297 case ENCINDEX_UTF_##e: { \
10298 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
10299 source = source_UTF_##e; \
10300 source_len = sizeof(source_UTF_##e); \
10301 break; \
10302 }
10303 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
10304#undef CASE_UTF
10305#undef CHARS_16BE
10306#undef CHARS_16LE
10307#undef CHARS_32BE
10308#undef CHARS_32LE
10309 }
10310
10311 regex_t *reg_grapheme_cluster;
10312 OnigErrorInfo einfo;
10313 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
10314 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
10315 if (r) {
10316 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
10317 onig_error_code_to_str(message, r, &einfo);
10318 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
10319 }
10320
10321 return reg_grapheme_cluster;
10322}
10323
10324static regex_t *
10325get_cached_reg_grapheme_cluster(rb_encoding *enc)
10326{
10327 int encidx = rb_enc_to_index(enc);
10328 static regex_t *reg_grapheme_cluster_utf8 = NULL;
10329
10330 if (encidx == rb_utf8_encindex()) {
10331 if (!reg_grapheme_cluster_utf8) {
10332 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
10333 }
10334
10335 return reg_grapheme_cluster_utf8;
10336 }
10337
10338 return NULL;
10339}
10340
10341static VALUE
10342rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
10343{
10344 size_t grapheme_cluster_count = 0;
10345 rb_encoding *enc = get_encoding(str);
10346 const char *ptr, *end;
10347
10348 if (!rb_enc_unicode_p(enc)) {
10349 return rb_str_length(str);
10350 }
10351
10352 bool cached_reg_grapheme_cluster = true;
10353 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10354 if (!reg_grapheme_cluster) {
10355 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10356 cached_reg_grapheme_cluster = false;
10357 }
10358
10359 ptr = RSTRING_PTR(str);
10360 end = RSTRING_END(str);
10361
10362 while (ptr < end) {
10363 OnigPosition len = onig_match(reg_grapheme_cluster,
10364 (const OnigUChar *)ptr, (const OnigUChar *)end,
10365 (const OnigUChar *)ptr, NULL, 0);
10366 if (len <= 0) break;
10367 grapheme_cluster_count++;
10368 ptr += len;
10369 }
10370
10371 if (!cached_reg_grapheme_cluster) {
10372 onig_free(reg_grapheme_cluster);
10373 }
10374
10375 return SIZET2NUM(grapheme_cluster_count);
10376}
10377
10378static VALUE
10379rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
10380{
10381 VALUE orig = str;
10382 rb_encoding *enc = get_encoding(str);
10383 const char *ptr0, *ptr, *end;
10384
10385 if (!rb_enc_unicode_p(enc)) {
10386 return rb_str_enumerate_chars(str, ary);
10387 }
10388
10389 if (!ary) str = rb_str_new_frozen(str);
10390
10391 bool cached_reg_grapheme_cluster = true;
10392 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10393 if (!reg_grapheme_cluster) {
10394 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10395 cached_reg_grapheme_cluster = false;
10396 }
10397
10398 ptr0 = ptr = RSTRING_PTR(str);
10399 end = RSTRING_END(str);
10400
10401 while (ptr < end) {
10402 OnigPosition len = onig_match(reg_grapheme_cluster,
10403 (const OnigUChar *)ptr, (const OnigUChar *)end,
10404 (const OnigUChar *)ptr, NULL, 0);
10405 if (len <= 0) break;
10406 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10407 ptr += len;
10408 }
10409
10410 if (!cached_reg_grapheme_cluster) {
10411 onig_free(reg_grapheme_cluster);
10412 }
10413
10414 RB_GC_GUARD(str);
10415 if (ary)
10416 return ary;
10417 else
10418 return orig;
10419}
10420
10421/*
10422 * call-seq:
10423 * each_grapheme_cluster {|gc| ... } -> self
10424 * each_grapheme_cluster -> enumerator
10425 *
10426 * :include: doc/string/each_grapheme_cluster.rdoc
10427 *
10428 */
10429
10430static VALUE
10431rb_str_each_grapheme_cluster(VALUE str)
10432{
10433 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10434 return rb_str_enumerate_grapheme_clusters(str, 0);
10435}
10436
10437/*
10438 * call-seq:
10439 * grapheme_clusters -> array_of_grapheme_clusters
10440 *
10441 * :include: doc/string/grapheme_clusters.rdoc
10442 *
10443 */
10444
10445static VALUE
10446rb_str_grapheme_clusters(VALUE str)
10447{
10448 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10449 return rb_str_enumerate_grapheme_clusters(str, ary);
10450}
10451
10452static long
10453chopped_length(VALUE str)
10454{
10455 rb_encoding *enc = STR_ENC_GET(str);
10456 const char *p, *p2, *beg, *end;
10457
10458 beg = RSTRING_PTR(str);
10459 end = beg + RSTRING_LEN(str);
10460 if (beg >= end) return 0;
10461 p = rb_enc_prev_char(beg, end, end, enc);
10462 if (!p) return 0;
10463 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10464 p2 = rb_enc_prev_char(beg, p, end, enc);
10465 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10466 }
10467 return p - beg;
10468}
10469
10470/*
10471 * call-seq:
10472 * chop! -> self or nil
10473 *
10474 * Like String#chop, but modifies +self+ in place;
10475 * returns +nil+ if +self+ is empty, +self+ otherwise.
10476 *
10477 * Related: String#chomp!.
10478 */
10479
10480static VALUE
10481rb_str_chop_bang(VALUE str)
10482{
10483 str_modify_keep_cr(str);
10484 if (RSTRING_LEN(str) > 0) {
10485 long len;
10486 len = chopped_length(str);
10487 STR_SET_LEN(str, len);
10488 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10489 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10491 }
10492 return str;
10493 }
10494 return Qnil;
10495}
10496
10497
10498/*
10499 * call-seq:
10500 * chop -> new_string
10501 *
10502 * :include: doc/string/chop.rdoc
10503 *
10504 */
10505
10506static VALUE
10507rb_str_chop(VALUE str)
10508{
10509 return rb_str_subseq(str, 0, chopped_length(str));
10510}
10511
10512static long
10513smart_chomp(VALUE str, const char *e, const char *p)
10514{
10515 rb_encoding *enc = rb_enc_get(str);
10516 if (rb_enc_mbminlen(enc) > 1) {
10517 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10518 if (rb_enc_is_newline(pp, e, enc)) {
10519 e = pp;
10520 }
10521 pp = e - rb_enc_mbminlen(enc);
10522 if (pp >= p) {
10523 pp = rb_enc_left_char_head(p, pp, e, enc);
10524 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10525 e = pp;
10526 }
10527 }
10528 }
10529 else {
10530 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10531 case '\n':
10532 if (--e > p && *(e-1) == '\r') {
10533 --e;
10534 }
10535 break;
10536 case '\r':
10537 --e;
10538 break;
10539 }
10540 }
10541 return e - p;
10542}
10543
10544static long
10545chompped_length(VALUE str, VALUE rs)
10546{
10547 rb_encoding *enc;
10548 int newline;
10549 char *pp, *e, *rsptr;
10550 long rslen;
10551 char *const p = RSTRING_PTR(str);
10552 long len = RSTRING_LEN(str);
10553
10554 if (len == 0) return 0;
10555 e = p + len;
10556 if (rs == rb_default_rs) {
10557 return smart_chomp(str, e, p);
10558 }
10559
10560 enc = rb_enc_get(str);
10561 RSTRING_GETMEM(rs, rsptr, rslen);
10562 if (rslen == 0) {
10563 if (rb_enc_mbminlen(enc) > 1) {
10564 while (e > p) {
10565 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10566 if (!rb_enc_is_newline(pp, e, enc)) break;
10567 e = pp;
10568 pp -= rb_enc_mbminlen(enc);
10569 if (pp >= p) {
10570 pp = rb_enc_left_char_head(p, pp, e, enc);
10571 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10572 e = pp;
10573 }
10574 }
10575 }
10576 }
10577 else {
10578 while (e > p && *(e-1) == '\n') {
10579 --e;
10580 if (e > p && *(e-1) == '\r')
10581 --e;
10582 }
10583 }
10584 return e - p;
10585 }
10586 if (rslen > len) return len;
10587
10588 enc = rb_enc_get(rs);
10589 newline = rsptr[rslen-1];
10590 if (rslen == rb_enc_mbminlen(enc)) {
10591 if (rslen == 1) {
10592 if (newline == '\n')
10593 return smart_chomp(str, e, p);
10594 }
10595 else {
10596 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10597 return smart_chomp(str, e, p);
10598 }
10599 }
10600
10601 enc = rb_enc_check(str, rs);
10602 if (is_broken_string(rs)) {
10603 return len;
10604 }
10605 pp = e - rslen;
10606 if (p[len-1] == newline &&
10607 (rslen <= 1 ||
10608 memcmp(rsptr, pp, rslen) == 0)) {
10609 if (at_char_boundary(p, pp, e, enc))
10610 return len - rslen;
10611 RB_GC_GUARD(rs);
10612 }
10613 return len;
10614}
10615
10621static VALUE
10622chomp_rs(int argc, const VALUE *argv)
10623{
10624 rb_check_arity(argc, 0, 1);
10625 if (argc > 0) {
10626 VALUE rs = argv[0];
10627 if (!NIL_P(rs)) StringValue(rs);
10628 return rs;
10629 }
10630 else {
10631 return rb_rs;
10632 }
10633}
10634
10635VALUE
10636rb_str_chomp_string(VALUE str, VALUE rs)
10637{
10638 long olen = RSTRING_LEN(str);
10639 long len = chompped_length(str, rs);
10640 if (len >= olen) return Qnil;
10641 str_modify_keep_cr(str);
10642 STR_SET_LEN(str, len);
10643 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10644 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10646 }
10647 return str;
10648}
10649
10650/*
10651 * call-seq:
10652 * chomp!(line_sep = $/) -> self or nil
10653 *
10654 * Like String#chomp, but modifies +self+ in place;
10655 * returns +nil+ if no modification made, +self+ otherwise.
10656 *
10657 */
10658
10659static VALUE
10660rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10661{
10662 VALUE rs;
10663 str_modifiable(str);
10664 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10665 rs = chomp_rs(argc, argv);
10666 if (NIL_P(rs)) return Qnil;
10667 return rb_str_chomp_string(str, rs);
10668}
10669
10670
10671/*
10672 * call-seq:
10673 * chomp(line_sep = $/) -> new_string
10674 *
10675 * :include: doc/string/chomp.rdoc
10676 *
10677 */
10678
10679static VALUE
10680rb_str_chomp(int argc, VALUE *argv, VALUE str)
10681{
10682 VALUE rs = chomp_rs(argc, argv);
10683 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10684 return rb_str_subseq(str, 0, chompped_length(str, rs));
10685}
10686
10687static long
10688lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10689{
10690 const char *const start = s;
10691
10692 if (!s || s >= e) return 0;
10693
10694 /* remove spaces at head */
10695 if (single_byte_optimizable(str)) {
10696 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10697 }
10698 else {
10699 while (s < e) {
10700 int n;
10701 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10702
10703 if (cc && !rb_isspace(cc)) break;
10704 s += n;
10705 }
10706 }
10707 return s - start;
10708}
10709
10710/*
10711 * call-seq:
10712 * lstrip! -> self or nil
10713 *
10714 * Like String#lstrip, except that any modifications are made in +self+;
10715 * returns +self+ if any modification are made, +nil+ otherwise.
10716 *
10717 * Related: String#rstrip!, String#strip!.
10718 */
10719
10720static VALUE
10721rb_str_lstrip_bang(VALUE str)
10722{
10723 rb_encoding *enc;
10724 char *start, *s;
10725 long olen, loffset;
10726
10727 str_modify_keep_cr(str);
10728 enc = STR_ENC_GET(str);
10729 RSTRING_GETMEM(str, start, olen);
10730 loffset = lstrip_offset(str, start, start+olen, enc);
10731 if (loffset > 0) {
10732 long len = olen-loffset;
10733 s = start + loffset;
10734 memmove(start, s, len);
10735 STR_SET_LEN(str, len);
10736 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10737 return str;
10738 }
10739 return Qnil;
10740}
10741
10742
10743/*
10744 * call-seq:
10745 * lstrip -> new_string
10746 *
10747 * Returns a copy of +self+ with leading whitespace removed;
10748 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10749 *
10750 * whitespace = "\x00\t\n\v\f\r "
10751 * s = whitespace + 'abc' + whitespace
10752 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10753 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10754 *
10755 * Related: String#rstrip, String#strip.
10756 */
10757
10758static VALUE
10759rb_str_lstrip(VALUE str)
10760{
10761 char *start;
10762 long len, loffset;
10763 RSTRING_GETMEM(str, start, len);
10764 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10765 if (loffset <= 0) return str_duplicate(rb_cString, str);
10766 return rb_str_subseq(str, loffset, len - loffset);
10767}
10768
10769static long
10770rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10771{
10772 const char *t;
10773
10774 rb_str_check_dummy_enc(enc);
10776 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10777 }
10778 if (!s || s >= e) return 0;
10779 t = e;
10780
10781 /* remove trailing spaces or '\0's */
10782 if (single_byte_optimizable(str)) {
10783 unsigned char c;
10784 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10785 }
10786 else {
10787 char *tp;
10788
10789 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10790 unsigned int c = rb_enc_codepoint(tp, e, enc);
10791 if (c && !rb_isspace(c)) break;
10792 t = tp;
10793 }
10794 }
10795 return e - t;
10796}
10797
10798/*
10799 * call-seq:
10800 * rstrip! -> self or nil
10801 *
10802 * Like String#rstrip, except that any modifications are made in +self+;
10803 * returns +self+ if any modification are made, +nil+ otherwise.
10804 *
10805 * Related: String#lstrip!, String#strip!.
10806 */
10807
10808static VALUE
10809rb_str_rstrip_bang(VALUE str)
10810{
10811 rb_encoding *enc;
10812 char *start;
10813 long olen, roffset;
10814
10815 str_modify_keep_cr(str);
10816 enc = STR_ENC_GET(str);
10817 RSTRING_GETMEM(str, start, olen);
10818 roffset = rstrip_offset(str, start, start+olen, enc);
10819 if (roffset > 0) {
10820 long len = olen - roffset;
10821
10822 STR_SET_LEN(str, len);
10823 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10824 return str;
10825 }
10826 return Qnil;
10827}
10828
10829
10830/*
10831 * call-seq:
10832 * rstrip -> new_string
10833 *
10834 * Returns a copy of the receiver with trailing whitespace removed;
10835 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10836 *
10837 * whitespace = "\x00\t\n\v\f\r "
10838 * s = whitespace + 'abc' + whitespace
10839 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10840 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10841 *
10842 * Related: String#lstrip, String#strip.
10843 */
10844
10845static VALUE
10846rb_str_rstrip(VALUE str)
10847{
10848 rb_encoding *enc;
10849 char *start;
10850 long olen, roffset;
10851
10852 enc = STR_ENC_GET(str);
10853 RSTRING_GETMEM(str, start, olen);
10854 roffset = rstrip_offset(str, start, start+olen, enc);
10855
10856 if (roffset <= 0) return str_duplicate(rb_cString, str);
10857 return rb_str_subseq(str, 0, olen-roffset);
10858}
10859
10860
10861/*
10862 * call-seq:
10863 * strip! -> self or nil
10864 *
10865 * Like String#strip, except that any modifications are made in +self+;
10866 * returns +self+ if any modification are made, +nil+ otherwise.
10867 *
10868 * Related: String#lstrip!, String#strip!.
10869 */
10870
10871static VALUE
10872rb_str_strip_bang(VALUE str)
10873{
10874 char *start;
10875 long olen, loffset, roffset;
10876 rb_encoding *enc;
10877
10878 str_modify_keep_cr(str);
10879 enc = STR_ENC_GET(str);
10880 RSTRING_GETMEM(str, start, olen);
10881 loffset = lstrip_offset(str, start, start+olen, enc);
10882 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10883
10884 if (loffset > 0 || roffset > 0) {
10885 long len = olen-roffset;
10886 if (loffset > 0) {
10887 len -= loffset;
10888 memmove(start, start + loffset, len);
10889 }
10890 STR_SET_LEN(str, len);
10891 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10892 return str;
10893 }
10894 return Qnil;
10895}
10896
10897
10898/*
10899 * call-seq:
10900 * strip -> new_string
10901 *
10902 * Returns a copy of the receiver with leading and trailing whitespace removed;
10903 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10904 *
10905 * whitespace = "\x00\t\n\v\f\r "
10906 * s = whitespace + 'abc' + whitespace
10907 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10908 * s.strip # => "abc"
10909 *
10910 * Related: String#lstrip, String#rstrip.
10911 */
10912
10913static VALUE
10914rb_str_strip(VALUE str)
10915{
10916 char *start;
10917 long olen, loffset, roffset;
10918 rb_encoding *enc = STR_ENC_GET(str);
10919
10920 RSTRING_GETMEM(str, start, olen);
10921 loffset = lstrip_offset(str, start, start+olen, enc);
10922 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10923
10924 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10925 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10926}
10927
10928static VALUE
10929scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10930{
10931 VALUE result = Qnil;
10932 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10933 if (pos >= 0) {
10934 VALUE match;
10935 struct re_registers *regs;
10936 if (BUILTIN_TYPE(pat) == T_STRING) {
10937 regs = NULL;
10938 end = pos + RSTRING_LEN(pat);
10939 }
10940 else {
10941 match = rb_backref_get();
10942 regs = RMATCH_REGS(match);
10943 pos = BEG(0);
10944 end = END(0);
10945 }
10946
10947 if (pos == end) {
10948 rb_encoding *enc = STR_ENC_GET(str);
10949 /*
10950 * Always consume at least one character of the input string
10951 */
10952 if (RSTRING_LEN(str) > end)
10953 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10954 RSTRING_END(str), enc);
10955 else
10956 *start = end + 1;
10957 }
10958 else {
10959 *start = end;
10960 }
10961
10962 if (!regs || regs->num_regs == 1) {
10963 result = rb_str_subseq(str, pos, end - pos);
10964 return result;
10965 }
10966 else {
10967 result = rb_ary_new2(regs->num_regs);
10968 for (int i = 1; i < regs->num_regs; i++) {
10969 VALUE s = Qnil;
10970 if (BEG(i) >= 0) {
10971 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10972 }
10973
10974 rb_ary_push(result, s);
10975 }
10976 }
10977
10978 RB_GC_GUARD(match);
10979 }
10980
10981 return result;
10982}
10983
10984
10985/*
10986 * call-seq:
10987 * scan(string_or_regexp) -> array
10988 * scan(string_or_regexp) {|matches| ... } -> self
10989 *
10990 * Matches a pattern against +self+; the pattern is:
10991 *
10992 * - +string_or_regexp+ itself, if it is a Regexp.
10993 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10994 *
10995 * Iterates through +self+, generating a collection of matching results:
10996 *
10997 * - If the pattern contains no groups, each result is the
10998 * matched string, <code>$&</code>.
10999 * - If the pattern contains groups, each result is an array
11000 * containing one entry per group.
11001 *
11002 * With no block given, returns an array of the results:
11003 *
11004 * s = 'cruel world'
11005 * s.scan(/\w+/) # => ["cruel", "world"]
11006 * s.scan(/.../) # => ["cru", "el ", "wor"]
11007 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
11008 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
11009 *
11010 * With a block given, calls the block with each result; returns +self+:
11011 *
11012 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
11013 * print "\n"
11014 * s.scan(/(.)(.)/) {|x,y| print y, x }
11015 * print "\n"
11016 *
11017 * Output:
11018 *
11019 * <<cruel>> <<world>>
11020 * rceu lowlr
11021 *
11022 */
11023
11024static VALUE
11025rb_str_scan(VALUE str, VALUE pat)
11026{
11027 VALUE result;
11028 long start = 0;
11029 long last = -1, prev = 0;
11030 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
11031
11032 pat = get_pat_quoted(pat, 1);
11033 mustnot_broken(str);
11034 if (!rb_block_given_p()) {
11035 VALUE ary = rb_ary_new();
11036
11037 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
11038 last = prev;
11039 prev = start;
11040 rb_ary_push(ary, result);
11041 }
11042 if (last >= 0) rb_pat_search(pat, str, last, 1);
11043 else rb_backref_set(Qnil);
11044 return ary;
11045 }
11046
11047 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
11048 last = prev;
11049 prev = start;
11050 rb_yield(result);
11051 str_mod_check(str, p, len);
11052 }
11053 if (last >= 0) rb_pat_search(pat, str, last, 1);
11054 return str;
11055}
11056
11057
11058/*
11059 * call-seq:
11060 * hex -> integer
11061 *
11062 * Interprets the leading substring of +self+ as a string of hexadecimal digits
11063 * (with an optional sign and an optional <code>0x</code>) and returns the
11064 * corresponding number;
11065 * returns zero if there is no such leading substring:
11066 *
11067 * '0x0a'.hex # => 10
11068 * '-1234'.hex # => -4660
11069 * '0'.hex # => 0
11070 * 'non-numeric'.hex # => 0
11071 *
11072 * Related: String#oct.
11073 *
11074 */
11075
11076static VALUE
11077rb_str_hex(VALUE str)
11078{
11079 return rb_str_to_inum(str, 16, FALSE);
11080}
11081
11082
11083/*
11084 * call-seq:
11085 * oct -> integer
11086 *
11087 * Interprets the leading substring of +self+ as a string of octal digits
11088 * (with an optional sign) and returns the corresponding number;
11089 * returns zero if there is no such leading substring:
11090 *
11091 * '123'.oct # => 83
11092 * '-377'.oct # => -255
11093 * '0377non-numeric'.oct # => 255
11094 * 'non-numeric'.oct # => 0
11095 *
11096 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
11097 * see Kernel#Integer.
11098 *
11099 * Related: String#hex.
11100 *
11101 */
11102
11103static VALUE
11104rb_str_oct(VALUE str)
11105{
11106 return rb_str_to_inum(str, -8, FALSE);
11107}
11108
11109#ifndef HAVE_CRYPT_R
11110# include "ruby/thread_native.h"
11111# include "ruby/atomic.h"
11112
11113static struct {
11114 rb_nativethread_lock_t lock;
11115} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
11116
11117static void
11118crypt_mutex_initialize(void)
11119{
11120}
11121#endif
11122
11123/*
11124 * call-seq:
11125 * crypt(salt_str) -> new_string
11126 *
11127 * Returns the string generated by calling <code>crypt(3)</code>
11128 * standard library function with <code>str</code> and
11129 * <code>salt_str</code>, in this order, as its arguments. Please do
11130 * not use this method any longer. It is legacy; provided only for
11131 * backward compatibility with ruby scripts in earlier days. It is
11132 * bad to use in contemporary programs for several reasons:
11133 *
11134 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
11135 * run. The generated string lacks data portability.
11136 *
11137 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
11138 * (i.e. silently ends up in unexpected results).
11139 *
11140 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
11141 * thread safe.
11142 *
11143 * * So-called "traditional" usage of <code>crypt(3)</code> is very
11144 * very very weak. According to its manpage, Linux's traditional
11145 * <code>crypt(3)</code> output has only 2**56 variations; too
11146 * easy to brute force today. And this is the default behaviour.
11147 *
11148 * * In order to make things robust some OSes implement so-called
11149 * "modular" usage. To go through, you have to do a complex
11150 * build-up of the <code>salt_str</code> parameter, by hand.
11151 * Failure in generation of a proper salt string tends not to
11152 * yield any errors; typos in parameters are normally not
11153 * detectable.
11154 *
11155 * * For instance, in the following example, the second invocation
11156 * of String#crypt is wrong; it has a typo in "round=" (lacks
11157 * "s"). However the call does not fail and something unexpected
11158 * is generated.
11159 *
11160 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
11161 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
11162 *
11163 * * Even in the "modular" mode, some hash functions are considered
11164 * archaic and no longer recommended at all; for instance module
11165 * <code>$1$</code> is officially abandoned by its author: see
11166 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
11167 * instance module <code>$3$</code> is considered completely
11168 * broken: see the manpage of FreeBSD.
11169 *
11170 * * On some OS such as Mac OS, there is no modular mode. Yet, as
11171 * written above, <code>crypt(3)</code> on Mac OS never fails.
11172 * This means even if you build up a proper salt string it
11173 * generates a traditional DES hash anyways, and there is no way
11174 * for you to be aware of.
11175 *
11176 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
11177 *
11178 * If for some reason you cannot migrate to other secure contemporary
11179 * password hashing algorithms, install the string-crypt gem and
11180 * <code>require 'string/crypt'</code> to continue using it.
11181 */
11182
11183static VALUE
11184rb_str_crypt(VALUE str, VALUE salt)
11185{
11186#ifdef HAVE_CRYPT_R
11187 VALUE databuf;
11188 struct crypt_data *data;
11189# define CRYPT_END() ALLOCV_END(databuf)
11190#else
11191 extern char *crypt(const char *, const char *);
11192# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
11193#endif
11194 VALUE result;
11195 const char *s, *saltp;
11196 char *res;
11197#ifdef BROKEN_CRYPT
11198 char salt_8bit_clean[3];
11199#endif
11200
11201 StringValue(salt);
11202 mustnot_wchar(str);
11203 mustnot_wchar(salt);
11204 s = StringValueCStr(str);
11205 saltp = RSTRING_PTR(salt);
11206 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
11207 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
11208 }
11209
11210#ifdef BROKEN_CRYPT
11211 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
11212 salt_8bit_clean[0] = saltp[0] & 0x7f;
11213 salt_8bit_clean[1] = saltp[1] & 0x7f;
11214 salt_8bit_clean[2] = '\0';
11215 saltp = salt_8bit_clean;
11216 }
11217#endif
11218#ifdef HAVE_CRYPT_R
11219 data = ALLOCV(databuf, sizeof(struct crypt_data));
11220# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
11221 data->initialized = 0;
11222# endif
11223 res = crypt_r(s, saltp, data);
11224#else
11225 crypt_mutex_initialize();
11226 rb_nativethread_lock_lock(&crypt_mutex.lock);
11227 res = crypt(s, saltp);
11228#endif
11229 if (!res) {
11230 int err = errno;
11231 CRYPT_END();
11232 rb_syserr_fail(err, "crypt");
11233 }
11234 result = rb_str_new_cstr(res);
11235 CRYPT_END();
11236 return result;
11237}
11238
11239
11240/*
11241 * call-seq:
11242 * ord -> integer
11243 *
11244 * :include: doc/string/ord.rdoc
11245 *
11246 */
11247
11248static VALUE
11249rb_str_ord(VALUE s)
11250{
11251 unsigned int c;
11252
11253 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
11254 return UINT2NUM(c);
11255}
11256/*
11257 * call-seq:
11258 * sum(n = 16) -> integer
11259 *
11260 * :include: doc/string/sum.rdoc
11261 *
11262 */
11263
11264static VALUE
11265rb_str_sum(int argc, VALUE *argv, VALUE str)
11266{
11267 int bits = 16;
11268 char *ptr, *p, *pend;
11269 long len;
11270 VALUE sum = INT2FIX(0);
11271 unsigned long sum0 = 0;
11272
11273 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11274 bits = 0;
11275 }
11276 ptr = p = RSTRING_PTR(str);
11277 len = RSTRING_LEN(str);
11278 pend = p + len;
11279
11280 while (p < pend) {
11281 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11282 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11283 str_mod_check(str, ptr, len);
11284 sum0 = 0;
11285 }
11286 sum0 += (unsigned char)*p;
11287 p++;
11288 }
11289
11290 if (bits == 0) {
11291 if (sum0) {
11292 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11293 }
11294 }
11295 else {
11296 if (sum == INT2FIX(0)) {
11297 if (bits < (int)sizeof(long)*CHAR_BIT) {
11298 sum0 &= (((unsigned long)1)<<bits)-1;
11299 }
11300 sum = LONG2FIX(sum0);
11301 }
11302 else {
11303 VALUE mod;
11304
11305 if (sum0) {
11306 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11307 }
11308
11309 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11310 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11311 sum = rb_funcall(sum, '&', 1, mod);
11312 }
11313 }
11314 return sum;
11315}
11316
11317static VALUE
11318rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11319{
11320 rb_encoding *enc;
11321 VALUE w;
11322 long width, len, flen = 1, fclen = 1;
11323 VALUE res;
11324 char *p;
11325 const char *f = " ";
11326 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11327 VALUE pad;
11328 int singlebyte = 1, cr;
11329 int termlen;
11330
11331 rb_scan_args(argc, argv, "11", &w, &pad);
11332 enc = STR_ENC_GET(str);
11333 termlen = rb_enc_mbminlen(enc);
11334 width = NUM2LONG(w);
11335 if (argc == 2) {
11336 StringValue(pad);
11337 enc = rb_enc_check(str, pad);
11338 f = RSTRING_PTR(pad);
11339 flen = RSTRING_LEN(pad);
11340 fclen = str_strlen(pad, enc); /* rb_enc_check */
11341 singlebyte = single_byte_optimizable(pad);
11342 if (flen == 0 || fclen == 0) {
11343 rb_raise(rb_eArgError, "zero width padding");
11344 }
11345 }
11346 len = str_strlen(str, enc); /* rb_enc_check */
11347 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11348 n = width - len;
11349 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11350 rlen = n - llen;
11351 cr = ENC_CODERANGE(str);
11352 if (flen > 1) {
11353 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11354 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11355 }
11356 size = RSTRING_LEN(str);
11357 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11358 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11359 (len += llen2 + rlen2) >= LONG_MAX - size) {
11360 rb_raise(rb_eArgError, "argument too big");
11361 }
11362 len += size;
11363 res = str_enc_new(rb_cString, 0, len, enc);
11364 p = RSTRING_PTR(res);
11365 if (flen <= 1) {
11366 memset(p, *f, llen);
11367 p += llen;
11368 }
11369 else {
11370 while (llen >= fclen) {
11371 memcpy(p,f,flen);
11372 p += flen;
11373 llen -= fclen;
11374 }
11375 if (llen > 0) {
11376 memcpy(p, f, llen2);
11377 p += llen2;
11378 }
11379 }
11380 memcpy(p, RSTRING_PTR(str), size);
11381 p += size;
11382 if (flen <= 1) {
11383 memset(p, *f, rlen);
11384 p += rlen;
11385 }
11386 else {
11387 while (rlen >= fclen) {
11388 memcpy(p,f,flen);
11389 p += flen;
11390 rlen -= fclen;
11391 }
11392 if (rlen > 0) {
11393 memcpy(p, f, rlen2);
11394 p += rlen2;
11395 }
11396 }
11397 TERM_FILL(p, termlen);
11398 STR_SET_LEN(res, p-RSTRING_PTR(res));
11399
11400 if (argc == 2)
11401 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11402 if (cr != ENC_CODERANGE_BROKEN)
11403 ENC_CODERANGE_SET(res, cr);
11404
11405 RB_GC_GUARD(pad);
11406 return res;
11407}
11408
11409
11410/*
11411 * call-seq:
11412 * ljust(size, pad_string = ' ') -> new_string
11413 *
11414 * :include: doc/string/ljust.rdoc
11415 *
11416 * Related: String#rjust, String#center.
11417 *
11418 */
11419
11420static VALUE
11421rb_str_ljust(int argc, VALUE *argv, VALUE str)
11422{
11423 return rb_str_justify(argc, argv, str, 'l');
11424}
11425
11426/*
11427 * call-seq:
11428 * rjust(size, pad_string = ' ') -> new_string
11429 *
11430 * :include: doc/string/rjust.rdoc
11431 *
11432 * Related: String#ljust, String#center.
11433 *
11434 */
11435
11436static VALUE
11437rb_str_rjust(int argc, VALUE *argv, VALUE str)
11438{
11439 return rb_str_justify(argc, argv, str, 'r');
11440}
11441
11442
11443/*
11444 * call-seq:
11445 * center(size, pad_string = ' ') -> new_string
11446 *
11447 * :include: doc/string/center.rdoc
11448 *
11449 * Related: String#ljust, String#rjust.
11450 *
11451 */
11452
11453static VALUE
11454rb_str_center(int argc, VALUE *argv, VALUE str)
11455{
11456 return rb_str_justify(argc, argv, str, 'c');
11457}
11458
11459/*
11460 * call-seq:
11461 * partition(string_or_regexp) -> [head, match, tail]
11462 *
11463 * :include: doc/string/partition.rdoc
11464 *
11465 */
11466
11467static VALUE
11468rb_str_partition(VALUE str, VALUE sep)
11469{
11470 long pos;
11471
11472 sep = get_pat_quoted(sep, 0);
11473 if (RB_TYPE_P(sep, T_REGEXP)) {
11474 if (rb_reg_search(sep, str, 0, 0) < 0) {
11475 goto failed;
11476 }
11477 VALUE match = rb_backref_get();
11478 struct re_registers *regs = RMATCH_REGS(match);
11479
11480 pos = BEG(0);
11481 sep = rb_str_subseq(str, pos, END(0) - pos);
11482 }
11483 else {
11484 pos = rb_str_index(str, sep, 0);
11485 if (pos < 0) goto failed;
11486 }
11487 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11488 sep,
11489 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11490 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11491
11492 failed:
11493 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11494}
11495
11496/*
11497 * call-seq:
11498 * rpartition(sep) -> [head, match, tail]
11499 *
11500 * :include: doc/string/rpartition.rdoc
11501 *
11502 */
11503
11504static VALUE
11505rb_str_rpartition(VALUE str, VALUE sep)
11506{
11507 long pos = RSTRING_LEN(str);
11508
11509 sep = get_pat_quoted(sep, 0);
11510 if (RB_TYPE_P(sep, T_REGEXP)) {
11511 if (rb_reg_search(sep, str, pos, 1) < 0) {
11512 goto failed;
11513 }
11514 VALUE match = rb_backref_get();
11515 struct re_registers *regs = RMATCH_REGS(match);
11516
11517 pos = BEG(0);
11518 sep = rb_str_subseq(str, pos, END(0) - pos);
11519 }
11520 else {
11521 pos = rb_str_sublen(str, pos);
11522 pos = rb_str_rindex(str, sep, pos);
11523 if (pos < 0) {
11524 goto failed;
11525 }
11526 }
11527
11528 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11529 sep,
11530 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11531 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11532 failed:
11533 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11534}
11535
11536/*
11537 * call-seq:
11538 * start_with?(*string_or_regexp) -> true or false
11539 *
11540 * :include: doc/string/start_with_p.rdoc
11541 *
11542 */
11543
11544static VALUE
11545rb_str_start_with(int argc, VALUE *argv, VALUE str)
11546{
11547 int i;
11548
11549 for (i=0; i<argc; i++) {
11550 VALUE tmp = argv[i];
11551 if (RB_TYPE_P(tmp, T_REGEXP)) {
11552 if (rb_reg_start_with_p(tmp, str))
11553 return Qtrue;
11554 }
11555 else {
11556 const char *p, *s, *e;
11557 long slen, tlen;
11558 rb_encoding *enc;
11559
11560 StringValue(tmp);
11561 enc = rb_enc_check(str, tmp);
11562 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11563 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11564 p = RSTRING_PTR(str);
11565 e = p + slen;
11566 s = p + tlen;
11567 if (!at_char_right_boundary(p, s, e, enc))
11568 continue;
11569 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11570 return Qtrue;
11571 }
11572 }
11573 return Qfalse;
11574}
11575
11576/*
11577 * call-seq:
11578 * end_with?(*strings) -> true or false
11579 *
11580 * :include: doc/string/end_with_p.rdoc
11581 *
11582 */
11583
11584static VALUE
11585rb_str_end_with(int argc, VALUE *argv, VALUE str)
11586{
11587 int i;
11588
11589 for (i=0; i<argc; i++) {
11590 VALUE tmp = argv[i];
11591 const char *p, *s, *e;
11592 long slen, tlen;
11593 rb_encoding *enc;
11594
11595 StringValue(tmp);
11596 enc = rb_enc_check(str, tmp);
11597 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11598 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11599 p = RSTRING_PTR(str);
11600 e = p + slen;
11601 s = e - tlen;
11602 if (!at_char_boundary(p, s, e, enc))
11603 continue;
11604 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11605 return Qtrue;
11606 }
11607 return Qfalse;
11608}
11609
11619static long
11620deleted_prefix_length(VALUE str, VALUE prefix)
11621{
11622 const char *strptr, *prefixptr;
11623 long olen, prefixlen;
11624 rb_encoding *enc = rb_enc_get(str);
11625
11626 StringValue(prefix);
11627
11628 if (!is_broken_string(prefix) ||
11629 !rb_enc_asciicompat(enc) ||
11630 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11631 enc = rb_enc_check(str, prefix);
11632 }
11633
11634 /* return 0 if not start with prefix */
11635 prefixlen = RSTRING_LEN(prefix);
11636 if (prefixlen <= 0) return 0;
11637 olen = RSTRING_LEN(str);
11638 if (olen < prefixlen) return 0;
11639 strptr = RSTRING_PTR(str);
11640 prefixptr = RSTRING_PTR(prefix);
11641 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11642 if (is_broken_string(prefix)) {
11643 if (!is_broken_string(str)) {
11644 /* prefix in a valid string cannot be broken */
11645 return 0;
11646 }
11647 const char *strend = strptr + olen;
11648 const char *after_prefix = strptr + prefixlen;
11649 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11650 /* prefix does not end at char-boundary */
11651 return 0;
11652 }
11653 }
11654 /* prefix part in `str` also should be valid. */
11655
11656 return prefixlen;
11657}
11658
11659/*
11660 * call-seq:
11661 * delete_prefix!(prefix) -> self or nil
11662 *
11663 * Like String#delete_prefix, except that +self+ is modified in place.
11664 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11665 *
11666 */
11667
11668static VALUE
11669rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11670{
11671 long prefixlen;
11672 str_modify_keep_cr(str);
11673
11674 prefixlen = deleted_prefix_length(str, prefix);
11675 if (prefixlen <= 0) return Qnil;
11676
11677 return rb_str_drop_bytes(str, prefixlen);
11678}
11679
11680/*
11681 * call-seq:
11682 * delete_prefix(prefix) -> new_string
11683 *
11684 * :include: doc/string/delete_prefix.rdoc
11685 *
11686 */
11687
11688static VALUE
11689rb_str_delete_prefix(VALUE str, VALUE prefix)
11690{
11691 long prefixlen;
11692
11693 prefixlen = deleted_prefix_length(str, prefix);
11694 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11695
11696 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11697}
11698
11708static long
11709deleted_suffix_length(VALUE str, VALUE suffix)
11710{
11711 const char *strptr, *suffixptr;
11712 long olen, suffixlen;
11713 rb_encoding *enc;
11714
11715 StringValue(suffix);
11716 if (is_broken_string(suffix)) return 0;
11717 enc = rb_enc_check(str, suffix);
11718
11719 /* return 0 if not start with suffix */
11720 suffixlen = RSTRING_LEN(suffix);
11721 if (suffixlen <= 0) return 0;
11722 olen = RSTRING_LEN(str);
11723 if (olen < suffixlen) return 0;
11724 strptr = RSTRING_PTR(str);
11725 suffixptr = RSTRING_PTR(suffix);
11726 const char *strend = strptr + olen;
11727 const char *before_suffix = strend - suffixlen;
11728 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11729 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11730
11731 return suffixlen;
11732}
11733
11734/*
11735 * call-seq:
11736 * delete_suffix!(suffix) -> self or nil
11737 *
11738 * Like String#delete_suffix, except that +self+ is modified in place.
11739 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11740 *
11741 */
11742
11743static VALUE
11744rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11745{
11746 long olen, suffixlen, len;
11747 str_modifiable(str);
11748
11749 suffixlen = deleted_suffix_length(str, suffix);
11750 if (suffixlen <= 0) return Qnil;
11751
11752 olen = RSTRING_LEN(str);
11753 str_modify_keep_cr(str);
11754 len = olen - suffixlen;
11755 STR_SET_LEN(str, len);
11756 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11757 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11759 }
11760 return str;
11761}
11762
11763/*
11764 * call-seq:
11765 * delete_suffix(suffix) -> new_string
11766 *
11767 * :include: doc/string/delete_suffix.rdoc
11768 *
11769 */
11770
11771static VALUE
11772rb_str_delete_suffix(VALUE str, VALUE suffix)
11773{
11774 long suffixlen;
11775
11776 suffixlen = deleted_suffix_length(str, suffix);
11777 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11778
11779 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11780}
11781
11782void
11783rb_str_setter(VALUE val, ID id, VALUE *var)
11784{
11785 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11786 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11787 }
11788 *var = val;
11789}
11790
11791static void
11792rb_fs_setter(VALUE val, ID id, VALUE *var)
11793{
11794 val = rb_fs_check(val);
11795 if (!val) {
11796 rb_raise(rb_eTypeError,
11797 "value of %"PRIsVALUE" must be String or Regexp",
11798 rb_id2str(id));
11799 }
11800 if (!NIL_P(val)) {
11801 rb_warn_deprecated("'$;'", NULL);
11802 }
11803 *var = val;
11804}
11805
11806
11807/*
11808 * call-seq:
11809 * force_encoding(encoding) -> self
11810 *
11811 * :include: doc/string/force_encoding.rdoc
11812 *
11813 */
11814
11815static VALUE
11816rb_str_force_encoding(VALUE str, VALUE enc)
11817{
11818 str_modifiable(str);
11819
11820 rb_encoding *encoding = rb_to_encoding(enc);
11821 int idx = rb_enc_to_index(encoding);
11822
11823 // If the encoding is unchanged, we do nothing.
11824 if (ENCODING_GET(str) == idx) {
11825 return str;
11826 }
11827
11828 rb_enc_associate_index(str, idx);
11829
11830 // If the coderange was 7bit and the new encoding is ASCII-compatible
11831 // we can keep the coderange.
11832 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11833 return str;
11834 }
11835
11837 return str;
11838}
11839
11840/*
11841 * call-seq:
11842 * b -> string
11843 *
11844 * :include: doc/string/b.rdoc
11845 *
11846 */
11847
11848static VALUE
11849rb_str_b(VALUE str)
11850{
11851 VALUE str2;
11852 if (STR_EMBED_P(str)) {
11853 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11854 }
11855 else {
11856 str2 = str_alloc_heap(rb_cString);
11857 }
11858 str_replace_shared_without_enc(str2, str);
11859
11860 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11861 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11862 // If we know the receiver's code range then we know the result's code range.
11863 int cr = ENC_CODERANGE(str);
11864 switch (cr) {
11865 case ENC_CODERANGE_7BIT:
11867 break;
11871 break;
11872 default:
11873 ENC_CODERANGE_CLEAR(str2);
11874 break;
11875 }
11876 }
11877
11878 return str2;
11879}
11880
11881/*
11882 * call-seq:
11883 * valid_encoding? -> true or false
11884 *
11885 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11886 *
11887 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11888 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11889 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11890 */
11891
11892static VALUE
11893rb_str_valid_encoding_p(VALUE str)
11894{
11895 int cr = rb_enc_str_coderange(str);
11896
11897 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11898}
11899
11900/*
11901 * call-seq:
11902 * ascii_only? -> true or false
11903 *
11904 * Returns +true+ if +self+ contains only ASCII characters,
11905 * +false+ otherwise:
11906 *
11907 * 'abc'.ascii_only? # => true
11908 * "abc\u{6666}".ascii_only? # => false
11909 *
11910 */
11911
11912static VALUE
11913rb_str_is_ascii_only_p(VALUE str)
11914{
11915 int cr = rb_enc_str_coderange(str);
11916
11917 return RBOOL(cr == ENC_CODERANGE_7BIT);
11918}
11919
11920VALUE
11922{
11923 static const char ellipsis[] = "...";
11924 const long ellipsislen = sizeof(ellipsis) - 1;
11925 rb_encoding *const enc = rb_enc_get(str);
11926 const long blen = RSTRING_LEN(str);
11927 const char *const p = RSTRING_PTR(str), *e = p + blen;
11928 VALUE estr, ret = 0;
11929
11930 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11931 if (len * rb_enc_mbminlen(enc) >= blen ||
11932 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11933 ret = str;
11934 }
11935 else if (len <= ellipsislen ||
11936 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11937 if (rb_enc_asciicompat(enc)) {
11938 ret = rb_str_new(ellipsis, len);
11939 rb_enc_associate(ret, enc);
11940 }
11941 else {
11942 estr = rb_usascii_str_new(ellipsis, len);
11943 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11944 }
11945 }
11946 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11947 rb_str_cat(ret, ellipsis, ellipsislen);
11948 }
11949 else {
11950 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11951 rb_enc_from_encoding(enc), 0, Qnil);
11952 rb_str_append(ret, estr);
11953 }
11954 return ret;
11955}
11956
11957static VALUE
11958str_compat_and_valid(VALUE str, rb_encoding *enc)
11959{
11960 int cr;
11961 str = StringValue(str);
11962 cr = rb_enc_str_coderange(str);
11963 if (cr == ENC_CODERANGE_BROKEN) {
11964 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11965 }
11966 else {
11967 rb_encoding *e = STR_ENC_GET(str);
11968 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11969 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11970 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11971 }
11972 }
11973 return str;
11974}
11975
11976static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11977
11978VALUE
11980{
11981 rb_encoding *enc = STR_ENC_GET(str);
11982 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11983}
11984
11985VALUE
11986rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11987{
11988 int cr = ENC_CODERANGE_UNKNOWN;
11989 if (enc == STR_ENC_GET(str)) {
11990 /* cached coderange makes sense only when enc equals the
11991 * actual encoding of str */
11992 cr = ENC_CODERANGE(str);
11993 }
11994 return enc_str_scrub(enc, str, repl, cr);
11995}
11996
11997static VALUE
11998enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11999{
12000 int encidx;
12001 VALUE buf = Qnil;
12002 const char *rep, *p, *e, *p1, *sp;
12003 long replen = -1;
12004 long slen;
12005
12006 if (rb_block_given_p()) {
12007 if (!NIL_P(repl))
12008 rb_raise(rb_eArgError, "both of block and replacement given");
12009 replen = 0;
12010 }
12011
12012 if (ENC_CODERANGE_CLEAN_P(cr))
12013 return Qnil;
12014
12015 if (!NIL_P(repl)) {
12016 repl = str_compat_and_valid(repl, enc);
12017 }
12018
12019 if (rb_enc_dummy_p(enc)) {
12020 return Qnil;
12021 }
12022 encidx = rb_enc_to_index(enc);
12023
12024#define DEFAULT_REPLACE_CHAR(str) do { \
12025 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
12026 rep = replace; replen = (int)sizeof(replace); \
12027 } while (0)
12028
12029 slen = RSTRING_LEN(str);
12030 p = RSTRING_PTR(str);
12031 e = RSTRING_END(str);
12032 p1 = p;
12033 sp = p;
12034
12035 if (rb_enc_asciicompat(enc)) {
12036 int rep7bit_p;
12037 if (!replen) {
12038 rep = NULL;
12039 rep7bit_p = FALSE;
12040 }
12041 else if (!NIL_P(repl)) {
12042 rep = RSTRING_PTR(repl);
12043 replen = RSTRING_LEN(repl);
12044 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
12045 }
12046 else if (encidx == rb_utf8_encindex()) {
12047 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
12048 rep7bit_p = FALSE;
12049 }
12050 else {
12051 DEFAULT_REPLACE_CHAR("?");
12052 rep7bit_p = TRUE;
12053 }
12054 cr = ENC_CODERANGE_7BIT;
12055
12056 p = search_nonascii(p, e);
12057 if (!p) {
12058 p = e;
12059 }
12060 while (p < e) {
12061 int ret = rb_enc_precise_mbclen(p, e, enc);
12062 if (MBCLEN_NEEDMORE_P(ret)) {
12063 break;
12064 }
12065 else if (MBCLEN_CHARFOUND_P(ret)) {
12067 p += MBCLEN_CHARFOUND_LEN(ret);
12068 }
12069 else if (MBCLEN_INVALID_P(ret)) {
12070 /*
12071 * p1~p: valid ascii/multibyte chars
12072 * p ~e: invalid bytes + unknown bytes
12073 */
12074 long clen = rb_enc_mbmaxlen(enc);
12075 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
12076 if (p > p1) {
12077 rb_str_buf_cat(buf, p1, p - p1);
12078 }
12079
12080 if (e - p < clen) clen = e - p;
12081 if (clen <= 2) {
12082 clen = 1;
12083 }
12084 else {
12085 const char *q = p;
12086 clen--;
12087 for (; clen > 1; clen--) {
12088 ret = rb_enc_precise_mbclen(q, q + clen, enc);
12089 if (MBCLEN_NEEDMORE_P(ret)) break;
12090 if (MBCLEN_INVALID_P(ret)) continue;
12092 }
12093 }
12094 if (rep) {
12095 rb_str_buf_cat(buf, rep, replen);
12096 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
12097 }
12098 else {
12099 repl = rb_yield(rb_enc_str_new(p, clen, enc));
12100 str_mod_check(str, sp, slen);
12101 repl = str_compat_and_valid(repl, enc);
12102 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12105 }
12106 p += clen;
12107 p1 = p;
12108 p = search_nonascii(p, e);
12109 if (!p) {
12110 p = e;
12111 break;
12112 }
12113 }
12114 else {
12116 }
12117 }
12118 if (NIL_P(buf)) {
12119 if (p == e) {
12120 ENC_CODERANGE_SET(str, cr);
12121 return Qnil;
12122 }
12123 buf = rb_str_buf_new(RSTRING_LEN(str));
12124 }
12125 if (p1 < p) {
12126 rb_str_buf_cat(buf, p1, p - p1);
12127 }
12128 if (p < e) {
12129 if (rep) {
12130 rb_str_buf_cat(buf, rep, replen);
12131 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
12132 }
12133 else {
12134 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12135 str_mod_check(str, sp, slen);
12136 repl = str_compat_and_valid(repl, enc);
12137 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12140 }
12141 }
12142 }
12143 else {
12144 /* ASCII incompatible */
12145 long mbminlen = rb_enc_mbminlen(enc);
12146 if (!replen) {
12147 rep = NULL;
12148 }
12149 else if (!NIL_P(repl)) {
12150 rep = RSTRING_PTR(repl);
12151 replen = RSTRING_LEN(repl);
12152 }
12153 else if (encidx == ENCINDEX_UTF_16BE) {
12154 DEFAULT_REPLACE_CHAR("\xFF\xFD");
12155 }
12156 else if (encidx == ENCINDEX_UTF_16LE) {
12157 DEFAULT_REPLACE_CHAR("\xFD\xFF");
12158 }
12159 else if (encidx == ENCINDEX_UTF_32BE) {
12160 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
12161 }
12162 else if (encidx == ENCINDEX_UTF_32LE) {
12163 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
12164 }
12165 else {
12166 DEFAULT_REPLACE_CHAR("?");
12167 }
12168
12169 while (p < e) {
12170 int ret = rb_enc_precise_mbclen(p, e, enc);
12171 if (MBCLEN_NEEDMORE_P(ret)) {
12172 break;
12173 }
12174 else if (MBCLEN_CHARFOUND_P(ret)) {
12175 p += MBCLEN_CHARFOUND_LEN(ret);
12176 }
12177 else if (MBCLEN_INVALID_P(ret)) {
12178 const char *q = p;
12179 long clen = rb_enc_mbmaxlen(enc);
12180 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
12181 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
12182
12183 if (e - p < clen) clen = e - p;
12184 if (clen <= mbminlen * 2) {
12185 clen = mbminlen;
12186 }
12187 else {
12188 clen -= mbminlen;
12189 for (; clen > mbminlen; clen-=mbminlen) {
12190 ret = rb_enc_precise_mbclen(q, q + clen, enc);
12191 if (MBCLEN_NEEDMORE_P(ret)) break;
12192 if (MBCLEN_INVALID_P(ret)) continue;
12194 }
12195 }
12196 if (rep) {
12197 rb_str_buf_cat(buf, rep, replen);
12198 }
12199 else {
12200 repl = rb_yield(rb_enc_str_new(p, clen, enc));
12201 str_mod_check(str, sp, slen);
12202 repl = str_compat_and_valid(repl, enc);
12203 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12204 }
12205 p += clen;
12206 p1 = p;
12207 }
12208 else {
12210 }
12211 }
12212 if (NIL_P(buf)) {
12213 if (p == e) {
12215 return Qnil;
12216 }
12217 buf = rb_str_buf_new(RSTRING_LEN(str));
12218 }
12219 if (p1 < p) {
12220 rb_str_buf_cat(buf, p1, p - p1);
12221 }
12222 if (p < e) {
12223 if (rep) {
12224 rb_str_buf_cat(buf, rep, replen);
12225 }
12226 else {
12227 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12228 str_mod_check(str, sp, slen);
12229 repl = str_compat_and_valid(repl, enc);
12230 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12231 }
12232 }
12234 }
12235 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
12236 return buf;
12237}
12238
12239/*
12240 * call-seq:
12241 * scrub(replacement_string = default_replacement) -> new_string
12242 * scrub{|bytes| ... } -> new_string
12243 *
12244 * :include: doc/string/scrub.rdoc
12245 *
12246 */
12247static VALUE
12248str_scrub(int argc, VALUE *argv, VALUE str)
12249{
12250 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12251 VALUE new = rb_str_scrub(str, repl);
12252 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
12253}
12254
12255/*
12256 * call-seq:
12257 * scrub! -> self
12258 * scrub!(replacement_string = default_replacement) -> self
12259 * scrub!{|bytes| ... } -> self
12260 *
12261 * Like String#scrub, except that any replacements are made in +self+.
12262 *
12263 */
12264static VALUE
12265str_scrub_bang(int argc, VALUE *argv, VALUE str)
12266{
12267 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12268 VALUE new = rb_str_scrub(str, repl);
12269 if (!NIL_P(new)) rb_str_replace(str, new);
12270 return str;
12271}
12272
12273static ID id_normalize;
12274static ID id_normalized_p;
12275static VALUE mUnicodeNormalize;
12276
12277static VALUE
12278unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12279{
12280 static int UnicodeNormalizeRequired = 0;
12281 VALUE argv2[2];
12282
12283 if (!UnicodeNormalizeRequired) {
12284 rb_require("unicode_normalize/normalize.rb");
12285 UnicodeNormalizeRequired = 1;
12286 }
12287 argv2[0] = str;
12288 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12289 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12290}
12291
12292/*
12293 * call-seq:
12294 * unicode_normalize(form = :nfc) -> string
12295 *
12296 * Returns a copy of +self+ with
12297 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
12298 *
12299 * Argument +form+ must be one of the following symbols
12300 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
12301 *
12302 * - +:nfc+: Canonical decomposition, followed by canonical composition.
12303 * - +:nfd+: Canonical decomposition.
12304 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
12305 * - +:nfkd+: Compatibility decomposition.
12306 *
12307 * The encoding of +self+ must be one of:
12308 *
12309 * - Encoding::UTF_8
12310 * - Encoding::UTF_16BE
12311 * - Encoding::UTF_16LE
12312 * - Encoding::UTF_32BE
12313 * - Encoding::UTF_32LE
12314 * - Encoding::GB18030
12315 * - Encoding::UCS_2BE
12316 * - Encoding::UCS_4BE
12317 *
12318 * Examples:
12319 *
12320 * "a\u0300".unicode_normalize # => "a"
12321 * "\u00E0".unicode_normalize(:nfd) # => "a "
12322 *
12323 * Related: String#unicode_normalize!, String#unicode_normalized?.
12324 */
12325static VALUE
12326rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12327{
12328 return unicode_normalize_common(argc, argv, str, id_normalize);
12329}
12330
12331/*
12332 * call-seq:
12333 * unicode_normalize!(form = :nfc) -> self
12334 *
12335 * Like String#unicode_normalize, except that the normalization
12336 * is performed on +self+.
12337 *
12338 * Related String#unicode_normalized?.
12339 *
12340 */
12341static VALUE
12342rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12343{
12344 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12345}
12346
12347/* call-seq:
12348 * unicode_normalized?(form = :nfc) -> true or false
12349 *
12350 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
12351 * +false+ otherwise.
12352 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12353 *
12354 * Examples:
12355 *
12356 * "a\u0300".unicode_normalized? # => false
12357 * "a\u0300".unicode_normalized?(:nfd) # => true
12358 * "\u00E0".unicode_normalized? # => true
12359 * "\u00E0".unicode_normalized?(:nfd) # => false
12360 *
12361 *
12362 * Raises an exception if +self+ is not in a Unicode encoding:
12363 *
12364 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12365 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
12366 *
12367 * Related: String#unicode_normalize, String#unicode_normalize!.
12368 *
12369 */
12370static VALUE
12371rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12372{
12373 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12374}
12375
12376/**********************************************************************
12377 * Document-class: Symbol
12378 *
12379 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12380 *
12381 * You can create a +Symbol+ object explicitly with:
12382 *
12383 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12384 *
12385 * The same +Symbol+ object will be
12386 * created for a given name or string for the duration of a program's
12387 * execution, regardless of the context or meaning of that name. Thus
12388 * if <code>Fred</code> is a constant in one context, a method in
12389 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12390 * will be the same object in all three contexts.
12391 *
12392 * module One
12393 * class Fred
12394 * end
12395 * $f1 = :Fred
12396 * end
12397 * module Two
12398 * Fred = 1
12399 * $f2 = :Fred
12400 * end
12401 * def Fred()
12402 * end
12403 * $f3 = :Fred
12404 * $f1.object_id #=> 2514190
12405 * $f2.object_id #=> 2514190
12406 * $f3.object_id #=> 2514190
12407 *
12408 * Constant, method, and variable names are returned as symbols:
12409 *
12410 * module One
12411 * Two = 2
12412 * def three; 3 end
12413 * @four = 4
12414 * @@five = 5
12415 * $six = 6
12416 * end
12417 * seven = 7
12418 *
12419 * One.constants
12420 * # => [:Two]
12421 * One.instance_methods(true)
12422 * # => [:three]
12423 * One.instance_variables
12424 * # => [:@four]
12425 * One.class_variables
12426 * # => [:@@five]
12427 * global_variables.grep(/six/)
12428 * # => [:$six]
12429 * local_variables
12430 * # => [:seven]
12431 *
12432 * A +Symbol+ object differs from a String object in that
12433 * a +Symbol+ object represents an identifier, while a String object
12434 * represents text or data.
12435 *
12436 * == What's Here
12437 *
12438 * First, what's elsewhere. Class +Symbol+:
12439 *
12440 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12441 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12442 *
12443 * Here, class +Symbol+ provides methods that are useful for:
12444 *
12445 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12446 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12447 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12448 *
12449 * === Methods for Querying
12450 *
12451 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12452 * - #=~: Returns the index of the first substring in symbol that matches a
12453 * given Regexp or other object; returns +nil+ if no match is found.
12454 * - #[], #slice : Returns a substring of symbol
12455 * determined by a given index, start/length, or range, or string.
12456 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12457 * - #encoding: Returns the Encoding object that represents the encoding
12458 * of symbol.
12459 * - #end_with?: Returns +true+ if symbol ends with
12460 * any of the given strings.
12461 * - #match: Returns a MatchData object if symbol
12462 * matches a given Regexp; +nil+ otherwise.
12463 * - #match?: Returns +true+ if symbol
12464 * matches a given Regexp; +false+ otherwise.
12465 * - #length, #size: Returns the number of characters in symbol.
12466 * - #start_with?: Returns +true+ if symbol starts with
12467 * any of the given strings.
12468 *
12469 * === Methods for Comparing
12470 *
12471 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12472 * or larger than symbol.
12473 * - #==, #===: Returns +true+ if a given symbol has the same content and
12474 * encoding.
12475 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12476 * symbol is smaller than, equal to, or larger than symbol.
12477 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12478 * after Unicode case folding; +false+ otherwise.
12479 *
12480 * === Methods for Converting
12481 *
12482 * - #capitalize: Returns symbol with the first character upcased
12483 * and all other characters downcased.
12484 * - #downcase: Returns symbol with all characters downcased.
12485 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12486 * - #name: Returns the frozen string corresponding to symbol.
12487 * - #succ, #next: Returns the symbol that is the successor to symbol.
12488 * - #swapcase: Returns symbol with all upcase characters downcased
12489 * and all downcase characters upcased.
12490 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12491 * - #to_s, #id2name: Returns the string corresponding to +self+.
12492 * - #to_sym, #intern: Returns +self+.
12493 * - #upcase: Returns symbol with all characters upcased.
12494 *
12495 */
12496
12497
12498/*
12499 * call-seq:
12500 * symbol == object -> true or false
12501 *
12502 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12503 */
12504
12505#define sym_equal rb_obj_equal
12506
12507static int
12508sym_printable(const char *s, const char *send, rb_encoding *enc)
12509{
12510 while (s < send) {
12511 int n;
12512 int c = rb_enc_precise_mbclen(s, send, enc);
12513
12514 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12515 n = MBCLEN_CHARFOUND_LEN(c);
12516 c = rb_enc_mbc_to_codepoint(s, send, enc);
12517 if (!rb_enc_isprint(c, enc)) return FALSE;
12518 s += n;
12519 }
12520 return TRUE;
12521}
12522
12523int
12524rb_str_symname_p(VALUE sym)
12525{
12526 rb_encoding *enc;
12527 const char *ptr;
12528 long len;
12529 rb_encoding *resenc = rb_default_internal_encoding();
12530
12531 if (resenc == NULL) resenc = rb_default_external_encoding();
12532 enc = STR_ENC_GET(sym);
12533 ptr = RSTRING_PTR(sym);
12534 len = RSTRING_LEN(sym);
12535 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12536 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12537 return FALSE;
12538 }
12539 return TRUE;
12540}
12541
12542VALUE
12543rb_str_quote_unprintable(VALUE str)
12544{
12545 rb_encoding *enc;
12546 const char *ptr;
12547 long len;
12548 rb_encoding *resenc;
12549
12550 Check_Type(str, T_STRING);
12551 resenc = rb_default_internal_encoding();
12552 if (resenc == NULL) resenc = rb_default_external_encoding();
12553 enc = STR_ENC_GET(str);
12554 ptr = RSTRING_PTR(str);
12555 len = RSTRING_LEN(str);
12556 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12557 !sym_printable(ptr, ptr + len, enc)) {
12558 return rb_str_escape(str);
12559 }
12560 return str;
12561}
12562
12563VALUE
12564rb_id_quote_unprintable(ID id)
12565{
12566 VALUE str = rb_id2str(id);
12567 if (!rb_str_symname_p(str)) {
12568 return rb_str_escape(str);
12569 }
12570 return str;
12571}
12572
12573/*
12574 * call-seq:
12575 * inspect -> string
12576 *
12577 * Returns a string representation of +self+ (including the leading colon):
12578 *
12579 * :foo.inspect # => ":foo"
12580 *
12581 * Related: Symbol#to_s, Symbol#name.
12582 *
12583 */
12584
12585static VALUE
12586sym_inspect(VALUE sym)
12587{
12588 VALUE str = rb_sym2str(sym);
12589 const char *ptr;
12590 long len;
12591 char *dest;
12592
12593 if (!rb_str_symname_p(str)) {
12594 str = rb_str_inspect(str);
12595 len = RSTRING_LEN(str);
12596 rb_str_resize(str, len + 1);
12597 dest = RSTRING_PTR(str);
12598 memmove(dest + 1, dest, len);
12599 }
12600 else {
12601 rb_encoding *enc = STR_ENC_GET(str);
12602 VALUE orig_str = str;
12603
12604 len = RSTRING_LEN(orig_str);
12605 str = rb_enc_str_new(0, len + 1, enc);
12606
12607 // Get data pointer after allocation
12608 ptr = RSTRING_PTR(orig_str);
12609 dest = RSTRING_PTR(str);
12610 memcpy(dest + 1, ptr, len);
12611
12612 RB_GC_GUARD(orig_str);
12613 }
12614 dest[0] = ':';
12615
12617
12618 return str;
12619}
12620
12621VALUE
12623{
12624 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12625 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12626 return str;
12627}
12628
12629VALUE
12630rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12631{
12632 VALUE obj;
12633
12634 if (argc < 1) {
12635 rb_raise(rb_eArgError, "no receiver given");
12636 }
12637 obj = argv[0];
12638 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12639}
12640
12641/*
12642 * call-seq:
12643 * succ
12644 *
12645 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12646 *
12647 * :foo.succ # => :fop
12648 *
12649 * Related: String#succ.
12650 */
12651
12652static VALUE
12653sym_succ(VALUE sym)
12654{
12655 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12656}
12657
12658/*
12659 * call-seq:
12660 * symbol <=> object -> -1, 0, +1, or nil
12661 *
12662 * If +object+ is a symbol,
12663 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12664 *
12665 * :bar <=> :foo # => -1
12666 * :foo <=> :foo # => 0
12667 * :foo <=> :bar # => 1
12668 *
12669 * Otherwise, returns +nil+:
12670 *
12671 * :foo <=> 'bar' # => nil
12672 *
12673 * Related: String#<=>.
12674 */
12675
12676static VALUE
12677sym_cmp(VALUE sym, VALUE other)
12678{
12679 if (!SYMBOL_P(other)) {
12680 return Qnil;
12681 }
12682 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12683}
12684
12685/*
12686 * call-seq:
12687 * casecmp(object) -> -1, 0, 1, or nil
12688 *
12689 * :include: doc/symbol/casecmp.rdoc
12690 *
12691 */
12692
12693static VALUE
12694sym_casecmp(VALUE sym, VALUE other)
12695{
12696 if (!SYMBOL_P(other)) {
12697 return Qnil;
12698 }
12699 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12700}
12701
12702/*
12703 * call-seq:
12704 * casecmp?(object) -> true, false, or nil
12705 *
12706 * :include: doc/symbol/casecmp_p.rdoc
12707 *
12708 */
12709
12710static VALUE
12711sym_casecmp_p(VALUE sym, VALUE other)
12712{
12713 if (!SYMBOL_P(other)) {
12714 return Qnil;
12715 }
12716 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12717}
12718
12719/*
12720 * call-seq:
12721 * symbol =~ object -> integer or nil
12722 *
12723 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12724 * including possible updates to global variables;
12725 * see String#=~.
12726 *
12727 */
12728
12729static VALUE
12730sym_match(VALUE sym, VALUE other)
12731{
12732 return rb_str_match(rb_sym2str(sym), other);
12733}
12734
12735/*
12736 * call-seq:
12737 * match(pattern, offset = 0) -> matchdata or nil
12738 * match(pattern, offset = 0) {|matchdata| } -> object
12739 *
12740 * Equivalent to <tt>self.to_s.match</tt>,
12741 * including possible updates to global variables;
12742 * see String#match.
12743 *
12744 */
12745
12746static VALUE
12747sym_match_m(int argc, VALUE *argv, VALUE sym)
12748{
12749 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12750}
12751
12752/*
12753 * call-seq:
12754 * match?(pattern, offset) -> true or false
12755 *
12756 * Equivalent to <tt>sym.to_s.match?</tt>;
12757 * see String#match.
12758 *
12759 */
12760
12761static VALUE
12762sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12763{
12764 return rb_str_match_m_p(argc, argv, sym);
12765}
12766
12767/*
12768 * call-seq:
12769 * symbol[index] -> string or nil
12770 * symbol[start, length] -> string or nil
12771 * symbol[range] -> string or nil
12772 * symbol[regexp, capture = 0] -> string or nil
12773 * symbol[substring] -> string or nil
12774 *
12775 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12776 *
12777 */
12778
12779static VALUE
12780sym_aref(int argc, VALUE *argv, VALUE sym)
12781{
12782 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12783}
12784
12785/*
12786 * call-seq:
12787 * length -> integer
12788 *
12789 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12790 */
12791
12792static VALUE
12793sym_length(VALUE sym)
12794{
12795 return rb_str_length(rb_sym2str(sym));
12796}
12797
12798/*
12799 * call-seq:
12800 * empty? -> true or false
12801 *
12802 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12803 *
12804 */
12805
12806static VALUE
12807sym_empty(VALUE sym)
12808{
12809 return rb_str_empty(rb_sym2str(sym));
12810}
12811
12812/*
12813 * call-seq:
12814 * upcase(*options) -> symbol
12815 *
12816 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12817 *
12818 * See String#upcase.
12819 *
12820 */
12821
12822static VALUE
12823sym_upcase(int argc, VALUE *argv, VALUE sym)
12824{
12825 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12826}
12827
12828/*
12829 * call-seq:
12830 * downcase(*options) -> symbol
12831 *
12832 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12833 *
12834 * See String#downcase.
12835 *
12836 * Related: Symbol#upcase.
12837 *
12838 */
12839
12840static VALUE
12841sym_downcase(int argc, VALUE *argv, VALUE sym)
12842{
12843 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12844}
12845
12846/*
12847 * call-seq:
12848 * capitalize(*options) -> symbol
12849 *
12850 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12851 *
12852 * See String#capitalize.
12853 *
12854 */
12855
12856static VALUE
12857sym_capitalize(int argc, VALUE *argv, VALUE sym)
12858{
12859 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12860}
12861
12862/*
12863 * call-seq:
12864 * swapcase(*options) -> symbol
12865 *
12866 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12867 *
12868 * See String#swapcase.
12869 *
12870 */
12871
12872static VALUE
12873sym_swapcase(int argc, VALUE *argv, VALUE sym)
12874{
12875 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12876}
12877
12878/*
12879 * call-seq:
12880 * start_with?(*string_or_regexp) -> true or false
12881 *
12882 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12883 *
12884 */
12885
12886static VALUE
12887sym_start_with(int argc, VALUE *argv, VALUE sym)
12888{
12889 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12890}
12891
12892/*
12893 * call-seq:
12894 * end_with?(*strings) -> true or false
12895 *
12896 *
12897 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12898 *
12899 */
12900
12901static VALUE
12902sym_end_with(int argc, VALUE *argv, VALUE sym)
12903{
12904 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12905}
12906
12907/*
12908 * call-seq:
12909 * encoding -> encoding
12910 *
12911 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12912 *
12913 */
12914
12915static VALUE
12916sym_encoding(VALUE sym)
12917{
12918 return rb_obj_encoding(rb_sym2str(sym));
12919}
12920
12921static VALUE
12922string_for_symbol(VALUE name)
12923{
12924 if (!RB_TYPE_P(name, T_STRING)) {
12925 VALUE tmp = rb_check_string_type(name);
12926 if (NIL_P(tmp)) {
12927 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12928 name);
12929 }
12930 name = tmp;
12931 }
12932 return name;
12933}
12934
12935ID
12937{
12938 if (SYMBOL_P(name)) {
12939 return SYM2ID(name);
12940 }
12941 name = string_for_symbol(name);
12942 return rb_intern_str(name);
12943}
12944
12945VALUE
12947{
12948 if (SYMBOL_P(name)) {
12949 return name;
12950 }
12951 name = string_for_symbol(name);
12952 return rb_str_intern(name);
12953}
12954
12955/*
12956 * call-seq:
12957 * Symbol.all_symbols -> array_of_symbols
12958 *
12959 * Returns an array of all symbols currently in Ruby's symbol table:
12960 *
12961 * Symbol.all_symbols.size # => 9334
12962 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12963 *
12964 */
12965
12966static VALUE
12967sym_all_symbols(VALUE _)
12968{
12969 return rb_sym_all_symbols();
12970}
12971
12972VALUE
12973rb_str_to_interned_str(VALUE str)
12974{
12975 return rb_fstring(str);
12976}
12977
12978VALUE
12979rb_interned_str(const char *ptr, long len)
12980{
12981 struct RString fake_str;
12982 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12983}
12984
12985VALUE
12987{
12988 return rb_interned_str(ptr, strlen(ptr));
12989}
12990
12991VALUE
12992rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12993{
12994 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12995 rb_enc_autoload(enc);
12996 }
12997
12998 struct RString fake_str;
12999 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
13000}
13001
13002VALUE
13003rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
13004{
13005 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
13006 rb_enc_autoload(enc);
13007 }
13008
13009 struct RString fake_str;
13010 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
13011}
13012
13013VALUE
13015{
13016 return rb_enc_interned_str(ptr, strlen(ptr), enc);
13017}
13018
13019#if USE_YJIT
13020void
13021rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
13022{
13023 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
13024 ssize_t code = RB_NUM2SSIZE(codepoint);
13025
13026 if (RB_LIKELY(code >= 0 && code < 0xff)) {
13027 rb_str_buf_cat_byte(str, (char) code);
13028 return;
13029 }
13030 }
13031
13032 rb_str_concat(str, codepoint);
13033}
13034#endif
13035
13036void
13037Init_String(void)
13038{
13039 rb_cString = rb_define_class("String", rb_cObject);
13040 struct fstring_table_struct *fstring_table = RTYPEDDATA_GET_DATA(fstring_table_obj);
13041 for (unsigned int i = 0; i < fstring_table->capacity; i++) {
13042 VALUE str = fstring_table->entries[i].str;
13043 if (!str) continue;
13044 RBASIC_SET_CLASS(str, rb_cString);
13045 }
13047 rb_define_alloc_func(rb_cString, empty_str_alloc);
13048 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
13049 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
13050 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
13051 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
13052 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
13053 rb_define_method(rb_cString, "==", rb_str_equal, 1);
13054 rb_define_method(rb_cString, "===", rb_str_equal, 1);
13055 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
13056 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
13057 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
13058 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
13059 rb_define_method(rb_cString, "+", rb_str_plus, 1);
13060 rb_define_method(rb_cString, "*", rb_str_times, 1);
13061 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
13062 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
13063 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
13064 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
13065 rb_define_method(rb_cString, "length", rb_str_length, 0);
13066 rb_define_method(rb_cString, "size", rb_str_length, 0);
13067 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
13068 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
13069 rb_define_method(rb_cString, "=~", rb_str_match, 1);
13070 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
13071 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
13072 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
13073 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
13074 rb_define_method(rb_cString, "next", rb_str_succ, 0);
13075 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
13076 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
13077 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
13078 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
13079 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
13080 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
13081 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
13082 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
13083 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
13084 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
13085 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
13086 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
13087 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
13088 rb_define_method(rb_cString, "scrub", str_scrub, -1);
13089 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
13090 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
13091 rb_define_method(rb_cString, "+@", str_uplus, 0);
13092 rb_define_method(rb_cString, "-@", str_uminus, 0);
13093 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
13094 rb_define_alias(rb_cString, "dedup", "-@");
13095
13096 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
13097 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
13098 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
13099 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
13100 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
13101 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
13102 rb_define_method(rb_cString, "undump", str_undump, 0);
13103
13104 sym_ascii = ID2SYM(rb_intern_const("ascii"));
13105 sym_turkic = ID2SYM(rb_intern_const("turkic"));
13106 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
13107 sym_fold = ID2SYM(rb_intern_const("fold"));
13108
13109 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
13110 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
13111 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
13112 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
13113
13114 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
13115 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
13116 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
13117 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
13118
13119 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
13120 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
13121 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
13122 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
13123 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
13124 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
13125 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
13126 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
13127 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
13128 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
13129 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
13130 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
13131 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
13132 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
13133 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
13134 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
13135 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
13136 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
13137
13138 rb_define_method(rb_cString, "include?", rb_str_include, 1);
13139 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
13140 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
13141
13142 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
13143
13144 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
13145 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
13146 rb_define_method(rb_cString, "center", rb_str_center, -1);
13147
13148 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
13149 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
13150 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
13151 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
13152 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
13153 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
13154 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
13155 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
13156 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
13157
13158 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
13159 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
13160 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
13161 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
13162 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
13163 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
13164 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
13165 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
13166 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
13167
13168 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
13169 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
13170 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
13171 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
13172 rb_define_method(rb_cString, "count", rb_str_count, -1);
13173
13174 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
13175 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
13176 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
13177 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
13178
13179 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
13180 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
13181 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
13182 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
13183 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
13184
13185 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
13186
13187 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
13188 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
13189
13190 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
13191 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
13192
13193 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
13194 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
13195 rb_define_method(rb_cString, "b", rb_str_b, 0);
13196 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
13197 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
13198
13199 /* define UnicodeNormalize module here so that we don't have to look it up */
13200 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
13201 id_normalize = rb_intern_const("normalize");
13202 id_normalized_p = rb_intern_const("normalized?");
13203
13204 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
13205 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
13206 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
13207
13208 rb_fs = Qnil;
13209 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
13210 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
13211 rb_gc_register_address(&rb_fs);
13212
13213 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
13217 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
13218
13219 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
13220 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
13221 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
13222 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
13223 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
13224 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
13225
13226 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
13227 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
13228 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
13229 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
13230
13231 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
13232 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
13233 rb_define_method(rb_cSymbol, "length", sym_length, 0);
13234 rb_define_method(rb_cSymbol, "size", sym_length, 0);
13235 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
13236 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
13237 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
13238
13239 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
13240 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
13241 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
13242 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
13243
13244 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
13245 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
13246
13247 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
13248}
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
#define RUBY_ATOMIC_VALUE_CAS(var, oldval, newval)
Identical to RUBY_ATOMIC_CAS, except it expects its arguments are VALUE.
Definition atomic.h:381
#define RUBY_ATOMIC_VALUE_SET(var, val)
Identical to RUBY_ATOMIC_SET, except it expects its arguments are VALUE.
Definition atomic.h:353
std::atomic< unsigned > rb_atomic_t
Type that is eligible for atomic operations.
Definition atomic.h:69
#define RUBY_ATOMIC_FETCH_ADD(var, val)
Atomically replaces the value pointed by var with the result of addition of val to the old value of v...
Definition atomic.h:93
#define RUBY_ATOMIC_VALUE_EXCHANGE(var, val)
Identical to RUBY_ATOMIC_EXCHANGE, except it expects its arguments are VALUE.
Definition atomic.h:367
#define RUBY_ATOMIC_DEC(var)
Atomically decrements the value pointed by var.
Definition atomic.h:198
#define RUBY_ATOMIC_LOAD(var)
Atomic load.
Definition atomic.h:150
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:870
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:456
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:311
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1697
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1479
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1598
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2843
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2664
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3133
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:941
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2922
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:65
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:680
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3905
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:682
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2123
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2141
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1309
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3538
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:589
VALUE rb_cSymbol
Symbol class.
Definition string.c:83
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1297
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:82
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3222
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1665
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:1280
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1530
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3335
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1549
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12992
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2642
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:4037
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1478
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1770
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1671
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:1299
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:13014
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:1164
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:431
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:1049
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1865
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1059
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1871
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1926
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4219
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3716
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1926
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:2065
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1835
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2794
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:4102
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1746
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12622
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2867
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1722
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:2059
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3363
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5710
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4476
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3460
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11921
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1752
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:2101
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1512
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1334
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1841
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:2304
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4462
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3870
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2731
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:2322
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6948
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3468
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12986
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1752
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:4068
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3410
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4585
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3692
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7669
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:3097
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12979
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4532
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4349
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4507
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:4044
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3585
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:6220
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11979
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:2015
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:3257
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3557
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3675
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1524
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:3051
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7783
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1734
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:2031
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2745
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:6138
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9876
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1518
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:895
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:2163
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2125
st_index_t rb_ivar_count(VALUE obj)
Number of instance variables defined on an object.
Definition variable.c:2499
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2162
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3064
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1387
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:987
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12946
ID rb_to_id(VALUE str)
Definition string.c:12936
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1865
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3500
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4463
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1764
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:3234
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:3116
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1758
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:3129
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:2092
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:450
#define TypedData_Make_Struct(klass, type, data_type, sval)
Identical to TypedData_Wrap_Struct, except it allocates a new data region internally instead of takin...
Definition rtypeddata.h:498
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1580
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:75
Ruby's String.
Definition rstring.h:196
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
union RString::@52::@53::@55 aux
Auxiliary info.
Definition string.c:541
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:203
const char * wrap_struct_name
Name of structs of this kind.
Definition rtypeddata.h:210
Definition string.c:8741
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:291
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113