Ruby 3.5.0dev (2025-04-24 revision 5dc155351a23465ace10f32e8775fc5e23909d6e)
string.c (5dc155351a23465ace10f32e8775fc5e23909d6e)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/thread.h"
46#include "ruby/util.h"
47#include "ruby_assert.h"
48#include "vm_sync.h"
49
50#if defined HAVE_CRYPT_R
51# if defined HAVE_CRYPT_H
52# include <crypt.h>
53# endif
54#elif !defined HAVE_CRYPT
55# include "missing/crypt.h"
56# define HAVE_CRYPT_R 1
57#endif
58
59#define BEG(no) (regs->beg[(no)])
60#define END(no) (regs->end[(no)])
61
62#undef rb_str_new
63#undef rb_usascii_str_new
64#undef rb_utf8_str_new
65#undef rb_enc_str_new
66#undef rb_str_new_cstr
67#undef rb_usascii_str_new_cstr
68#undef rb_utf8_str_new_cstr
69#undef rb_enc_str_new_cstr
70#undef rb_external_str_new_cstr
71#undef rb_locale_str_new_cstr
72#undef rb_str_dup_frozen
73#undef rb_str_buf_new_cstr
74#undef rb_str_buf_cat
75#undef rb_str_buf_cat2
76#undef rb_str_cat2
77#undef rb_str_cat_cstr
78#undef rb_fstring_cstr
79
82
83/* Flags of RString
84 *
85 * 0: STR_SHARED (equal to ELTS_SHARED)
86 * The string is shared. The buffer this string points to is owned by
87 * another string (the shared root).
88 * 1: RSTRING_NOEMBED
89 * The string is not embedded. When a string is embedded, the contents
90 * follow the header. When a string is not embedded, the contents is
91 * on a separately allocated buffer.
92 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
93 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
94 * It emits a deprecation warning when mutated for the first time.
95 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
96 * The string was allocated by the `Symbol#to_s` method.
97 * It emits a deprecation warning when mutated for the first time.
98 * 4: STR_PRECOMPUTED_HASH
99 * The string is embedded and has its precomputed hashcode stored
100 * after the terminator.
101 * 5: STR_SHARED_ROOT
102 * Other strings may point to the contents of this string. When this
103 * flag is set, STR_SHARED must not be set.
104 * 6: STR_BORROWED
105 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
106 * to be unshared by rb_str_tmp_frozen_release.
107 * 7: STR_TMPLOCK
108 * The pointer to the buffer is passed to a system call such as
109 * read(2). Any modification and realloc is prohibited.
110 * 8-9: ENC_CODERANGE
111 * Stores the coderange of the string.
112 * 10-16: ENCODING
113 * Stores the encoding of the string.
114 * 17: RSTRING_FSTR
115 * The string is a fstring. The string is deduplicated in the fstring
116 * table.
117 * 18: STR_NOFREE
118 * Do not free this string's buffer when the string is reclaimed
119 * by the garbage collector. Used for when the string buffer is a C
120 * string literal.
121 * 19: STR_FAKESTR
122 * The string is not allocated or managed by the garbage collector.
123 * Typically, the string object header (struct RString) is temporarily
124 * allocated on C stack.
125 */
126
127#define RUBY_MAX_CHAR_LEN 16
128#define STR_PRECOMPUTED_HASH FL_USER4
129#define STR_SHARED_ROOT FL_USER5
130#define STR_BORROWED FL_USER6
131#define STR_TMPLOCK FL_USER7
132#define STR_NOFREE FL_USER18
133#define STR_FAKESTR FL_USER19
134
135#define STR_SET_NOEMBED(str) do {\
136 FL_SET((str), STR_NOEMBED);\
137 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
138} while (0)
139#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
140
141#define STR_SET_LEN(str, n) do { \
142 RSTRING(str)->len = (n); \
143} while (0)
144
145static inline bool
146str_encindex_fastpath(int encindex)
147{
148 // The overwhelming majority of strings are in one of these 3 encodings.
149 switch (encindex) {
150 case ENCINDEX_ASCII_8BIT:
151 case ENCINDEX_UTF_8:
152 case ENCINDEX_US_ASCII:
153 return true;
154 default:
155 return false;
156 }
157}
158
159static inline bool
160str_enc_fastpath(VALUE str)
161{
162 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
163}
164
165#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
166#define TERM_FILL(ptr, termlen) do {\
167 char *const term_fill_ptr = (ptr);\
168 const int term_fill_len = (termlen);\
169 *term_fill_ptr = '\0';\
170 if (UNLIKELY(term_fill_len > 1))\
171 memset(term_fill_ptr, 0, term_fill_len);\
172} while (0)
173
174#define RESIZE_CAPA(str,capacity) do {\
175 const int termlen = TERM_LEN(str);\
176 RESIZE_CAPA_TERM(str,capacity,termlen);\
177} while (0)
178#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
179 if (STR_EMBED_P(str)) {\
180 if (str_embed_capa(str) < capacity + termlen) {\
181 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
182 const long tlen = RSTRING_LEN(str);\
183 memcpy(tmp, RSTRING_PTR(str), tlen);\
184 RSTRING(str)->as.heap.ptr = tmp;\
185 RSTRING(str)->len = tlen;\
186 STR_SET_NOEMBED(str);\
187 RSTRING(str)->as.heap.aux.capa = (capacity);\
188 }\
189 }\
190 else {\
191 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
192 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
193 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
194 RSTRING(str)->as.heap.aux.capa = (capacity);\
195 }\
196} while (0)
197
198#define STR_SET_SHARED(str, shared_str) do { \
199 if (!FL_TEST(str, STR_FAKESTR)) { \
200 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
201 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
202 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
203 FL_SET((str), STR_SHARED); \
204 FL_SET((shared_str), STR_SHARED_ROOT); \
205 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
206 FL_SET_RAW((shared_str), STR_BORROWED); \
207 } \
208} while (0)
209
210#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
211#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
212/* TODO: include the terminator size in capa. */
213
214#define STR_ENC_GET(str) get_encoding(str)
215
216#if !defined SHARABLE_MIDDLE_SUBSTRING
217# define SHARABLE_MIDDLE_SUBSTRING 0
218#endif
219#if !SHARABLE_MIDDLE_SUBSTRING
220#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
221#else
222#define SHARABLE_SUBSTRING_P(beg, len, end) 1
223#endif
224
225
226static inline long
227str_embed_capa(VALUE str)
228{
229 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
230}
231
232bool
233rb_str_reembeddable_p(VALUE str)
234{
235 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
236}
237
238static inline size_t
239rb_str_embed_size(long capa)
240{
241 return offsetof(struct RString, as.embed.ary) + capa;
242}
243
244size_t
245rb_str_size_as_embedded(VALUE str)
246{
247 size_t real_size;
248 if (STR_EMBED_P(str)) {
249 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
250 }
251 /* if the string is not currently embedded, but it can be embedded, how
252 * much space would it require */
253 else if (rb_str_reembeddable_p(str)) {
254 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
255 }
256 else {
257 real_size = sizeof(struct RString);
258 }
259
260 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
261 real_size += sizeof(st_index_t);
262 }
263
264 return real_size;
265}
266
267static inline bool
268STR_EMBEDDABLE_P(long len, long termlen)
269{
270 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
271}
272
273static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
274static VALUE str_new_frozen(VALUE klass, VALUE orig);
275static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
276static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
277static VALUE str_new(VALUE klass, const char *ptr, long len);
278static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
279static inline void str_modifiable(VALUE str);
280static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
281static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
282
283static inline void
284str_make_independent(VALUE str)
285{
286 long len = RSTRING_LEN(str);
287 int termlen = TERM_LEN(str);
288 str_make_independent_expand((str), len, 0L, termlen);
289}
290
291static inline int str_dependent_p(VALUE str);
292
293void
294rb_str_make_independent(VALUE str)
295{
296 if (str_dependent_p(str)) {
297 str_make_independent(str);
298 }
299}
300
301void
302rb_str_make_embedded(VALUE str)
303{
304 RUBY_ASSERT(rb_str_reembeddable_p(str));
305 RUBY_ASSERT(!STR_EMBED_P(str));
306
307 char *buf = RSTRING(str)->as.heap.ptr;
308 long len = RSTRING(str)->len;
309
310 STR_SET_EMBED(str);
311 STR_SET_LEN(str, len);
312
313 if (len > 0) {
314 memcpy(RSTRING_PTR(str), buf, len);
315 ruby_xfree(buf);
316 }
317
318 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
319}
320
321void
322rb_debug_rstring_null_ptr(const char *func)
323{
324 fprintf(stderr, "%s is returning NULL!! "
325 "SIGSEGV is highly expected to follow immediately.\n"
326 "If you could reproduce, attach your debugger here, "
327 "and look at the passed string.\n",
328 func);
329}
330
331/* symbols for [up|down|swap]case/capitalize options */
332static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
333
334static rb_encoding *
335get_encoding(VALUE str)
336{
337 return rb_enc_from_index(ENCODING_GET(str));
338}
339
340static void
341mustnot_broken(VALUE str)
342{
343 if (is_broken_string(str)) {
344 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
345 }
346}
347
348static void
349mustnot_wchar(VALUE str)
350{
351 rb_encoding *enc = STR_ENC_GET(str);
352 if (rb_enc_mbminlen(enc) > 1) {
353 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
354 }
355}
356
357static int fstring_cmp(VALUE a, VALUE b);
358
359static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
360
361#if SIZEOF_LONG == SIZEOF_VOIDP
362#define PRECOMPUTED_FAKESTR_HASH 1
363#else
364#endif
365
366#ifdef PRECOMPUTED_FAKESTR_HASH
367static st_index_t
368fstring_hash(VALUE str)
369{
370 st_index_t h;
371 if (FL_TEST_RAW(str, STR_FAKESTR)) {
372 // register_fstring precomputes the hash and stores it in capa for fake strings
373 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
374 }
375 else {
376 h = rb_str_hash(str);
377 }
378 // rb_str_hash doesn't include the encoding for ascii only strings, so
379 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
380 return rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
381}
382#else
383#define fstring_hash rb_str_hash
384#endif
385
386#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
387
388static inline st_index_t
389str_do_hash(VALUE str)
390{
391 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
392 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
393 if (e && !is_ascii_string(str)) {
394 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
395 }
396 return h;
397}
398
399static VALUE
400str_store_precomputed_hash(VALUE str, st_index_t hash)
401{
402 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
403 RUBY_ASSERT(STR_EMBED_P(str));
404
405#if RUBY_DEBUG
406 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
407 size_t free_bytes = str_embed_capa(str) - used_bytes;
408 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
409#endif
410
411 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
412
413 FL_SET(str, STR_PRECOMPUTED_HASH);
414
415 return str;
416}
417
419 bool copy;
420 bool force_precompute_hash;
421};
422
423static VALUE
424build_fstring(VALUE str, struct fstr_update_arg *arg)
425{
426 // Unless the string is empty or binary, its coderange has been precomputed.
427 int coderange = ENC_CODERANGE(str);
428
429 if (FL_TEST_RAW(str, STR_FAKESTR)) {
430 if (arg->copy) {
431 VALUE new_str;
432 long len = RSTRING_LEN(str);
433 long capa = len + sizeof(st_index_t);
434 int term_len = TERM_LEN(str);
435
436 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
437 new_str = str_alloc_embed(rb_cString, capa + term_len);
438 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
439 STR_SET_LEN(new_str, RSTRING_LEN(str));
440 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
441 rb_enc_copy(new_str, str);
442 str_store_precomputed_hash(new_str, str_do_hash(str));
443 }
444 else {
445 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
446 rb_enc_copy(new_str, str);
447#ifdef PRECOMPUTED_FAKESTR_HASH
448 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
449 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
450 }
451#endif
452 }
453 str = new_str;
454 }
455 else {
456 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
457 RSTRING(str)->len,
458 ENCODING_GET(str));
459 }
460 OBJ_FREEZE(str);
461 }
462 else {
463 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
464 str = str_new_frozen(rb_cString, str);
465 }
466 if (STR_SHARED_P(str)) { /* str should not be shared */
467 /* shared substring */
468 str_make_independent(str);
470 }
471 if (!BARE_STRING_P(str)) {
472 str = str_new_frozen(rb_cString, str);
473 }
474 }
475
476 ENC_CODERANGE_SET(str, coderange);
477 RBASIC(str)->flags |= RSTRING_FSTR;
478
481 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
484 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
485
486 return str;
487}
488
489VALUE
490rb_fstring(VALUE str)
491{
492 VALUE fstr;
493 int bare;
494
495 Check_Type(str, T_STRING);
496
497 if (FL_TEST(str, RSTRING_FSTR))
498 return str;
499
500 bare = BARE_STRING_P(str);
501 if (!bare) {
502 if (STR_EMBED_P(str)) {
503 OBJ_FREEZE(str);
504 return str;
505 }
506
507 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
509 return str;
510 }
511 }
512
513 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
514 rb_str_resize(str, RSTRING_LEN(str));
515
516 fstr = register_fstring(str, false, false);
517
518 if (!bare) {
519 str_replace_shared_without_enc(str, fstr);
520 OBJ_FREEZE(str);
521 return str;
522 }
523 return fstr;
524}
525
526#define FSTRING_TABLE_EMPTY Qfalse
527#define FSTRING_TABLE_TOMBSTONE Qtrue
528#define FSTRING_TABLE_MOVED Qundef
529
531 VALUE str;
532 VALUE hash;
533};
534
536 struct fstring_table_entry *entries;
537 unsigned int capacity;
538 unsigned int deleted_entries;
539 rb_atomic_t count; // TODO: pad to own cache line?
540};
541
542static void
543fstring_table_free(void *ptr)
544{
545 struct fstring_table_struct *table = ptr;
546 xfree(table->entries);
547}
548
549static size_t
550fstring_table_size(const void *ptr)
551{
552 const struct fstring_table_struct *table = ptr;
553 return sizeof(struct fstring_table_struct) + sizeof(struct fstring_table_entry) * table->capacity;
554}
555
556// We declare a type for the table so that we can lean on Ruby's GC for deferred reclamation
557static const rb_data_type_t fstring_table_type = {
558 .wrap_struct_name = "VM/fstring_table",
559 .function = {
560 .dmark = NULL,
561 .dfree = fstring_table_free,
562 .dsize = fstring_table_size,
563 },
564 .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE
565};
566
567
568static VALUE fstring_table_obj;
569
570static VALUE
571new_fstring_table(int capacity)
572{
573 VALUE obj;
574 struct fstring_table_struct *table;
575 obj = TypedData_Make_Struct(0, struct fstring_table_struct, &fstring_table_type, table);
576 table->capacity = capacity;
577 table->count = 0;
578 table->entries = ZALLOC_N(struct fstring_table_entry, capacity);
579 return obj;
580}
581
582void
583Init_fstring_table(void)
584{
585 fstring_table_obj = new_fstring_table(8192);
586 rb_gc_register_address(&fstring_table_obj);
587}
588
589#if 0
590
591// Linear probe
592struct fstring_table_probe {
593 int idx;
594 int mask;
595};
596
597static int
598fstring_table_probe_start(struct fstring_table_probe *probe, struct fstring_table_struct *table, VALUE hash_code)
599{
600 RUBY_ASSERT((table->capacity & (table->capacity - 1)) == 0);
601 probe->mask = table->capacity - 1;
602 probe->idx = hash_code & probe->mask;
603 return probe->idx;
604}
605
606static int
607fstring_table_probe_next(struct fstring_table_probe *probe)
608{
609 probe->idx = (probe->idx + 1) & probe->mask;
610 return probe->idx;
611}
612
613#else
614
615// Struct containing probe information. Intended that the compiler should always inline this
616// Quadratic probing
618 int idx;
619 int d;
620 int mask;
621};
622
623static int
624fstring_table_probe_start(struct fstring_table_probe *probe, struct fstring_table_struct *table, VALUE hash_code)
625{
626 RUBY_ASSERT((table->capacity & (table->capacity - 1)) == 0);
627 probe->d = 0;
628 probe->mask = table->capacity - 1;
629 probe->idx = hash_code & probe->mask;
630 return probe->idx;
631}
632
633static int
634fstring_table_probe_next(struct fstring_table_probe *probe)
635{
636 probe->d++;
637 probe->idx = (probe->idx + probe->d) & probe->mask;
638 return probe->idx;
639}
640#endif
641
642#define RUBY_ATOMIC_VALUE_LOAD(x) (VALUE)(RUBY_ATOMIC_PTR_LOAD(x))
643
644static void
645fstring_insert_on_resize(struct fstring_table_struct *table, VALUE hash_code, VALUE value)
646{
647 struct fstring_table_probe probe;
648 int idx = fstring_table_probe_start(&probe, table, hash_code);
649
650 for (;;) {
651 struct fstring_table_entry *entry = &table->entries[idx];
652 VALUE candidate = entry->str;
653
654 RUBY_ASSERT(candidate != FSTRING_TABLE_TOMBSTONE);
655 RUBY_ASSERT(candidate != FSTRING_TABLE_MOVED);
656
657 if (candidate == FSTRING_TABLE_EMPTY) {
658 table->count++;
659
660 RUBY_ASSERT(table->count < table->capacity / 2);
661 RUBY_ASSERT(entry->hash == 0);
662
663 entry->str = value;
664 entry->hash = hash_code;
665 return;
666 }
667
668 idx = fstring_table_probe_next(&probe);
669 }
670}
671
672// Rebuilds the table
673static void
674fstring_try_resize(VALUE old_table_obj)
675{
676 RB_VM_LOCK_ENTER();
677
678 // Check if another thread has already resized
679 if (RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj) != old_table_obj) {
680 goto end;
681 }
682
683 struct fstring_table_struct *old_table = RTYPEDDATA_GET_DATA(old_table_obj);
684
685 // This may overcount by up to the number of threads concurrently attempting to insert
686 // GC may also happen between now and the table being rebuilt
687 int expected_count = RUBY_ATOMIC_LOAD(old_table->count) - old_table->deleted_entries;
688
689 struct fstring_table_entry *old_entries = old_table->entries;
690 int old_capacity = old_table->capacity;
691 int new_capacity = old_capacity * 2;
692 if (new_capacity > expected_count * 8) {
693 new_capacity = old_capacity / 2;
694 }
695 else if (new_capacity > expected_count * 4) {
696 new_capacity = old_capacity;
697 }
698
699 // May cause GC and therefore deletes, so must hapen first
700 VALUE new_table_obj = new_fstring_table(new_capacity);
701 struct fstring_table_struct *new_table = RTYPEDDATA_GET_DATA(new_table_obj);
702
703 for (int i = 0; i < old_capacity; i++) {
704 struct fstring_table_entry *entry = &old_entries[i];
705 VALUE val = RUBY_ATOMIC_VALUE_EXCHANGE(entry->str, FSTRING_TABLE_MOVED);
706 RUBY_ASSERT(val != FSTRING_TABLE_MOVED);
707 if (val == FSTRING_TABLE_EMPTY) continue;
708 if (val == FSTRING_TABLE_TOMBSTONE) continue;
709 if (rb_objspace_garbage_object_p(val)) continue;
710
711 VALUE hash_code = RUBY_ATOMIC_VALUE_LOAD(entry->hash);
712 if (hash_code == 0) {
713 // Either in-progress insert or extremely unlikely 0 hash
714 // Re-calculate the hash ourselves
715 hash_code = fstring_hash(val);
716 }
717 RUBY_ASSERT(hash_code == fstring_hash(val));
718 fstring_insert_on_resize(new_table, hash_code, val);
719 }
720
721#if 0
722 fprintf(stderr, "resized: %p(%i) -> %p(%i) (count: %i->%i)\n", old_table, old_table->capacity, new_table, new_table->capacity, old_table->count, new_table->count);
723#endif
724
725 RUBY_ATOMIC_VALUE_SET(fstring_table_obj, new_table_obj);
726
727end:
728 RB_GC_GUARD(old_table_obj);
729 RB_VM_LOCK_LEAVE();
730}
731
732static VALUE
733fstring_find_or_insert(VALUE hash_code, VALUE value, struct fstr_update_arg *arg)
734{
735 struct fstring_table_probe probe;
736 bool inserting = false;
737 int idx;
738 VALUE table_obj;
739 struct fstring_table_struct *table;
740
741 retry:
742 table_obj = RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj);
743 RUBY_ASSERT(table_obj);
744 table = RTYPEDDATA_GET_DATA(table_obj);
745 idx = fstring_table_probe_start(&probe, table, hash_code);
746
747 for (;;) {
748 struct fstring_table_entry *entry = &table->entries[idx];
749 VALUE candidate = RUBY_ATOMIC_VALUE_LOAD(entry->str);
750
751 if (candidate == FSTRING_TABLE_EMPTY) {
752 // Not in table
753 if (!inserting) {
754 // Prepare a string suitable for inserting into the table
755 value = build_fstring(value, arg);
756 RUBY_ASSERT(hash_code == fstring_hash(value));
757 inserting = true;
758 }
759
760 unsigned int prev_count = RUBY_ATOMIC_FETCH_ADD(table->count, 1);
761
762 if (UNLIKELY(prev_count > table->capacity / 2)) {
763 fstring_try_resize(table_obj);
764 goto retry;
765 }
766
767 VALUE found = RUBY_ATOMIC_VALUE_CAS(entry->str, FSTRING_TABLE_EMPTY, value);
768 if (found == FSTRING_TABLE_EMPTY) {
769 // Success! Our value was inserted
770
771 // Also set the hash code
772 RUBY_ATOMIC_VALUE_SET(entry->hash, hash_code);
773
774 RB_GC_GUARD(table_obj);
775 return value;
776 }
777 else {
778 // Nothing was inserted
779 RUBY_ATOMIC_DEC(table->count); // we didn't end up inserting
780
781 // Another thread won the race, try again at the same location
782 continue;
783 }
784 }
785 else if (candidate == FSTRING_TABLE_TOMBSTONE) {
786 // Deleted entry, continue searching
787 }
788 else if (candidate == FSTRING_TABLE_MOVED) {
789 // Wait
790 RB_VM_LOCK_ENTER();
791 RB_VM_LOCK_LEAVE();
792
793 goto retry;
794 }
795 else {
796 VALUE candidate_hash = RUBY_ATOMIC_VALUE_LOAD(entry->hash);
797 if ((candidate_hash == hash_code || candidate_hash == 0) && !fstring_cmp(candidate, value)) {
798 // We've found a match
799 if (UNLIKELY(rb_objspace_garbage_object_p(candidate))) {
800 // This is a weakref table, so after marking but before sweeping is complete we may find a matching garbage object.
801 // Skip it and mark it as a tombstone to help other threads out
802 RUBY_ATOMIC_VALUE_CAS(entry->str, candidate, FSTRING_TABLE_TOMBSTONE);
803
804 // Fall through and continue our search
805 }
806 else {
807 RB_GC_GUARD(table_obj);
808 return candidate;
809 }
810 }
811 }
812
813 idx = fstring_table_probe_next(&probe);
814 }
815}
816
817
818// Removes an fstring from the table. Compares by identity
819static void
820fstring_delete(VALUE hash_code, VALUE value)
821{
822 // Delete is never called concurrently, so atomic operations are unnecessary
823 VALUE table_obj = RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj);
824 RUBY_ASSERT_ALWAYS(table_obj);
825 struct fstring_table_struct *table = RTYPEDDATA_GET_DATA(table_obj);
826
827 struct fstring_table_probe probe;
828 int idx = fstring_table_probe_start(&probe, table, hash_code);
829
830 for (;;) {
831 struct fstring_table_entry *entry = &table->entries[idx];
832 VALUE candidate = entry->str;
833
834 // Allocations should only occur at the beginning of the resize
835 RUBY_ASSERT(candidate != FSTRING_TABLE_MOVED);
836
837 if (candidate == FSTRING_TABLE_EMPTY) {
838 // We didn't find our string to delete
839 return;
840 }
841 else if (candidate == value) {
842 // We found our string, replace it with a tombstone and increment the count
843 entry->str = FSTRING_TABLE_TOMBSTONE;
844 table->deleted_entries++;
845 return;
846 }
847
848 idx = fstring_table_probe_next(&probe);
849 }
850}
851
852static VALUE
853register_fstring(VALUE str, bool copy, bool force_precompute_hash)
854{
855 struct fstr_update_arg args = {
856 .copy = copy,
857 .force_precompute_hash = force_precompute_hash
858 };
859
860#if SIZEOF_VOIDP == SIZEOF_LONG
861 if (FL_TEST_RAW(str, STR_FAKESTR)) {
862 // if the string hasn't been interned, we'll need the hash twice, so we
863 // compute it once and store it in capa
864 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
865 }
866#endif
867
868 VALUE hash_code = fstring_hash(str);
869 VALUE result = fstring_find_or_insert(hash_code, str, &args);
870
871 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
873 RUBY_ASSERT(OBJ_FROZEN(result));
874 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
877
878 return result;
879}
880
881void
882rb_fstring_foreach_with_replace(st_foreach_check_callback_func *func, st_update_callback_func *replace, st_data_t arg)
883{
884 // Assume locking and barrier (which there is no assert for)
885 ASSERT_vm_locking();
886
887 VALUE table_obj = RUBY_ATOMIC_VALUE_LOAD(fstring_table_obj);
888 if (!table_obj) {
889 // Table not yet initialized. Nothing to iterate over
890 return;
891 }
892 struct fstring_table_struct *table = RTYPEDDATA_GET_DATA(table_obj);
893
894 for (unsigned int i = 0; i < table->capacity; i++) {
895 VALUE key = table->entries[i].str;
896 if(key == FSTRING_TABLE_EMPTY) continue;
897 if(key == FSTRING_TABLE_TOMBSTONE) continue;
898
899 enum st_retval retval;
900 retval = (*func)(key, key, arg, 0);
901
902 if (retval == ST_REPLACE && replace) {
903 st_data_t value = key;
904 retval = (*replace)(&key, &value, arg, TRUE);
905 table->entries[i].str = key;
906 }
907 switch (retval) {
908 case ST_REPLACE:
909 case ST_CONTINUE:
910 break;
911 case ST_CHECK:
912 rb_bug("unsupported");
913 case ST_STOP:
914 return;
915 case ST_DELETE:
916 table->entries[i].str = FSTRING_TABLE_TOMBSTONE;
917 break;
918 }
919 }
920}
921
922bool
923rb_obj_is_fstring_table(VALUE obj)
924{
925 ASSERT_vm_locking();
926
927 return obj == fstring_table_obj;
928}
929
930void
931rb_gc_free_fstring(VALUE obj)
932{
933 // Assume locking and barrier (which there is no assert for)
934 ASSERT_vm_locking();
935
936 VALUE str_hash = fstring_hash(obj);
937 fstring_delete(str_hash, obj);
938
939 RB_DEBUG_COUNTER_INC(obj_str_fstr);
940
941 FL_UNSET(obj, RSTRING_FSTR);
942}
943
944static VALUE
945setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
946{
947 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
948
949 if (!name) {
951 name = "";
952 }
953
954 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
955
956 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
957 fake_str->len = len;
958 fake_str->as.heap.ptr = (char *)name;
959 fake_str->as.heap.aux.capa = len;
960 return (VALUE)fake_str;
961}
962
963/*
964 * set up a fake string which refers a static string literal.
965 */
966VALUE
967rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
968{
969 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
970}
971
972/*
973 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
974 * shared string which refers a static string literal. `ptr` must
975 * point a constant string.
976 */
977VALUE
978rb_fstring_new(const char *ptr, long len)
979{
980 struct RString fake_str;
981 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
982}
983
984VALUE
985rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
986{
987 struct RString fake_str;
988 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
989}
990
991VALUE
992rb_fstring_cstr(const char *ptr)
993{
994 return rb_fstring_new(ptr, strlen(ptr));
995}
996
997static int
998fstring_cmp(VALUE a, VALUE b)
999{
1000 long alen, blen;
1001 const char *aptr, *bptr;
1002
1005
1006 RSTRING_GETMEM(a, aptr, alen);
1007 RSTRING_GETMEM(b, bptr, blen);
1008 return (alen != blen ||
1009 ENCODING_GET(a) != ENCODING_GET(b) ||
1010 memcmp(aptr, bptr, alen) != 0);
1011}
1012
1013static inline bool
1014single_byte_optimizable(VALUE str)
1015{
1016 int encindex = ENCODING_GET(str);
1017 switch (encindex) {
1018 case ENCINDEX_ASCII_8BIT:
1019 case ENCINDEX_US_ASCII:
1020 return true;
1021 case ENCINDEX_UTF_8:
1022 // For UTF-8 it's worth scanning the string coderange when unknown.
1024 }
1025 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
1026 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
1027 return true;
1028 }
1029
1030 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
1031 return true;
1032 }
1033
1034 /* Conservative. Possibly single byte.
1035 * "\xa1" in Shift_JIS for example. */
1036 return false;
1037}
1038
1040
1041static inline const char *
1042search_nonascii(const char *p, const char *e)
1043{
1044 const uintptr_t *s, *t;
1045
1046#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
1047# if SIZEOF_UINTPTR_T == 8
1048# define NONASCII_MASK UINT64_C(0x8080808080808080)
1049# elif SIZEOF_UINTPTR_T == 4
1050# define NONASCII_MASK UINT32_C(0x80808080)
1051# else
1052# error "don't know what to do."
1053# endif
1054#else
1055# if SIZEOF_UINTPTR_T == 8
1056# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
1057# elif SIZEOF_UINTPTR_T == 4
1058# define NONASCII_MASK 0x80808080UL /* or...? */
1059# else
1060# error "don't know what to do."
1061# endif
1062#endif
1063
1064 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
1065#if !UNALIGNED_WORD_ACCESS
1066 if ((uintptr_t)p % SIZEOF_VOIDP) {
1067 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
1068 p += l;
1069 switch (l) {
1070 default: UNREACHABLE;
1071#if SIZEOF_VOIDP > 4
1072 case 7: if (p[-7]&0x80) return p-7;
1073 case 6: if (p[-6]&0x80) return p-6;
1074 case 5: if (p[-5]&0x80) return p-5;
1075 case 4: if (p[-4]&0x80) return p-4;
1076#endif
1077 case 3: if (p[-3]&0x80) return p-3;
1078 case 2: if (p[-2]&0x80) return p-2;
1079 case 1: if (p[-1]&0x80) return p-1;
1080 case 0: break;
1081 }
1082 }
1083#endif
1084#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
1085#define aligned_ptr(value) \
1086 __builtin_assume_aligned((value), sizeof(uintptr_t))
1087#else
1088#define aligned_ptr(value) (uintptr_t *)(value)
1089#endif
1090 s = aligned_ptr(p);
1091 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
1092#undef aligned_ptr
1093 for (;s < t; s++) {
1094 if (*s & NONASCII_MASK) {
1095#ifdef WORDS_BIGENDIAN
1096 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
1097#else
1098 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
1099#endif
1100 }
1101 }
1102 p = (const char *)s;
1103 }
1104
1105 switch (e - p) {
1106 default: UNREACHABLE;
1107#if SIZEOF_VOIDP > 4
1108 case 7: if (e[-7]&0x80) return e-7;
1109 case 6: if (e[-6]&0x80) return e-6;
1110 case 5: if (e[-5]&0x80) return e-5;
1111 case 4: if (e[-4]&0x80) return e-4;
1112#endif
1113 case 3: if (e[-3]&0x80) return e-3;
1114 case 2: if (e[-2]&0x80) return e-2;
1115 case 1: if (e[-1]&0x80) return e-1;
1116 case 0: return NULL;
1117 }
1118}
1119
1120static int
1121coderange_scan(const char *p, long len, rb_encoding *enc)
1122{
1123 const char *e = p + len;
1124
1125 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
1126 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
1127 p = search_nonascii(p, e);
1129 }
1130
1131 if (rb_enc_asciicompat(enc)) {
1132 p = search_nonascii(p, e);
1133 if (!p) return ENC_CODERANGE_7BIT;
1134 for (;;) {
1135 int ret = rb_enc_precise_mbclen(p, e, enc);
1136 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
1137 p += MBCLEN_CHARFOUND_LEN(ret);
1138 if (p == e) break;
1139 p = search_nonascii(p, e);
1140 if (!p) break;
1141 }
1142 }
1143 else {
1144 while (p < e) {
1145 int ret = rb_enc_precise_mbclen(p, e, enc);
1146 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
1147 p += MBCLEN_CHARFOUND_LEN(ret);
1148 }
1149 }
1150 return ENC_CODERANGE_VALID;
1151}
1152
1153long
1154rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
1155{
1156 const char *p = s;
1157
1158 if (*cr == ENC_CODERANGE_BROKEN)
1159 return e - s;
1160
1161 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
1162 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
1163 if (*cr == ENC_CODERANGE_VALID) return e - s;
1164 p = search_nonascii(p, e);
1166 return e - s;
1167 }
1168 else if (rb_enc_asciicompat(enc)) {
1169 p = search_nonascii(p, e);
1170 if (!p) {
1171 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
1172 return e - s;
1173 }
1174 for (;;) {
1175 int ret = rb_enc_precise_mbclen(p, e, enc);
1176 if (!MBCLEN_CHARFOUND_P(ret)) {
1178 return p - s;
1179 }
1180 p += MBCLEN_CHARFOUND_LEN(ret);
1181 if (p == e) break;
1182 p = search_nonascii(p, e);
1183 if (!p) break;
1184 }
1185 }
1186 else {
1187 while (p < e) {
1188 int ret = rb_enc_precise_mbclen(p, e, enc);
1189 if (!MBCLEN_CHARFOUND_P(ret)) {
1191 return p - s;
1192 }
1193 p += MBCLEN_CHARFOUND_LEN(ret);
1194 }
1195 }
1196 *cr = ENC_CODERANGE_VALID;
1197 return e - s;
1198}
1199
1200static inline void
1201str_enc_copy(VALUE str1, VALUE str2)
1202{
1203 rb_enc_set_index(str1, ENCODING_GET(str2));
1204}
1205
1206/* Like str_enc_copy, but does not check frozen status of str1.
1207 * You should use this only if you're certain that str1 is not frozen. */
1208static inline void
1209str_enc_copy_direct(VALUE str1, VALUE str2)
1210{
1211 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
1212 if (inlined_encoding == ENCODING_INLINE_MAX) {
1213 rb_enc_set_index(str1, rb_enc_get_index(str2));
1214 }
1215 else {
1216 ENCODING_SET_INLINED(str1, inlined_encoding);
1217 }
1218}
1219
1220static void
1221rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
1222{
1223 /* this function is designed for copying encoding and coderange
1224 * from src to new string "dest" which is made from the part of src.
1225 */
1226 str_enc_copy(dest, src);
1227 if (RSTRING_LEN(dest) == 0) {
1228 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
1230 else
1232 return;
1233 }
1234 switch (ENC_CODERANGE(src)) {
1235 case ENC_CODERANGE_7BIT:
1237 break;
1239 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
1240 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
1242 else
1244 break;
1245 default:
1246 break;
1247 }
1248}
1249
1250static void
1251rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
1252{
1253 str_enc_copy(dest, src);
1254 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
1255}
1256
1257static int
1258enc_coderange_scan(VALUE str, rb_encoding *enc)
1259{
1260 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
1261}
1262
1263int
1264rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
1265{
1266 return enc_coderange_scan(str, enc);
1267}
1268
1269int
1271{
1272 int cr = ENC_CODERANGE(str);
1273
1274 if (cr == ENC_CODERANGE_UNKNOWN) {
1275 cr = enc_coderange_scan(str, get_encoding(str));
1276 ENC_CODERANGE_SET(str, cr);
1277 }
1278 return cr;
1279}
1280
1281static inline bool
1282rb_enc_str_asciicompat(VALUE str)
1283{
1284 int encindex = ENCODING_GET_INLINED(str);
1285 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
1286}
1287
1288int
1290{
1291 switch(ENC_CODERANGE(str)) {
1293 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
1294 case ENC_CODERANGE_7BIT:
1295 return true;
1296 default:
1297 return false;
1298 }
1299}
1300
1301static inline void
1302str_mod_check(VALUE s, const char *p, long len)
1303{
1304 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
1305 rb_raise(rb_eRuntimeError, "string modified");
1306 }
1307}
1308
1309static size_t
1310str_capacity(VALUE str, const int termlen)
1311{
1312 if (STR_EMBED_P(str)) {
1313 return str_embed_capa(str) - termlen;
1314 }
1315 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
1316 return RSTRING(str)->len;
1317 }
1318 else {
1319 return RSTRING(str)->as.heap.aux.capa;
1320 }
1321}
1322
1323size_t
1325{
1326 return str_capacity(str, TERM_LEN(str));
1327}
1328
1329static inline void
1330must_not_null(const char *ptr)
1331{
1332 if (!ptr) {
1333 rb_raise(rb_eArgError, "NULL pointer given");
1334 }
1335}
1336
1337static inline VALUE
1338str_alloc_embed(VALUE klass, size_t capa)
1339{
1340 size_t size = rb_str_embed_size(capa);
1341 RUBY_ASSERT(size > 0);
1342 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1343
1344 NEWOBJ_OF(str, struct RString, klass,
1346
1347 return (VALUE)str;
1348}
1349
1350static inline VALUE
1351str_alloc_heap(VALUE klass)
1352{
1353 NEWOBJ_OF(str, struct RString, klass,
1354 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1355
1356 return (VALUE)str;
1357}
1358
1359static inline VALUE
1360empty_str_alloc(VALUE klass)
1361{
1362 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1363 VALUE str = str_alloc_embed(klass, 0);
1364 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1366 return str;
1367}
1368
1369static VALUE
1370str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1371{
1372 VALUE str;
1373
1374 if (len < 0) {
1375 rb_raise(rb_eArgError, "negative string size (or size too big)");
1376 }
1377
1378 if (enc == NULL) {
1379 enc = rb_ascii8bit_encoding();
1380 }
1381
1382 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1383
1384 int termlen = rb_enc_mbminlen(enc);
1385
1386 if (STR_EMBEDDABLE_P(len, termlen)) {
1387 str = str_alloc_embed(klass, len + termlen);
1388 if (len == 0) {
1389 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1390 }
1391 }
1392 else {
1393 str = str_alloc_heap(klass);
1394 RSTRING(str)->as.heap.aux.capa = len;
1395 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1396 * integer overflow. If we can STATIC_ASSERT that, the following
1397 * mul_add_mul can be reverted to a simple ALLOC_N. */
1398 RSTRING(str)->as.heap.ptr =
1399 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1400 }
1401
1402 rb_enc_raw_set(str, enc);
1403
1404 if (ptr) {
1405 memcpy(RSTRING_PTR(str), ptr, len);
1406 }
1407
1408 STR_SET_LEN(str, len);
1409 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1410 return str;
1411}
1412
1413static VALUE
1414str_new(VALUE klass, const char *ptr, long len)
1415{
1416 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1417}
1418
1419VALUE
1420rb_str_new(const char *ptr, long len)
1421{
1422 return str_new(rb_cString, ptr, len);
1423}
1424
1425VALUE
1426rb_usascii_str_new(const char *ptr, long len)
1427{
1428 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1429}
1430
1431VALUE
1432rb_utf8_str_new(const char *ptr, long len)
1433{
1434 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1435}
1436
1437VALUE
1438rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1439{
1440 return str_enc_new(rb_cString, ptr, len, enc);
1441}
1442
1443VALUE
1445{
1446 must_not_null(ptr);
1447 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1448 * memory regions, and that cannot be detected by the MSAN. Just
1449 * trust the programmer that the argument passed here is a sane C
1450 * string. */
1451 __msan_unpoison_string(ptr);
1452 return rb_str_new(ptr, strlen(ptr));
1453}
1454
1455VALUE
1457{
1458 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1459}
1460
1461VALUE
1463{
1464 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1465}
1466
1467VALUE
1469{
1470 must_not_null(ptr);
1471 if (rb_enc_mbminlen(enc) != 1) {
1472 rb_raise(rb_eArgError, "wchar encoding given");
1473 }
1474 return rb_enc_str_new(ptr, strlen(ptr), enc);
1475}
1476
1477static VALUE
1478str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1479{
1480 VALUE str;
1481
1482 if (len < 0) {
1483 rb_raise(rb_eArgError, "negative string size (or size too big)");
1484 }
1485
1486 if (!ptr) {
1487 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1488 }
1489 else {
1490 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1491 str = str_alloc_heap(klass);
1492 RSTRING(str)->len = len;
1493 RSTRING(str)->as.heap.ptr = (char *)ptr;
1494 RSTRING(str)->as.heap.aux.capa = len;
1495 RBASIC(str)->flags |= STR_NOFREE;
1496 rb_enc_associate_index(str, encindex);
1497 }
1498 return str;
1499}
1500
1501VALUE
1502rb_str_new_static(const char *ptr, long len)
1503{
1504 return str_new_static(rb_cString, ptr, len, 0);
1505}
1506
1507VALUE
1509{
1510 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1511}
1512
1513VALUE
1515{
1516 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1517}
1518
1519VALUE
1521{
1522 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1523}
1524
1525static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1526 rb_encoding *from, rb_encoding *to,
1527 int ecflags, VALUE ecopts);
1528
1529static inline bool
1530is_enc_ascii_string(VALUE str, rb_encoding *enc)
1531{
1532 int encidx = rb_enc_to_index(enc);
1533 if (rb_enc_get_index(str) == encidx)
1534 return is_ascii_string(str);
1535 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1536}
1537
1538VALUE
1539rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1540{
1541 long len;
1542 const char *ptr;
1543 VALUE newstr;
1544
1545 if (!to) return str;
1546 if (!from) from = rb_enc_get(str);
1547 if (from == to) return str;
1548 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1549 rb_is_ascii8bit_enc(to)) {
1550 if (STR_ENC_GET(str) != to) {
1551 str = rb_str_dup(str);
1552 rb_enc_associate(str, to);
1553 }
1554 return str;
1555 }
1556
1557 RSTRING_GETMEM(str, ptr, len);
1558 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1559 from, to, ecflags, ecopts);
1560 if (NIL_P(newstr)) {
1561 /* some error, return original */
1562 return str;
1563 }
1564 return newstr;
1565}
1566
1567VALUE
1568rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1569 rb_encoding *from, int ecflags, VALUE ecopts)
1570{
1571 long olen;
1572
1573 olen = RSTRING_LEN(newstr);
1574 if (ofs < -olen || olen < ofs)
1575 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1576 if (ofs < 0) ofs += olen;
1577 if (!from) {
1578 STR_SET_LEN(newstr, ofs);
1579 return rb_str_cat(newstr, ptr, len);
1580 }
1581
1582 rb_str_modify(newstr);
1583 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1584 rb_enc_get(newstr),
1585 ecflags, ecopts);
1586}
1587
1588VALUE
1589rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1590{
1591 STR_SET_LEN(str, 0);
1592 rb_enc_associate(str, enc);
1593 rb_str_cat(str, ptr, len);
1594 return str;
1595}
1596
1597static VALUE
1598str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1599 rb_encoding *from, rb_encoding *to,
1600 int ecflags, VALUE ecopts)
1601{
1602 rb_econv_t *ec;
1604 long olen;
1605 VALUE econv_wrapper;
1606 const unsigned char *start, *sp;
1607 unsigned char *dest, *dp;
1608 size_t converted_output = (size_t)ofs;
1609
1610 olen = rb_str_capacity(newstr);
1611
1612 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1613 RBASIC_CLEAR_CLASS(econv_wrapper);
1614 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1615 if (!ec) return Qnil;
1616 DATA_PTR(econv_wrapper) = ec;
1617
1618 sp = (unsigned char*)ptr;
1619 start = sp;
1620 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1621 (dp = dest + converted_output),
1622 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1624 /* destination buffer short */
1625 size_t converted_input = sp - start;
1626 size_t rest = len - converted_input;
1627 converted_output = dp - dest;
1628 rb_str_set_len(newstr, converted_output);
1629 if (converted_input && converted_output &&
1630 rest < (LONG_MAX / converted_output)) {
1631 rest = (rest * converted_output) / converted_input;
1632 }
1633 else {
1634 rest = olen;
1635 }
1636 olen += rest < 2 ? 2 : rest;
1637 rb_str_resize(newstr, olen);
1638 }
1639 DATA_PTR(econv_wrapper) = 0;
1640 RB_GC_GUARD(econv_wrapper);
1641 rb_econv_close(ec);
1642 switch (ret) {
1643 case econv_finished:
1644 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1645 rb_str_set_len(newstr, len);
1646 rb_enc_associate(newstr, to);
1647 return newstr;
1648
1649 default:
1650 return Qnil;
1651 }
1652}
1653
1654VALUE
1656{
1657 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1658}
1659
1660VALUE
1662{
1663 rb_encoding *ienc;
1664 VALUE str;
1665 const int eidx = rb_enc_to_index(eenc);
1666
1667 if (!ptr) {
1668 return rb_enc_str_new(ptr, len, eenc);
1669 }
1670
1671 /* ASCII-8BIT case, no conversion */
1672 if ((eidx == rb_ascii8bit_encindex()) ||
1673 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1674 return rb_str_new(ptr, len);
1675 }
1676 /* no default_internal or same encoding, no conversion */
1677 ienc = rb_default_internal_encoding();
1678 if (!ienc || eenc == ienc) {
1679 return rb_enc_str_new(ptr, len, eenc);
1680 }
1681 /* ASCII compatible, and ASCII only string, no conversion in
1682 * default_internal */
1683 if ((eidx == rb_ascii8bit_encindex()) ||
1684 (eidx == rb_usascii_encindex()) ||
1685 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1686 return rb_enc_str_new(ptr, len, ienc);
1687 }
1688 /* convert from the given encoding to default_internal */
1689 str = rb_enc_str_new(NULL, 0, ienc);
1690 /* when the conversion failed for some reason, just ignore the
1691 * default_internal and result in the given encoding as-is. */
1692 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1693 rb_str_initialize(str, ptr, len, eenc);
1694 }
1695 return str;
1696}
1697
1698VALUE
1699rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1700{
1701 int eidx = rb_enc_to_index(eenc);
1702 if (eidx == rb_usascii_encindex() &&
1703 !is_ascii_string(str)) {
1704 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1705 return str;
1706 }
1707 rb_enc_associate_index(str, eidx);
1708 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1709}
1710
1711VALUE
1712rb_external_str_new(const char *ptr, long len)
1713{
1714 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1715}
1716
1717VALUE
1719{
1720 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1721}
1722
1723VALUE
1724rb_locale_str_new(const char *ptr, long len)
1725{
1726 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1727}
1728
1729VALUE
1731{
1732 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1733}
1734
1735VALUE
1737{
1738 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1739}
1740
1741VALUE
1743{
1744 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1745}
1746
1747VALUE
1749{
1750 return rb_str_export_to_enc(str, rb_default_external_encoding());
1751}
1752
1753VALUE
1755{
1756 return rb_str_export_to_enc(str, rb_locale_encoding());
1757}
1758
1759VALUE
1761{
1762 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1763}
1764
1765static VALUE
1766str_replace_shared_without_enc(VALUE str2, VALUE str)
1767{
1768 const int termlen = TERM_LEN(str);
1769 char *ptr;
1770 long len;
1771
1772 RSTRING_GETMEM(str, ptr, len);
1773 if (str_embed_capa(str2) >= len + termlen) {
1774 char *ptr2 = RSTRING(str2)->as.embed.ary;
1775 STR_SET_EMBED(str2);
1776 memcpy(ptr2, RSTRING_PTR(str), len);
1777 TERM_FILL(ptr2+len, termlen);
1778 }
1779 else {
1780 VALUE root;
1781 if (STR_SHARED_P(str)) {
1782 root = RSTRING(str)->as.heap.aux.shared;
1783 RSTRING_GETMEM(str, ptr, len);
1784 }
1785 else {
1786 root = rb_str_new_frozen(str);
1787 RSTRING_GETMEM(root, ptr, len);
1788 }
1789 RUBY_ASSERT(OBJ_FROZEN(root));
1790
1791 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1792 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1793 rb_fatal("about to free a possible shared root");
1794 }
1795 char *ptr2 = STR_HEAP_PTR(str2);
1796 if (ptr2 != ptr) {
1797 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1798 }
1799 }
1800 FL_SET(str2, STR_NOEMBED);
1801 RSTRING(str2)->as.heap.ptr = ptr;
1802 STR_SET_SHARED(str2, root);
1803 }
1804
1805 STR_SET_LEN(str2, len);
1806
1807 return str2;
1808}
1809
1810static VALUE
1811str_replace_shared(VALUE str2, VALUE str)
1812{
1813 str_replace_shared_without_enc(str2, str);
1814 rb_enc_cr_str_exact_copy(str2, str);
1815 return str2;
1816}
1817
1818static VALUE
1819str_new_shared(VALUE klass, VALUE str)
1820{
1821 return str_replace_shared(str_alloc_heap(klass), str);
1822}
1823
1824VALUE
1826{
1827 return str_new_shared(rb_obj_class(str), str);
1828}
1829
1830VALUE
1832{
1833 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1834 return str_new_frozen(rb_obj_class(orig), orig);
1835}
1836
1837static VALUE
1838rb_str_new_frozen_String(VALUE orig)
1839{
1840 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1841 return str_new_frozen(rb_cString, orig);
1842}
1843
1844
1845VALUE
1846rb_str_frozen_bare_string(VALUE orig)
1847{
1848 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1849 return str_new_frozen(rb_cString, orig);
1850}
1851
1852VALUE
1853rb_str_tmp_frozen_acquire(VALUE orig)
1854{
1855 if (OBJ_FROZEN_RAW(orig)) return orig;
1856 return str_new_frozen_buffer(0, orig, FALSE);
1857}
1858
1859VALUE
1860rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1861{
1862 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1863 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1864
1865 VALUE str = str_alloc_heap(0);
1866 OBJ_FREEZE(str);
1867 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1868 FL_SET(str, STR_SHARED_ROOT);
1869
1870 size_t capa = str_capacity(orig, TERM_LEN(orig));
1871
1872 /* If the string is embedded then we want to create a copy that is heap
1873 * allocated. If the string is shared then the shared root must be
1874 * embedded, so we want to create a copy. If the string is a shared root
1875 * then it must be embedded, so we want to create a copy. */
1876 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1877 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1878 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1879 }
1880 else {
1881 /* orig must be heap allocated and not shared, so we can safely transfer
1882 * the pointer to str. */
1883 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1884 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1885 RBASIC(orig)->flags &= ~STR_NOFREE;
1886 STR_SET_SHARED(orig, str);
1887 }
1888
1889 RSTRING(str)->len = RSTRING(orig)->len;
1890 RSTRING(str)->as.heap.aux.capa = capa;
1891
1892 return str;
1893}
1894
1895void
1896rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1897{
1898 if (RBASIC_CLASS(tmp) != 0)
1899 return;
1900
1901 if (STR_EMBED_P(tmp)) {
1903 }
1904 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1905 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1906 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1907
1908 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1909 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1910 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1911
1912 /* Unshare orig since the root (tmp) only has this one child. */
1913 FL_UNSET_RAW(orig, STR_SHARED);
1914 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1915 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1917
1918 /* Make tmp embedded and empty so it is safe for sweeping. */
1919 STR_SET_EMBED(tmp);
1920 STR_SET_LEN(tmp, 0);
1921 }
1922 }
1923}
1924
1925static VALUE
1926str_new_frozen(VALUE klass, VALUE orig)
1927{
1928 return str_new_frozen_buffer(klass, orig, TRUE);
1929}
1930
1931static VALUE
1932heap_str_make_shared(VALUE klass, VALUE orig)
1933{
1934 RUBY_ASSERT(!STR_EMBED_P(orig));
1935 RUBY_ASSERT(!STR_SHARED_P(orig));
1936
1937 VALUE str = str_alloc_heap(klass);
1938 STR_SET_LEN(str, RSTRING_LEN(orig));
1939 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1940 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1941 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1942 RBASIC(orig)->flags &= ~STR_NOFREE;
1943 STR_SET_SHARED(orig, str);
1944 if (klass == 0)
1945 FL_UNSET_RAW(str, STR_BORROWED);
1946 return str;
1947}
1948
1949static VALUE
1950str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1951{
1952 VALUE str;
1953
1954 long len = RSTRING_LEN(orig);
1955 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1956 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1957
1958 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1959 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1960 RUBY_ASSERT(STR_EMBED_P(str));
1961 }
1962 else {
1963 if (FL_TEST_RAW(orig, STR_SHARED)) {
1964 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1965 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1966 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1967 RUBY_ASSERT(ofs >= 0);
1968 RUBY_ASSERT(rest >= 0);
1969 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1971
1972 if ((ofs > 0) || (rest > 0) ||
1973 (klass != RBASIC(shared)->klass) ||
1974 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1975 str = str_new_shared(klass, shared);
1976 RUBY_ASSERT(!STR_EMBED_P(str));
1977 RSTRING(str)->as.heap.ptr += ofs;
1978 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1979 }
1980 else {
1981 if (RBASIC_CLASS(shared) == 0)
1982 FL_SET_RAW(shared, STR_BORROWED);
1983 return shared;
1984 }
1985 }
1986 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1987 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1988 STR_SET_EMBED(str);
1989 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1990 STR_SET_LEN(str, RSTRING_LEN(orig));
1991 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1992 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1993 }
1994 else {
1995 str = heap_str_make_shared(klass, orig);
1996 }
1997 }
1998
1999 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
2000 OBJ_FREEZE(str);
2001 return str;
2002}
2003
2004VALUE
2005rb_str_new_with_class(VALUE obj, const char *ptr, long len)
2006{
2007 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
2008}
2009
2010static VALUE
2011str_new_empty_String(VALUE str)
2012{
2013 VALUE v = rb_str_new(0, 0);
2014 rb_enc_copy(v, str);
2015 return v;
2016}
2017
2018#define STR_BUF_MIN_SIZE 63
2019
2020VALUE
2022{
2023 if (STR_EMBEDDABLE_P(capa, 1)) {
2024 return str_alloc_embed(rb_cString, capa + 1);
2025 }
2026
2027 VALUE str = str_alloc_heap(rb_cString);
2028
2029 RSTRING(str)->as.heap.aux.capa = capa;
2030 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
2031 RSTRING(str)->as.heap.ptr[0] = '\0';
2032
2033 return str;
2034}
2035
2036VALUE
2038{
2039 VALUE str;
2040 long len = strlen(ptr);
2041
2042 str = rb_str_buf_new(len);
2043 rb_str_buf_cat(str, ptr, len);
2044
2045 return str;
2046}
2047
2048VALUE
2050{
2051 return str_new(0, 0, len);
2052}
2053
2054void
2056{
2057 if (STR_EMBED_P(str)) {
2058 RB_DEBUG_COUNTER_INC(obj_str_embed);
2059 }
2060 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
2061 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
2062 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
2063 }
2064 else {
2065 RB_DEBUG_COUNTER_INC(obj_str_ptr);
2066 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2067 }
2068}
2069
2070size_t
2071rb_str_memsize(VALUE str)
2072{
2073 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
2074 return STR_HEAP_SIZE(str);
2075 }
2076 else {
2077 return 0;
2078 }
2079}
2080
2081VALUE
2083{
2084 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
2085}
2086
2087static inline void str_discard(VALUE str);
2088static void str_shared_replace(VALUE str, VALUE str2);
2089
2090void
2092{
2093 if (str != str2) str_shared_replace(str, str2);
2094}
2095
2096static void
2097str_shared_replace(VALUE str, VALUE str2)
2098{
2099 rb_encoding *enc;
2100 int cr;
2101 int termlen;
2102
2103 RUBY_ASSERT(str2 != str);
2104 enc = STR_ENC_GET(str2);
2105 cr = ENC_CODERANGE(str2);
2106 str_discard(str);
2107 termlen = rb_enc_mbminlen(enc);
2108
2109 STR_SET_LEN(str, RSTRING_LEN(str2));
2110
2111 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
2112 STR_SET_EMBED(str);
2113 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
2114 rb_enc_associate(str, enc);
2115 ENC_CODERANGE_SET(str, cr);
2116 }
2117 else {
2118 if (STR_EMBED_P(str2)) {
2119 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
2120 long len = RSTRING_LEN(str2);
2121 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
2122
2123 char *new_ptr = ALLOC_N(char, len + termlen);
2124 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
2125 RSTRING(str2)->as.heap.ptr = new_ptr;
2126 STR_SET_LEN(str2, len);
2127 RSTRING(str2)->as.heap.aux.capa = len;
2128 STR_SET_NOEMBED(str2);
2129 }
2130
2131 STR_SET_NOEMBED(str);
2132 FL_UNSET(str, STR_SHARED);
2133 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
2134
2135 if (FL_TEST(str2, STR_SHARED)) {
2136 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
2137 STR_SET_SHARED(str, shared);
2138 }
2139 else {
2140 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
2141 }
2142
2143 /* abandon str2 */
2144 STR_SET_EMBED(str2);
2145 RSTRING_PTR(str2)[0] = 0;
2146 STR_SET_LEN(str2, 0);
2147 rb_enc_associate(str, enc);
2148 ENC_CODERANGE_SET(str, cr);
2149 }
2150}
2151
2152VALUE
2154{
2155 VALUE str;
2156
2157 if (RB_TYPE_P(obj, T_STRING)) {
2158 return obj;
2159 }
2160 str = rb_funcall(obj, idTo_s, 0);
2161 return rb_obj_as_string_result(str, obj);
2162}
2163
2164VALUE
2165rb_obj_as_string_result(VALUE str, VALUE obj)
2166{
2167 if (!RB_TYPE_P(str, T_STRING))
2168 return rb_any_to_s(obj);
2169 return str;
2170}
2171
2172static VALUE
2173str_replace(VALUE str, VALUE str2)
2174{
2175 long len;
2176
2177 len = RSTRING_LEN(str2);
2178 if (STR_SHARED_P(str2)) {
2179 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
2181 STR_SET_NOEMBED(str);
2182 STR_SET_LEN(str, len);
2183 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
2184 STR_SET_SHARED(str, shared);
2185 rb_enc_cr_str_exact_copy(str, str2);
2186 }
2187 else {
2188 str_replace_shared(str, str2);
2189 }
2190
2191 return str;
2192}
2193
2194static inline VALUE
2195ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
2196{
2197 size_t size = rb_str_embed_size(capa);
2198 RUBY_ASSERT(size > 0);
2199 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
2200
2201 NEWOBJ_OF(str, struct RString, klass,
2203
2204 return (VALUE)str;
2205}
2206
2207static inline VALUE
2208ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
2209{
2210 NEWOBJ_OF(str, struct RString, klass,
2211 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
2212
2213 return (VALUE)str;
2214}
2215
2216static inline VALUE
2217str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
2218{
2219 int encidx = 0;
2220 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
2221 encidx = rb_enc_get_index(str);
2222 flags &= ~ENCODING_MASK;
2223 }
2224 FL_SET_RAW(dup, flags & ~FL_FREEZE);
2225 if (encidx) rb_enc_associate_index(dup, encidx);
2226 return dup;
2227}
2228
2229static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
2230
2231static inline VALUE
2232str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
2233{
2234 VALUE flags = FL_TEST_RAW(str, flag_mask);
2235 long len = RSTRING_LEN(str);
2236
2237 RUBY_ASSERT(STR_EMBED_P(dup));
2238 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
2239 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
2240 STR_SET_LEN(dup, RSTRING_LEN(str));
2241 return str_duplicate_setup_encoding(str, dup, flags);
2242}
2243
2244static inline VALUE
2245str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
2246{
2247 VALUE flags = FL_TEST_RAW(str, flag_mask);
2248 VALUE root = str;
2249 if (FL_TEST_RAW(str, STR_SHARED)) {
2250 root = RSTRING(str)->as.heap.aux.shared;
2251 }
2252 else if (UNLIKELY(!(flags & FL_FREEZE))) {
2253 root = str = str_new_frozen(klass, str);
2254 flags = FL_TEST_RAW(str, flag_mask);
2255 }
2256 RUBY_ASSERT(!STR_SHARED_P(root));
2258
2259 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
2260 FL_SET(root, STR_SHARED_ROOT);
2261 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
2262 flags |= RSTRING_NOEMBED | STR_SHARED;
2263
2264 STR_SET_LEN(dup, RSTRING_LEN(str));
2265 return str_duplicate_setup_encoding(str, dup, flags);
2266}
2267
2268static inline VALUE
2269str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
2270{
2271 if (STR_EMBED_P(str)) {
2272 return str_duplicate_setup_embed(klass, str, dup);
2273 }
2274 else {
2275 return str_duplicate_setup_heap(klass, str, dup);
2276 }
2277}
2278
2279static inline VALUE
2280str_duplicate(VALUE klass, VALUE str)
2281{
2282 VALUE dup;
2283 if (STR_EMBED_P(str)) {
2284 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
2285 }
2286 else {
2287 dup = str_alloc_heap(klass);
2288 }
2289
2290 return str_duplicate_setup(klass, str, dup);
2291}
2292
2293VALUE
2295{
2296 return str_duplicate(rb_obj_class(str), str);
2297}
2298
2299/* :nodoc: */
2300VALUE
2301rb_str_dup_m(VALUE str)
2302{
2303 if (LIKELY(BARE_STRING_P(str))) {
2304 return str_duplicate(rb_obj_class(str), str);
2305 }
2306 else {
2307 return rb_obj_dup(str);
2308 }
2309}
2310
2311VALUE
2313{
2314 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2315 return str_duplicate(rb_cString, str);
2316}
2317
2318VALUE
2319rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2320{
2321 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2322 VALUE new_str, klass = rb_cString;
2323
2324 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2325 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2326 str_duplicate_setup_embed(klass, str, new_str);
2327 }
2328 else {
2329 new_str = ec_str_alloc_heap(ec, klass);
2330 str_duplicate_setup_heap(klass, str, new_str);
2331 }
2332 if (chilled) {
2333 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2334 }
2335 return new_str;
2336}
2337
2338VALUE
2339rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2340{
2341 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2342 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2343 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2344 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2345 return rb_str_freeze(str);
2346}
2347
2348/*
2349 *
2350 * call-seq:
2351 * String.new(string = '', **opts) -> new_string
2352 *
2353 * :include: doc/string/new.rdoc
2354 *
2355 */
2356
2357static VALUE
2358rb_str_init(int argc, VALUE *argv, VALUE str)
2359{
2360 static ID keyword_ids[2];
2361 VALUE orig, opt, venc, vcapa;
2362 VALUE kwargs[2];
2363 rb_encoding *enc = 0;
2364 int n;
2365
2366 if (!keyword_ids[0]) {
2367 keyword_ids[0] = rb_id_encoding();
2368 CONST_ID(keyword_ids[1], "capacity");
2369 }
2370
2371 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2372 if (!NIL_P(opt)) {
2373 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2374 venc = kwargs[0];
2375 vcapa = kwargs[1];
2376 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2377 enc = rb_to_encoding(venc);
2378 }
2379 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2380 long capa = NUM2LONG(vcapa);
2381 long len = 0;
2382 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2383
2384 if (capa < STR_BUF_MIN_SIZE) {
2385 capa = STR_BUF_MIN_SIZE;
2386 }
2387 if (n == 1) {
2388 StringValue(orig);
2389 len = RSTRING_LEN(orig);
2390 if (capa < len) {
2391 capa = len;
2392 }
2393 if (orig == str) n = 0;
2394 }
2395 str_modifiable(str);
2396 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2397 /* make noembed always */
2398 const size_t size = (size_t)capa + termlen;
2399 const char *const old_ptr = RSTRING_PTR(str);
2400 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2401 char *new_ptr = ALLOC_N(char, size);
2402 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2403 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2404 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2405 RSTRING(str)->as.heap.ptr = new_ptr;
2406 }
2407 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2408 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2409 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2410 }
2411 STR_SET_LEN(str, len);
2412 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2413 if (n == 1) {
2414 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2415 rb_enc_cr_str_exact_copy(str, orig);
2416 }
2417 FL_SET(str, STR_NOEMBED);
2418 RSTRING(str)->as.heap.aux.capa = capa;
2419 }
2420 else if (n == 1) {
2421 rb_str_replace(str, orig);
2422 }
2423 if (enc) {
2424 rb_enc_associate(str, enc);
2426 }
2427 }
2428 else if (n == 1) {
2429 rb_str_replace(str, orig);
2430 }
2431 return str;
2432}
2433
2434/* :nodoc: */
2435static VALUE
2436rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2437{
2438 if (klass != rb_cString) {
2439 return rb_class_new_instance_pass_kw(argc, argv, klass);
2440 }
2441
2442 static ID keyword_ids[2];
2443 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2444 VALUE kwargs[2];
2445 rb_encoding *enc = NULL;
2446
2447 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2448 if (NIL_P(opt)) {
2449 return rb_class_new_instance_pass_kw(argc, argv, klass);
2450 }
2451
2452 keyword_ids[0] = rb_id_encoding();
2453 CONST_ID(keyword_ids[1], "capacity");
2454 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2455 encoding = kwargs[0];
2456 capacity = kwargs[1];
2457
2458 if (n == 1) {
2459 orig = StringValue(orig);
2460 }
2461 else {
2462 orig = Qnil;
2463 }
2464
2465 if (UNDEF_P(encoding)) {
2466 if (!NIL_P(orig)) {
2467 encoding = rb_obj_encoding(orig);
2468 }
2469 }
2470
2471 if (!UNDEF_P(encoding)) {
2472 enc = rb_to_encoding(encoding);
2473 }
2474
2475 // If capacity is nil, we're basically just duping `orig`.
2476 if (UNDEF_P(capacity)) {
2477 if (NIL_P(orig)) {
2478 VALUE empty_str = str_new(klass, "", 0);
2479 if (enc) {
2480 rb_enc_associate(empty_str, enc);
2481 }
2482 return empty_str;
2483 }
2484 VALUE copy = str_duplicate(klass, orig);
2485 rb_enc_associate(copy, enc);
2486 ENC_CODERANGE_CLEAR(copy);
2487 return copy;
2488 }
2489
2490 long capa = 0;
2491 capa = NUM2LONG(capacity);
2492 if (capa < 0) {
2493 capa = 0;
2494 }
2495
2496 if (!NIL_P(orig)) {
2497 long orig_capa = rb_str_capacity(orig);
2498 if (orig_capa > capa) {
2499 capa = orig_capa;
2500 }
2501 }
2502
2503 VALUE str = str_enc_new(klass, NULL, capa, enc);
2504 STR_SET_LEN(str, 0);
2505 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2506
2507 if (!NIL_P(orig)) {
2508 rb_str_buf_append(str, orig);
2509 }
2510
2511 return str;
2512}
2513
2514#ifdef NONASCII_MASK
2515#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2516
2517/*
2518 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2519 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2520 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2521 *
2522 * if (!(byte & 0x80))
2523 * byte |= 0x40; // turn on bit6
2524 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2525 *
2526 * This function calculates whether a byte is leading or not for all bytes
2527 * in the argument word by concurrently using the above logic, and then
2528 * adds up the number of leading bytes in the word.
2529 */
2530static inline uintptr_t
2531count_utf8_lead_bytes_with_word(const uintptr_t *s)
2532{
2533 uintptr_t d = *s;
2534
2535 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2536 d = (d>>6) | (~d>>7);
2537 d &= NONASCII_MASK >> 7;
2538
2539 /* Gather all bytes. */
2540#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2541 /* use only if it can use POPCNT */
2542 return rb_popcount_intptr(d);
2543#else
2544 d += (d>>8);
2545 d += (d>>16);
2546# if SIZEOF_VOIDP == 8
2547 d += (d>>32);
2548# endif
2549 return (d&0xF);
2550#endif
2551}
2552#endif
2553
2554static inline long
2555enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2556{
2557 long c;
2558 const char *q;
2559
2560 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2561 long diff = (long)(e - p);
2562 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2563 }
2564#ifdef NONASCII_MASK
2565 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2566 uintptr_t len = 0;
2567 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2568 const uintptr_t *s, *t;
2569 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2570 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2571 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2572 while (p < (const char *)s) {
2573 if (is_utf8_lead_byte(*p)) len++;
2574 p++;
2575 }
2576 while (s < t) {
2577 len += count_utf8_lead_bytes_with_word(s);
2578 s++;
2579 }
2580 p = (const char *)s;
2581 }
2582 while (p < e) {
2583 if (is_utf8_lead_byte(*p)) len++;
2584 p++;
2585 }
2586 return (long)len;
2587 }
2588#endif
2589 else if (rb_enc_asciicompat(enc)) {
2590 c = 0;
2591 if (ENC_CODERANGE_CLEAN_P(cr)) {
2592 while (p < e) {
2593 if (ISASCII(*p)) {
2594 q = search_nonascii(p, e);
2595 if (!q)
2596 return c + (e - p);
2597 c += q - p;
2598 p = q;
2599 }
2600 p += rb_enc_fast_mbclen(p, e, enc);
2601 c++;
2602 }
2603 }
2604 else {
2605 while (p < e) {
2606 if (ISASCII(*p)) {
2607 q = search_nonascii(p, e);
2608 if (!q)
2609 return c + (e - p);
2610 c += q - p;
2611 p = q;
2612 }
2613 p += rb_enc_mbclen(p, e, enc);
2614 c++;
2615 }
2616 }
2617 return c;
2618 }
2619
2620 for (c=0; p<e; c++) {
2621 p += rb_enc_mbclen(p, e, enc);
2622 }
2623 return c;
2624}
2625
2626long
2627rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2628{
2629 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2630}
2631
2632/* To get strlen with cr
2633 * Note that given cr is not used.
2634 */
2635long
2636rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2637{
2638 long c;
2639 const char *q;
2640 int ret;
2641
2642 *cr = 0;
2643 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2644 long diff = (long)(e - p);
2645 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2646 }
2647 else if (rb_enc_asciicompat(enc)) {
2648 c = 0;
2649 while (p < e) {
2650 if (ISASCII(*p)) {
2651 q = search_nonascii(p, e);
2652 if (!q) {
2653 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2654 return c + (e - p);
2655 }
2656 c += q - p;
2657 p = q;
2658 }
2659 ret = rb_enc_precise_mbclen(p, e, enc);
2660 if (MBCLEN_CHARFOUND_P(ret)) {
2661 *cr |= ENC_CODERANGE_VALID;
2662 p += MBCLEN_CHARFOUND_LEN(ret);
2663 }
2664 else {
2666 p++;
2667 }
2668 c++;
2669 }
2670 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2671 return c;
2672 }
2673
2674 for (c=0; p<e; c++) {
2675 ret = rb_enc_precise_mbclen(p, e, enc);
2676 if (MBCLEN_CHARFOUND_P(ret)) {
2677 *cr |= ENC_CODERANGE_VALID;
2678 p += MBCLEN_CHARFOUND_LEN(ret);
2679 }
2680 else {
2682 if (p + rb_enc_mbminlen(enc) <= e)
2683 p += rb_enc_mbminlen(enc);
2684 else
2685 p = e;
2686 }
2687 }
2688 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2689 return c;
2690}
2691
2692/* enc must be str's enc or rb_enc_check(str, str2) */
2693static long
2694str_strlen(VALUE str, rb_encoding *enc)
2695{
2696 const char *p, *e;
2697 int cr;
2698
2699 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2700 if (!enc) enc = STR_ENC_GET(str);
2701 p = RSTRING_PTR(str);
2702 e = RSTRING_END(str);
2703 cr = ENC_CODERANGE(str);
2704
2705 if (cr == ENC_CODERANGE_UNKNOWN) {
2706 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2707 if (cr) ENC_CODERANGE_SET(str, cr);
2708 return n;
2709 }
2710 else {
2711 return enc_strlen(p, e, enc, cr);
2712 }
2713}
2714
2715long
2717{
2718 return str_strlen(str, NULL);
2719}
2720
2721/*
2722 * call-seq:
2723 * length -> integer
2724 *
2725 * :include: doc/string/length.rdoc
2726 *
2727 */
2728
2729VALUE
2731{
2732 return LONG2NUM(str_strlen(str, NULL));
2733}
2734
2735/*
2736 * call-seq:
2737 * bytesize -> integer
2738 *
2739 * :include: doc/string/bytesize.rdoc
2740 *
2741 */
2742
2743VALUE
2744rb_str_bytesize(VALUE str)
2745{
2746 return LONG2NUM(RSTRING_LEN(str));
2747}
2748
2749/*
2750 * call-seq:
2751 * empty? -> true or false
2752 *
2753 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2754 *
2755 * "hello".empty? # => false
2756 * " ".empty? # => false
2757 * "".empty? # => true
2758 *
2759 */
2760
2761static VALUE
2762rb_str_empty(VALUE str)
2763{
2764 return RBOOL(RSTRING_LEN(str) == 0);
2765}
2766
2767/*
2768 * call-seq:
2769 * string + other_string -> new_string
2770 *
2771 * Returns a new +String+ containing +other_string+ concatenated to +self+:
2772 *
2773 * "Hello from " + self.to_s # => "Hello from main"
2774 *
2775 */
2776
2777VALUE
2779{
2780 VALUE str3;
2781 rb_encoding *enc;
2782 char *ptr1, *ptr2, *ptr3;
2783 long len1, len2;
2784 int termlen;
2785
2786 StringValue(str2);
2787 enc = rb_enc_check_str(str1, str2);
2788 RSTRING_GETMEM(str1, ptr1, len1);
2789 RSTRING_GETMEM(str2, ptr2, len2);
2790 termlen = rb_enc_mbminlen(enc);
2791 if (len1 > LONG_MAX - len2) {
2792 rb_raise(rb_eArgError, "string size too big");
2793 }
2794 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2795 ptr3 = RSTRING_PTR(str3);
2796 memcpy(ptr3, ptr1, len1);
2797 memcpy(ptr3+len1, ptr2, len2);
2798 TERM_FILL(&ptr3[len1+len2], termlen);
2799
2800 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2802 RB_GC_GUARD(str1);
2803 RB_GC_GUARD(str2);
2804 return str3;
2805}
2806
2807/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2808VALUE
2809rb_str_opt_plus(VALUE str1, VALUE str2)
2810{
2813 long len1, len2;
2814 MAYBE_UNUSED(char) *ptr1, *ptr2;
2815 RSTRING_GETMEM(str1, ptr1, len1);
2816 RSTRING_GETMEM(str2, ptr2, len2);
2817 int enc1 = rb_enc_get_index(str1);
2818 int enc2 = rb_enc_get_index(str2);
2819
2820 if (enc1 < 0) {
2821 return Qundef;
2822 }
2823 else if (enc2 < 0) {
2824 return Qundef;
2825 }
2826 else if (enc1 != enc2) {
2827 return Qundef;
2828 }
2829 else if (len1 > LONG_MAX - len2) {
2830 return Qundef;
2831 }
2832 else {
2833 return rb_str_plus(str1, str2);
2834 }
2835
2836}
2837
2838/*
2839 * call-seq:
2840 * string * integer -> new_string
2841 *
2842 * Returns a new +String+ containing +integer+ copies of +self+:
2843 *
2844 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2845 * "Ho! " * 0 # => ""
2846 *
2847 */
2848
2849VALUE
2851{
2852 VALUE str2;
2853 long n, len;
2854 char *ptr2;
2855 int termlen;
2856
2857 if (times == INT2FIX(1)) {
2858 return str_duplicate(rb_cString, str);
2859 }
2860 if (times == INT2FIX(0)) {
2861 str2 = str_alloc_embed(rb_cString, 0);
2862 rb_enc_copy(str2, str);
2863 return str2;
2864 }
2865 len = NUM2LONG(times);
2866 if (len < 0) {
2867 rb_raise(rb_eArgError, "negative argument");
2868 }
2869 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2870 if (STR_EMBEDDABLE_P(len, 1)) {
2871 str2 = str_alloc_embed(rb_cString, len + 1);
2872 memset(RSTRING_PTR(str2), 0, len + 1);
2873 }
2874 else {
2875 str2 = str_alloc_heap(rb_cString);
2876 RSTRING(str2)->as.heap.aux.capa = len;
2877 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2878 }
2879 STR_SET_LEN(str2, len);
2880 rb_enc_copy(str2, str);
2881 return str2;
2882 }
2883 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2884 rb_raise(rb_eArgError, "argument too big");
2885 }
2886
2887 len *= RSTRING_LEN(str);
2888 termlen = TERM_LEN(str);
2889 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2890 ptr2 = RSTRING_PTR(str2);
2891 if (len) {
2892 n = RSTRING_LEN(str);
2893 memcpy(ptr2, RSTRING_PTR(str), n);
2894 while (n <= len/2) {
2895 memcpy(ptr2 + n, ptr2, n);
2896 n *= 2;
2897 }
2898 memcpy(ptr2 + n, ptr2, len-n);
2899 }
2900 STR_SET_LEN(str2, len);
2901 TERM_FILL(&ptr2[len], termlen);
2902 rb_enc_cr_str_copy_for_substr(str2, str);
2903
2904 return str2;
2905}
2906
2907/*
2908 * call-seq:
2909 * string % object -> new_string
2910 *
2911 * Returns the result of formatting +object+ into the format specification +self+
2912 * (see Kernel#sprintf for formatting details):
2913 *
2914 * "%05d" % 123 # => "00123"
2915 *
2916 * If +self+ contains multiple substitutions, +object+ must be
2917 * an Array or Hash containing the values to be substituted:
2918 *
2919 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2920 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2921 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2922 *
2923 */
2924
2925static VALUE
2926rb_str_format_m(VALUE str, VALUE arg)
2927{
2928 VALUE tmp = rb_check_array_type(arg);
2929
2930 if (!NIL_P(tmp)) {
2931 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2932 }
2933 return rb_str_format(1, &arg, str);
2934}
2935
2936static inline void
2937rb_check_lockedtmp(VALUE str)
2938{
2939 if (FL_TEST(str, STR_TMPLOCK)) {
2940 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2941 }
2942}
2943
2944// If none of these flags are set, we know we have an modifiable string.
2945// If any is set, we need to do more detailed checks.
2946#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2947static inline void
2948str_modifiable(VALUE str)
2949{
2950 RUBY_ASSERT(ruby_thread_has_gvl_p());
2951
2952 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2953 if (CHILLED_STRING_P(str)) {
2954 CHILLED_STRING_MUTATED(str);
2955 }
2956 rb_check_lockedtmp(str);
2957 rb_check_frozen(str);
2958 }
2959}
2960
2961static inline int
2962str_dependent_p(VALUE str)
2963{
2964 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2965 return FALSE;
2966 }
2967 else {
2968 return TRUE;
2969 }
2970}
2971
2972// If none of these flags are set, we know we have an independent string.
2973// If any is set, we need to do more detailed checks.
2974#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2975static inline int
2976str_independent(VALUE str)
2977{
2978 RUBY_ASSERT(ruby_thread_has_gvl_p());
2979
2980 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2981 str_modifiable(str);
2982 return !str_dependent_p(str);
2983 }
2984 return TRUE;
2985}
2986
2987static void
2988str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2989{
2990 RUBY_ASSERT(ruby_thread_has_gvl_p());
2991
2992 char *ptr;
2993 char *oldptr;
2994 long capa = len + expand;
2995
2996 if (len > capa) len = capa;
2997
2998 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2999 ptr = RSTRING(str)->as.heap.ptr;
3000 STR_SET_EMBED(str);
3001 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
3002 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3003 STR_SET_LEN(str, len);
3004 return;
3005 }
3006
3007 ptr = ALLOC_N(char, (size_t)capa + termlen);
3008 oldptr = RSTRING_PTR(str);
3009 if (oldptr) {
3010 memcpy(ptr, oldptr, len);
3011 }
3012 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
3013 xfree(oldptr);
3014 }
3015 STR_SET_NOEMBED(str);
3016 FL_UNSET(str, STR_SHARED|STR_NOFREE);
3017 TERM_FILL(ptr + len, termlen);
3018 RSTRING(str)->as.heap.ptr = ptr;
3019 STR_SET_LEN(str, len);
3020 RSTRING(str)->as.heap.aux.capa = capa;
3021}
3022
3023void
3024rb_str_modify(VALUE str)
3025{
3026 if (!str_independent(str))
3027 str_make_independent(str);
3029}
3030
3031void
3033{
3034 RUBY_ASSERT(ruby_thread_has_gvl_p());
3035
3036 int termlen = TERM_LEN(str);
3037 long len = RSTRING_LEN(str);
3038
3039 if (expand < 0) {
3040 rb_raise(rb_eArgError, "negative expanding string size");
3041 }
3042 if (expand >= LONG_MAX - len) {
3043 rb_raise(rb_eArgError, "string size too big");
3044 }
3045
3046 if (!str_independent(str)) {
3047 str_make_independent_expand(str, len, expand, termlen);
3048 }
3049 else if (expand > 0) {
3050 RESIZE_CAPA_TERM(str, len + expand, termlen);
3051 }
3053}
3054
3055/* As rb_str_modify(), but don't clear coderange */
3056static void
3057str_modify_keep_cr(VALUE str)
3058{
3059 if (!str_independent(str))
3060 str_make_independent(str);
3062 /* Force re-scan later */
3064}
3065
3066static inline void
3067str_discard(VALUE str)
3068{
3069 str_modifiable(str);
3070 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
3071 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
3072 RSTRING(str)->as.heap.ptr = 0;
3073 STR_SET_LEN(str, 0);
3074 }
3075}
3076
3077void
3079{
3080 int encindex = rb_enc_get_index(str);
3081
3082 if (RB_UNLIKELY(encindex == -1)) {
3083 rb_raise(rb_eTypeError, "not encoding capable object");
3084 }
3085
3086 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
3087 return;
3088 }
3089
3090 rb_encoding *enc = rb_enc_from_index(encindex);
3091 if (!rb_enc_asciicompat(enc)) {
3092 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
3093 }
3094}
3095
3096VALUE
3098{
3099 RUBY_ASSERT(ruby_thread_has_gvl_p());
3100
3101 VALUE s = *ptr;
3102 if (!RB_TYPE_P(s, T_STRING)) {
3103 s = rb_str_to_str(s);
3104 *ptr = s;
3105 }
3106 return s;
3107}
3108
3109char *
3111{
3112 VALUE str = rb_string_value(ptr);
3113 return RSTRING_PTR(str);
3114}
3115
3116static int
3117zero_filled(const char *s, int n)
3118{
3119 for (; n > 0; --n) {
3120 if (*s++) return 0;
3121 }
3122 return 1;
3123}
3124
3125static const char *
3126str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
3127{
3128 const char *e = s + len;
3129
3130 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
3131 if (zero_filled(s, minlen)) return s;
3132 }
3133 return 0;
3134}
3135
3136static char *
3137str_fill_term(VALUE str, char *s, long len, int termlen)
3138{
3139 /* This function assumes that (capa + termlen) bytes of memory
3140 * is allocated, like many other functions in this file.
3141 */
3142 if (str_dependent_p(str)) {
3143 if (!zero_filled(s + len, termlen))
3144 str_make_independent_expand(str, len, 0L, termlen);
3145 }
3146 else {
3147 TERM_FILL(s + len, termlen);
3148 return s;
3149 }
3150 return RSTRING_PTR(str);
3151}
3152
3153void
3154rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
3155{
3156 long capa = str_capacity(str, oldtermlen) + oldtermlen;
3157 long len = RSTRING_LEN(str);
3158
3159 RUBY_ASSERT(capa >= len);
3160 if (capa - len < termlen) {
3161 rb_check_lockedtmp(str);
3162 str_make_independent_expand(str, len, 0L, termlen);
3163 }
3164 else if (str_dependent_p(str)) {
3165 if (termlen > oldtermlen)
3166 str_make_independent_expand(str, len, 0L, termlen);
3167 }
3168 else {
3169 if (!STR_EMBED_P(str)) {
3170 /* modify capa instead of realloc */
3171 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
3172 RSTRING(str)->as.heap.aux.capa = capa - termlen;
3173 }
3174 if (termlen > oldtermlen) {
3175 TERM_FILL(RSTRING_PTR(str) + len, termlen);
3176 }
3177 }
3178
3179 return;
3180}
3181
3182static char *
3183str_null_check(VALUE str, int *w)
3184{
3185 char *s = RSTRING_PTR(str);
3186 long len = RSTRING_LEN(str);
3187 rb_encoding *enc = rb_enc_get(str);
3188 const int minlen = rb_enc_mbminlen(enc);
3189
3190 if (minlen > 1) {
3191 *w = 1;
3192 if (str_null_char(s, len, minlen, enc)) {
3193 return NULL;
3194 }
3195 return str_fill_term(str, s, len, minlen);
3196 }
3197 *w = 0;
3198 if (!s || memchr(s, 0, len)) {
3199 return NULL;
3200 }
3201 if (s[len]) {
3202 s = str_fill_term(str, s, len, minlen);
3203 }
3204 return s;
3205}
3206
3207char *
3208rb_str_to_cstr(VALUE str)
3209{
3210 int w;
3211 return str_null_check(str, &w);
3212}
3213
3214char *
3216{
3217 VALUE str = rb_string_value(ptr);
3218 int w;
3219 char *s = str_null_check(str, &w);
3220 if (!s) {
3221 if (w) {
3222 rb_raise(rb_eArgError, "string contains null char");
3223 }
3224 rb_raise(rb_eArgError, "string contains null byte");
3225 }
3226 return s;
3227}
3228
3229char *
3230rb_str_fill_terminator(VALUE str, const int newminlen)
3231{
3232 char *s = RSTRING_PTR(str);
3233 long len = RSTRING_LEN(str);
3234 return str_fill_term(str, s, len, newminlen);
3235}
3236
3237VALUE
3239{
3240 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
3241 return str;
3242}
3243
3244/*
3245 * call-seq:
3246 * String.try_convert(object) -> object, new_string, or nil
3247 *
3248 * Attempts to convert the given +object+ to a string.
3249 *
3250 * If +object+ is already a string, returns +object+, unmodified.
3251 *
3252 * Otherwise if +object+ responds to <tt>:to_str</tt>,
3253 * calls <tt>object.to_str</tt> and returns the result.
3254 *
3255 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
3256 *
3257 * Raises an exception unless <tt>object.to_str</tt> returns a string.
3258 */
3259static VALUE
3260rb_str_s_try_convert(VALUE dummy, VALUE str)
3261{
3262 return rb_check_string_type(str);
3263}
3264
3265static char*
3266str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
3267{
3268 long nth = *nthp;
3269 if (rb_enc_mbmaxlen(enc) == 1) {
3270 p += nth;
3271 }
3272 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3273 p += nth * rb_enc_mbmaxlen(enc);
3274 }
3275 else if (rb_enc_asciicompat(enc)) {
3276 const char *p2, *e2;
3277 int n;
3278
3279 while (p < e && 0 < nth) {
3280 e2 = p + nth;
3281 if (e < e2) {
3282 *nthp = nth;
3283 return (char *)e;
3284 }
3285 if (ISASCII(*p)) {
3286 p2 = search_nonascii(p, e2);
3287 if (!p2) {
3288 nth -= e2 - p;
3289 *nthp = nth;
3290 return (char *)e2;
3291 }
3292 nth -= p2 - p;
3293 p = p2;
3294 }
3295 n = rb_enc_mbclen(p, e, enc);
3296 p += n;
3297 nth--;
3298 }
3299 *nthp = nth;
3300 if (nth != 0) {
3301 return (char *)e;
3302 }
3303 return (char *)p;
3304 }
3305 else {
3306 while (p < e && nth--) {
3307 p += rb_enc_mbclen(p, e, enc);
3308 }
3309 }
3310 if (p > e) p = e;
3311 *nthp = nth;
3312 return (char*)p;
3313}
3314
3315char*
3316rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3317{
3318 return str_nth_len(p, e, &nth, enc);
3319}
3320
3321static char*
3322str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3323{
3324 if (singlebyte)
3325 p += nth;
3326 else {
3327 p = str_nth_len(p, e, &nth, enc);
3328 }
3329 if (!p) return 0;
3330 if (p > e) p = e;
3331 return (char *)p;
3332}
3333
3334/* char offset to byte offset */
3335static long
3336str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3337{
3338 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3339 if (!pp) return e - p;
3340 return pp - p;
3341}
3342
3343long
3344rb_str_offset(VALUE str, long pos)
3345{
3346 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3347 STR_ENC_GET(str), single_byte_optimizable(str));
3348}
3349
3350#ifdef NONASCII_MASK
3351static char *
3352str_utf8_nth(const char *p, const char *e, long *nthp)
3353{
3354 long nth = *nthp;
3355 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3356 const uintptr_t *s, *t;
3357 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3358 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3359 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3360 while (p < (const char *)s) {
3361 if (is_utf8_lead_byte(*p)) nth--;
3362 p++;
3363 }
3364 do {
3365 nth -= count_utf8_lead_bytes_with_word(s);
3366 s++;
3367 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3368 p = (char *)s;
3369 }
3370 while (p < e) {
3371 if (is_utf8_lead_byte(*p)) {
3372 if (nth == 0) break;
3373 nth--;
3374 }
3375 p++;
3376 }
3377 *nthp = nth;
3378 return (char *)p;
3379}
3380
3381static long
3382str_utf8_offset(const char *p, const char *e, long nth)
3383{
3384 const char *pp = str_utf8_nth(p, e, &nth);
3385 return pp - p;
3386}
3387#endif
3388
3389/* byte offset to char offset */
3390long
3391rb_str_sublen(VALUE str, long pos)
3392{
3393 if (single_byte_optimizable(str) || pos < 0)
3394 return pos;
3395 else {
3396 char *p = RSTRING_PTR(str);
3397 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3398 }
3399}
3400
3401static VALUE
3402str_subseq(VALUE str, long beg, long len)
3403{
3404 VALUE str2;
3405
3406 RUBY_ASSERT(beg >= 0);
3407 RUBY_ASSERT(len >= 0);
3408 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3409
3410 const int termlen = TERM_LEN(str);
3411 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3412 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3413 RB_GC_GUARD(str);
3414 return str2;
3415 }
3416
3417 str2 = str_alloc_heap(rb_cString);
3418 if (str_embed_capa(str2) >= len + termlen) {
3419 char *ptr2 = RSTRING(str2)->as.embed.ary;
3420 STR_SET_EMBED(str2);
3421 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3422 TERM_FILL(ptr2+len, termlen);
3423
3424 STR_SET_LEN(str2, len);
3425 RB_GC_GUARD(str);
3426 }
3427 else {
3428 str_replace_shared(str2, str);
3429 RUBY_ASSERT(!STR_EMBED_P(str2));
3430 ENC_CODERANGE_CLEAR(str2);
3431 RSTRING(str2)->as.heap.ptr += beg;
3432 if (RSTRING_LEN(str2) > len) {
3433 STR_SET_LEN(str2, len);
3434 }
3435 }
3436
3437 return str2;
3438}
3439
3440VALUE
3441rb_str_subseq(VALUE str, long beg, long len)
3442{
3443 VALUE str2 = str_subseq(str, beg, len);
3444 rb_enc_cr_str_copy_for_substr(str2, str);
3445 return str2;
3446}
3447
3448char *
3449rb_str_subpos(VALUE str, long beg, long *lenp)
3450{
3451 long len = *lenp;
3452 long slen = -1L;
3453 const long blen = RSTRING_LEN(str);
3454 rb_encoding *enc = STR_ENC_GET(str);
3455 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3456
3457 if (len < 0) return 0;
3458 if (beg < 0 && -beg < 0) return 0;
3459 if (!blen) {
3460 len = 0;
3461 }
3462 if (single_byte_optimizable(str)) {
3463 if (beg > blen) return 0;
3464 if (beg < 0) {
3465 beg += blen;
3466 if (beg < 0) return 0;
3467 }
3468 if (len > blen - beg)
3469 len = blen - beg;
3470 if (len < 0) return 0;
3471 p = s + beg;
3472 goto end;
3473 }
3474 if (beg < 0) {
3475 if (len > -beg) len = -beg;
3476 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3477 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3478 beg = -beg;
3479 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3480 p = e;
3481 if (!p) return 0;
3482 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3483 if (!p) return 0;
3484 len = e - p;
3485 goto end;
3486 }
3487 else {
3488 slen = str_strlen(str, enc);
3489 beg += slen;
3490 if (beg < 0) return 0;
3491 p = s + beg;
3492 if (len == 0) goto end;
3493 }
3494 }
3495 else if (beg > 0 && beg > blen) {
3496 return 0;
3497 }
3498 if (len == 0) {
3499 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3500 p = s + beg;
3501 }
3502#ifdef NONASCII_MASK
3503 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3504 enc == rb_utf8_encoding()) {
3505 p = str_utf8_nth(s, e, &beg);
3506 if (beg > 0) return 0;
3507 len = str_utf8_offset(p, e, len);
3508 }
3509#endif
3510 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3511 int char_sz = rb_enc_mbmaxlen(enc);
3512
3513 p = s + beg * char_sz;
3514 if (p > e) {
3515 return 0;
3516 }
3517 else if (len * char_sz > e - p)
3518 len = e - p;
3519 else
3520 len *= char_sz;
3521 }
3522 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3523 if (beg > 0) return 0;
3524 len = 0;
3525 }
3526 else {
3527 len = str_offset(p, e, len, enc, 0);
3528 }
3529 end:
3530 *lenp = len;
3531 RB_GC_GUARD(str);
3532 return p;
3533}
3534
3535static VALUE str_substr(VALUE str, long beg, long len, int empty);
3536
3537VALUE
3538rb_str_substr(VALUE str, long beg, long len)
3539{
3540 return str_substr(str, beg, len, TRUE);
3541}
3542
3543VALUE
3544rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3545{
3546 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3547}
3548
3549static VALUE
3550str_substr(VALUE str, long beg, long len, int empty)
3551{
3552 char *p = rb_str_subpos(str, beg, &len);
3553
3554 if (!p) return Qnil;
3555 if (!len && !empty) return Qnil;
3556
3557 beg = p - RSTRING_PTR(str);
3558
3559 VALUE str2 = str_subseq(str, beg, len);
3560 rb_enc_cr_str_copy_for_substr(str2, str);
3561 return str2;
3562}
3563
3564/* :nodoc: */
3565VALUE
3567{
3568 if (CHILLED_STRING_P(str)) {
3569 FL_UNSET_RAW(str, STR_CHILLED);
3570 }
3571
3572 if (OBJ_FROZEN(str)) return str;
3573 rb_str_resize(str, RSTRING_LEN(str));
3574 return rb_obj_freeze(str);
3575}
3576
3577/*
3578 * call-seq:
3579 * +string -> new_string or self
3580 *
3581 * Returns +self+ if +self+ is not frozen and can be mutated
3582 * without warning issuance.
3583 *
3584 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3585 */
3586static VALUE
3587str_uplus(VALUE str)
3588{
3589 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3590 return rb_str_dup(str);
3591 }
3592 else {
3593 return str;
3594 }
3595}
3596
3597/*
3598 * call-seq:
3599 * -string -> frozen_string
3600 * dedup -> frozen_string
3601 *
3602 * Returns a frozen, possibly pre-existing copy of the string.
3603 *
3604 * The returned +String+ will be deduplicated as long as it does not have
3605 * any instance variables set on it and is not a String subclass.
3606 *
3607 * Note that <tt>-string</tt> variant is more convenient for defining
3608 * constants:
3609 *
3610 * FILENAME = -'config/database.yml'
3611 *
3612 * while +dedup+ is better suitable for using the method in chains
3613 * of calculations:
3614 *
3615 * @url_list.concat(urls.map(&:dedup))
3616 *
3617 */
3618static VALUE
3619str_uminus(VALUE str)
3620{
3621 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3622 str = rb_str_dup(str);
3623 }
3624 return rb_fstring(str);
3625}
3626
3627RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3628#define rb_str_dup_frozen rb_str_new_frozen
3629
3630VALUE
3632{
3633 if (FL_TEST(str, STR_TMPLOCK)) {
3634 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3635 }
3636 FL_SET(str, STR_TMPLOCK);
3637 return str;
3638}
3639
3640VALUE
3642{
3643 if (!FL_TEST(str, STR_TMPLOCK)) {
3644 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3645 }
3646 FL_UNSET(str, STR_TMPLOCK);
3647 return str;
3648}
3649
3650VALUE
3651rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3652{
3653 rb_str_locktmp(str);
3654 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3655}
3656
3657void
3659{
3660 RUBY_ASSERT(ruby_thread_has_gvl_p());
3661
3662 long capa;
3663 const int termlen = TERM_LEN(str);
3664
3665 str_modifiable(str);
3666 if (STR_SHARED_P(str)) {
3667 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3668 }
3669 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3670 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3671 }
3672
3673 int cr = ENC_CODERANGE(str);
3674 if (len == 0) {
3675 /* Empty string does not contain non-ASCII */
3677 }
3678 else if (cr == ENC_CODERANGE_UNKNOWN) {
3679 /* Leave unknown. */
3680 }
3681 else if (len > RSTRING_LEN(str)) {
3682 if (ENC_CODERANGE_CLEAN_P(cr)) {
3683 /* Update the coderange regarding the extended part. */
3684 const char *const prev_end = RSTRING_END(str);
3685 const char *const new_end = RSTRING_PTR(str) + len;
3686 rb_encoding *enc = rb_enc_get(str);
3687 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3688 ENC_CODERANGE_SET(str, cr);
3689 }
3690 else if (cr == ENC_CODERANGE_BROKEN) {
3691 /* May be valid now, by appended part. */
3693 }
3694 }
3695 else if (len < RSTRING_LEN(str)) {
3696 if (cr != ENC_CODERANGE_7BIT) {
3697 /* ASCII-only string is keeping after truncated. Valid
3698 * and broken may be invalid or valid, leave unknown. */
3700 }
3701 }
3702
3703 STR_SET_LEN(str, len);
3704 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3705}
3706
3707VALUE
3708rb_str_resize(VALUE str, long len)
3709{
3710 if (len < 0) {
3711 rb_raise(rb_eArgError, "negative string size (or size too big)");
3712 }
3713
3714 int independent = str_independent(str);
3715 long slen = RSTRING_LEN(str);
3716 const int termlen = TERM_LEN(str);
3717
3718 if (slen > len || (termlen != 1 && slen < len)) {
3720 }
3721
3722 {
3723 long capa;
3724 if (STR_EMBED_P(str)) {
3725 if (len == slen) return str;
3726 if (str_embed_capa(str) >= len + termlen) {
3727 STR_SET_LEN(str, len);
3728 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3729 return str;
3730 }
3731 str_make_independent_expand(str, slen, len - slen, termlen);
3732 }
3733 else if (str_embed_capa(str) >= len + termlen) {
3734 char *ptr = STR_HEAP_PTR(str);
3735 STR_SET_EMBED(str);
3736 if (slen > len) slen = len;
3737 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3738 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3739 STR_SET_LEN(str, len);
3740 if (independent) ruby_xfree(ptr);
3741 return str;
3742 }
3743 else if (!independent) {
3744 if (len == slen) return str;
3745 str_make_independent_expand(str, slen, len - slen, termlen);
3746 }
3747 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3748 (capa - len) > (len < 1024 ? len : 1024)) {
3749 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3750 (size_t)len + termlen, STR_HEAP_SIZE(str));
3751 RSTRING(str)->as.heap.aux.capa = len;
3752 }
3753 else if (len == slen) return str;
3754 STR_SET_LEN(str, len);
3755 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3756 }
3757 return str;
3758}
3759
3760static void
3761str_ensure_available_capa(VALUE str, long len)
3762{
3763 str_modify_keep_cr(str);
3764
3765 const int termlen = TERM_LEN(str);
3766 long olen = RSTRING_LEN(str);
3767
3768 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3769 rb_raise(rb_eArgError, "string sizes too big");
3770 }
3771
3772 long total = olen + len;
3773 long capa = str_capacity(str, termlen);
3774
3775 if (capa < total) {
3776 if (total >= LONG_MAX / 2) {
3777 capa = total;
3778 }
3779 while (total > capa) {
3780 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3781 }
3782 RESIZE_CAPA_TERM(str, capa, termlen);
3783 }
3784}
3785
3786static VALUE
3787str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3788{
3789 if (keep_cr) {
3790 str_modify_keep_cr(str);
3791 }
3792 else {
3793 rb_str_modify(str);
3794 }
3795 if (len == 0) return 0;
3796
3797 long total, olen, off = -1;
3798 char *sptr;
3799 const int termlen = TERM_LEN(str);
3800
3801 RSTRING_GETMEM(str, sptr, olen);
3802 if (ptr >= sptr && ptr <= sptr + olen) {
3803 off = ptr - sptr;
3804 }
3805
3806 long capa = str_capacity(str, termlen);
3807
3808 if (olen > LONG_MAX - len) {
3809 rb_raise(rb_eArgError, "string sizes too big");
3810 }
3811 total = olen + len;
3812 if (capa < total) {
3813 if (total >= LONG_MAX / 2) {
3814 capa = total;
3815 }
3816 while (total > capa) {
3817 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3818 }
3819 RESIZE_CAPA_TERM(str, capa, termlen);
3820 sptr = RSTRING_PTR(str);
3821 }
3822 if (off != -1) {
3823 ptr = sptr + off;
3824 }
3825 memcpy(sptr + olen, ptr, len);
3826 STR_SET_LEN(str, total);
3827 TERM_FILL(sptr + total, termlen); /* sentinel */
3828
3829 return str;
3830}
3831
3832#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3833#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3834
3835VALUE
3836rb_str_cat(VALUE str, const char *ptr, long len)
3837{
3838 if (len == 0) return str;
3839 if (len < 0) {
3840 rb_raise(rb_eArgError, "negative string size (or size too big)");
3841 }
3842 return str_buf_cat(str, ptr, len);
3843}
3844
3845VALUE
3846rb_str_cat_cstr(VALUE str, const char *ptr)
3847{
3848 must_not_null(ptr);
3849 return rb_str_buf_cat(str, ptr, strlen(ptr));
3850}
3851
3852static void
3853rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3854{
3855 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3856
3857 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3858 if (UNLIKELY(!str_independent(str))) {
3859 str_make_independent(str);
3860 }
3861
3862 long string_length = -1;
3863 const int null_terminator_length = 1;
3864 char *sptr;
3865 RSTRING_GETMEM(str, sptr, string_length);
3866
3867 // Ensure the resulting string wouldn't be too long.
3868 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3869 rb_raise(rb_eArgError, "string sizes too big");
3870 }
3871
3872 long string_capacity = str_capacity(str, null_terminator_length);
3873
3874 // Get the code range before any modifications since those might clear the code range.
3875 int cr = ENC_CODERANGE(str);
3876
3877 // Check if the string has spare string_capacity to write the new byte.
3878 if (LIKELY(string_capacity >= string_length + 1)) {
3879 // In fast path we can write the new byte and note the string's new length.
3880 sptr[string_length] = byte;
3881 STR_SET_LEN(str, string_length + 1);
3882 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3883 }
3884 else {
3885 // If there's not enough string_capacity, make a call into the general string concatenation function.
3886 str_buf_cat(str, (char *)&byte, 1);
3887 }
3888
3889 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3890 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3891 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3892 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3893 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3894 if (ISASCII(byte)) {
3896 }
3897 else {
3899
3900 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3901 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3902 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3903 }
3904 }
3905 }
3906}
3907
3908RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3909RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3910RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3911
3912static VALUE
3913rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3914 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3915{
3916 int str_encindex = ENCODING_GET(str);
3917 int res_encindex;
3918 int str_cr, res_cr;
3919 rb_encoding *str_enc, *ptr_enc;
3920
3921 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3922
3923 if (str_encindex == ptr_encindex) {
3924 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3925 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3926 }
3927 }
3928 else {
3929 str_enc = rb_enc_from_index(str_encindex);
3930 ptr_enc = rb_enc_from_index(ptr_encindex);
3931 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3932 if (len == 0)
3933 return str;
3934 if (RSTRING_LEN(str) == 0) {
3935 rb_str_buf_cat(str, ptr, len);
3936 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3937 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3938 return str;
3939 }
3940 goto incompatible;
3941 }
3942 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3943 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3944 }
3945 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3946 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3947 str_cr = rb_enc_str_coderange(str);
3948 }
3949 }
3950 }
3951 if (ptr_cr_ret)
3952 *ptr_cr_ret = ptr_cr;
3953
3954 if (str_encindex != ptr_encindex &&
3955 str_cr != ENC_CODERANGE_7BIT &&
3956 ptr_cr != ENC_CODERANGE_7BIT) {
3957 str_enc = rb_enc_from_index(str_encindex);
3958 ptr_enc = rb_enc_from_index(ptr_encindex);
3959 goto incompatible;
3960 }
3961
3962 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3963 res_encindex = str_encindex;
3964 res_cr = ENC_CODERANGE_UNKNOWN;
3965 }
3966 else if (str_cr == ENC_CODERANGE_7BIT) {
3967 if (ptr_cr == ENC_CODERANGE_7BIT) {
3968 res_encindex = str_encindex;
3969 res_cr = ENC_CODERANGE_7BIT;
3970 }
3971 else {
3972 res_encindex = ptr_encindex;
3973 res_cr = ptr_cr;
3974 }
3975 }
3976 else if (str_cr == ENC_CODERANGE_VALID) {
3977 res_encindex = str_encindex;
3978 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3979 res_cr = str_cr;
3980 else
3981 res_cr = ptr_cr;
3982 }
3983 else { /* str_cr == ENC_CODERANGE_BROKEN */
3984 res_encindex = str_encindex;
3985 res_cr = str_cr;
3986 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3987 }
3988
3989 if (len < 0) {
3990 rb_raise(rb_eArgError, "negative string size (or size too big)");
3991 }
3992 str_buf_cat(str, ptr, len);
3993 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3994 return str;
3995
3996 incompatible:
3997 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3998 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
4000}
4001
4002VALUE
4003rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
4004{
4005 return rb_enc_cr_str_buf_cat(str, ptr, len,
4006 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
4007}
4008
4009VALUE
4011{
4012 /* ptr must reference NUL terminated ASCII string. */
4013 int encindex = ENCODING_GET(str);
4014 rb_encoding *enc = rb_enc_from_index(encindex);
4015 if (rb_enc_asciicompat(enc)) {
4016 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
4017 encindex, ENC_CODERANGE_7BIT, 0);
4018 }
4019 else {
4020 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
4021 while (*ptr) {
4022 unsigned int c = (unsigned char)*ptr;
4023 int len = rb_enc_codelen(c, enc);
4024 rb_enc_mbcput(c, buf, enc);
4025 rb_enc_cr_str_buf_cat(str, buf, len,
4026 encindex, ENC_CODERANGE_VALID, 0);
4027 ptr++;
4028 }
4029 return str;
4030 }
4031}
4032
4033VALUE
4035{
4036 int str2_cr = rb_enc_str_coderange(str2);
4037
4038 if (str_enc_fastpath(str)) {
4039 switch (str2_cr) {
4040 case ENC_CODERANGE_7BIT:
4041 // If RHS is 7bit we can do simple concatenation
4042 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
4043 RB_GC_GUARD(str2);
4044 return str;
4046 // If RHS is valid, we can do simple concatenation if encodings are the same
4047 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
4048 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
4049 int str_cr = ENC_CODERANGE(str);
4050 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
4051 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
4052 }
4053 RB_GC_GUARD(str2);
4054 return str;
4055 }
4056 }
4057 }
4058
4059 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
4060 ENCODING_GET(str2), str2_cr, &str2_cr);
4061
4062 ENC_CODERANGE_SET(str2, str2_cr);
4063
4064 return str;
4065}
4066
4067VALUE
4069{
4070 StringValue(str2);
4071 return rb_str_buf_append(str, str2);
4072}
4073
4074VALUE
4075rb_str_concat_literals(size_t num, const VALUE *strary)
4076{
4077 VALUE str;
4078 size_t i, s = 0;
4079 unsigned long len = 1;
4080
4081 if (UNLIKELY(!num)) return rb_str_new(0, 0);
4082 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
4083
4084 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
4085 str = rb_str_buf_new(len);
4086 str_enc_copy_direct(str, strary[0]);
4087
4088 for (i = s; i < num; ++i) {
4089 const VALUE v = strary[i];
4090 int encidx = ENCODING_GET(v);
4091
4092 rb_str_buf_append(str, v);
4093 if (encidx != ENCINDEX_US_ASCII) {
4094 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
4095 rb_enc_set_index(str, encidx);
4096 }
4097 }
4098 return str;
4099}
4100
4101/*
4102 * call-seq:
4103 * concat(*objects) -> string
4104 *
4105 * Concatenates each object in +objects+ to +self+ and returns +self+:
4106 *
4107 * s = 'foo'
4108 * s.concat('bar', 'baz') # => "foobarbaz"
4109 * s # => "foobarbaz"
4110 *
4111 * For each given object +object+ that is an Integer,
4112 * the value is considered a codepoint and converted to a character before concatenation:
4113 *
4114 * s = 'foo'
4115 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
4116 *
4117 * Related: String#<<, which takes a single argument.
4118 */
4119static VALUE
4120rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
4121{
4122 str_modifiable(str);
4123
4124 if (argc == 1) {
4125 return rb_str_concat(str, argv[0]);
4126 }
4127 else if (argc > 1) {
4128 int i;
4129 VALUE arg_str = rb_str_tmp_new(0);
4130 rb_enc_copy(arg_str, str);
4131 for (i = 0; i < argc; i++) {
4132 rb_str_concat(arg_str, argv[i]);
4133 }
4134 rb_str_buf_append(str, arg_str);
4135 }
4136
4137 return str;
4138}
4139
4140/*
4141 * call-seq:
4142 * append_as_bytes(*objects) -> string
4143 *
4144 * Concatenates each object in +objects+ into +self+ without any encoding
4145 * validation or conversion and returns +self+:
4146 *
4147 * s = 'foo'
4148 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
4149 * s.valid_encoding? # => false
4150 * s.append_as_bytes("\xAC 12")
4151 * s.valid_encoding? # => true
4152 *
4153 * For each given object +object+ that is an Integer,
4154 * the value is considered a Byte. If the Integer is bigger
4155 * than one byte, only the lower byte is considered, similar to String#setbyte:
4156 *
4157 * s = ""
4158 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
4159 *
4160 * Related: String#<<, String#concat, which do an encoding aware concatenation.
4161 */
4162
4163VALUE
4164rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
4165{
4166 long needed_capacity = 0;
4167 volatile VALUE t0;
4168 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
4169
4170 for (int index = 0; index < argc; index++) {
4171 VALUE obj = argv[index];
4172 enum ruby_value_type type = types[index] = rb_type(obj);
4173 switch (type) {
4174 case T_FIXNUM:
4175 case T_BIGNUM:
4176 needed_capacity++;
4177 break;
4178 case T_STRING:
4179 needed_capacity += RSTRING_LEN(obj);
4180 break;
4181 default:
4182 rb_raise(
4184 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
4185 rb_obj_class(obj)
4186 );
4187 break;
4188 }
4189 }
4190
4191 str_ensure_available_capa(str, needed_capacity);
4192 char *sptr = RSTRING_END(str);
4193
4194 for (int index = 0; index < argc; index++) {
4195 VALUE obj = argv[index];
4196 enum ruby_value_type type = types[index];
4197 switch (type) {
4198 case T_FIXNUM:
4199 case T_BIGNUM: {
4200 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
4201 char byte = (char)(NUM2INT(obj) & 0xFF);
4202 *sptr = byte;
4203 sptr++;
4204 break;
4205 }
4206 case T_STRING: {
4207 const char *ptr;
4208 long len;
4209 RSTRING_GETMEM(obj, ptr, len);
4210 memcpy(sptr, ptr, len);
4211 sptr += len;
4212 break;
4213 }
4214 default:
4215 rb_bug("append_as_bytes arguments should have been validated");
4216 }
4217 }
4218
4219 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
4220 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
4221
4222 int cr = ENC_CODERANGE(str);
4223 switch (cr) {
4224 case ENC_CODERANGE_7BIT: {
4225 for (int index = 0; index < argc; index++) {
4226 VALUE obj = argv[index];
4227 enum ruby_value_type type = types[index];
4228 switch (type) {
4229 case T_FIXNUM:
4230 case T_BIGNUM: {
4231 if (!ISASCII(NUM2INT(obj))) {
4232 goto clear_cr;
4233 }
4234 break;
4235 }
4236 case T_STRING: {
4237 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
4238 goto clear_cr;
4239 }
4240 break;
4241 }
4242 default:
4243 rb_bug("append_as_bytes arguments should have been validated");
4244 }
4245 }
4246 break;
4247 }
4249 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
4250 goto keep_cr;
4251 }
4252 else {
4253 goto clear_cr;
4254 }
4255 break;
4256 default:
4257 goto clear_cr;
4258 break;
4259 }
4260
4261 RB_GC_GUARD(t0);
4262
4263 clear_cr:
4264 // If no fast path was hit, we clear the coderange.
4265 // append_as_bytes is predominently meant to be used in
4266 // buffering situation, hence it's likely the coderange
4267 // will never be scanned, so it's not worth spending time
4268 // precomputing the coderange except for simple and common
4269 // situations.
4271 keep_cr:
4272 return str;
4273}
4274
4275/*
4276 * call-seq:
4277 * string << object -> string
4278 *
4279 * Concatenates +object+ to +self+ and returns +self+:
4280 *
4281 * s = 'foo'
4282 * s << 'bar' # => "foobar"
4283 * s # => "foobar"
4284 *
4285 * If +object+ is an Integer,
4286 * the value is considered a codepoint and converted to a character before concatenation:
4287 *
4288 * s = 'foo'
4289 * s << 33 # => "foo!"
4290 *
4291 * If that codepoint is not representable in the encoding of
4292 * _string_, RangeError is raised.
4293 *
4294 * s = 'foo'
4295 * s.encoding # => <Encoding:UTF-8>
4296 * s << 0x00110000 # 1114112 out of char range (RangeError)
4297 * s = 'foo'.encode(Encoding::EUC_JP)
4298 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4299 *
4300 * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
4301 * is automatically promoted to ASCII-8BIT.
4302 *
4303 * s = 'foo'.encode(Encoding::US_ASCII)
4304 * s << 0xff
4305 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4306 *
4307 * Related: String#concat, which takes multiple arguments.
4308 */
4309VALUE
4311{
4312 unsigned int code;
4313 rb_encoding *enc = STR_ENC_GET(str1);
4314 int encidx;
4315
4316 if (RB_INTEGER_TYPE_P(str2)) {
4317 if (rb_num_to_uint(str2, &code) == 0) {
4318 }
4319 else if (FIXNUM_P(str2)) {
4320 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4321 }
4322 else {
4323 rb_raise(rb_eRangeError, "bignum out of char range");
4324 }
4325 }
4326 else {
4327 return rb_str_append(str1, str2);
4328 }
4329
4330 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4331
4332 if (encidx >= 0) {
4333 rb_str_buf_cat_byte(str1, (unsigned char)code);
4334 }
4335 else {
4336 long pos = RSTRING_LEN(str1);
4337 int cr = ENC_CODERANGE(str1);
4338 int len;
4339 char *buf;
4340
4341 switch (len = rb_enc_codelen(code, enc)) {
4342 case ONIGERR_INVALID_CODE_POINT_VALUE:
4343 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4344 break;
4345 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4346 case 0:
4347 rb_raise(rb_eRangeError, "%u out of char range", code);
4348 break;
4349 }
4350 buf = ALLOCA_N(char, len + 1);
4351 rb_enc_mbcput(code, buf, enc);
4352 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4353 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4354 }
4355 rb_str_resize(str1, pos+len);
4356 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4357 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4359 }
4360 else if (cr == ENC_CODERANGE_BROKEN) {
4362 }
4363 ENC_CODERANGE_SET(str1, cr);
4364 }
4365 return str1;
4366}
4367
4368int
4369rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4370{
4371 int encidx = rb_enc_to_index(enc);
4372
4373 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4374 /* US-ASCII automatically extended to ASCII-8BIT */
4375 if (code > 0xFF) {
4376 rb_raise(rb_eRangeError, "%u out of char range", code);
4377 }
4378 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4379 return ENCINDEX_ASCII_8BIT;
4380 }
4381 return encidx;
4382 }
4383 else {
4384 return -1;
4385 }
4386}
4387
4388/*
4389 * call-seq:
4390 * prepend(*other_strings) -> string
4391 *
4392 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4393 *
4394 * s = 'foo'
4395 * s.prepend('bar', 'baz') # => "barbazfoo"
4396 * s # => "barbazfoo"
4397 *
4398 * Related: String#concat.
4399 */
4400
4401static VALUE
4402rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4403{
4404 str_modifiable(str);
4405
4406 if (argc == 1) {
4407 rb_str_update(str, 0L, 0L, argv[0]);
4408 }
4409 else if (argc > 1) {
4410 int i;
4411 VALUE arg_str = rb_str_tmp_new(0);
4412 rb_enc_copy(arg_str, str);
4413 for (i = 0; i < argc; i++) {
4414 rb_str_append(arg_str, argv[i]);
4415 }
4416 rb_str_update(str, 0L, 0L, arg_str);
4417 }
4418
4419 return str;
4420}
4421
4422st_index_t
4424{
4425 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4426 st_index_t precomputed_hash;
4427 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4428
4429 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4430 return precomputed_hash;
4431 }
4432
4433 return str_do_hash(str);
4434}
4435
4436int
4438{
4439 long len1, len2;
4440 const char *ptr1, *ptr2;
4441 RSTRING_GETMEM(str1, ptr1, len1);
4442 RSTRING_GETMEM(str2, ptr2, len2);
4443 return (len1 != len2 ||
4444 !rb_str_comparable(str1, str2) ||
4445 memcmp(ptr1, ptr2, len1) != 0);
4446}
4447
4448/*
4449 * call-seq:
4450 * hash -> integer
4451 *
4452 * Returns the integer hash value for +self+.
4453 * The value is based on the length, content and encoding of +self+.
4454 *
4455 * Related: Object#hash.
4456 */
4457
4458static VALUE
4459rb_str_hash_m(VALUE str)
4460{
4461 st_index_t hval = rb_str_hash(str);
4462 return ST2FIX(hval);
4463}
4464
4465#define lesser(a,b) (((a)>(b))?(b):(a))
4466
4467int
4469{
4470 int idx1, idx2;
4471 int rc1, rc2;
4472
4473 if (RSTRING_LEN(str1) == 0) return TRUE;
4474 if (RSTRING_LEN(str2) == 0) return TRUE;
4475 idx1 = ENCODING_GET(str1);
4476 idx2 = ENCODING_GET(str2);
4477 if (idx1 == idx2) return TRUE;
4478 rc1 = rb_enc_str_coderange(str1);
4479 rc2 = rb_enc_str_coderange(str2);
4480 if (rc1 == ENC_CODERANGE_7BIT) {
4481 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4482 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4483 return TRUE;
4484 }
4485 if (rc2 == ENC_CODERANGE_7BIT) {
4486 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4487 return TRUE;
4488 }
4489 return FALSE;
4490}
4491
4492int
4494{
4495 long len1, len2;
4496 const char *ptr1, *ptr2;
4497 int retval;
4498
4499 if (str1 == str2) return 0;
4500 RSTRING_GETMEM(str1, ptr1, len1);
4501 RSTRING_GETMEM(str2, ptr2, len2);
4502 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4503 if (len1 == len2) {
4504 if (!rb_str_comparable(str1, str2)) {
4505 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4506 return 1;
4507 return -1;
4508 }
4509 return 0;
4510 }
4511 if (len1 > len2) return 1;
4512 return -1;
4513 }
4514 if (retval > 0) return 1;
4515 return -1;
4516}
4517
4518/*
4519 * call-seq:
4520 * string == object -> true or false
4521 * string === object -> true or false
4522 *
4523 * Returns +true+ if +object+ has the same length and content;
4524 * as +self+; +false+ otherwise:
4525 *
4526 * s = 'foo'
4527 * s == 'foo' # => true
4528 * s == 'food' # => false
4529 * s == 'FOO' # => false
4530 *
4531 * Returns +false+ if the two strings' encodings are not compatible:
4532 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4533 *
4534 * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4535 * two strings are compared using <code>object.==</code>.
4536 */
4537
4538VALUE
4540{
4541 if (str1 == str2) return Qtrue;
4542 if (!RB_TYPE_P(str2, T_STRING)) {
4543 if (!rb_respond_to(str2, idTo_str)) {
4544 return Qfalse;
4545 }
4546 return rb_equal(str2, str1);
4547 }
4548 return rb_str_eql_internal(str1, str2);
4549}
4550
4551/*
4552 * call-seq:
4553 * eql?(object) -> true or false
4554 *
4555 * Returns +true+ if +object+ has the same length and content;
4556 * as +self+; +false+ otherwise:
4557 *
4558 * s = 'foo'
4559 * s.eql?('foo') # => true
4560 * s.eql?('food') # => false
4561 * s.eql?('FOO') # => false
4562 *
4563 * Returns +false+ if the two strings' encodings are not compatible:
4564 *
4565 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1).eql?("\u{c4 d6 dc}") # => false
4566 *
4567 */
4568
4569VALUE
4570rb_str_eql(VALUE str1, VALUE str2)
4571{
4572 if (str1 == str2) return Qtrue;
4573 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4574 return rb_str_eql_internal(str1, str2);
4575}
4576
4577/*
4578 * call-seq:
4579 * string <=> other_string -> -1, 0, 1, or nil
4580 *
4581 * Compares +self+ and +other_string+, returning:
4582 *
4583 * - -1 if +other_string+ is larger.
4584 * - 0 if the two are equal.
4585 * - 1 if +other_string+ is smaller.
4586 * - +nil+ if the two are incomparable.
4587 *
4588 * Examples:
4589 *
4590 * 'foo' <=> 'foo' # => 0
4591 * 'foo' <=> 'food' # => -1
4592 * 'food' <=> 'foo' # => 1
4593 * 'FOO' <=> 'foo' # => -1
4594 * 'foo' <=> 'FOO' # => 1
4595 * 'foo' <=> 1 # => nil
4596 *
4597 */
4598
4599static VALUE
4600rb_str_cmp_m(VALUE str1, VALUE str2)
4601{
4602 int result;
4603 VALUE s = rb_check_string_type(str2);
4604 if (NIL_P(s)) {
4605 return rb_invcmp(str1, str2);
4606 }
4607 result = rb_str_cmp(str1, s);
4608 return INT2FIX(result);
4609}
4610
4611static VALUE str_casecmp(VALUE str1, VALUE str2);
4612static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4613
4614/*
4615 * call-seq:
4616 * casecmp(other_string) -> -1, 0, 1, or nil
4617 *
4618 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4619 *
4620 * - -1 if <tt>other_string.downcase</tt> is larger.
4621 * - 0 if the two are equal.
4622 * - 1 if <tt>other_string.downcase</tt> is smaller.
4623 * - +nil+ if the two are incomparable.
4624 *
4625 * Examples:
4626 *
4627 * 'foo'.casecmp('foo') # => 0
4628 * 'foo'.casecmp('food') # => -1
4629 * 'food'.casecmp('foo') # => 1
4630 * 'FOO'.casecmp('foo') # => 0
4631 * 'foo'.casecmp('FOO') # => 0
4632 * 'foo'.casecmp(1) # => nil
4633 *
4634 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4635 *
4636 * Related: String#casecmp?.
4637 *
4638 */
4639
4640static VALUE
4641rb_str_casecmp(VALUE str1, VALUE str2)
4642{
4643 VALUE s = rb_check_string_type(str2);
4644 if (NIL_P(s)) {
4645 return Qnil;
4646 }
4647 return str_casecmp(str1, s);
4648}
4649
4650static VALUE
4651str_casecmp(VALUE str1, VALUE str2)
4652{
4653 long len;
4654 rb_encoding *enc;
4655 const char *p1, *p1end, *p2, *p2end;
4656
4657 enc = rb_enc_compatible(str1, str2);
4658 if (!enc) {
4659 return Qnil;
4660 }
4661
4662 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4663 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4664 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4665 while (p1 < p1end && p2 < p2end) {
4666 if (*p1 != *p2) {
4667 unsigned int c1 = TOLOWER(*p1 & 0xff);
4668 unsigned int c2 = TOLOWER(*p2 & 0xff);
4669 if (c1 != c2)
4670 return INT2FIX(c1 < c2 ? -1 : 1);
4671 }
4672 p1++;
4673 p2++;
4674 }
4675 }
4676 else {
4677 while (p1 < p1end && p2 < p2end) {
4678 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4679 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4680
4681 if (0 <= c1 && 0 <= c2) {
4682 c1 = TOLOWER(c1);
4683 c2 = TOLOWER(c2);
4684 if (c1 != c2)
4685 return INT2FIX(c1 < c2 ? -1 : 1);
4686 }
4687 else {
4688 int r;
4689 l1 = rb_enc_mbclen(p1, p1end, enc);
4690 l2 = rb_enc_mbclen(p2, p2end, enc);
4691 len = l1 < l2 ? l1 : l2;
4692 r = memcmp(p1, p2, len);
4693 if (r != 0)
4694 return INT2FIX(r < 0 ? -1 : 1);
4695 if (l1 != l2)
4696 return INT2FIX(l1 < l2 ? -1 : 1);
4697 }
4698 p1 += l1;
4699 p2 += l2;
4700 }
4701 }
4702 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4703 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4704 return INT2FIX(-1);
4705}
4706
4707/*
4708 * call-seq:
4709 * casecmp?(other_string) -> true, false, or nil
4710 *
4711 * Returns +true+ if +self+ and +other_string+ are equal after
4712 * Unicode case folding, otherwise +false+:
4713 *
4714 * 'foo'.casecmp?('foo') # => true
4715 * 'foo'.casecmp?('food') # => false
4716 * 'food'.casecmp?('foo') # => false
4717 * 'FOO'.casecmp?('foo') # => true
4718 * 'foo'.casecmp?('FOO') # => true
4719 *
4720 * Returns +nil+ if the two values are incomparable:
4721 *
4722 * 'foo'.casecmp?(1) # => nil
4723 *
4724 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4725 *
4726 * Related: String#casecmp.
4727 *
4728 */
4729
4730static VALUE
4731rb_str_casecmp_p(VALUE str1, VALUE str2)
4732{
4733 VALUE s = rb_check_string_type(str2);
4734 if (NIL_P(s)) {
4735 return Qnil;
4736 }
4737 return str_casecmp_p(str1, s);
4738}
4739
4740static VALUE
4741str_casecmp_p(VALUE str1, VALUE str2)
4742{
4743 rb_encoding *enc;
4744 VALUE folded_str1, folded_str2;
4745 VALUE fold_opt = sym_fold;
4746
4747 enc = rb_enc_compatible(str1, str2);
4748 if (!enc) {
4749 return Qnil;
4750 }
4751
4752 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4753 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4754
4755 return rb_str_eql(folded_str1, folded_str2);
4756}
4757
4758static long
4759strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4760 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4761{
4762 const char *search_start = str_ptr;
4763 long pos, search_len = str_len - offset;
4764
4765 for (;;) {
4766 const char *t;
4767 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4768 if (pos < 0) return pos;
4769 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4770 if (t == search_start + pos) break;
4771 search_len -= t - search_start;
4772 if (search_len <= 0) return -1;
4773 offset += t - search_start;
4774 search_start = t;
4775 }
4776 return pos + offset;
4777}
4778
4779/* found index in byte */
4780#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4781#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4782
4783static long
4784rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4785{
4786 const char *str_ptr, *str_ptr_end, *sub_ptr;
4787 long str_len, sub_len;
4788 rb_encoding *enc;
4789
4790 enc = rb_enc_check(str, sub);
4791 if (is_broken_string(sub)) return -1;
4792
4793 str_ptr = RSTRING_PTR(str);
4794 str_ptr_end = RSTRING_END(str);
4795 str_len = RSTRING_LEN(str);
4796 sub_ptr = RSTRING_PTR(sub);
4797 sub_len = RSTRING_LEN(sub);
4798
4799 if (str_len < sub_len) return -1;
4800
4801 if (offset != 0) {
4802 long str_len_char, sub_len_char;
4803 int single_byte = single_byte_optimizable(str);
4804 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4805 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4806 if (offset < 0) {
4807 offset += str_len_char;
4808 if (offset < 0) return -1;
4809 }
4810 if (str_len_char - offset < sub_len_char) return -1;
4811 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4812 str_ptr += offset;
4813 }
4814 if (sub_len == 0) return offset;
4815
4816 /* need proceed one character at a time */
4817 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4818}
4819
4820
4821/*
4822 * call-seq:
4823 * index(substring, offset = 0) -> integer or nil
4824 * index(regexp, offset = 0) -> integer or nil
4825 *
4826 * :include: doc/string/index.rdoc
4827 *
4828 */
4829
4830static VALUE
4831rb_str_index_m(int argc, VALUE *argv, VALUE str)
4832{
4833 VALUE sub;
4834 VALUE initpos;
4835 rb_encoding *enc = STR_ENC_GET(str);
4836 long pos;
4837
4838 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4839 long slen = str_strlen(str, enc); /* str's enc */
4840 pos = NUM2LONG(initpos);
4841 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4842 if (RB_TYPE_P(sub, T_REGEXP)) {
4844 }
4845 return Qnil;
4846 }
4847 }
4848 else {
4849 pos = 0;
4850 }
4851
4852 if (RB_TYPE_P(sub, T_REGEXP)) {
4853 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4854 enc, single_byte_optimizable(str));
4855
4856 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4857 VALUE match = rb_backref_get();
4858 struct re_registers *regs = RMATCH_REGS(match);
4859 pos = rb_str_sublen(str, BEG(0));
4860 return LONG2NUM(pos);
4861 }
4862 }
4863 else {
4864 StringValue(sub);
4865 pos = rb_str_index(str, sub, pos);
4866 if (pos >= 0) {
4867 pos = rb_str_sublen(str, pos);
4868 return LONG2NUM(pos);
4869 }
4870 }
4871 return Qnil;
4872}
4873
4874/* Ensure that the given pos is a valid character boundary.
4875 * Note that in this function, "character" means a code point
4876 * (Unicode scalar value), not a grapheme cluster.
4877 */
4878static void
4879str_ensure_byte_pos(VALUE str, long pos)
4880{
4881 if (!single_byte_optimizable(str)) {
4882 const char *s = RSTRING_PTR(str);
4883 const char *e = RSTRING_END(str);
4884 const char *p = s + pos;
4885 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4886 rb_raise(rb_eIndexError,
4887 "offset %ld does not land on character boundary", pos);
4888 }
4889 }
4890}
4891
4892/*
4893 * call-seq:
4894 * byteindex(substring, offset = 0) -> integer or nil
4895 * byteindex(regexp, offset = 0) -> integer or nil
4896 *
4897 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4898 * or +nil+ if none found:
4899 *
4900 * 'foo'.byteindex('f') # => 0
4901 * 'foo'.byteindex('o') # => 1
4902 * 'foo'.byteindex('oo') # => 1
4903 * 'foo'.byteindex('ooo') # => nil
4904 *
4905 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4906 * or +nil+ if none found:
4907 *
4908 * 'foo'.byteindex(/f/) # => 0
4909 * 'foo'.byteindex(/o/) # => 1
4910 * 'foo'.byteindex(/oo/) # => 1
4911 * 'foo'.byteindex(/ooo/) # => nil
4912 *
4913 * Integer argument +offset+, if given, specifies the byte-based position in the
4914 * string to begin the search:
4915 *
4916 * 'foo'.byteindex('o', 1) # => 1
4917 * 'foo'.byteindex('o', 2) # => 2
4918 * 'foo'.byteindex('o', 3) # => nil
4919 *
4920 * If +offset+ is negative, counts backward from the end of +self+:
4921 *
4922 * 'foo'.byteindex('o', -1) # => 2
4923 * 'foo'.byteindex('o', -2) # => 1
4924 * 'foo'.byteindex('o', -3) # => 1
4925 * 'foo'.byteindex('o', -4) # => nil
4926 *
4927 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4928 * raised.
4929 *
4930 * Related: String#index, String#byterindex.
4931 */
4932
4933static VALUE
4934rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4935{
4936 VALUE sub;
4937 VALUE initpos;
4938 long pos;
4939
4940 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4941 long slen = RSTRING_LEN(str);
4942 pos = NUM2LONG(initpos);
4943 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4944 if (RB_TYPE_P(sub, T_REGEXP)) {
4946 }
4947 return Qnil;
4948 }
4949 }
4950 else {
4951 pos = 0;
4952 }
4953
4954 str_ensure_byte_pos(str, pos);
4955
4956 if (RB_TYPE_P(sub, T_REGEXP)) {
4957 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4958 VALUE match = rb_backref_get();
4959 struct re_registers *regs = RMATCH_REGS(match);
4960 pos = BEG(0);
4961 return LONG2NUM(pos);
4962 }
4963 }
4964 else {
4965 StringValue(sub);
4966 pos = rb_str_byteindex(str, sub, pos);
4967 if (pos >= 0) return LONG2NUM(pos);
4968 }
4969 return Qnil;
4970}
4971
4972#ifndef HAVE_MEMRCHR
4973static void*
4974memrchr(const char *search_str, int chr, long search_len)
4975{
4976 const char *ptr = search_str + search_len;
4977 while (ptr > search_str) {
4978 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4979 }
4980
4981 return ((void *)0);
4982}
4983#endif
4984
4985static long
4986str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4987{
4988 char *hit, *adjusted;
4989 int c;
4990 long slen, searchlen;
4991 char *sbeg, *e, *t;
4992
4993 sbeg = RSTRING_PTR(str);
4994 slen = RSTRING_LEN(sub);
4995 if (slen == 0) return s - sbeg;
4996 e = RSTRING_END(str);
4997 t = RSTRING_PTR(sub);
4998 c = *t & 0xff;
4999 searchlen = s - sbeg + 1;
5000
5001 if (memcmp(s, t, slen) == 0) {
5002 return s - sbeg;
5003 }
5004
5005 do {
5006 hit = memrchr(sbeg, c, searchlen);
5007 if (!hit) break;
5008 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
5009 if (hit != adjusted) {
5010 searchlen = adjusted - sbeg;
5011 continue;
5012 }
5013 if (memcmp(hit, t, slen) == 0)
5014 return hit - sbeg;
5015 searchlen = adjusted - sbeg;
5016 } while (searchlen > 0);
5017
5018 return -1;
5019}
5020
5021/* found index in byte */
5022static long
5023rb_str_rindex(VALUE str, VALUE sub, long pos)
5024{
5025 long len, slen;
5026 char *sbeg, *s;
5027 rb_encoding *enc;
5028 int singlebyte;
5029
5030 enc = rb_enc_check(str, sub);
5031 if (is_broken_string(sub)) return -1;
5032 singlebyte = single_byte_optimizable(str);
5033 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
5034 slen = str_strlen(sub, enc); /* rb_enc_check */
5035
5036 /* substring longer than string */
5037 if (len < slen) return -1;
5038 if (len - pos < slen) pos = len - slen;
5039 if (len == 0) return pos;
5040
5041 sbeg = RSTRING_PTR(str);
5042
5043 if (pos == 0) {
5044 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
5045 return 0;
5046 else
5047 return -1;
5048 }
5049
5050 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
5051 return str_rindex(str, sub, s, enc);
5052}
5053
5054/*
5055 * call-seq:
5056 * rindex(substring, offset = self.length) -> integer or nil
5057 * rindex(regexp, offset = self.length) -> integer or nil
5058 *
5059 * Returns the Integer index of the _last_ occurrence of the given +substring+,
5060 * or +nil+ if none found:
5061 *
5062 * 'foo'.rindex('f') # => 0
5063 * 'foo'.rindex('o') # => 2
5064 * 'foo'.rindex('oo') # => 1
5065 * 'foo'.rindex('ooo') # => nil
5066 *
5067 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
5068 * or +nil+ if none found:
5069 *
5070 * 'foo'.rindex(/f/) # => 0
5071 * 'foo'.rindex(/o/) # => 2
5072 * 'foo'.rindex(/oo/) # => 1
5073 * 'foo'.rindex(/ooo/) # => nil
5074 *
5075 * The _last_ match means starting at the possible last position, not
5076 * the last of longest matches.
5077 *
5078 * 'foo'.rindex(/o+/) # => 2
5079 * $~ #=> #<MatchData "o">
5080 *
5081 * To get the last longest match, needs to combine with negative
5082 * lookbehind.
5083 *
5084 * 'foo'.rindex(/(?<!o)o+/) # => 1
5085 * $~ #=> #<MatchData "oo">
5086 *
5087 * Or String#index with negative lookforward.
5088 *
5089 * 'foo'.index(/o+(?!.*o)/) # => 1
5090 * $~ #=> #<MatchData "oo">
5091 *
5092 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
5093 * string to _end_ the search:
5094 *
5095 * 'foo'.rindex('o', 0) # => nil
5096 * 'foo'.rindex('o', 1) # => 1
5097 * 'foo'.rindex('o', 2) # => 2
5098 * 'foo'.rindex('o', 3) # => 2
5099 *
5100 * If +offset+ is a negative Integer, the maximum starting position in the
5101 * string to _end_ the search is the sum of the string's length and +offset+:
5102 *
5103 * 'foo'.rindex('o', -1) # => 2
5104 * 'foo'.rindex('o', -2) # => 1
5105 * 'foo'.rindex('o', -3) # => nil
5106 * 'foo'.rindex('o', -4) # => nil
5107 *
5108 * Related: String#index.
5109 */
5110
5111static VALUE
5112rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
5113{
5114 VALUE sub;
5115 VALUE initpos;
5116 rb_encoding *enc = STR_ENC_GET(str);
5117 long pos, len = str_strlen(str, enc); /* str's enc */
5118
5119 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5120 pos = NUM2LONG(initpos);
5121 if (pos < 0 && (pos += len) < 0) {
5122 if (RB_TYPE_P(sub, T_REGEXP)) {
5124 }
5125 return Qnil;
5126 }
5127 if (pos > len) pos = len;
5128 }
5129 else {
5130 pos = len;
5131 }
5132
5133 if (RB_TYPE_P(sub, T_REGEXP)) {
5134 /* enc = rb_enc_check(str, sub); */
5135 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
5136 enc, single_byte_optimizable(str));
5137
5138 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5139 VALUE match = rb_backref_get();
5140 struct re_registers *regs = RMATCH_REGS(match);
5141 pos = rb_str_sublen(str, BEG(0));
5142 return LONG2NUM(pos);
5143 }
5144 }
5145 else {
5146 StringValue(sub);
5147 pos = rb_str_rindex(str, sub, pos);
5148 if (pos >= 0) {
5149 pos = rb_str_sublen(str, pos);
5150 return LONG2NUM(pos);
5151 }
5152 }
5153 return Qnil;
5154}
5155
5156static long
5157rb_str_byterindex(VALUE str, VALUE sub, long pos)
5158{
5159 long len, slen;
5160 char *sbeg, *s;
5161 rb_encoding *enc;
5162
5163 enc = rb_enc_check(str, sub);
5164 if (is_broken_string(sub)) return -1;
5165 len = RSTRING_LEN(str);
5166 slen = RSTRING_LEN(sub);
5167
5168 /* substring longer than string */
5169 if (len < slen) return -1;
5170 if (len - pos < slen) pos = len - slen;
5171 if (len == 0) return pos;
5172
5173 sbeg = RSTRING_PTR(str);
5174
5175 if (pos == 0) {
5176 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
5177 return 0;
5178 else
5179 return -1;
5180 }
5181
5182 s = sbeg + pos;
5183 return str_rindex(str, sub, s, enc);
5184}
5185
5186
5187/*
5188 * call-seq:
5189 * byterindex(substring, offset = self.bytesize) -> integer or nil
5190 * byterindex(regexp, offset = self.bytesize) -> integer or nil
5191 *
5192 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
5193 * or +nil+ if none found:
5194 *
5195 * 'foo'.byterindex('f') # => 0
5196 * 'foo'.byterindex('o') # => 2
5197 * 'foo'.byterindex('oo') # => 1
5198 * 'foo'.byterindex('ooo') # => nil
5199 *
5200 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
5201 * or +nil+ if none found:
5202 *
5203 * 'foo'.byterindex(/f/) # => 0
5204 * 'foo'.byterindex(/o/) # => 2
5205 * 'foo'.byterindex(/oo/) # => 1
5206 * 'foo'.byterindex(/ooo/) # => nil
5207 *
5208 * The _last_ match means starting at the possible last position, not
5209 * the last of longest matches.
5210 *
5211 * 'foo'.byterindex(/o+/) # => 2
5212 * $~ #=> #<MatchData "o">
5213 *
5214 * To get the last longest match, needs to combine with negative
5215 * lookbehind.
5216 *
5217 * 'foo'.byterindex(/(?<!o)o+/) # => 1
5218 * $~ #=> #<MatchData "oo">
5219 *
5220 * Or String#byteindex with negative lookforward.
5221 *
5222 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
5223 * $~ #=> #<MatchData "oo">
5224 *
5225 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
5226 * string to _end_ the search:
5227 *
5228 * 'foo'.byterindex('o', 0) # => nil
5229 * 'foo'.byterindex('o', 1) # => 1
5230 * 'foo'.byterindex('o', 2) # => 2
5231 * 'foo'.byterindex('o', 3) # => 2
5232 *
5233 * If +offset+ is a negative Integer, the maximum starting position in the
5234 * string to _end_ the search is the sum of the string's length and +offset+:
5235 *
5236 * 'foo'.byterindex('o', -1) # => 2
5237 * 'foo'.byterindex('o', -2) # => 1
5238 * 'foo'.byterindex('o', -3) # => nil
5239 * 'foo'.byterindex('o', -4) # => nil
5240 *
5241 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
5242 * raised.
5243 *
5244 * Related: String#byteindex.
5245 */
5246
5247static VALUE
5248rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
5249{
5250 VALUE sub;
5251 VALUE initpos;
5252 long pos, len = RSTRING_LEN(str);
5253
5254 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5255 pos = NUM2LONG(initpos);
5256 if (pos < 0 && (pos += len) < 0) {
5257 if (RB_TYPE_P(sub, T_REGEXP)) {
5259 }
5260 return Qnil;
5261 }
5262 if (pos > len) pos = len;
5263 }
5264 else {
5265 pos = len;
5266 }
5267
5268 str_ensure_byte_pos(str, pos);
5269
5270 if (RB_TYPE_P(sub, T_REGEXP)) {
5271 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5272 VALUE match = rb_backref_get();
5273 struct re_registers *regs = RMATCH_REGS(match);
5274 pos = BEG(0);
5275 return LONG2NUM(pos);
5276 }
5277 }
5278 else {
5279 StringValue(sub);
5280 pos = rb_str_byterindex(str, sub, pos);
5281 if (pos >= 0) return LONG2NUM(pos);
5282 }
5283 return Qnil;
5284}
5285
5286/*
5287 * call-seq:
5288 * string =~ regexp -> integer or nil
5289 * string =~ object -> integer or nil
5290 *
5291 * Returns the Integer index of the first substring that matches
5292 * the given +regexp+, or +nil+ if no match found:
5293 *
5294 * 'foo' =~ /f/ # => 0
5295 * 'foo' =~ /o/ # => 1
5296 * 'foo' =~ /x/ # => nil
5297 *
5298 * Note: also updates Regexp@Global+Variables.
5299 *
5300 * If the given +object+ is not a Regexp, returns the value
5301 * returned by <tt>object =~ self</tt>.
5302 *
5303 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5304 * (see Regexp#=~):
5305 *
5306 * number= nil
5307 * "no. 9" =~ /(?<number>\d+)/
5308 * number # => nil (not assigned)
5309 * /(?<number>\d+)/ =~ "no. 9"
5310 * number #=> "9"
5311 *
5312 */
5313
5314static VALUE
5315rb_str_match(VALUE x, VALUE y)
5316{
5317 switch (OBJ_BUILTIN_TYPE(y)) {
5318 case T_STRING:
5319 rb_raise(rb_eTypeError, "type mismatch: String given");
5320
5321 case T_REGEXP:
5322 return rb_reg_match(y, x);
5323
5324 default:
5325 return rb_funcall(y, idEqTilde, 1, x);
5326 }
5327}
5328
5329
5330static VALUE get_pat(VALUE);
5331
5332
5333/*
5334 * call-seq:
5335 * match(pattern, offset = 0) -> matchdata or nil
5336 * match(pattern, offset = 0) {|matchdata| ... } -> object
5337 *
5338 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
5339 *
5340 * Note: also updates Regexp@Global+Variables.
5341 *
5342 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5343 * regexp = Regexp.new(pattern)
5344 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5345 * (see Regexp#match):
5346 * matchdata = regexp.match(self)
5347 *
5348 * With no block given, returns the computed +matchdata+:
5349 *
5350 * 'foo'.match('f') # => #<MatchData "f">
5351 * 'foo'.match('o') # => #<MatchData "o">
5352 * 'foo'.match('x') # => nil
5353 *
5354 * If Integer argument +offset+ is given, the search begins at index +offset+:
5355 *
5356 * 'foo'.match('f', 1) # => nil
5357 * 'foo'.match('o', 1) # => #<MatchData "o">
5358 *
5359 * With a block given, calls the block with the computed +matchdata+
5360 * and returns the block's return value:
5361 *
5362 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5363 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
5364 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
5365 *
5366 */
5367
5368static VALUE
5369rb_str_match_m(int argc, VALUE *argv, VALUE str)
5370{
5371 VALUE re, result;
5372 if (argc < 1)
5373 rb_check_arity(argc, 1, 2);
5374 re = argv[0];
5375 argv[0] = str;
5376 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5377 if (!NIL_P(result) && rb_block_given_p()) {
5378 return rb_yield(result);
5379 }
5380 return result;
5381}
5382
5383/*
5384 * call-seq:
5385 * match?(pattern, offset = 0) -> true or false
5386 *
5387 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5388 *
5389 * Note: does not update Regexp@Global+Variables.
5390 *
5391 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5392 * regexp = Regexp.new(pattern)
5393 *
5394 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5395 * +false+ otherwise:
5396 *
5397 * 'foo'.match?(/o/) # => true
5398 * 'foo'.match?('o') # => true
5399 * 'foo'.match?(/x/) # => false
5400 *
5401 * If Integer argument +offset+ is given, the search begins at index +offset+:
5402 * 'foo'.match?('f', 1) # => false
5403 * 'foo'.match?('o', 1) # => true
5404 *
5405 */
5406
5407static VALUE
5408rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5409{
5410 VALUE re;
5411 rb_check_arity(argc, 1, 2);
5412 re = get_pat(argv[0]);
5413 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5414}
5415
5416enum neighbor_char {
5417 NEIGHBOR_NOT_CHAR,
5418 NEIGHBOR_FOUND,
5419 NEIGHBOR_WRAPPED
5420};
5421
5422static enum neighbor_char
5423enc_succ_char(char *p, long len, rb_encoding *enc)
5424{
5425 long i;
5426 int l;
5427
5428 if (rb_enc_mbminlen(enc) > 1) {
5429 /* wchar, trivial case */
5430 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5431 if (!MBCLEN_CHARFOUND_P(r)) {
5432 return NEIGHBOR_NOT_CHAR;
5433 }
5434 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5435 l = rb_enc_code_to_mbclen(c, enc);
5436 if (!l) return NEIGHBOR_NOT_CHAR;
5437 if (l != len) return NEIGHBOR_WRAPPED;
5438 rb_enc_mbcput(c, p, enc);
5439 r = rb_enc_precise_mbclen(p, p + len, enc);
5440 if (!MBCLEN_CHARFOUND_P(r)) {
5441 return NEIGHBOR_NOT_CHAR;
5442 }
5443 return NEIGHBOR_FOUND;
5444 }
5445 while (1) {
5446 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5447 p[i] = '\0';
5448 if (i < 0)
5449 return NEIGHBOR_WRAPPED;
5450 ++((unsigned char*)p)[i];
5451 l = rb_enc_precise_mbclen(p, p+len, enc);
5452 if (MBCLEN_CHARFOUND_P(l)) {
5453 l = MBCLEN_CHARFOUND_LEN(l);
5454 if (l == len) {
5455 return NEIGHBOR_FOUND;
5456 }
5457 else {
5458 memset(p+l, 0xff, len-l);
5459 }
5460 }
5461 if (MBCLEN_INVALID_P(l) && i < len-1) {
5462 long len2;
5463 int l2;
5464 for (len2 = len-1; 0 < len2; len2--) {
5465 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5466 if (!MBCLEN_INVALID_P(l2))
5467 break;
5468 }
5469 memset(p+len2+1, 0xff, len-(len2+1));
5470 }
5471 }
5472}
5473
5474static enum neighbor_char
5475enc_pred_char(char *p, long len, rb_encoding *enc)
5476{
5477 long i;
5478 int l;
5479 if (rb_enc_mbminlen(enc) > 1) {
5480 /* wchar, trivial case */
5481 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5482 if (!MBCLEN_CHARFOUND_P(r)) {
5483 return NEIGHBOR_NOT_CHAR;
5484 }
5485 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5486 if (!c) return NEIGHBOR_NOT_CHAR;
5487 --c;
5488 l = rb_enc_code_to_mbclen(c, enc);
5489 if (!l) return NEIGHBOR_NOT_CHAR;
5490 if (l != len) return NEIGHBOR_WRAPPED;
5491 rb_enc_mbcput(c, p, enc);
5492 r = rb_enc_precise_mbclen(p, p + len, enc);
5493 if (!MBCLEN_CHARFOUND_P(r)) {
5494 return NEIGHBOR_NOT_CHAR;
5495 }
5496 return NEIGHBOR_FOUND;
5497 }
5498 while (1) {
5499 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5500 p[i] = '\xff';
5501 if (i < 0)
5502 return NEIGHBOR_WRAPPED;
5503 --((unsigned char*)p)[i];
5504 l = rb_enc_precise_mbclen(p, p+len, enc);
5505 if (MBCLEN_CHARFOUND_P(l)) {
5506 l = MBCLEN_CHARFOUND_LEN(l);
5507 if (l == len) {
5508 return NEIGHBOR_FOUND;
5509 }
5510 else {
5511 memset(p+l, 0, len-l);
5512 }
5513 }
5514 if (MBCLEN_INVALID_P(l) && i < len-1) {
5515 long len2;
5516 int l2;
5517 for (len2 = len-1; 0 < len2; len2--) {
5518 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5519 if (!MBCLEN_INVALID_P(l2))
5520 break;
5521 }
5522 memset(p+len2+1, 0, len-(len2+1));
5523 }
5524 }
5525}
5526
5527/*
5528 overwrite +p+ by succeeding letter in +enc+ and returns
5529 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5530 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5531 assuming each ranges are successive, and mbclen
5532 never change in each ranges.
5533 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5534 character.
5535 */
5536static enum neighbor_char
5537enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5538{
5539 enum neighbor_char ret;
5540 unsigned int c;
5541 int ctype;
5542 int range;
5543 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5544
5545 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5546 int try;
5547 const int max_gaps = 1;
5548
5549 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5550 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5551 ctype = ONIGENC_CTYPE_DIGIT;
5552 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5553 ctype = ONIGENC_CTYPE_ALPHA;
5554 else
5555 return NEIGHBOR_NOT_CHAR;
5556
5557 MEMCPY(save, p, char, len);
5558 for (try = 0; try <= max_gaps; ++try) {
5559 ret = enc_succ_char(p, len, enc);
5560 if (ret == NEIGHBOR_FOUND) {
5561 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5562 if (rb_enc_isctype(c, ctype, enc))
5563 return NEIGHBOR_FOUND;
5564 }
5565 }
5566 MEMCPY(p, save, char, len);
5567 range = 1;
5568 while (1) {
5569 MEMCPY(save, p, char, len);
5570 ret = enc_pred_char(p, len, enc);
5571 if (ret == NEIGHBOR_FOUND) {
5572 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5573 if (!rb_enc_isctype(c, ctype, enc)) {
5574 MEMCPY(p, save, char, len);
5575 break;
5576 }
5577 }
5578 else {
5579 MEMCPY(p, save, char, len);
5580 break;
5581 }
5582 range++;
5583 }
5584 if (range == 1) {
5585 return NEIGHBOR_NOT_CHAR;
5586 }
5587
5588 if (ctype != ONIGENC_CTYPE_DIGIT) {
5589 MEMCPY(carry, p, char, len);
5590 return NEIGHBOR_WRAPPED;
5591 }
5592
5593 MEMCPY(carry, p, char, len);
5594 enc_succ_char(carry, len, enc);
5595 return NEIGHBOR_WRAPPED;
5596}
5597
5598
5599static VALUE str_succ(VALUE str);
5600
5601/*
5602 * call-seq:
5603 * succ -> new_str
5604 *
5605 * Returns the successor to +self+. The successor is calculated by
5606 * incrementing characters.
5607 *
5608 * The first character to be incremented is the rightmost alphanumeric:
5609 * or, if no alphanumerics, the rightmost character:
5610 *
5611 * 'THX1138'.succ # => "THX1139"
5612 * '<<koala>>'.succ # => "<<koalb>>"
5613 * '***'.succ # => '**+'
5614 *
5615 * The successor to a digit is another digit, "carrying" to the next-left
5616 * character for a "rollover" from 9 to 0, and prepending another digit
5617 * if necessary:
5618 *
5619 * '00'.succ # => "01"
5620 * '09'.succ # => "10"
5621 * '99'.succ # => "100"
5622 *
5623 * The successor to a letter is another letter of the same case,
5624 * carrying to the next-left character for a rollover,
5625 * and prepending another same-case letter if necessary:
5626 *
5627 * 'aa'.succ # => "ab"
5628 * 'az'.succ # => "ba"
5629 * 'zz'.succ # => "aaa"
5630 * 'AA'.succ # => "AB"
5631 * 'AZ'.succ # => "BA"
5632 * 'ZZ'.succ # => "AAA"
5633 *
5634 * The successor to a non-alphanumeric character is the next character
5635 * in the underlying character set's collating sequence,
5636 * carrying to the next-left character for a rollover,
5637 * and prepending another character if necessary:
5638 *
5639 * s = 0.chr * 3
5640 * s # => "\x00\x00\x00"
5641 * s.succ # => "\x00\x00\x01"
5642 * s = 255.chr * 3
5643 * s # => "\xFF\xFF\xFF"
5644 * s.succ # => "\x01\x00\x00\x00"
5645 *
5646 * Carrying can occur between and among mixtures of alphanumeric characters:
5647 *
5648 * s = 'zz99zz99'
5649 * s.succ # => "aaa00aa00"
5650 * s = '99zz99zz'
5651 * s.succ # => "100aa00aa"
5652 *
5653 * The successor to an empty +String+ is a new empty +String+:
5654 *
5655 * ''.succ # => ""
5656 *
5657 */
5658
5659VALUE
5661{
5662 VALUE str;
5663 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5664 rb_enc_cr_str_copy_for_substr(str, orig);
5665 return str_succ(str);
5666}
5667
5668static VALUE
5669str_succ(VALUE str)
5670{
5671 rb_encoding *enc;
5672 char *sbeg, *s, *e, *last_alnum = 0;
5673 int found_alnum = 0;
5674 long l, slen;
5675 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5676 long carry_pos = 0, carry_len = 1;
5677 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5678
5679 slen = RSTRING_LEN(str);
5680 if (slen == 0) return str;
5681
5682 enc = STR_ENC_GET(str);
5683 sbeg = RSTRING_PTR(str);
5684 s = e = sbeg + slen;
5685
5686 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5687 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5688 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5689 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5690 break;
5691 }
5692 }
5693 l = rb_enc_precise_mbclen(s, e, enc);
5694 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5695 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5696 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5697 switch (neighbor) {
5698 case NEIGHBOR_NOT_CHAR:
5699 continue;
5700 case NEIGHBOR_FOUND:
5701 return str;
5702 case NEIGHBOR_WRAPPED:
5703 last_alnum = s;
5704 break;
5705 }
5706 found_alnum = 1;
5707 carry_pos = s - sbeg;
5708 carry_len = l;
5709 }
5710 if (!found_alnum) { /* str contains no alnum */
5711 s = e;
5712 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5713 enum neighbor_char neighbor;
5714 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5715 l = rb_enc_precise_mbclen(s, e, enc);
5716 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5717 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5718 MEMCPY(tmp, s, char, l);
5719 neighbor = enc_succ_char(tmp, l, enc);
5720 switch (neighbor) {
5721 case NEIGHBOR_FOUND:
5722 MEMCPY(s, tmp, char, l);
5723 return str;
5724 break;
5725 case NEIGHBOR_WRAPPED:
5726 MEMCPY(s, tmp, char, l);
5727 break;
5728 case NEIGHBOR_NOT_CHAR:
5729 break;
5730 }
5731 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5732 /* wrapped to \0...\0. search next valid char. */
5733 enc_succ_char(s, l, enc);
5734 }
5735 if (!rb_enc_asciicompat(enc)) {
5736 MEMCPY(carry, s, char, l);
5737 carry_len = l;
5738 }
5739 carry_pos = s - sbeg;
5740 }
5742 }
5743 RESIZE_CAPA(str, slen + carry_len);
5744 sbeg = RSTRING_PTR(str);
5745 s = sbeg + carry_pos;
5746 memmove(s + carry_len, s, slen - carry_pos);
5747 memmove(s, carry, carry_len);
5748 slen += carry_len;
5749 STR_SET_LEN(str, slen);
5750 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5752 return str;
5753}
5754
5755
5756/*
5757 * call-seq:
5758 * succ! -> self
5759 *
5760 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5761 */
5762
5763static VALUE
5764rb_str_succ_bang(VALUE str)
5765{
5766 rb_str_modify(str);
5767 str_succ(str);
5768 return str;
5769}
5770
5771static int
5772all_digits_p(const char *s, long len)
5773{
5774 while (len-- > 0) {
5775 if (!ISDIGIT(*s)) return 0;
5776 s++;
5777 }
5778 return 1;
5779}
5780
5781static int
5782str_upto_i(VALUE str, VALUE arg)
5783{
5784 rb_yield(str);
5785 return 0;
5786}
5787
5788/*
5789 * call-seq:
5790 * upto(other_string, exclusive = false) {|string| ... } -> self
5791 * upto(other_string, exclusive = false) -> new_enumerator
5792 *
5793 * With a block given, calls the block with each +String+ value
5794 * returned by successive calls to String#succ;
5795 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5796 * the sequence terminates when value +other_string+ is reached;
5797 * returns +self+:
5798 *
5799 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5800 * Output:
5801 *
5802 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5803 *
5804 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5805 *
5806 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5807 *
5808 * Output:
5809 *
5810 * a8 a9 b0 b1 b2 b3 b4 b5
5811 *
5812 * If +other_string+ would not be reached, does not call the block:
5813 *
5814 * '25'.upto('5') {|s| fail s }
5815 * 'aa'.upto('a') {|s| fail s }
5816 *
5817 * With no block given, returns a new Enumerator:
5818 *
5819 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5820 *
5821 */
5822
5823static VALUE
5824rb_str_upto(int argc, VALUE *argv, VALUE beg)
5825{
5826 VALUE end, exclusive;
5827
5828 rb_scan_args(argc, argv, "11", &end, &exclusive);
5829 RETURN_ENUMERATOR(beg, argc, argv);
5830 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5831}
5832
5833VALUE
5834rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5835{
5836 VALUE current, after_end;
5837 ID succ;
5838 int n, ascii;
5839 rb_encoding *enc;
5840
5841 CONST_ID(succ, "succ");
5842 StringValue(end);
5843 enc = rb_enc_check(beg, end);
5844 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5845 /* single character */
5846 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5847 char c = RSTRING_PTR(beg)[0];
5848 char e = RSTRING_PTR(end)[0];
5849
5850 if (c > e || (excl && c == e)) return beg;
5851 for (;;) {
5852 VALUE str = rb_enc_str_new(&c, 1, enc);
5854 if ((*each)(str, arg)) break;
5855 if (!excl && c == e) break;
5856 c++;
5857 if (excl && c == e) break;
5858 }
5859 return beg;
5860 }
5861 /* both edges are all digits */
5862 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5863 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5864 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5865 VALUE b, e;
5866 int width;
5867
5868 width = RSTRING_LENINT(beg);
5869 b = rb_str_to_inum(beg, 10, FALSE);
5870 e = rb_str_to_inum(end, 10, FALSE);
5871 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5872 long bi = FIX2LONG(b);
5873 long ei = FIX2LONG(e);
5874 rb_encoding *usascii = rb_usascii_encoding();
5875
5876 while (bi <= ei) {
5877 if (excl && bi == ei) break;
5878 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5879 bi++;
5880 }
5881 }
5882 else {
5883 ID op = excl ? '<' : idLE;
5884 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5885
5886 args[0] = INT2FIX(width);
5887 while (rb_funcall(b, op, 1, e)) {
5888 args[1] = b;
5889 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5890 b = rb_funcallv(b, succ, 0, 0);
5891 }
5892 }
5893 return beg;
5894 }
5895 /* normal case */
5896 n = rb_str_cmp(beg, end);
5897 if (n > 0 || (excl && n == 0)) return beg;
5898
5899 after_end = rb_funcallv(end, succ, 0, 0);
5900 current = str_duplicate(rb_cString, beg);
5901 while (!rb_str_equal(current, after_end)) {
5902 VALUE next = Qnil;
5903 if (excl || !rb_str_equal(current, end))
5904 next = rb_funcallv(current, succ, 0, 0);
5905 if ((*each)(current, arg)) break;
5906 if (NIL_P(next)) break;
5907 current = next;
5908 StringValue(current);
5909 if (excl && rb_str_equal(current, end)) break;
5910 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5911 break;
5912 }
5913
5914 return beg;
5915}
5916
5917VALUE
5918rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5919{
5920 VALUE current;
5921 ID succ;
5922
5923 CONST_ID(succ, "succ");
5924 /* both edges are all digits */
5925 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5926 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5927 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5928 int width = RSTRING_LENINT(beg);
5929 b = rb_str_to_inum(beg, 10, FALSE);
5930 if (FIXNUM_P(b)) {
5931 long bi = FIX2LONG(b);
5932 rb_encoding *usascii = rb_usascii_encoding();
5933
5934 while (FIXABLE(bi)) {
5935 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5936 bi++;
5937 }
5938 b = LONG2NUM(bi);
5939 }
5940 args[0] = INT2FIX(width);
5941 while (1) {
5942 args[1] = b;
5943 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5944 b = rb_funcallv(b, succ, 0, 0);
5945 }
5946 }
5947 /* normal case */
5948 current = str_duplicate(rb_cString, beg);
5949 while (1) {
5950 VALUE next = rb_funcallv(current, succ, 0, 0);
5951 if ((*each)(current, arg)) break;
5952 current = next;
5953 StringValue(current);
5954 if (RSTRING_LEN(current) == 0)
5955 break;
5956 }
5957
5958 return beg;
5959}
5960
5961static int
5962include_range_i(VALUE str, VALUE arg)
5963{
5964 VALUE *argp = (VALUE *)arg;
5965 if (!rb_equal(str, *argp)) return 0;
5966 *argp = Qnil;
5967 return 1;
5968}
5969
5970VALUE
5971rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5972{
5973 beg = rb_str_new_frozen(beg);
5974 StringValue(end);
5975 end = rb_str_new_frozen(end);
5976 if (NIL_P(val)) return Qfalse;
5977 val = rb_check_string_type(val);
5978 if (NIL_P(val)) return Qfalse;
5979 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5980 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5981 rb_enc_asciicompat(STR_ENC_GET(val))) {
5982 const char *bp = RSTRING_PTR(beg);
5983 const char *ep = RSTRING_PTR(end);
5984 const char *vp = RSTRING_PTR(val);
5985 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5986 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5987 return Qfalse;
5988 else {
5989 char b = *bp;
5990 char e = *ep;
5991 char v = *vp;
5992
5993 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5994 if (b <= v && v < e) return Qtrue;
5995 return RBOOL(!RTEST(exclusive) && v == e);
5996 }
5997 }
5998 }
5999#if 0
6000 /* both edges are all digits */
6001 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
6002 all_digits_p(bp, RSTRING_LEN(beg)) &&
6003 all_digits_p(ep, RSTRING_LEN(end))) {
6004 /* TODO */
6005 }
6006#endif
6007 }
6008 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
6009
6010 return RBOOL(NIL_P(val));
6011}
6012
6013static VALUE
6014rb_str_subpat(VALUE str, VALUE re, VALUE backref)
6015{
6016 if (rb_reg_search(re, str, 0, 0) >= 0) {
6017 VALUE match = rb_backref_get();
6018 int nth = rb_reg_backref_number(match, backref);
6019 return rb_reg_nth_match(nth, match);
6020 }
6021 return Qnil;
6022}
6023
6024static VALUE
6025rb_str_aref(VALUE str, VALUE indx)
6026{
6027 long idx;
6028
6029 if (FIXNUM_P(indx)) {
6030 idx = FIX2LONG(indx);
6031 }
6032 else if (RB_TYPE_P(indx, T_REGEXP)) {
6033 return rb_str_subpat(str, indx, INT2FIX(0));
6034 }
6035 else if (RB_TYPE_P(indx, T_STRING)) {
6036 if (rb_str_index(str, indx, 0) != -1)
6037 return str_duplicate(rb_cString, indx);
6038 return Qnil;
6039 }
6040 else {
6041 /* check if indx is Range */
6042 long beg, len = str_strlen(str, NULL);
6043 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6044 case Qfalse:
6045 break;
6046 case Qnil:
6047 return Qnil;
6048 default:
6049 return rb_str_substr(str, beg, len);
6050 }
6051 idx = NUM2LONG(indx);
6052 }
6053
6054 return str_substr(str, idx, 1, FALSE);
6055}
6056
6057
6058/*
6059 * call-seq:
6060 * string[index] -> new_string or nil
6061 * string[start, length] -> new_string or nil
6062 * string[range] -> new_string or nil
6063 * string[regexp, capture = 0] -> new_string or nil
6064 * string[substring] -> new_string or nil
6065 *
6066 * Returns the substring of +self+ specified by the arguments.
6067 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
6068 *
6069 *
6070 */
6071
6072static VALUE
6073rb_str_aref_m(int argc, VALUE *argv, VALUE str)
6074{
6075 if (argc == 2) {
6076 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6077 return rb_str_subpat(str, argv[0], argv[1]);
6078 }
6079 else {
6080 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
6081 }
6082 }
6083 rb_check_arity(argc, 1, 2);
6084 return rb_str_aref(str, argv[0]);
6085}
6086
6087VALUE
6089{
6090 char *ptr = RSTRING_PTR(str);
6091 long olen = RSTRING_LEN(str), nlen;
6092
6093 str_modifiable(str);
6094 if (len > olen) len = olen;
6095 nlen = olen - len;
6096 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
6097 char *oldptr = ptr;
6098 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
6099 STR_SET_EMBED(str);
6100 ptr = RSTRING(str)->as.embed.ary;
6101 memmove(ptr, oldptr + len, nlen);
6102 if (fl == STR_NOEMBED) xfree(oldptr);
6103 }
6104 else {
6105 if (!STR_SHARED_P(str)) {
6106 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
6107 rb_enc_cr_str_exact_copy(shared, str);
6108 OBJ_FREEZE(shared);
6109 }
6110 ptr = RSTRING(str)->as.heap.ptr += len;
6111 }
6112 STR_SET_LEN(str, nlen);
6113
6114 if (!SHARABLE_MIDDLE_SUBSTRING) {
6115 TERM_FILL(ptr + nlen, TERM_LEN(str));
6116 }
6118 return str;
6119}
6120
6121static void
6122rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
6123{
6124 char *sptr;
6125 long slen;
6126 int cr;
6127
6128 if (beg == 0 && vlen == 0) {
6129 rb_str_drop_bytes(str, len);
6130 return;
6131 }
6132
6133 str_modify_keep_cr(str);
6134 RSTRING_GETMEM(str, sptr, slen);
6135 if (len < vlen) {
6136 /* expand string */
6137 RESIZE_CAPA(str, slen + vlen - len);
6138 sptr = RSTRING_PTR(str);
6139 }
6140
6142 cr = rb_enc_str_coderange(val);
6143 else
6145
6146 if (vlen != len) {
6147 memmove(sptr + beg + vlen,
6148 sptr + beg + len,
6149 slen - (beg + len));
6150 }
6151 if (vlen < beg && len < 0) {
6152 MEMZERO(sptr + slen, char, -len);
6153 }
6154 if (vlen > 0) {
6155 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
6156 }
6157 slen += vlen - len;
6158 STR_SET_LEN(str, slen);
6159 TERM_FILL(&sptr[slen], TERM_LEN(str));
6160 ENC_CODERANGE_SET(str, cr);
6161}
6162
6163static inline void
6164rb_str_update_0(VALUE str, long beg, long len, VALUE val)
6165{
6166 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
6167}
6168
6169void
6170rb_str_update(VALUE str, long beg, long len, VALUE val)
6171{
6172 long slen;
6173 char *p, *e;
6174 rb_encoding *enc;
6175 int singlebyte = single_byte_optimizable(str);
6176 int cr;
6177
6178 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
6179
6180 StringValue(val);
6181 enc = rb_enc_check(str, val);
6182 slen = str_strlen(str, enc); /* rb_enc_check */
6183
6184 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6185 rb_raise(rb_eIndexError, "index %ld out of string", beg);
6186 }
6187 if (beg < 0) {
6188 beg += slen;
6189 }
6190 RUBY_ASSERT(beg >= 0);
6191 RUBY_ASSERT(beg <= slen);
6192
6193 if (len > slen - beg) {
6194 len = slen - beg;
6195 }
6196 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
6197 if (!p) p = RSTRING_END(str);
6198 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
6199 if (!e) e = RSTRING_END(str);
6200 /* error check */
6201 beg = p - RSTRING_PTR(str); /* physical position */
6202 len = e - p; /* physical length */
6203 rb_str_update_0(str, beg, len, val);
6204 rb_enc_associate(str, enc);
6206 if (cr != ENC_CODERANGE_BROKEN)
6207 ENC_CODERANGE_SET(str, cr);
6208}
6209
6210static void
6211rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
6212{
6213 int nth;
6214 VALUE match;
6215 long start, end, len;
6216 rb_encoding *enc;
6217 struct re_registers *regs;
6218
6219 if (rb_reg_search(re, str, 0, 0) < 0) {
6220 rb_raise(rb_eIndexError, "regexp not matched");
6221 }
6222 match = rb_backref_get();
6223 nth = rb_reg_backref_number(match, backref);
6224 regs = RMATCH_REGS(match);
6225 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
6226 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
6227 }
6228 if (nth < 0) {
6229 nth += regs->num_regs;
6230 }
6231
6232 start = BEG(nth);
6233 if (start == -1) {
6234 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
6235 }
6236 end = END(nth);
6237 len = end - start;
6238 StringValue(val);
6239 enc = rb_enc_check_str(str, val);
6240 rb_str_update_0(str, start, len, val);
6241 rb_enc_associate(str, enc);
6242}
6243
6244static VALUE
6245rb_str_aset(VALUE str, VALUE indx, VALUE val)
6246{
6247 long idx, beg;
6248
6249 switch (TYPE(indx)) {
6250 case T_REGEXP:
6251 rb_str_subpat_set(str, indx, INT2FIX(0), val);
6252 return val;
6253
6254 case T_STRING:
6255 beg = rb_str_index(str, indx, 0);
6256 if (beg < 0) {
6257 rb_raise(rb_eIndexError, "string not matched");
6258 }
6259 beg = rb_str_sublen(str, beg);
6260 rb_str_update(str, beg, str_strlen(indx, NULL), val);
6261 return val;
6262
6263 default:
6264 /* check if indx is Range */
6265 {
6266 long beg, len;
6267 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6268 rb_str_update(str, beg, len, val);
6269 return val;
6270 }
6271 }
6272 /* FALLTHROUGH */
6273
6274 case T_FIXNUM:
6275 idx = NUM2LONG(indx);
6276 rb_str_update(str, idx, 1, val);
6277 return val;
6278 }
6279}
6280
6281/*
6282 * call-seq:
6283 * string[index] = new_string
6284 * string[start, length] = new_string
6285 * string[range] = new_string
6286 * string[regexp, capture = 0] = new_string
6287 * string[substring] = new_string
6288 *
6289 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6290 * See {String Slices}[rdoc-ref:String@String+Slices].
6291 *
6292 * A few examples:
6293 *
6294 * s = 'foo'
6295 * s[2] = 'rtune' # => "rtune"
6296 * s # => "fortune"
6297 * s[1, 5] = 'init' # => "init"
6298 * s # => "finite"
6299 * s[3..4] = 'al' # => "al"
6300 * s # => "finale"
6301 * s[/e$/] = 'ly' # => "ly"
6302 * s # => "finally"
6303 * s['lly'] = 'ncial' # => "ncial"
6304 * s # => "financial"
6305 *
6306 */
6307
6308static VALUE
6309rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6310{
6311 if (argc == 3) {
6312 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6313 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6314 }
6315 else {
6316 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6317 }
6318 return argv[2];
6319 }
6320 rb_check_arity(argc, 2, 3);
6321 return rb_str_aset(str, argv[0], argv[1]);
6322}
6323
6324/*
6325 * call-seq:
6326 * insert(index, other_string) -> self
6327 *
6328 * Inserts the given +other_string+ into +self+; returns +self+.
6329 *
6330 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
6331 *
6332 * 'foo'.insert(1, 'bar') # => "fbaroo"
6333 *
6334 * If the Integer +index+ is negative, counts backward from the end of +self+
6335 * and inserts +other_string+ at offset <tt>index+1</tt>
6336 * (that is, _after_ <tt>self[index]</tt>):
6337 *
6338 * 'foo'.insert(-2, 'bar') # => "fobaro"
6339 *
6340 */
6341
6342static VALUE
6343rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6344{
6345 long pos = NUM2LONG(idx);
6346
6347 if (pos == -1) {
6348 return rb_str_append(str, str2);
6349 }
6350 else if (pos < 0) {
6351 pos++;
6352 }
6353 rb_str_update(str, pos, 0, str2);
6354 return str;
6355}
6356
6357
6358/*
6359 * call-seq:
6360 * slice!(index) -> new_string or nil
6361 * slice!(start, length) -> new_string or nil
6362 * slice!(range) -> new_string or nil
6363 * slice!(regexp, capture = 0) -> new_string or nil
6364 * slice!(substring) -> new_string or nil
6365 *
6366 * Removes and returns the substring of +self+ specified by the arguments.
6367 * See {String Slices}[rdoc-ref:String@String+Slices].
6368 *
6369 * A few examples:
6370 *
6371 * string = "This is a string"
6372 * string.slice!(2) #=> "i"
6373 * string.slice!(3..6) #=> " is "
6374 * string.slice!(/s.*t/) #=> "sa st"
6375 * string.slice!("r") #=> "r"
6376 * string #=> "Thing"
6377 *
6378 */
6379
6380static VALUE
6381rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6382{
6383 VALUE result = Qnil;
6384 VALUE indx;
6385 long beg, len = 1;
6386 char *p;
6387
6388 rb_check_arity(argc, 1, 2);
6389 str_modify_keep_cr(str);
6390 indx = argv[0];
6391 if (RB_TYPE_P(indx, T_REGEXP)) {
6392 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6393 VALUE match = rb_backref_get();
6394 struct re_registers *regs = RMATCH_REGS(match);
6395 int nth = 0;
6396 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6397 if ((nth += regs->num_regs) <= 0) return Qnil;
6398 }
6399 else if (nth >= regs->num_regs) return Qnil;
6400 beg = BEG(nth);
6401 len = END(nth) - beg;
6402 goto subseq;
6403 }
6404 else if (argc == 2) {
6405 beg = NUM2LONG(indx);
6406 len = NUM2LONG(argv[1]);
6407 goto num_index;
6408 }
6409 else if (FIXNUM_P(indx)) {
6410 beg = FIX2LONG(indx);
6411 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6412 if (!len) return Qnil;
6413 beg = p - RSTRING_PTR(str);
6414 goto subseq;
6415 }
6416 else if (RB_TYPE_P(indx, T_STRING)) {
6417 beg = rb_str_index(str, indx, 0);
6418 if (beg == -1) return Qnil;
6419 len = RSTRING_LEN(indx);
6420 result = str_duplicate(rb_cString, indx);
6421 goto squash;
6422 }
6423 else {
6424 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6425 case Qnil:
6426 return Qnil;
6427 case Qfalse:
6428 beg = NUM2LONG(indx);
6429 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6430 if (!len) return Qnil;
6431 beg = p - RSTRING_PTR(str);
6432 goto subseq;
6433 default:
6434 goto num_index;
6435 }
6436 }
6437
6438 num_index:
6439 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6440 beg = p - RSTRING_PTR(str);
6441
6442 subseq:
6443 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6444 rb_enc_cr_str_copy_for_substr(result, str);
6445
6446 squash:
6447 if (len > 0) {
6448 if (beg == 0) {
6449 rb_str_drop_bytes(str, len);
6450 }
6451 else {
6452 char *sptr = RSTRING_PTR(str);
6453 long slen = RSTRING_LEN(str);
6454 if (beg + len > slen) /* pathological check */
6455 len = slen - beg;
6456 memmove(sptr + beg,
6457 sptr + beg + len,
6458 slen - (beg + len));
6459 slen -= len;
6460 STR_SET_LEN(str, slen);
6461 TERM_FILL(&sptr[slen], TERM_LEN(str));
6462 }
6463 }
6464 return result;
6465}
6466
6467static VALUE
6468get_pat(VALUE pat)
6469{
6470 VALUE val;
6471
6472 switch (OBJ_BUILTIN_TYPE(pat)) {
6473 case T_REGEXP:
6474 return pat;
6475
6476 case T_STRING:
6477 break;
6478
6479 default:
6480 val = rb_check_string_type(pat);
6481 if (NIL_P(val)) {
6482 Check_Type(pat, T_REGEXP);
6483 }
6484 pat = val;
6485 }
6486
6487 return rb_reg_regcomp(pat);
6488}
6489
6490static VALUE
6491get_pat_quoted(VALUE pat, int check)
6492{
6493 VALUE val;
6494
6495 switch (OBJ_BUILTIN_TYPE(pat)) {
6496 case T_REGEXP:
6497 return pat;
6498
6499 case T_STRING:
6500 break;
6501
6502 default:
6503 val = rb_check_string_type(pat);
6504 if (NIL_P(val)) {
6505 Check_Type(pat, T_REGEXP);
6506 }
6507 pat = val;
6508 }
6509 if (check && is_broken_string(pat)) {
6510 rb_exc_raise(rb_reg_check_preprocess(pat));
6511 }
6512 return pat;
6513}
6514
6515static long
6516rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6517{
6518 if (BUILTIN_TYPE(pat) == T_STRING) {
6519 pos = rb_str_byteindex(str, pat, pos);
6520 if (set_backref_str) {
6521 if (pos >= 0) {
6522 str = rb_str_new_frozen_String(str);
6523 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6524 if (match) {
6525 *match = match_data;
6526 }
6527 }
6528 else {
6530 }
6531 }
6532 return pos;
6533 }
6534 else {
6535 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6536 }
6537}
6538
6539static long
6540rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6541{
6542 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6543}
6544
6545
6546/*
6547 * call-seq:
6548 * sub!(pattern, replacement) -> self or nil
6549 * sub!(pattern) {|match| ... } -> self or nil
6550 *
6551 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6552 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6553 *
6554 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6555 *
6556 * Related: String#sub, String#gsub, String#gsub!.
6557 *
6558 */
6559
6560static VALUE
6561rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6562{
6563 VALUE pat, repl, hash = Qnil;
6564 int iter = 0;
6565 long plen;
6566 int min_arity = rb_block_given_p() ? 1 : 2;
6567 long beg;
6568
6569 rb_check_arity(argc, min_arity, 2);
6570 if (argc == 1) {
6571 iter = 1;
6572 }
6573 else {
6574 repl = argv[1];
6575 hash = rb_check_hash_type(argv[1]);
6576 if (NIL_P(hash)) {
6577 StringValue(repl);
6578 }
6579 }
6580
6581 pat = get_pat_quoted(argv[0], 1);
6582
6583 str_modifiable(str);
6584 beg = rb_pat_search(pat, str, 0, 1);
6585 if (beg >= 0) {
6586 rb_encoding *enc;
6587 int cr = ENC_CODERANGE(str);
6588 long beg0, end0;
6589 VALUE match, match0 = Qnil;
6590 struct re_registers *regs;
6591 char *p, *rp;
6592 long len, rlen;
6593
6594 match = rb_backref_get();
6595 regs = RMATCH_REGS(match);
6596 if (RB_TYPE_P(pat, T_STRING)) {
6597 beg0 = beg;
6598 end0 = beg0 + RSTRING_LEN(pat);
6599 match0 = pat;
6600 }
6601 else {
6602 beg0 = BEG(0);
6603 end0 = END(0);
6604 if (iter) match0 = rb_reg_nth_match(0, match);
6605 }
6606
6607 if (iter || !NIL_P(hash)) {
6608 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6609
6610 if (iter) {
6611 repl = rb_obj_as_string(rb_yield(match0));
6612 }
6613 else {
6614 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6615 repl = rb_obj_as_string(repl);
6616 }
6617 str_mod_check(str, p, len);
6618 rb_check_frozen(str);
6619 }
6620 else {
6621 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6622 }
6623
6624 enc = rb_enc_compatible(str, repl);
6625 if (!enc) {
6626 rb_encoding *str_enc = STR_ENC_GET(str);
6627 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6628 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6629 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6630 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6631 rb_enc_inspect_name(str_enc),
6632 rb_enc_inspect_name(STR_ENC_GET(repl)));
6633 }
6634 enc = STR_ENC_GET(repl);
6635 }
6636 rb_str_modify(str);
6637 rb_enc_associate(str, enc);
6639 int cr2 = ENC_CODERANGE(repl);
6640 if (cr2 == ENC_CODERANGE_BROKEN ||
6641 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6643 else
6644 cr = cr2;
6645 }
6646 plen = end0 - beg0;
6647 rlen = RSTRING_LEN(repl);
6648 len = RSTRING_LEN(str);
6649 if (rlen > plen) {
6650 RESIZE_CAPA(str, len + rlen - plen);
6651 }
6652 p = RSTRING_PTR(str);
6653 if (rlen != plen) {
6654 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6655 }
6656 rp = RSTRING_PTR(repl);
6657 memmove(p + beg0, rp, rlen);
6658 len += rlen - plen;
6659 STR_SET_LEN(str, len);
6660 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6661 ENC_CODERANGE_SET(str, cr);
6662
6663 RB_GC_GUARD(match);
6664
6665 return str;
6666 }
6667 return Qnil;
6668}
6669
6670
6671/*
6672 * call-seq:
6673 * sub(pattern, replacement) -> new_string
6674 * sub(pattern) {|match| ... } -> new_string
6675 *
6676 * Returns a copy of +self+ with only the first occurrence
6677 * (not all occurrences) of the given +pattern+ replaced.
6678 *
6679 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6680 *
6681 * Related: String#sub!, String#gsub, String#gsub!.
6682 *
6683 */
6684
6685static VALUE
6686rb_str_sub(int argc, VALUE *argv, VALUE str)
6687{
6688 str = str_duplicate(rb_cString, str);
6689 rb_str_sub_bang(argc, argv, str);
6690 return str;
6691}
6692
6693static VALUE
6694str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6695{
6696 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6697 long beg, beg0, end0;
6698 long offset, blen, slen, len, last;
6699 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6700 char *sp, *cp;
6701 int need_backref_str = -1;
6702 rb_encoding *str_enc;
6703
6704 switch (argc) {
6705 case 1:
6706 RETURN_ENUMERATOR(str, argc, argv);
6707 mode = ITER;
6708 break;
6709 case 2:
6710 repl = argv[1];
6711 hash = rb_check_hash_type(argv[1]);
6712 if (NIL_P(hash)) {
6713 StringValue(repl);
6714 }
6715 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6716 mode = FAST_MAP;
6717 }
6718 else {
6719 mode = MAP;
6720 }
6721 break;
6722 default:
6723 rb_error_arity(argc, 1, 2);
6724 }
6725
6726 pat = get_pat_quoted(argv[0], 1);
6727 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6728
6729 if (beg < 0) {
6730 if (bang) return Qnil; /* no match, no substitution */
6731 return str_duplicate(rb_cString, str);
6732 }
6733
6734 offset = 0;
6735 blen = RSTRING_LEN(str) + 30; /* len + margin */
6736 dest = rb_str_buf_new(blen);
6737 sp = RSTRING_PTR(str);
6738 slen = RSTRING_LEN(str);
6739 cp = sp;
6740 str_enc = STR_ENC_GET(str);
6741 rb_enc_associate(dest, str_enc);
6742 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6743
6744 do {
6745 struct re_registers *regs = RMATCH_REGS(match);
6746 if (RB_TYPE_P(pat, T_STRING)) {
6747 beg0 = beg;
6748 end0 = beg0 + RSTRING_LEN(pat);
6749 match0 = pat;
6750 }
6751 else {
6752 beg0 = BEG(0);
6753 end0 = END(0);
6754 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6755 }
6756
6757 if (mode != STR) {
6758 if (mode == ITER) {
6759 val = rb_obj_as_string(rb_yield(match0));
6760 }
6761 else {
6762 struct RString fake_str;
6763 VALUE key;
6764 if (mode == FAST_MAP) {
6765 // It is safe to use a fake_str here because we established that it won't escape,
6766 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6767 // default proc.
6768 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6769 }
6770 else {
6771 key = rb_str_subseq(str, beg0, end0 - beg0);
6772 }
6773 val = rb_hash_aref(hash, key);
6774 val = rb_obj_as_string(val);
6775 }
6776 str_mod_check(str, sp, slen);
6777 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6778 rb_raise(rb_eRuntimeError, "block should not cheat");
6779 }
6780 }
6781 else if (need_backref_str) {
6782 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6783 if (need_backref_str < 0) {
6784 need_backref_str = val != repl;
6785 }
6786 }
6787 else {
6788 val = repl;
6789 }
6790
6791 len = beg0 - offset; /* copy pre-match substr */
6792 if (len) {
6793 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6794 }
6795
6796 rb_str_buf_append(dest, val);
6797
6798 last = offset;
6799 offset = end0;
6800 if (beg0 == end0) {
6801 /*
6802 * Always consume at least one character of the input string
6803 * in order to prevent infinite loops.
6804 */
6805 if (RSTRING_LEN(str) <= end0) break;
6806 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6807 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6808 offset = end0 + len;
6809 }
6810 cp = RSTRING_PTR(str) + offset;
6811 if (offset > RSTRING_LEN(str)) break;
6812
6813 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6814 if (mode != FAST_MAP && mode != STR) {
6815 match = Qnil;
6816 }
6817 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6818
6819 RB_GC_GUARD(match);
6820 } while (beg >= 0);
6821
6822 if (RSTRING_LEN(str) > offset) {
6823 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6824 }
6825 rb_pat_search0(pat, str, last, 1, &match);
6826 if (bang) {
6827 str_shared_replace(str, dest);
6828 }
6829 else {
6830 str = dest;
6831 }
6832
6833 return str;
6834}
6835
6836
6837/*
6838 * call-seq:
6839 * gsub!(pattern, replacement) -> self or nil
6840 * gsub!(pattern) {|match| ... } -> self or nil
6841 * gsub!(pattern) -> an_enumerator
6842 *
6843 * Performs the specified substring replacement(s) on +self+;
6844 * returns +self+ if any replacement occurred, +nil+ otherwise.
6845 *
6846 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6847 *
6848 * Returns an Enumerator if no +replacement+ and no block given.
6849 *
6850 * Related: String#sub, String#gsub, String#sub!.
6851 *
6852 */
6853
6854static VALUE
6855rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6856{
6857 str_modify_keep_cr(str);
6858 return str_gsub(argc, argv, str, 1);
6859}
6860
6861
6862/*
6863 * call-seq:
6864 * gsub(pattern, replacement) -> new_string
6865 * gsub(pattern) {|match| ... } -> new_string
6866 * gsub(pattern) -> enumerator
6867 *
6868 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6869 *
6870 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6871 *
6872 * Returns an Enumerator if no +replacement+ and no block given.
6873 *
6874 * Related: String#sub, String#sub!, String#gsub!.
6875 *
6876 */
6877
6878static VALUE
6879rb_str_gsub(int argc, VALUE *argv, VALUE str)
6880{
6881 return str_gsub(argc, argv, str, 0);
6882}
6883
6884
6885/*
6886 * call-seq:
6887 * replace(other_string) -> self
6888 *
6889 * Replaces the contents of +self+ with the contents of +other_string+:
6890 *
6891 * s = 'foo' # => "foo"
6892 * s.replace('bar') # => "bar"
6893 *
6894 */
6895
6896VALUE
6898{
6899 str_modifiable(str);
6900 if (str == str2) return str;
6901
6902 StringValue(str2);
6903 str_discard(str);
6904 return str_replace(str, str2);
6905}
6906
6907/*
6908 * call-seq:
6909 * clear -> self
6910 *
6911 * Removes the contents of +self+:
6912 *
6913 * s = 'foo' # => "foo"
6914 * s.clear # => ""
6915 *
6916 */
6917
6918static VALUE
6919rb_str_clear(VALUE str)
6920{
6921 str_discard(str);
6922 STR_SET_EMBED(str);
6923 STR_SET_LEN(str, 0);
6924 RSTRING_PTR(str)[0] = 0;
6925 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6927 else
6929 return str;
6930}
6931
6932/*
6933 * call-seq:
6934 * chr -> string
6935 *
6936 * Returns a string containing the first character of +self+:
6937 *
6938 * s = 'foo' # => "foo"
6939 * s.chr # => "f"
6940 *
6941 */
6942
6943static VALUE
6944rb_str_chr(VALUE str)
6945{
6946 return rb_str_substr(str, 0, 1);
6947}
6948
6949/*
6950 * call-seq:
6951 * getbyte(index) -> integer or nil
6952 *
6953 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6954 *
6955 * s = 'abcde' # => "abcde"
6956 * s.getbyte(0) # => 97
6957 * s.getbyte(-1) # => 101
6958 * s.getbyte(5) # => nil
6959 *
6960 * Related: String#setbyte.
6961 */
6962VALUE
6963rb_str_getbyte(VALUE str, VALUE index)
6964{
6965 long pos = NUM2LONG(index);
6966
6967 if (pos < 0)
6968 pos += RSTRING_LEN(str);
6969 if (pos < 0 || RSTRING_LEN(str) <= pos)
6970 return Qnil;
6971
6972 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6973}
6974
6975/*
6976 * call-seq:
6977 * setbyte(index, integer) -> integer
6978 *
6979 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6980 *
6981 * s = 'abcde' # => "abcde"
6982 * s.setbyte(0, 98) # => 98
6983 * s # => "bbcde"
6984 *
6985 * Related: String#getbyte.
6986 */
6987VALUE
6988rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6989{
6990 long pos = NUM2LONG(index);
6991 long len = RSTRING_LEN(str);
6992 char *ptr, *head, *left = 0;
6993 rb_encoding *enc;
6994 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6995
6996 if (pos < -len || len <= pos)
6997 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6998 if (pos < 0)
6999 pos += len;
7000
7001 VALUE v = rb_to_int(value);
7002 VALUE w = rb_int_and(v, INT2FIX(0xff));
7003 char byte = (char)(NUM2INT(w) & 0xFF);
7004
7005 if (!str_independent(str))
7006 str_make_independent(str);
7007 enc = STR_ENC_GET(str);
7008 head = RSTRING_PTR(str);
7009 ptr = &head[pos];
7010 if (!STR_EMBED_P(str)) {
7011 cr = ENC_CODERANGE(str);
7012 switch (cr) {
7013 case ENC_CODERANGE_7BIT:
7014 left = ptr;
7015 *ptr = byte;
7016 if (ISASCII(byte)) goto end;
7017 nlen = rb_enc_precise_mbclen(left, head+len, enc);
7018 if (!MBCLEN_CHARFOUND_P(nlen))
7020 else
7022 goto end;
7024 left = rb_enc_left_char_head(head, ptr, head+len, enc);
7025 width = rb_enc_precise_mbclen(left, head+len, enc);
7026 *ptr = byte;
7027 nlen = rb_enc_precise_mbclen(left, head+len, enc);
7028 if (!MBCLEN_CHARFOUND_P(nlen))
7030 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
7032 goto end;
7033 }
7034 }
7036 *ptr = byte;
7037
7038 end:
7039 return value;
7040}
7041
7042static VALUE
7043str_byte_substr(VALUE str, long beg, long len, int empty)
7044{
7045 long n = RSTRING_LEN(str);
7046
7047 if (beg > n || len < 0) return Qnil;
7048 if (beg < 0) {
7049 beg += n;
7050 if (beg < 0) return Qnil;
7051 }
7052 if (len > n - beg)
7053 len = n - beg;
7054 if (len <= 0) {
7055 if (!empty) return Qnil;
7056 len = 0;
7057 }
7058
7059 VALUE str2 = str_subseq(str, beg, len);
7060
7061 str_enc_copy_direct(str2, str);
7062
7063 if (RSTRING_LEN(str2) == 0) {
7064 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
7066 else
7068 }
7069 else {
7070 switch (ENC_CODERANGE(str)) {
7071 case ENC_CODERANGE_7BIT:
7073 break;
7074 default:
7076 break;
7077 }
7078 }
7079
7080 return str2;
7081}
7082
7083VALUE
7084rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
7085{
7086 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
7087}
7088
7089static VALUE
7090str_byte_aref(VALUE str, VALUE indx)
7091{
7092 long idx;
7093 if (FIXNUM_P(indx)) {
7094 idx = FIX2LONG(indx);
7095 }
7096 else {
7097 /* check if indx is Range */
7098 long beg, len = RSTRING_LEN(str);
7099
7100 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
7101 case Qfalse:
7102 break;
7103 case Qnil:
7104 return Qnil;
7105 default:
7106 return str_byte_substr(str, beg, len, TRUE);
7107 }
7108
7109 idx = NUM2LONG(indx);
7110 }
7111 return str_byte_substr(str, idx, 1, FALSE);
7112}
7113
7114/*
7115 * call-seq:
7116 * byteslice(index, length = 1) -> string or nil
7117 * byteslice(range) -> string or nil
7118 *
7119 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
7120 *
7121 * With integer arguments +index+ and +length+ given,
7122 * returns the substring beginning at the given +index+
7123 * of the given +length+ (if possible),
7124 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
7125 *
7126 * s = '0123456789' # => "0123456789"
7127 * s.byteslice(2) # => "2"
7128 * s.byteslice(200) # => nil
7129 * s.byteslice(4, 3) # => "456"
7130 * s.byteslice(4, 30) # => "456789"
7131 * s.byteslice(4, -1) # => nil
7132 * s.byteslice(40, 2) # => nil
7133 *
7134 * In either case above, counts backwards from the end of +self+
7135 * if +index+ is negative:
7136 *
7137 * s = '0123456789' # => "0123456789"
7138 * s.byteslice(-4) # => "6"
7139 * s.byteslice(-4, 3) # => "678"
7140 *
7141 * With Range argument +range+ given, returns
7142 * <tt>byteslice(range.begin, range.size)</tt>:
7143 *
7144 * s = '0123456789' # => "0123456789"
7145 * s.byteslice(4..6) # => "456"
7146 * s.byteslice(-6..-4) # => "456"
7147 * s.byteslice(5..2) # => "" # range.size is zero.
7148 * s.byteslice(40..42) # => nil
7149 *
7150 * In all cases, a returned string has the same encoding as +self+:
7151 *
7152 * s.encoding # => #<Encoding:UTF-8>
7153 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
7154 *
7155 */
7156
7157static VALUE
7158rb_str_byteslice(int argc, VALUE *argv, VALUE str)
7159{
7160 if (argc == 2) {
7161 long beg = NUM2LONG(argv[0]);
7162 long len = NUM2LONG(argv[1]);
7163 return str_byte_substr(str, beg, len, TRUE);
7164 }
7165 rb_check_arity(argc, 1, 2);
7166 return str_byte_aref(str, argv[0]);
7167}
7168
7169static void
7170str_check_beg_len(VALUE str, long *beg, long *len)
7171{
7172 long end, slen = RSTRING_LEN(str);
7173
7174 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
7175 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
7176 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
7177 }
7178 if (*beg < 0) {
7179 *beg += slen;
7180 }
7181 RUBY_ASSERT(*beg >= 0);
7182 RUBY_ASSERT(*beg <= slen);
7183
7184 if (*len > slen - *beg) {
7185 *len = slen - *beg;
7186 }
7187 end = *beg + *len;
7188 str_ensure_byte_pos(str, *beg);
7189 str_ensure_byte_pos(str, end);
7190}
7191
7192/*
7193 * call-seq:
7194 * bytesplice(index, length, str) -> string
7195 * bytesplice(index, length, str, str_index, str_length) -> string
7196 * bytesplice(range, str) -> string
7197 * bytesplice(range, str, str_range) -> string
7198 *
7199 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
7200 * The portion of the string affected is determined using
7201 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
7202 * If the replacement string is not the same length as the text it is replacing,
7203 * the string will be adjusted accordingly.
7204 *
7205 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
7206 *
7207 * The form that take an Integer will raise an IndexError if the value is out
7208 * of range; the Range form will raise a RangeError.
7209 * If the beginning or ending offset does not land on character (codepoint)
7210 * boundary, an IndexError will be raised.
7211 */
7212
7213static VALUE
7214rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
7215{
7216 long beg, len, vbeg, vlen;
7217 VALUE val;
7218 int cr;
7219
7220 rb_check_arity(argc, 2, 5);
7221 if (!(argc == 2 || argc == 3 || argc == 5)) {
7222 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
7223 }
7224 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
7225 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
7226 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
7227 rb_builtin_class_name(argv[0]));
7228 }
7229 val = argv[1];
7230 StringValue(val);
7231 if (argc == 2) {
7232 /* bytesplice(range, str) */
7233 vbeg = 0;
7234 vlen = RSTRING_LEN(val);
7235 }
7236 else {
7237 /* bytesplice(range, str, str_range) */
7238 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
7239 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
7240 rb_builtin_class_name(argv[2]));
7241 }
7242 }
7243 }
7244 else {
7245 beg = NUM2LONG(argv[0]);
7246 len = NUM2LONG(argv[1]);
7247 val = argv[2];
7248 StringValue(val);
7249 if (argc == 3) {
7250 /* bytesplice(index, length, str) */
7251 vbeg = 0;
7252 vlen = RSTRING_LEN(val);
7253 }
7254 else {
7255 /* bytesplice(index, length, str, str_index, str_length) */
7256 vbeg = NUM2LONG(argv[3]);
7257 vlen = NUM2LONG(argv[4]);
7258 }
7259 }
7260 str_check_beg_len(str, &beg, &len);
7261 str_check_beg_len(val, &vbeg, &vlen);
7262 str_modify_keep_cr(str);
7263
7264 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
7265 rb_enc_associate(str, rb_enc_check(str, val));
7266 }
7267
7268 rb_str_update_1(str, beg, len, val, vbeg, vlen);
7270 if (cr != ENC_CODERANGE_BROKEN)
7271 ENC_CODERANGE_SET(str, cr);
7272 return str;
7273}
7274
7275/*
7276 * call-seq:
7277 * reverse -> string
7278 *
7279 * Returns a new string with the characters from +self+ in reverse order.
7280 *
7281 * 'stressed'.reverse # => "desserts"
7282 *
7283 */
7284
7285static VALUE
7286rb_str_reverse(VALUE str)
7287{
7288 rb_encoding *enc;
7289 VALUE rev;
7290 char *s, *e, *p;
7291 int cr;
7292
7293 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
7294 enc = STR_ENC_GET(str);
7295 rev = rb_str_new(0, RSTRING_LEN(str));
7296 s = RSTRING_PTR(str); e = RSTRING_END(str);
7297 p = RSTRING_END(rev);
7298 cr = ENC_CODERANGE(str);
7299
7300 if (RSTRING_LEN(str) > 1) {
7301 if (single_byte_optimizable(str)) {
7302 while (s < e) {
7303 *--p = *s++;
7304 }
7305 }
7306 else if (cr == ENC_CODERANGE_VALID) {
7307 while (s < e) {
7308 int clen = rb_enc_fast_mbclen(s, e, enc);
7309
7310 p -= clen;
7311 memcpy(p, s, clen);
7312 s += clen;
7313 }
7314 }
7315 else {
7316 cr = rb_enc_asciicompat(enc) ?
7318 while (s < e) {
7319 int clen = rb_enc_mbclen(s, e, enc);
7320
7321 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7322 p -= clen;
7323 memcpy(p, s, clen);
7324 s += clen;
7325 }
7326 }
7327 }
7328 STR_SET_LEN(rev, RSTRING_LEN(str));
7329 str_enc_copy_direct(rev, str);
7330 ENC_CODERANGE_SET(rev, cr);
7331
7332 return rev;
7333}
7334
7335
7336/*
7337 * call-seq:
7338 * reverse! -> self
7339 *
7340 * Returns +self+ with its characters reversed:
7341 *
7342 * s = 'stressed'
7343 * s.reverse! # => "desserts"
7344 * s # => "desserts"
7345 *
7346 */
7347
7348static VALUE
7349rb_str_reverse_bang(VALUE str)
7350{
7351 if (RSTRING_LEN(str) > 1) {
7352 if (single_byte_optimizable(str)) {
7353 char *s, *e, c;
7354
7355 str_modify_keep_cr(str);
7356 s = RSTRING_PTR(str);
7357 e = RSTRING_END(str) - 1;
7358 while (s < e) {
7359 c = *s;
7360 *s++ = *e;
7361 *e-- = c;
7362 }
7363 }
7364 else {
7365 str_shared_replace(str, rb_str_reverse(str));
7366 }
7367 }
7368 else {
7369 str_modify_keep_cr(str);
7370 }
7371 return str;
7372}
7373
7374
7375/*
7376 * call-seq:
7377 * include?(other_string) -> true or false
7378 *
7379 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
7380 *
7381 * s = 'foo'
7382 * s.include?('f') # => true
7383 * s.include?('fo') # => true
7384 * s.include?('food') # => false
7385 *
7386 */
7387
7388VALUE
7389rb_str_include(VALUE str, VALUE arg)
7390{
7391 long i;
7392
7393 StringValue(arg);
7394 i = rb_str_index(str, arg, 0);
7395
7396 return RBOOL(i != -1);
7397}
7398
7399
7400/*
7401 * call-seq:
7402 * to_i(base = 10) -> integer
7403 *
7404 * Returns the result of interpreting leading characters in +self+
7405 * as an integer in the given +base+ (which must be in (0, 2..36)):
7406 *
7407 * '123456'.to_i # => 123456
7408 * '123def'.to_i(16) # => 1195503
7409 *
7410 * With +base+ zero, string +object+ may contain leading characters
7411 * to specify the actual base:
7412 *
7413 * '123def'.to_i(0) # => 123
7414 * '0123def'.to_i(0) # => 83
7415 * '0b123def'.to_i(0) # => 1
7416 * '0o123def'.to_i(0) # => 83
7417 * '0d123def'.to_i(0) # => 123
7418 * '0x123def'.to_i(0) # => 1195503
7419 *
7420 * Characters past a leading valid number (in the given +base+) are ignored:
7421 *
7422 * '12.345'.to_i # => 12
7423 * '12345'.to_i(2) # => 1
7424 *
7425 * Returns zero if there is no leading valid number:
7426 *
7427 * 'abcdef'.to_i # => 0
7428 * '2'.to_i(2) # => 0
7429 *
7430 */
7431
7432static VALUE
7433rb_str_to_i(int argc, VALUE *argv, VALUE str)
7434{
7435 int base = 10;
7436
7437 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7438 rb_raise(rb_eArgError, "invalid radix %d", base);
7439 }
7440 return rb_str_to_inum(str, base, FALSE);
7441}
7442
7443
7444/*
7445 * call-seq:
7446 * to_f -> float
7447 *
7448 * Returns the result of interpreting leading characters in +self+ as a Float:
7449 *
7450 * '3.14159'.to_f # => 3.14159
7451 * '1.234e-2'.to_f # => 0.01234
7452 *
7453 * Characters past a leading valid number (in the given +base+) are ignored:
7454 *
7455 * '3.14 (pi to two places)'.to_f # => 3.14
7456 *
7457 * Returns zero if there is no leading valid number:
7458 *
7459 * 'abcdef'.to_f # => 0.0
7460 *
7461 */
7462
7463static VALUE
7464rb_str_to_f(VALUE str)
7465{
7466 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7467}
7468
7469
7470/*
7471 * call-seq:
7472 * to_s -> self or string
7473 *
7474 * Returns +self+ if +self+ is a +String+,
7475 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7476 */
7477
7478static VALUE
7479rb_str_to_s(VALUE str)
7480{
7481 if (rb_obj_class(str) != rb_cString) {
7482 return str_duplicate(rb_cString, str);
7483 }
7484 return str;
7485}
7486
7487#if 0
7488static void
7489str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7490{
7491 char s[RUBY_MAX_CHAR_LEN];
7492 int n = rb_enc_codelen(c, enc);
7493
7494 rb_enc_mbcput(c, s, enc);
7495 rb_enc_str_buf_cat(str, s, n, enc);
7496}
7497#endif
7498
7499#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7500
7501int
7502rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7503{
7504 char buf[CHAR_ESC_LEN + 1];
7505 int l;
7506
7507#if SIZEOF_INT > 4
7508 c &= 0xffffffff;
7509#endif
7510 if (unicode_p) {
7511 if (c < 0x7F && ISPRINT(c)) {
7512 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7513 }
7514 else if (c < 0x10000) {
7515 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7516 }
7517 else {
7518 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7519 }
7520 }
7521 else {
7522 if (c < 0x100) {
7523 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7524 }
7525 else {
7526 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7527 }
7528 }
7529 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7530 rb_str_buf_cat(result, buf, l);
7531 return l;
7532}
7533
7534const char *
7535ruby_escaped_char(int c)
7536{
7537 switch (c) {
7538 case '\0': return "\\0";
7539 case '\n': return "\\n";
7540 case '\r': return "\\r";
7541 case '\t': return "\\t";
7542 case '\f': return "\\f";
7543 case '\013': return "\\v";
7544 case '\010': return "\\b";
7545 case '\007': return "\\a";
7546 case '\033': return "\\e";
7547 case '\x7f': return "\\c?";
7548 }
7549 return NULL;
7550}
7551
7552VALUE
7553rb_str_escape(VALUE str)
7554{
7555 int encidx = ENCODING_GET(str);
7556 rb_encoding *enc = rb_enc_from_index(encidx);
7557 const char *p = RSTRING_PTR(str);
7558 const char *pend = RSTRING_END(str);
7559 const char *prev = p;
7560 char buf[CHAR_ESC_LEN + 1];
7561 VALUE result = rb_str_buf_new(0);
7562 int unicode_p = rb_enc_unicode_p(enc);
7563 int asciicompat = rb_enc_asciicompat(enc);
7564
7565 while (p < pend) {
7566 unsigned int c;
7567 const char *cc;
7568 int n = rb_enc_precise_mbclen(p, pend, enc);
7569 if (!MBCLEN_CHARFOUND_P(n)) {
7570 if (p > prev) str_buf_cat(result, prev, p - prev);
7571 n = rb_enc_mbminlen(enc);
7572 if (pend < p + n)
7573 n = (int)(pend - p);
7574 while (n--) {
7575 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7576 str_buf_cat(result, buf, strlen(buf));
7577 prev = ++p;
7578 }
7579 continue;
7580 }
7581 n = MBCLEN_CHARFOUND_LEN(n);
7582 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7583 p += n;
7584 cc = ruby_escaped_char(c);
7585 if (cc) {
7586 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7587 str_buf_cat(result, cc, strlen(cc));
7588 prev = p;
7589 }
7590 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7591 }
7592 else {
7593 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7594 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7595 prev = p;
7596 }
7597 }
7598 if (p > prev) str_buf_cat(result, prev, p - prev);
7599 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7600
7601 return result;
7602}
7603
7604/*
7605 * call-seq:
7606 * inspect -> string
7607 *
7608 * Returns a printable version of +self+, enclosed in double-quotes,
7609 * and with special characters escaped:
7610 *
7611 * s = "foo\tbar\tbaz\n"
7612 * s.inspect
7613 * # => "\"foo\\tbar\\tbaz\\n\""
7614 *
7615 */
7616
7617VALUE
7619{
7620 int encidx = ENCODING_GET(str);
7621 rb_encoding *enc = rb_enc_from_index(encidx);
7622 const char *p, *pend, *prev;
7623 char buf[CHAR_ESC_LEN + 1];
7624 VALUE result = rb_str_buf_new(0);
7625 rb_encoding *resenc = rb_default_internal_encoding();
7626 int unicode_p = rb_enc_unicode_p(enc);
7627 int asciicompat = rb_enc_asciicompat(enc);
7628
7629 if (resenc == NULL) resenc = rb_default_external_encoding();
7630 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7631 rb_enc_associate(result, resenc);
7632 str_buf_cat2(result, "\"");
7633
7634 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7635 prev = p;
7636 while (p < pend) {
7637 unsigned int c, cc;
7638 int n;
7639
7640 n = rb_enc_precise_mbclen(p, pend, enc);
7641 if (!MBCLEN_CHARFOUND_P(n)) {
7642 if (p > prev) str_buf_cat(result, prev, p - prev);
7643 n = rb_enc_mbminlen(enc);
7644 if (pend < p + n)
7645 n = (int)(pend - p);
7646 while (n--) {
7647 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7648 str_buf_cat(result, buf, strlen(buf));
7649 prev = ++p;
7650 }
7651 continue;
7652 }
7653 n = MBCLEN_CHARFOUND_LEN(n);
7654 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7655 p += n;
7656 if ((asciicompat || unicode_p) &&
7657 (c == '"'|| c == '\\' ||
7658 (c == '#' &&
7659 p < pend &&
7660 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7661 (cc = rb_enc_codepoint(p,pend,enc),
7662 (cc == '$' || cc == '@' || cc == '{'))))) {
7663 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7664 str_buf_cat2(result, "\\");
7665 if (asciicompat || enc == resenc) {
7666 prev = p - n;
7667 continue;
7668 }
7669 }
7670 switch (c) {
7671 case '\n': cc = 'n'; break;
7672 case '\r': cc = 'r'; break;
7673 case '\t': cc = 't'; break;
7674 case '\f': cc = 'f'; break;
7675 case '\013': cc = 'v'; break;
7676 case '\010': cc = 'b'; break;
7677 case '\007': cc = 'a'; break;
7678 case 033: cc = 'e'; break;
7679 default: cc = 0; break;
7680 }
7681 if (cc) {
7682 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7683 buf[0] = '\\';
7684 buf[1] = (char)cc;
7685 str_buf_cat(result, buf, 2);
7686 prev = p;
7687 continue;
7688 }
7689 /* The special casing of 0x85 (NEXT_LINE) here is because
7690 * Oniguruma historically treats it as printable, but it
7691 * doesn't match the print POSIX bracket class or character
7692 * property in regexps.
7693 *
7694 * See Ruby Bug #16842 for details:
7695 * https://bugs.ruby-lang.org/issues/16842
7696 */
7697 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7698 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7699 continue;
7700 }
7701 else {
7702 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7703 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7704 prev = p;
7705 continue;
7706 }
7707 }
7708 if (p > prev) str_buf_cat(result, prev, p - prev);
7709 str_buf_cat2(result, "\"");
7710
7711 return result;
7712}
7713
7714#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7715
7716/*
7717 * call-seq:
7718 * dump -> string
7719 *
7720 * Returns a printable version of +self+, enclosed in double-quotes,
7721 * with special characters escaped, and with non-printing characters
7722 * replaced by hexadecimal notation:
7723 *
7724 * "hello \n ''".dump # => "\"hello \\n ''\""
7725 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7726 *
7727 * Related: String#undump (inverse of String#dump).
7728 *
7729 */
7730
7731VALUE
7733{
7734 int encidx = rb_enc_get_index(str);
7735 rb_encoding *enc = rb_enc_from_index(encidx);
7736 long len;
7737 const char *p, *pend;
7738 char *q, *qend;
7739 VALUE result;
7740 int u8 = (encidx == rb_utf8_encindex());
7741 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7742
7743 len = 2; /* "" */
7744 if (!rb_enc_asciicompat(enc)) {
7745 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7746 len += strlen(enc->name);
7747 }
7748
7749 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7750 while (p < pend) {
7751 int clen;
7752 unsigned char c = *p++;
7753
7754 switch (c) {
7755 case '"': case '\\':
7756 case '\n': case '\r':
7757 case '\t': case '\f':
7758 case '\013': case '\010': case '\007': case '\033':
7759 clen = 2;
7760 break;
7761
7762 case '#':
7763 clen = IS_EVSTR(p, pend) ? 2 : 1;
7764 break;
7765
7766 default:
7767 if (ISPRINT(c)) {
7768 clen = 1;
7769 }
7770 else {
7771 if (u8 && c > 0x7F) { /* \u notation */
7772 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7773 if (MBCLEN_CHARFOUND_P(n)) {
7774 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7775 if (cc <= 0xFFFF)
7776 clen = 6; /* \uXXXX */
7777 else if (cc <= 0xFFFFF)
7778 clen = 9; /* \u{XXXXX} */
7779 else
7780 clen = 10; /* \u{XXXXXX} */
7781 p += MBCLEN_CHARFOUND_LEN(n)-1;
7782 break;
7783 }
7784 }
7785 clen = 4; /* \xNN */
7786 }
7787 break;
7788 }
7789
7790 if (clen > LONG_MAX - len) {
7791 rb_raise(rb_eRuntimeError, "string size too big");
7792 }
7793 len += clen;
7794 }
7795
7796 result = rb_str_new(0, len);
7797 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7798 q = RSTRING_PTR(result); qend = q + len + 1;
7799
7800 *q++ = '"';
7801 while (p < pend) {
7802 unsigned char c = *p++;
7803
7804 if (c == '"' || c == '\\') {
7805 *q++ = '\\';
7806 *q++ = c;
7807 }
7808 else if (c == '#') {
7809 if (IS_EVSTR(p, pend)) *q++ = '\\';
7810 *q++ = '#';
7811 }
7812 else if (c == '\n') {
7813 *q++ = '\\';
7814 *q++ = 'n';
7815 }
7816 else if (c == '\r') {
7817 *q++ = '\\';
7818 *q++ = 'r';
7819 }
7820 else if (c == '\t') {
7821 *q++ = '\\';
7822 *q++ = 't';
7823 }
7824 else if (c == '\f') {
7825 *q++ = '\\';
7826 *q++ = 'f';
7827 }
7828 else if (c == '\013') {
7829 *q++ = '\\';
7830 *q++ = 'v';
7831 }
7832 else if (c == '\010') {
7833 *q++ = '\\';
7834 *q++ = 'b';
7835 }
7836 else if (c == '\007') {
7837 *q++ = '\\';
7838 *q++ = 'a';
7839 }
7840 else if (c == '\033') {
7841 *q++ = '\\';
7842 *q++ = 'e';
7843 }
7844 else if (ISPRINT(c)) {
7845 *q++ = c;
7846 }
7847 else {
7848 *q++ = '\\';
7849 if (u8) {
7850 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7851 if (MBCLEN_CHARFOUND_P(n)) {
7852 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7853 p += n;
7854 if (cc <= 0xFFFF)
7855 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7856 else
7857 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7858 q += strlen(q);
7859 continue;
7860 }
7861 }
7862 snprintf(q, qend-q, "x%02X", c);
7863 q += 3;
7864 }
7865 }
7866 *q++ = '"';
7867 *q = '\0';
7868 if (!rb_enc_asciicompat(enc)) {
7869 snprintf(q, qend-q, nonascii_suffix, enc->name);
7870 encidx = rb_ascii8bit_encindex();
7871 }
7872 /* result from dump is ASCII */
7873 rb_enc_associate_index(result, encidx);
7875 return result;
7876}
7877
7878static int
7879unescape_ascii(unsigned int c)
7880{
7881 switch (c) {
7882 case 'n':
7883 return '\n';
7884 case 'r':
7885 return '\r';
7886 case 't':
7887 return '\t';
7888 case 'f':
7889 return '\f';
7890 case 'v':
7891 return '\13';
7892 case 'b':
7893 return '\010';
7894 case 'a':
7895 return '\007';
7896 case 'e':
7897 return 033;
7898 }
7900}
7901
7902static void
7903undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7904{
7905 const char *s = *ss;
7906 unsigned int c;
7907 int codelen;
7908 size_t hexlen;
7909 unsigned char buf[6];
7910 static rb_encoding *enc_utf8 = NULL;
7911
7912 switch (*s) {
7913 case '\\':
7914 case '"':
7915 case '#':
7916 rb_str_cat(undumped, s, 1); /* cat itself */
7917 s++;
7918 break;
7919 case 'n':
7920 case 'r':
7921 case 't':
7922 case 'f':
7923 case 'v':
7924 case 'b':
7925 case 'a':
7926 case 'e':
7927 *buf = unescape_ascii(*s);
7928 rb_str_cat(undumped, (char *)buf, 1);
7929 s++;
7930 break;
7931 case 'u':
7932 if (*binary) {
7933 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7934 }
7935 *utf8 = true;
7936 if (++s >= s_end) {
7937 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7938 }
7939 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7940 if (*penc != enc_utf8) {
7941 *penc = enc_utf8;
7942 rb_enc_associate(undumped, enc_utf8);
7943 }
7944 if (*s == '{') { /* handle \u{...} form */
7945 s++;
7946 for (;;) {
7947 if (s >= s_end) {
7948 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7949 }
7950 if (*s == '}') {
7951 s++;
7952 break;
7953 }
7954 if (ISSPACE(*s)) {
7955 s++;
7956 continue;
7957 }
7958 c = scan_hex(s, s_end-s, &hexlen);
7959 if (hexlen == 0 || hexlen > 6) {
7960 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7961 }
7962 if (c > 0x10ffff) {
7963 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7964 }
7965 if (0xd800 <= c && c <= 0xdfff) {
7966 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7967 }
7968 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7969 rb_str_cat(undumped, (char *)buf, codelen);
7970 s += hexlen;
7971 }
7972 }
7973 else { /* handle \uXXXX form */
7974 c = scan_hex(s, 4, &hexlen);
7975 if (hexlen != 4) {
7976 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7977 }
7978 if (0xd800 <= c && c <= 0xdfff) {
7979 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7980 }
7981 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7982 rb_str_cat(undumped, (char *)buf, codelen);
7983 s += hexlen;
7984 }
7985 break;
7986 case 'x':
7987 if (*utf8) {
7988 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7989 }
7990 *binary = true;
7991 if (++s >= s_end) {
7992 rb_raise(rb_eRuntimeError, "invalid hex escape");
7993 }
7994 *buf = scan_hex(s, 2, &hexlen);
7995 if (hexlen != 2) {
7996 rb_raise(rb_eRuntimeError, "invalid hex escape");
7997 }
7998 rb_str_cat(undumped, (char *)buf, 1);
7999 s += hexlen;
8000 break;
8001 default:
8002 rb_str_cat(undumped, s-1, 2);
8003 s++;
8004 }
8005
8006 *ss = s;
8007}
8008
8009static VALUE rb_str_is_ascii_only_p(VALUE str);
8010
8011/*
8012 * call-seq:
8013 * undump -> string
8014 *
8015 * Returns an unescaped version of +self+:
8016 *
8017 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
8018 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
8019 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
8020 * s_undumped == s_orig # => true
8021 *
8022 * Related: String#dump (inverse of String#undump).
8023 *
8024 */
8025
8026static VALUE
8027str_undump(VALUE str)
8028{
8029 const char *s = RSTRING_PTR(str);
8030 const char *s_end = RSTRING_END(str);
8031 rb_encoding *enc = rb_enc_get(str);
8032 VALUE undumped = rb_enc_str_new(s, 0L, enc);
8033 bool utf8 = false;
8034 bool binary = false;
8035 int w;
8036
8038 if (rb_str_is_ascii_only_p(str) == Qfalse) {
8039 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
8040 }
8041 if (!str_null_check(str, &w)) {
8042 rb_raise(rb_eRuntimeError, "string contains null byte");
8043 }
8044 if (RSTRING_LEN(str) < 2) goto invalid_format;
8045 if (*s != '"') goto invalid_format;
8046
8047 /* strip '"' at the start */
8048 s++;
8049
8050 for (;;) {
8051 if (s >= s_end) {
8052 rb_raise(rb_eRuntimeError, "unterminated dumped string");
8053 }
8054
8055 if (*s == '"') {
8056 /* epilogue */
8057 s++;
8058 if (s == s_end) {
8059 /* ascii compatible dumped string */
8060 break;
8061 }
8062 else {
8063 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
8064 static const char dup_suffix[] = ".dup";
8065 const char *encname;
8066 int encidx;
8067 ptrdiff_t size;
8068
8069 /* check separately for strings dumped by older versions */
8070 size = sizeof(dup_suffix) - 1;
8071 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
8072
8073 size = sizeof(force_encoding_suffix) - 1;
8074 if (s_end - s <= size) goto invalid_format;
8075 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
8076 s += size;
8077
8078 if (utf8) {
8079 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
8080 }
8081
8082 encname = s;
8083 s = memchr(s, '"', s_end-s);
8084 size = s - encname;
8085 if (!s) goto invalid_format;
8086 if (s_end - s != 2) goto invalid_format;
8087 if (s[0] != '"' || s[1] != ')') goto invalid_format;
8088
8089 encidx = rb_enc_find_index2(encname, (long)size);
8090 if (encidx < 0) {
8091 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
8092 }
8093 rb_enc_associate_index(undumped, encidx);
8094 }
8095 break;
8096 }
8097
8098 if (*s == '\\') {
8099 s++;
8100 if (s >= s_end) {
8101 rb_raise(rb_eRuntimeError, "invalid escape");
8102 }
8103 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
8104 }
8105 else {
8106 rb_str_cat(undumped, s++, 1);
8107 }
8108 }
8109
8110 RB_GC_GUARD(str);
8111
8112 return undumped;
8113invalid_format:
8114 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
8115}
8116
8117static void
8118rb_str_check_dummy_enc(rb_encoding *enc)
8119{
8120 if (rb_enc_dummy_p(enc)) {
8121 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
8122 rb_enc_name(enc));
8123 }
8124}
8125
8126static rb_encoding *
8127str_true_enc(VALUE str)
8128{
8129 rb_encoding *enc = STR_ENC_GET(str);
8130 rb_str_check_dummy_enc(enc);
8131 return enc;
8132}
8133
8134static OnigCaseFoldType
8135check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
8136{
8137 if (argc==0)
8138 return flags;
8139 if (argc>2)
8140 rb_raise(rb_eArgError, "too many options");
8141 if (argv[0]==sym_turkic) {
8142 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
8143 if (argc==2) {
8144 if (argv[1]==sym_lithuanian)
8145 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
8146 else
8147 rb_raise(rb_eArgError, "invalid second option");
8148 }
8149 }
8150 else if (argv[0]==sym_lithuanian) {
8151 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
8152 if (argc==2) {
8153 if (argv[1]==sym_turkic)
8154 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
8155 else
8156 rb_raise(rb_eArgError, "invalid second option");
8157 }
8158 }
8159 else if (argc>1)
8160 rb_raise(rb_eArgError, "too many options");
8161 else if (argv[0]==sym_ascii)
8162 flags |= ONIGENC_CASE_ASCII_ONLY;
8163 else if (argv[0]==sym_fold) {
8164 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
8165 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
8166 else
8167 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
8168 }
8169 else
8170 rb_raise(rb_eArgError, "invalid option");
8171 return flags;
8172}
8173
8174static inline bool
8175case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
8176{
8177 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
8178 return true;
8179 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
8180}
8181
8182/* 16 should be long enough to absorb any kind of single character length increase */
8183#define CASE_MAPPING_ADDITIONAL_LENGTH 20
8184#ifndef CASEMAP_DEBUG
8185# define CASEMAP_DEBUG 0
8186#endif
8187
8188struct mapping_buffer;
8189typedef struct mapping_buffer {
8190 size_t capa;
8191 size_t used;
8192 struct mapping_buffer *next;
8193 OnigUChar space[FLEX_ARY_LEN];
8195
8196static void
8197mapping_buffer_free(void *p)
8198{
8199 mapping_buffer *previous_buffer;
8200 mapping_buffer *current_buffer = p;
8201 while (current_buffer) {
8202 previous_buffer = current_buffer;
8203 current_buffer = current_buffer->next;
8204 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
8205 }
8206}
8207
8208static const rb_data_type_t mapping_buffer_type = {
8209 "mapping_buffer",
8210 {0, mapping_buffer_free,},
8211 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
8212};
8213
8214static VALUE
8215rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
8216{
8217 VALUE target;
8218
8219 const OnigUChar *source_current, *source_end;
8220 int target_length = 0;
8221 VALUE buffer_anchor;
8222 mapping_buffer *current_buffer = 0;
8223 mapping_buffer **pre_buffer;
8224 size_t buffer_count = 0;
8225 int buffer_length_or_invalid;
8226
8227 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
8228
8229 source_current = (OnigUChar*)RSTRING_PTR(source);
8230 source_end = (OnigUChar*)RSTRING_END(source);
8231
8232 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
8233 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
8234 while (source_current < source_end) {
8235 /* increase multiplier using buffer count to converge quickly */
8236 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
8237 if (CASEMAP_DEBUG) {
8238 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
8239 }
8240 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
8241 *pre_buffer = current_buffer;
8242 pre_buffer = &current_buffer->next;
8243 current_buffer->next = NULL;
8244 current_buffer->capa = capa;
8245 buffer_length_or_invalid = enc->case_map(flags,
8246 &source_current, source_end,
8247 current_buffer->space,
8248 current_buffer->space+current_buffer->capa,
8249 enc);
8250 if (buffer_length_or_invalid < 0) {
8251 current_buffer = DATA_PTR(buffer_anchor);
8252 DATA_PTR(buffer_anchor) = 0;
8253 mapping_buffer_free(current_buffer);
8254 rb_raise(rb_eArgError, "input string invalid");
8255 }
8256 target_length += current_buffer->used = buffer_length_or_invalid;
8257 }
8258 if (CASEMAP_DEBUG) {
8259 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
8260 }
8261
8262 if (buffer_count==1) {
8263 target = rb_str_new((const char*)current_buffer->space, target_length);
8264 }
8265 else {
8266 char *target_current;
8267
8268 target = rb_str_new(0, target_length);
8269 target_current = RSTRING_PTR(target);
8270 current_buffer = DATA_PTR(buffer_anchor);
8271 while (current_buffer) {
8272 memcpy(target_current, current_buffer->space, current_buffer->used);
8273 target_current += current_buffer->used;
8274 current_buffer = current_buffer->next;
8275 }
8276 }
8277 current_buffer = DATA_PTR(buffer_anchor);
8278 DATA_PTR(buffer_anchor) = 0;
8279 mapping_buffer_free(current_buffer);
8280
8281 RB_GC_GUARD(buffer_anchor);
8282
8283 /* TODO: check about string terminator character */
8284 str_enc_copy_direct(target, source);
8285 /*ENC_CODERANGE_SET(mapped, cr);*/
8286
8287 return target;
8288}
8289
8290static VALUE
8291rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
8292{
8293 const OnigUChar *source_current, *source_end;
8294 OnigUChar *target_current, *target_end;
8295 long old_length = RSTRING_LEN(source);
8296 int length_or_invalid;
8297
8298 if (old_length == 0) return Qnil;
8299
8300 source_current = (OnigUChar*)RSTRING_PTR(source);
8301 source_end = (OnigUChar*)RSTRING_END(source);
8302 if (source == target) {
8303 target_current = (OnigUChar*)source_current;
8304 target_end = (OnigUChar*)source_end;
8305 }
8306 else {
8307 target_current = (OnigUChar*)RSTRING_PTR(target);
8308 target_end = (OnigUChar*)RSTRING_END(target);
8309 }
8310
8311 length_or_invalid = onigenc_ascii_only_case_map(flags,
8312 &source_current, source_end,
8313 target_current, target_end, enc);
8314 if (length_or_invalid < 0)
8315 rb_raise(rb_eArgError, "input string invalid");
8316 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8317 fprintf(stderr, "problem with rb_str_ascii_casemap"
8318 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8319 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8320 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8321 }
8322
8323 str_enc_copy(target, source);
8324
8325 return target;
8326}
8327
8328static bool
8329upcase_single(VALUE str)
8330{
8331 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8332 bool modified = false;
8333
8334 while (s < send) {
8335 unsigned int c = *(unsigned char*)s;
8336
8337 if ('a' <= c && c <= 'z') {
8338 *s = 'A' + (c - 'a');
8339 modified = true;
8340 }
8341 s++;
8342 }
8343 return modified;
8344}
8345
8346/*
8347 * call-seq:
8348 * upcase!(*options) -> self or nil
8349 *
8350 * Upcases the characters in +self+;
8351 * returns +self+ if any changes were made, +nil+ otherwise:
8352 *
8353 * s = 'Hello World!' # => "Hello World!"
8354 * s.upcase! # => "HELLO WORLD!"
8355 * s # => "HELLO WORLD!"
8356 * s.upcase! # => nil
8357 *
8358 * The casing may be affected by the given +options+;
8359 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8360 *
8361 * Related: String#upcase, String#downcase, String#downcase!.
8362 *
8363 */
8364
8365static VALUE
8366rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8367{
8368 rb_encoding *enc;
8369 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8370
8371 flags = check_case_options(argc, argv, flags);
8372 str_modify_keep_cr(str);
8373 enc = str_true_enc(str);
8374 if (case_option_single_p(flags, enc, str)) {
8375 if (upcase_single(str))
8376 flags |= ONIGENC_CASE_MODIFIED;
8377 }
8378 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8379 rb_str_ascii_casemap(str, str, &flags, enc);
8380 else
8381 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8382
8383 if (ONIGENC_CASE_MODIFIED&flags) return str;
8384 return Qnil;
8385}
8386
8387
8388/*
8389 * call-seq:
8390 * upcase(*options) -> string
8391 *
8392 * Returns a string containing the upcased characters in +self+:
8393 *
8394 * s = 'Hello World!' # => "Hello World!"
8395 * s.upcase # => "HELLO WORLD!"
8396 *
8397 * The casing may be affected by the given +options+;
8398 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8399 *
8400 * Related: String#upcase!, String#downcase, String#downcase!.
8401 *
8402 */
8403
8404static VALUE
8405rb_str_upcase(int argc, VALUE *argv, VALUE str)
8406{
8407 rb_encoding *enc;
8408 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8409 VALUE ret;
8410
8411 flags = check_case_options(argc, argv, flags);
8412 enc = str_true_enc(str);
8413 if (case_option_single_p(flags, enc, str)) {
8414 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8415 str_enc_copy_direct(ret, str);
8416 upcase_single(ret);
8417 }
8418 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8419 ret = rb_str_new(0, RSTRING_LEN(str));
8420 rb_str_ascii_casemap(str, ret, &flags, enc);
8421 }
8422 else {
8423 ret = rb_str_casemap(str, &flags, enc);
8424 }
8425
8426 return ret;
8427}
8428
8429static bool
8430downcase_single(VALUE str)
8431{
8432 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8433 bool modified = false;
8434
8435 while (s < send) {
8436 unsigned int c = *(unsigned char*)s;
8437
8438 if ('A' <= c && c <= 'Z') {
8439 *s = 'a' + (c - 'A');
8440 modified = true;
8441 }
8442 s++;
8443 }
8444
8445 return modified;
8446}
8447
8448/*
8449 * call-seq:
8450 * downcase!(*options) -> self or nil
8451 *
8452 * Downcases the characters in +self+;
8453 * returns +self+ if any changes were made, +nil+ otherwise:
8454 *
8455 * s = 'Hello World!' # => "Hello World!"
8456 * s.downcase! # => "hello world!"
8457 * s # => "hello world!"
8458 * s.downcase! # => nil
8459 *
8460 * The casing may be affected by the given +options+;
8461 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8462 *
8463 * Related: String#downcase, String#upcase, String#upcase!.
8464 *
8465 */
8466
8467static VALUE
8468rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8469{
8470 rb_encoding *enc;
8471 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8472
8473 flags = check_case_options(argc, argv, flags);
8474 str_modify_keep_cr(str);
8475 enc = str_true_enc(str);
8476 if (case_option_single_p(flags, enc, str)) {
8477 if (downcase_single(str))
8478 flags |= ONIGENC_CASE_MODIFIED;
8479 }
8480 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8481 rb_str_ascii_casemap(str, str, &flags, enc);
8482 else
8483 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8484
8485 if (ONIGENC_CASE_MODIFIED&flags) return str;
8486 return Qnil;
8487}
8488
8489
8490/*
8491 * call-seq:
8492 * downcase(*options) -> string
8493 *
8494 * Returns a string containing the downcased characters in +self+:
8495 *
8496 * s = 'Hello World!' # => "Hello World!"
8497 * s.downcase # => "hello world!"
8498 *
8499 * The casing may be affected by the given +options+;
8500 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8501 *
8502 * Related: String#downcase!, String#upcase, String#upcase!.
8503 *
8504 */
8505
8506static VALUE
8507rb_str_downcase(int argc, VALUE *argv, VALUE str)
8508{
8509 rb_encoding *enc;
8510 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8511 VALUE ret;
8512
8513 flags = check_case_options(argc, argv, flags);
8514 enc = str_true_enc(str);
8515 if (case_option_single_p(flags, enc, str)) {
8516 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8517 str_enc_copy_direct(ret, str);
8518 downcase_single(ret);
8519 }
8520 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8521 ret = rb_str_new(0, RSTRING_LEN(str));
8522 rb_str_ascii_casemap(str, ret, &flags, enc);
8523 }
8524 else {
8525 ret = rb_str_casemap(str, &flags, enc);
8526 }
8527
8528 return ret;
8529}
8530
8531
8532/*
8533 * call-seq:
8534 * capitalize!(*options) -> self or nil
8535 *
8536 * Upcases the first character in +self+;
8537 * downcases the remaining characters;
8538 * returns +self+ if any changes were made, +nil+ otherwise:
8539 *
8540 * s = 'hello World!' # => "hello World!"
8541 * s.capitalize! # => "Hello world!"
8542 * s # => "Hello world!"
8543 * s.capitalize! # => nil
8544 *
8545 * The casing may be affected by the given +options+;
8546 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8547 *
8548 * Related: String#capitalize.
8549 *
8550 */
8551
8552static VALUE
8553rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8554{
8555 rb_encoding *enc;
8556 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8557
8558 flags = check_case_options(argc, argv, flags);
8559 str_modify_keep_cr(str);
8560 enc = str_true_enc(str);
8561 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8562 if (flags&ONIGENC_CASE_ASCII_ONLY)
8563 rb_str_ascii_casemap(str, str, &flags, enc);
8564 else
8565 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8566
8567 if (ONIGENC_CASE_MODIFIED&flags) return str;
8568 return Qnil;
8569}
8570
8571
8572/*
8573 * call-seq:
8574 * capitalize(*options) -> string
8575 *
8576 * Returns a string containing the characters in +self+;
8577 * the first character is upcased;
8578 * the remaining characters are downcased:
8579 *
8580 * s = 'hello World!' # => "hello World!"
8581 * s.capitalize # => "Hello world!"
8582 *
8583 * The casing may be affected by the given +options+;
8584 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8585 *
8586 * Related: String#capitalize!.
8587 *
8588 */
8589
8590static VALUE
8591rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8592{
8593 rb_encoding *enc;
8594 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8595 VALUE ret;
8596
8597 flags = check_case_options(argc, argv, flags);
8598 enc = str_true_enc(str);
8599 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8600 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8601 ret = rb_str_new(0, RSTRING_LEN(str));
8602 rb_str_ascii_casemap(str, ret, &flags, enc);
8603 }
8604 else {
8605 ret = rb_str_casemap(str, &flags, enc);
8606 }
8607 return ret;
8608}
8609
8610
8611/*
8612 * call-seq:
8613 * swapcase!(*options) -> self or nil
8614 *
8615 * Upcases each lowercase character in +self+;
8616 * downcases uppercase character;
8617 * returns +self+ if any changes were made, +nil+ otherwise:
8618 *
8619 * s = 'Hello World!' # => "Hello World!"
8620 * s.swapcase! # => "hELLO wORLD!"
8621 * s # => "hELLO wORLD!"
8622 * ''.swapcase! # => nil
8623 *
8624 * The casing may be affected by the given +options+;
8625 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8626 *
8627 * Related: String#swapcase.
8628 *
8629 */
8630
8631static VALUE
8632rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8633{
8634 rb_encoding *enc;
8635 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8636
8637 flags = check_case_options(argc, argv, flags);
8638 str_modify_keep_cr(str);
8639 enc = str_true_enc(str);
8640 if (flags&ONIGENC_CASE_ASCII_ONLY)
8641 rb_str_ascii_casemap(str, str, &flags, enc);
8642 else
8643 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8644
8645 if (ONIGENC_CASE_MODIFIED&flags) return str;
8646 return Qnil;
8647}
8648
8649
8650/*
8651 * call-seq:
8652 * swapcase(*options) -> string
8653 *
8654 * Returns a string containing the characters in +self+, with cases reversed;
8655 * each uppercase character is downcased;
8656 * each lowercase character is upcased:
8657 *
8658 * s = 'Hello World!' # => "Hello World!"
8659 * s.swapcase # => "hELLO wORLD!"
8660 *
8661 * The casing may be affected by the given +options+;
8662 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8663 *
8664 * Related: String#swapcase!.
8665 *
8666 */
8667
8668static VALUE
8669rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8670{
8671 rb_encoding *enc;
8672 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8673 VALUE ret;
8674
8675 flags = check_case_options(argc, argv, flags);
8676 enc = str_true_enc(str);
8677 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8678 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8679 ret = rb_str_new(0, RSTRING_LEN(str));
8680 rb_str_ascii_casemap(str, ret, &flags, enc);
8681 }
8682 else {
8683 ret = rb_str_casemap(str, &flags, enc);
8684 }
8685 return ret;
8686}
8687
8688typedef unsigned char *USTR;
8689
8690struct tr {
8691 int gen;
8692 unsigned int now, max;
8693 char *p, *pend;
8694};
8695
8696static unsigned int
8697trnext(struct tr *t, rb_encoding *enc)
8698{
8699 int n;
8700
8701 for (;;) {
8702 nextpart:
8703 if (!t->gen) {
8704 if (t->p == t->pend) return -1;
8705 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8706 t->p += n;
8707 }
8708 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8709 t->p += n;
8710 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8711 t->p += n;
8712 if (t->p < t->pend) {
8713 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8714 t->p += n;
8715 if (t->now > c) {
8716 if (t->now < 0x80 && c < 0x80) {
8717 rb_raise(rb_eArgError,
8718 "invalid range \"%c-%c\" in string transliteration",
8719 t->now, c);
8720 }
8721 else {
8722 rb_raise(rb_eArgError, "invalid range in string transliteration");
8723 }
8724 continue; /* not reached */
8725 }
8726 else if (t->now < c) {
8727 t->gen = 1;
8728 t->max = c;
8729 }
8730 }
8731 }
8732 return t->now;
8733 }
8734 else {
8735 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8736 if (t->now == t->max) {
8737 t->gen = 0;
8738 goto nextpart;
8739 }
8740 }
8741 if (t->now < t->max) {
8742 return t->now;
8743 }
8744 else {
8745 t->gen = 0;
8746 return t->max;
8747 }
8748 }
8749 }
8750}
8751
8752static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8753
8754static VALUE
8755tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8756{
8757 const unsigned int errc = -1;
8758 unsigned int trans[256];
8759 rb_encoding *enc, *e1, *e2;
8760 struct tr trsrc, trrepl;
8761 int cflag = 0;
8762 unsigned int c, c0, last = 0;
8763 int modify = 0, i, l;
8764 unsigned char *s, *send;
8765 VALUE hash = 0;
8766 int singlebyte = single_byte_optimizable(str);
8767 int termlen;
8768 int cr;
8769
8770#define CHECK_IF_ASCII(c) \
8771 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8772 (cr = ENC_CODERANGE_VALID) : 0)
8773
8774 StringValue(src);
8775 StringValue(repl);
8776 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8777 if (RSTRING_LEN(repl) == 0) {
8778 return rb_str_delete_bang(1, &src, str);
8779 }
8780
8781 cr = ENC_CODERANGE(str);
8782 e1 = rb_enc_check(str, src);
8783 e2 = rb_enc_check(str, repl);
8784 if (e1 == e2) {
8785 enc = e1;
8786 }
8787 else {
8788 enc = rb_enc_check(src, repl);
8789 }
8790 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8791 if (RSTRING_LEN(src) > 1 &&
8792 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8793 trsrc.p + l < trsrc.pend) {
8794 cflag = 1;
8795 trsrc.p += l;
8796 }
8797 trrepl.p = RSTRING_PTR(repl);
8798 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8799 trsrc.gen = trrepl.gen = 0;
8800 trsrc.now = trrepl.now = 0;
8801 trsrc.max = trrepl.max = 0;
8802
8803 if (cflag) {
8804 for (i=0; i<256; i++) {
8805 trans[i] = 1;
8806 }
8807 while ((c = trnext(&trsrc, enc)) != errc) {
8808 if (c < 256) {
8809 trans[c] = errc;
8810 }
8811 else {
8812 if (!hash) hash = rb_hash_new();
8813 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8814 }
8815 }
8816 while ((c = trnext(&trrepl, enc)) != errc)
8817 /* retrieve last replacer */;
8818 last = trrepl.now;
8819 for (i=0; i<256; i++) {
8820 if (trans[i] != errc) {
8821 trans[i] = last;
8822 }
8823 }
8824 }
8825 else {
8826 unsigned int r;
8827
8828 for (i=0; i<256; i++) {
8829 trans[i] = errc;
8830 }
8831 while ((c = trnext(&trsrc, enc)) != errc) {
8832 r = trnext(&trrepl, enc);
8833 if (r == errc) r = trrepl.now;
8834 if (c < 256) {
8835 trans[c] = r;
8836 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8837 }
8838 else {
8839 if (!hash) hash = rb_hash_new();
8840 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8841 }
8842 }
8843 }
8844
8845 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8846 cr = ENC_CODERANGE_7BIT;
8847 str_modify_keep_cr(str);
8848 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8849 termlen = rb_enc_mbminlen(enc);
8850 if (sflag) {
8851 int clen, tlen;
8852 long offset, max = RSTRING_LEN(str);
8853 unsigned int save = -1;
8854 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8855
8856 while (s < send) {
8857 int may_modify = 0;
8858
8859 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8860 if (!MBCLEN_CHARFOUND_P(r)) {
8861 xfree(buf);
8862 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8863 }
8864 clen = MBCLEN_CHARFOUND_LEN(r);
8865 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8866
8867 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8868
8869 s += clen;
8870 if (c < 256) {
8871 c = trans[c];
8872 }
8873 else if (hash) {
8874 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8875 if (NIL_P(tmp)) {
8876 if (cflag) c = last;
8877 else c = errc;
8878 }
8879 else if (cflag) c = errc;
8880 else c = NUM2INT(tmp);
8881 }
8882 else {
8883 c = errc;
8884 }
8885 if (c != (unsigned int)-1) {
8886 if (save == c) {
8887 CHECK_IF_ASCII(c);
8888 continue;
8889 }
8890 save = c;
8891 tlen = rb_enc_codelen(c, enc);
8892 modify = 1;
8893 }
8894 else {
8895 save = -1;
8896 c = c0;
8897 if (enc != e1) may_modify = 1;
8898 }
8899 if ((offset = t - buf) + tlen > max) {
8900 size_t MAYBE_UNUSED(old) = max + termlen;
8901 max = offset + tlen + (send - s);
8902 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8903 t = buf + offset;
8904 }
8905 rb_enc_mbcput(c, t, enc);
8906 if (may_modify && memcmp(s, t, tlen) != 0) {
8907 modify = 1;
8908 }
8909 CHECK_IF_ASCII(c);
8910 t += tlen;
8911 }
8912 if (!STR_EMBED_P(str)) {
8913 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8914 }
8915 TERM_FILL((char *)t, termlen);
8916 RSTRING(str)->as.heap.ptr = (char *)buf;
8917 STR_SET_LEN(str, t - buf);
8918 STR_SET_NOEMBED(str);
8919 RSTRING(str)->as.heap.aux.capa = max;
8920 }
8921 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8922 while (s < send) {
8923 c = (unsigned char)*s;
8924 if (trans[c] != errc) {
8925 if (!cflag) {
8926 c = trans[c];
8927 *s = c;
8928 modify = 1;
8929 }
8930 else {
8931 *s = last;
8932 modify = 1;
8933 }
8934 }
8935 CHECK_IF_ASCII(c);
8936 s++;
8937 }
8938 }
8939 else {
8940 int clen, tlen;
8941 long offset, max = (long)((send - s) * 1.2);
8942 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8943
8944 while (s < send) {
8945 int may_modify = 0;
8946
8947 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8948 if (!MBCLEN_CHARFOUND_P(r)) {
8949 xfree(buf);
8950 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8951 }
8952 clen = MBCLEN_CHARFOUND_LEN(r);
8953 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8954
8955 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8956
8957 if (c < 256) {
8958 c = trans[c];
8959 }
8960 else if (hash) {
8961 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8962 if (NIL_P(tmp)) {
8963 if (cflag) c = last;
8964 else c = errc;
8965 }
8966 else if (cflag) c = errc;
8967 else c = NUM2INT(tmp);
8968 }
8969 else {
8970 c = cflag ? last : errc;
8971 }
8972 if (c != errc) {
8973 tlen = rb_enc_codelen(c, enc);
8974 modify = 1;
8975 }
8976 else {
8977 c = c0;
8978 if (enc != e1) may_modify = 1;
8979 }
8980 if ((offset = t - buf) + tlen > max) {
8981 size_t MAYBE_UNUSED(old) = max + termlen;
8982 max = offset + tlen + (long)((send - s) * 1.2);
8983 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8984 t = buf + offset;
8985 }
8986 if (s != t) {
8987 rb_enc_mbcput(c, t, enc);
8988 if (may_modify && memcmp(s, t, tlen) != 0) {
8989 modify = 1;
8990 }
8991 }
8992 CHECK_IF_ASCII(c);
8993 s += clen;
8994 t += tlen;
8995 }
8996 if (!STR_EMBED_P(str)) {
8997 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8998 }
8999 TERM_FILL((char *)t, termlen);
9000 RSTRING(str)->as.heap.ptr = (char *)buf;
9001 STR_SET_LEN(str, t - buf);
9002 STR_SET_NOEMBED(str);
9003 RSTRING(str)->as.heap.aux.capa = max;
9004 }
9005
9006 if (modify) {
9007 if (cr != ENC_CODERANGE_BROKEN)
9008 ENC_CODERANGE_SET(str, cr);
9009 rb_enc_associate(str, enc);
9010 return str;
9011 }
9012 return Qnil;
9013}
9014
9015
9016/*
9017 * call-seq:
9018 * tr!(selector, replacements) -> self or nil
9019 *
9020 * Like String#tr, but modifies +self+ in place.
9021 * Returns +self+ if any changes were made, +nil+ otherwise.
9022 *
9023 */
9024
9025static VALUE
9026rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
9027{
9028 return tr_trans(str, src, repl, 0);
9029}
9030
9031
9032/*
9033 * call-seq:
9034 * tr(selector, replacements) -> new_string
9035 *
9036 * Returns a copy of +self+ with each character specified by string +selector+
9037 * translated to the corresponding character in string +replacements+.
9038 * The correspondence is _positional_:
9039 *
9040 * - Each occurrence of the first character specified by +selector+
9041 * is translated to the first character in +replacements+.
9042 * - Each occurrence of the second character specified by +selector+
9043 * is translated to the second character in +replacements+.
9044 * - And so on.
9045 *
9046 * Example:
9047 *
9048 * 'hello'.tr('el', 'ip') #=> "hippo"
9049 *
9050 * If +replacements+ is shorter than +selector+,
9051 * it is implicitly padded with its own last character:
9052 *
9053 * 'hello'.tr('aeiou', '-') # => "h-ll-"
9054 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
9055 *
9056 * Arguments +selector+ and +replacements+ must be valid character selectors
9057 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
9058 * and may use any of its valid forms, including negation, ranges, and escaping:
9059 *
9060 * # Negation.
9061 * 'hello'.tr('^aeiou', '-') # => "-e--o"
9062 * # Ranges.
9063 * 'ibm'.tr('b-z', 'a-z') # => "hal"
9064 * # Escapes.
9065 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
9066 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
9067 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
9068 *
9069 */
9070
9071static VALUE
9072rb_str_tr(VALUE str, VALUE src, VALUE repl)
9073{
9074 str = str_duplicate(rb_cString, str);
9075 tr_trans(str, src, repl, 0);
9076 return str;
9077}
9078
9079#define TR_TABLE_MAX (UCHAR_MAX+1)
9080#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
9081static void
9082tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
9083 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
9084{
9085 const unsigned int errc = -1;
9086 char buf[TR_TABLE_MAX];
9087 struct tr tr;
9088 unsigned int c;
9089 VALUE table = 0, ptable = 0;
9090 int i, l, cflag = 0;
9091
9092 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
9093 tr.gen = tr.now = tr.max = 0;
9094
9095 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
9096 cflag = 1;
9097 tr.p += l;
9098 }
9099 if (first) {
9100 for (i=0; i<TR_TABLE_MAX; i++) {
9101 stable[i] = 1;
9102 }
9103 stable[TR_TABLE_MAX] = cflag;
9104 }
9105 else if (stable[TR_TABLE_MAX] && !cflag) {
9106 stable[TR_TABLE_MAX] = 0;
9107 }
9108 for (i=0; i<TR_TABLE_MAX; i++) {
9109 buf[i] = cflag;
9110 }
9111
9112 while ((c = trnext(&tr, enc)) != errc) {
9113 if (c < TR_TABLE_MAX) {
9114 buf[(unsigned char)c] = !cflag;
9115 }
9116 else {
9117 VALUE key = UINT2NUM(c);
9118
9119 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
9120 if (cflag) {
9121 ptable = *ctablep;
9122 table = ptable ? ptable : rb_hash_new();
9123 *ctablep = table;
9124 }
9125 else {
9126 table = rb_hash_new();
9127 ptable = *tablep;
9128 *tablep = table;
9129 }
9130 }
9131 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
9132 rb_hash_aset(table, key, Qtrue);
9133 }
9134 }
9135 }
9136 for (i=0; i<TR_TABLE_MAX; i++) {
9137 stable[i] = stable[i] && buf[i];
9138 }
9139 if (!table && !cflag) {
9140 *tablep = 0;
9141 }
9142}
9143
9144
9145static int
9146tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
9147{
9148 if (c < TR_TABLE_MAX) {
9149 return table[c] != 0;
9150 }
9151 else {
9152 VALUE v = UINT2NUM(c);
9153
9154 if (del) {
9155 if (!NIL_P(rb_hash_lookup(del, v)) &&
9156 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
9157 return TRUE;
9158 }
9159 }
9160 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
9161 return FALSE;
9162 }
9163 return table[TR_TABLE_MAX] ? TRUE : FALSE;
9164 }
9165}
9166
9167/*
9168 * call-seq:
9169 * delete!(*selectors) -> self or nil
9170 *
9171 * Like String#delete, but modifies +self+ in place.
9172 * Returns +self+ if any changes were made, +nil+ otherwise.
9173 *
9174 */
9175
9176static VALUE
9177rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
9178{
9179 char squeez[TR_TABLE_SIZE];
9180 rb_encoding *enc = 0;
9181 char *s, *send, *t;
9182 VALUE del = 0, nodel = 0;
9183 int modify = 0;
9184 int i, ascompat, cr;
9185
9186 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
9188 for (i=0; i<argc; i++) {
9189 VALUE s = argv[i];
9190
9191 StringValue(s);
9192 enc = rb_enc_check(str, s);
9193 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
9194 }
9195
9196 str_modify_keep_cr(str);
9197 ascompat = rb_enc_asciicompat(enc);
9198 s = t = RSTRING_PTR(str);
9199 send = RSTRING_END(str);
9200 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
9201 while (s < send) {
9202 unsigned int c;
9203 int clen;
9204
9205 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9206 if (squeez[c]) {
9207 modify = 1;
9208 }
9209 else {
9210 if (t != s) *t = c;
9211 t++;
9212 }
9213 s++;
9214 }
9215 else {
9216 c = rb_enc_codepoint_len(s, send, &clen, enc);
9217
9218 if (tr_find(c, squeez, del, nodel)) {
9219 modify = 1;
9220 }
9221 else {
9222 if (t != s) rb_enc_mbcput(c, t, enc);
9223 t += clen;
9225 }
9226 s += clen;
9227 }
9228 }
9229 TERM_FILL(t, TERM_LEN(str));
9230 STR_SET_LEN(str, t - RSTRING_PTR(str));
9231 ENC_CODERANGE_SET(str, cr);
9232
9233 if (modify) return str;
9234 return Qnil;
9235}
9236
9237
9238/*
9239 * call-seq:
9240 * delete(*selectors) -> new_string
9241 *
9242 * Returns a copy of +self+ with characters specified by +selectors+ removed
9243 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9244 *
9245 * "hello".delete "l","lo" #=> "heo"
9246 * "hello".delete "lo" #=> "he"
9247 * "hello".delete "aeiou", "^e" #=> "hell"
9248 * "hello".delete "ej-m" #=> "ho"
9249 *
9250 */
9251
9252static VALUE
9253rb_str_delete(int argc, VALUE *argv, VALUE str)
9254{
9255 str = str_duplicate(rb_cString, str);
9256 rb_str_delete_bang(argc, argv, str);
9257 return str;
9258}
9259
9260
9261/*
9262 * call-seq:
9263 * squeeze!(*selectors) -> self or nil
9264 *
9265 * Like String#squeeze, but modifies +self+ in place.
9266 * Returns +self+ if any changes were made, +nil+ otherwise.
9267 */
9268
9269static VALUE
9270rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
9271{
9272 char squeez[TR_TABLE_SIZE];
9273 rb_encoding *enc = 0;
9274 VALUE del = 0, nodel = 0;
9275 unsigned char *s, *send, *t;
9276 int i, modify = 0;
9277 int ascompat, singlebyte = single_byte_optimizable(str);
9278 unsigned int save;
9279
9280 if (argc == 0) {
9281 enc = STR_ENC_GET(str);
9282 }
9283 else {
9284 for (i=0; i<argc; i++) {
9285 VALUE s = argv[i];
9286
9287 StringValue(s);
9288 enc = rb_enc_check(str, s);
9289 if (singlebyte && !single_byte_optimizable(s))
9290 singlebyte = 0;
9291 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
9292 }
9293 }
9294
9295 str_modify_keep_cr(str);
9296 s = t = (unsigned char *)RSTRING_PTR(str);
9297 if (!s || RSTRING_LEN(str) == 0) return Qnil;
9298 send = (unsigned char *)RSTRING_END(str);
9299 save = -1;
9300 ascompat = rb_enc_asciicompat(enc);
9301
9302 if (singlebyte) {
9303 while (s < send) {
9304 unsigned int c = *s++;
9305 if (c != save || (argc > 0 && !squeez[c])) {
9306 *t++ = save = c;
9307 }
9308 }
9309 }
9310 else {
9311 while (s < send) {
9312 unsigned int c;
9313 int clen;
9314
9315 if (ascompat && (c = *s) < 0x80) {
9316 if (c != save || (argc > 0 && !squeez[c])) {
9317 *t++ = save = c;
9318 }
9319 s++;
9320 }
9321 else {
9322 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
9323
9324 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9325 if (t != s) rb_enc_mbcput(c, t, enc);
9326 save = c;
9327 t += clen;
9328 }
9329 s += clen;
9330 }
9331 }
9332 }
9333
9334 TERM_FILL((char *)t, TERM_LEN(str));
9335 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9336 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9337 modify = 1;
9338 }
9339
9340 if (modify) return str;
9341 return Qnil;
9342}
9343
9344
9345/*
9346 * call-seq:
9347 * squeeze(*selectors) -> new_string
9348 *
9349 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9350 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9351 *
9352 * "Squeezed" means that each multiple-character run of a selected character
9353 * is squeezed down to a single character;
9354 * with no arguments given, squeezes all characters:
9355 *
9356 * "yellow moon".squeeze #=> "yelow mon"
9357 * " now is the".squeeze(" ") #=> " now is the"
9358 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9359 *
9360 */
9361
9362static VALUE
9363rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9364{
9365 str = str_duplicate(rb_cString, str);
9366 rb_str_squeeze_bang(argc, argv, str);
9367 return str;
9368}
9369
9370
9371/*
9372 * call-seq:
9373 * tr_s!(selector, replacements) -> self or nil
9374 *
9375 * Like String#tr_s, but modifies +self+ in place.
9376 * Returns +self+ if any changes were made, +nil+ otherwise.
9377 *
9378 * Related: String#squeeze!.
9379 */
9380
9381static VALUE
9382rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9383{
9384 return tr_trans(str, src, repl, 1);
9385}
9386
9387
9388/*
9389 * call-seq:
9390 * tr_s(selector, replacements) -> string
9391 *
9392 * Like String#tr, but also squeezes the modified portions of the translated string;
9393 * returns a new string (translated and squeezed).
9394 *
9395 * 'hello'.tr_s('l', 'r') #=> "hero"
9396 * 'hello'.tr_s('el', '-') #=> "h-o"
9397 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9398 *
9399 * Related: String#squeeze.
9400 *
9401 */
9402
9403static VALUE
9404rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9405{
9406 str = str_duplicate(rb_cString, str);
9407 tr_trans(str, src, repl, 1);
9408 return str;
9409}
9410
9411
9412/*
9413 * call-seq:
9414 * count(*selectors) -> integer
9415 *
9416 * Returns the total number of characters in +self+
9417 * that are specified by the given +selectors+
9418 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9419 *
9420 * a = "hello world"
9421 * a.count "lo" #=> 5
9422 * a.count "lo", "o" #=> 2
9423 * a.count "hello", "^l" #=> 4
9424 * a.count "ej-m" #=> 4
9425 *
9426 * "hello^world".count "\\^aeiou" #=> 4
9427 * "hello-world".count "a\\-eo" #=> 4
9428 *
9429 * c = "hello world\\r\\n"
9430 * c.count "\\" #=> 2
9431 * c.count "\\A" #=> 0
9432 * c.count "X-\\w" #=> 3
9433 */
9434
9435static VALUE
9436rb_str_count(int argc, VALUE *argv, VALUE str)
9437{
9438 char table[TR_TABLE_SIZE];
9439 rb_encoding *enc = 0;
9440 VALUE del = 0, nodel = 0, tstr;
9441 char *s, *send;
9442 int i;
9443 int ascompat;
9444 size_t n = 0;
9445
9447
9448 tstr = argv[0];
9449 StringValue(tstr);
9450 enc = rb_enc_check(str, tstr);
9451 if (argc == 1) {
9452 const char *ptstr;
9453 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9454 (ptstr = RSTRING_PTR(tstr),
9455 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9456 !is_broken_string(str)) {
9457 int clen;
9458 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9459
9460 s = RSTRING_PTR(str);
9461 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9462 send = RSTRING_END(str);
9463 while (s < send) {
9464 if (*(unsigned char*)s++ == c) n++;
9465 }
9466 return SIZET2NUM(n);
9467 }
9468 }
9469
9470 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9471 for (i=1; i<argc; i++) {
9472 tstr = argv[i];
9473 StringValue(tstr);
9474 enc = rb_enc_check(str, tstr);
9475 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9476 }
9477
9478 s = RSTRING_PTR(str);
9479 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9480 send = RSTRING_END(str);
9481 ascompat = rb_enc_asciicompat(enc);
9482 while (s < send) {
9483 unsigned int c;
9484
9485 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9486 if (table[c]) {
9487 n++;
9488 }
9489 s++;
9490 }
9491 else {
9492 int clen;
9493 c = rb_enc_codepoint_len(s, send, &clen, enc);
9494 if (tr_find(c, table, del, nodel)) {
9495 n++;
9496 }
9497 s += clen;
9498 }
9499 }
9500
9501 return SIZET2NUM(n);
9502}
9503
9504static VALUE
9505rb_fs_check(VALUE val)
9506{
9507 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9508 val = rb_check_string_type(val);
9509 if (NIL_P(val)) return 0;
9510 }
9511 return val;
9512}
9513
9514static const char isspacetable[256] = {
9515 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9517 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9518 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9520 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9522 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9531};
9532
9533#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9534
9535static long
9536split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9537{
9538 if (empty_count >= 0 && len == 0) {
9539 return empty_count + 1;
9540 }
9541 if (empty_count > 0) {
9542 /* make different substrings */
9543 if (result) {
9544 do {
9545 rb_ary_push(result, str_new_empty_String(str));
9546 } while (--empty_count > 0);
9547 }
9548 else {
9549 do {
9550 rb_yield(str_new_empty_String(str));
9551 } while (--empty_count > 0);
9552 }
9553 }
9554 str = rb_str_subseq(str, beg, len);
9555 if (result) {
9556 rb_ary_push(result, str);
9557 }
9558 else {
9559 rb_yield(str);
9560 }
9561 return empty_count;
9562}
9563
9564typedef enum {
9565 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9566} split_type_t;
9567
9568static split_type_t
9569literal_split_pattern(VALUE spat, split_type_t default_type)
9570{
9571 rb_encoding *enc = STR_ENC_GET(spat);
9572 const char *ptr;
9573 long len;
9574 RSTRING_GETMEM(spat, ptr, len);
9575 if (len == 0) {
9576 /* Special case - split into chars */
9577 return SPLIT_TYPE_CHARS;
9578 }
9579 else if (rb_enc_asciicompat(enc)) {
9580 if (len == 1 && ptr[0] == ' ') {
9581 return SPLIT_TYPE_AWK;
9582 }
9583 }
9584 else {
9585 int l;
9586 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9587 return SPLIT_TYPE_AWK;
9588 }
9589 }
9590 return default_type;
9591}
9592
9593/*
9594 * call-seq:
9595 * split(field_sep = $;, limit = 0) -> array
9596 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9597 *
9598 * :include: doc/string/split.rdoc
9599 *
9600 */
9601
9602static VALUE
9603rb_str_split_m(int argc, VALUE *argv, VALUE str)
9604{
9605 rb_encoding *enc;
9606 VALUE spat;
9607 VALUE limit;
9608 split_type_t split_type;
9609 long beg, end, i = 0, empty_count = -1;
9610 int lim = 0;
9611 VALUE result, tmp;
9612
9613 result = rb_block_given_p() ? Qfalse : Qnil;
9614 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9615 lim = NUM2INT(limit);
9616 if (lim <= 0) limit = Qnil;
9617 else if (lim == 1) {
9618 if (RSTRING_LEN(str) == 0)
9619 return result ? rb_ary_new2(0) : str;
9620 tmp = str_duplicate(rb_cString, str);
9621 if (!result) {
9622 rb_yield(tmp);
9623 return str;
9624 }
9625 return rb_ary_new3(1, tmp);
9626 }
9627 i = 1;
9628 }
9629 if (NIL_P(limit) && !lim) empty_count = 0;
9630
9631 enc = STR_ENC_GET(str);
9632 split_type = SPLIT_TYPE_REGEXP;
9633 if (!NIL_P(spat)) {
9634 spat = get_pat_quoted(spat, 0);
9635 }
9636 else if (NIL_P(spat = rb_fs)) {
9637 split_type = SPLIT_TYPE_AWK;
9638 }
9639 else if (!(spat = rb_fs_check(spat))) {
9640 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9641 }
9642 else {
9643 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9644 }
9645 if (split_type != SPLIT_TYPE_AWK) {
9646 switch (BUILTIN_TYPE(spat)) {
9647 case T_REGEXP:
9648 rb_reg_options(spat); /* check if uninitialized */
9649 tmp = RREGEXP_SRC(spat);
9650 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9651 if (split_type == SPLIT_TYPE_AWK) {
9652 spat = tmp;
9653 split_type = SPLIT_TYPE_STRING;
9654 }
9655 break;
9656
9657 case T_STRING:
9658 mustnot_broken(spat);
9659 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9660 break;
9661
9662 default:
9664 }
9665 }
9666
9667#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9668
9669 beg = 0;
9670 char *ptr = RSTRING_PTR(str);
9671 char *eptr = RSTRING_END(str);
9672 if (split_type == SPLIT_TYPE_AWK) {
9673 char *bptr = ptr;
9674 int skip = 1;
9675 unsigned int c;
9676
9677 if (result) result = rb_ary_new();
9678 end = beg;
9679 if (is_ascii_string(str)) {
9680 while (ptr < eptr) {
9681 c = (unsigned char)*ptr++;
9682 if (skip) {
9683 if (ascii_isspace(c)) {
9684 beg = ptr - bptr;
9685 }
9686 else {
9687 end = ptr - bptr;
9688 skip = 0;
9689 if (!NIL_P(limit) && lim <= i) break;
9690 }
9691 }
9692 else if (ascii_isspace(c)) {
9693 SPLIT_STR(beg, end-beg);
9694 skip = 1;
9695 beg = ptr - bptr;
9696 if (!NIL_P(limit)) ++i;
9697 }
9698 else {
9699 end = ptr - bptr;
9700 }
9701 }
9702 }
9703 else {
9704 while (ptr < eptr) {
9705 int n;
9706
9707 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9708 ptr += n;
9709 if (skip) {
9710 if (rb_isspace(c)) {
9711 beg = ptr - bptr;
9712 }
9713 else {
9714 end = ptr - bptr;
9715 skip = 0;
9716 if (!NIL_P(limit) && lim <= i) break;
9717 }
9718 }
9719 else if (rb_isspace(c)) {
9720 SPLIT_STR(beg, end-beg);
9721 skip = 1;
9722 beg = ptr - bptr;
9723 if (!NIL_P(limit)) ++i;
9724 }
9725 else {
9726 end = ptr - bptr;
9727 }
9728 }
9729 }
9730 }
9731 else if (split_type == SPLIT_TYPE_STRING) {
9732 char *str_start = ptr;
9733 char *substr_start = ptr;
9734 char *sptr = RSTRING_PTR(spat);
9735 long slen = RSTRING_LEN(spat);
9736
9737 if (result) result = rb_ary_new();
9738 mustnot_broken(str);
9739 enc = rb_enc_check(str, spat);
9740 while (ptr < eptr &&
9741 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9742 /* Check we are at the start of a char */
9743 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9744 if (t != ptr + end) {
9745 ptr = t;
9746 continue;
9747 }
9748 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9749 ptr += end + slen;
9750 substr_start = ptr;
9751 if (!NIL_P(limit) && lim <= ++i) break;
9752 }
9753 beg = ptr - str_start;
9754 }
9755 else if (split_type == SPLIT_TYPE_CHARS) {
9756 char *str_start = ptr;
9757 int n;
9758
9759 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9760 mustnot_broken(str);
9761 enc = rb_enc_get(str);
9762 while (ptr < eptr &&
9763 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9764 SPLIT_STR(ptr - str_start, n);
9765 ptr += n;
9766 if (!NIL_P(limit) && lim <= ++i) break;
9767 }
9768 beg = ptr - str_start;
9769 }
9770 else {
9771 if (result) result = rb_ary_new();
9772 long len = RSTRING_LEN(str);
9773 long start = beg;
9774 long idx;
9775 int last_null = 0;
9776 struct re_registers *regs;
9777 VALUE match = 0;
9778
9779 for (; rb_reg_search(spat, str, start, 0) >= 0;
9780 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9781 match = rb_backref_get();
9782 if (!result) rb_match_busy(match);
9783 regs = RMATCH_REGS(match);
9784 end = BEG(0);
9785 if (start == end && BEG(0) == END(0)) {
9786 if (!ptr) {
9787 SPLIT_STR(0, 0);
9788 break;
9789 }
9790 else if (last_null == 1) {
9791 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9792 beg = start;
9793 }
9794 else {
9795 if (start == len)
9796 start++;
9797 else
9798 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9799 last_null = 1;
9800 continue;
9801 }
9802 }
9803 else {
9804 SPLIT_STR(beg, end-beg);
9805 beg = start = END(0);
9806 }
9807 last_null = 0;
9808
9809 for (idx=1; idx < regs->num_regs; idx++) {
9810 if (BEG(idx) == -1) continue;
9811 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9812 }
9813 if (!NIL_P(limit) && lim <= ++i) break;
9814 }
9815 if (match) rb_match_unbusy(match);
9816 }
9817 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9818 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9819 }
9820
9821 return result ? result : str;
9822}
9823
9824VALUE
9825rb_str_split(VALUE str, const char *sep0)
9826{
9827 VALUE sep;
9828
9829 StringValue(str);
9830 sep = rb_str_new_cstr(sep0);
9831 return rb_str_split_m(1, &sep, str);
9832}
9833
9834#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9835
9836static inline int
9837enumerator_element(VALUE ary, VALUE e)
9838{
9839 if (ary) {
9840 rb_ary_push(ary, e);
9841 return 0;
9842 }
9843 else {
9844 rb_yield(e);
9845 return 1;
9846 }
9847}
9848
9849#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9850
9851static const char *
9852chomp_newline(const char *p, const char *e, rb_encoding *enc)
9853{
9854 const char *prev = rb_enc_prev_char(p, e, e, enc);
9855 if (rb_enc_is_newline(prev, e, enc)) {
9856 e = prev;
9857 prev = rb_enc_prev_char(p, e, e, enc);
9858 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9859 e = prev;
9860 }
9861 return e;
9862}
9863
9864static VALUE
9865get_rs(void)
9866{
9867 VALUE rs = rb_rs;
9868 if (!NIL_P(rs) &&
9869 (!RB_TYPE_P(rs, T_STRING) ||
9870 RSTRING_LEN(rs) != 1 ||
9871 RSTRING_PTR(rs)[0] != '\n')) {
9872 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9873 }
9874 return rs;
9875}
9876
9877#define rb_rs get_rs()
9878
9879static VALUE
9880rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9881{
9882 rb_encoding *enc;
9883 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9884 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9885 long pos, len, rslen;
9886 int rsnewline = 0;
9887
9888 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9889 rs = rb_rs;
9890 if (!NIL_P(opts)) {
9891 static ID keywords[1];
9892 if (!keywords[0]) {
9893 keywords[0] = rb_intern_const("chomp");
9894 }
9895 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9896 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9897 }
9898
9899 if (NIL_P(rs)) {
9900 if (!ENUM_ELEM(ary, str)) {
9901 return ary;
9902 }
9903 else {
9904 return orig;
9905 }
9906 }
9907
9908 if (!RSTRING_LEN(str)) goto end;
9909 str = rb_str_new_frozen(str);
9910 ptr = subptr = RSTRING_PTR(str);
9911 pend = RSTRING_END(str);
9912 len = RSTRING_LEN(str);
9913 StringValue(rs);
9914 rslen = RSTRING_LEN(rs);
9915
9916 if (rs == rb_default_rs)
9917 enc = rb_enc_get(str);
9918 else
9919 enc = rb_enc_check(str, rs);
9920
9921 if (rslen == 0) {
9922 /* paragraph mode */
9923 int n;
9924 const char *eol = NULL;
9925 subend = subptr;
9926 while (subend < pend) {
9927 long chomp_rslen = 0;
9928 do {
9929 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9930 n = 0;
9931 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9932 if (rb_enc_is_newline(subend + n, pend, enc)) {
9933 if (eol == subend) break;
9934 subend += rslen;
9935 if (subptr) {
9936 eol = subend;
9937 chomp_rslen = -rslen;
9938 }
9939 }
9940 else {
9941 if (!subptr) subptr = subend;
9942 subend += rslen;
9943 }
9944 rslen = 0;
9945 } while (subend < pend);
9946 if (!subptr) break;
9947 if (rslen == 0) chomp_rslen = 0;
9948 line = rb_str_subseq(str, subptr - ptr,
9949 subend - subptr + (chomp ? chomp_rslen : rslen));
9950 if (ENUM_ELEM(ary, line)) {
9951 str_mod_check(str, ptr, len);
9952 }
9953 subptr = eol = NULL;
9954 }
9955 goto end;
9956 }
9957 else {
9958 rsptr = RSTRING_PTR(rs);
9959 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9960 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9961 rsnewline = 1;
9962 }
9963 }
9964
9965 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9966 rs = rb_str_new(rsptr, rslen);
9967 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9968 rsptr = RSTRING_PTR(rs);
9969 rslen = RSTRING_LEN(rs);
9970 }
9971
9972 while (subptr < pend) {
9973 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9974 if (pos < 0) break;
9975 hit = subptr + pos;
9976 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9977 if (hit != adjusted) {
9978 subptr = adjusted;
9979 continue;
9980 }
9981 subend = hit += rslen;
9982 if (chomp) {
9983 if (rsnewline) {
9984 subend = chomp_newline(subptr, subend, enc);
9985 }
9986 else {
9987 subend -= rslen;
9988 }
9989 }
9990 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9991 if (ENUM_ELEM(ary, line)) {
9992 str_mod_check(str, ptr, len);
9993 }
9994 subptr = hit;
9995 }
9996
9997 if (subptr != pend) {
9998 if (chomp) {
9999 if (rsnewline) {
10000 pend = chomp_newline(subptr, pend, enc);
10001 }
10002 else if (pend - subptr >= rslen &&
10003 memcmp(pend - rslen, rsptr, rslen) == 0) {
10004 pend -= rslen;
10005 }
10006 }
10007 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
10008 ENUM_ELEM(ary, line);
10009 RB_GC_GUARD(str);
10010 }
10011
10012 end:
10013 if (ary)
10014 return ary;
10015 else
10016 return orig;
10017}
10018
10019/*
10020 * call-seq:
10021 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
10022 * each_line(line_sep = $/, chomp: false) -> enumerator
10023 *
10024 * :include: doc/string/each_line.rdoc
10025 *
10026 */
10027
10028static VALUE
10029rb_str_each_line(int argc, VALUE *argv, VALUE str)
10030{
10031 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
10032 return rb_str_enumerate_lines(argc, argv, str, 0);
10033}
10034
10035/*
10036 * call-seq:
10037 * lines(Line_sep = $/, chomp: false) -> array_of_strings
10038 *
10039 * Forms substrings ("lines") of +self+ according to the given arguments
10040 * (see String#each_line for details); returns the lines in an array.
10041 *
10042 */
10043
10044static VALUE
10045rb_str_lines(int argc, VALUE *argv, VALUE str)
10046{
10047 VALUE ary = WANTARRAY("lines", 0);
10048 return rb_str_enumerate_lines(argc, argv, str, ary);
10049}
10050
10051static VALUE
10052rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
10053{
10054 return LONG2FIX(RSTRING_LEN(str));
10055}
10056
10057static VALUE
10058rb_str_enumerate_bytes(VALUE str, VALUE ary)
10059{
10060 long i;
10061
10062 for (i=0; i<RSTRING_LEN(str); i++) {
10063 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
10064 }
10065 if (ary)
10066 return ary;
10067 else
10068 return str;
10069}
10070
10071/*
10072 * call-seq:
10073 * each_byte {|byte| ... } -> self
10074 * each_byte -> enumerator
10075 *
10076 * :include: doc/string/each_byte.rdoc
10077 *
10078 */
10079
10080static VALUE
10081rb_str_each_byte(VALUE str)
10082{
10083 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
10084 return rb_str_enumerate_bytes(str, 0);
10085}
10086
10087/*
10088 * call-seq:
10089 * bytes -> array_of_bytes
10090 *
10091 * :include: doc/string/bytes.rdoc
10092 *
10093 */
10094
10095static VALUE
10096rb_str_bytes(VALUE str)
10097{
10098 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
10099 return rb_str_enumerate_bytes(str, ary);
10100}
10101
10102static VALUE
10103rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
10104{
10105 return rb_str_length(str);
10106}
10107
10108static VALUE
10109rb_str_enumerate_chars(VALUE str, VALUE ary)
10110{
10111 VALUE orig = str;
10112 long i, len, n;
10113 const char *ptr;
10114 rb_encoding *enc;
10115
10116 str = rb_str_new_frozen(str);
10117 ptr = RSTRING_PTR(str);
10118 len = RSTRING_LEN(str);
10119 enc = rb_enc_get(str);
10120
10122 for (i = 0; i < len; i += n) {
10123 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
10124 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
10125 }
10126 }
10127 else {
10128 for (i = 0; i < len; i += n) {
10129 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
10130 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
10131 }
10132 }
10133 RB_GC_GUARD(str);
10134 if (ary)
10135 return ary;
10136 else
10137 return orig;
10138}
10139
10140/*
10141 * call-seq:
10142 * each_char {|c| ... } -> self
10143 * each_char -> enumerator
10144 *
10145 * :include: doc/string/each_char.rdoc
10146 *
10147 */
10148
10149static VALUE
10150rb_str_each_char(VALUE str)
10151{
10152 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
10153 return rb_str_enumerate_chars(str, 0);
10154}
10155
10156/*
10157 * call-seq:
10158 * chars -> array_of_characters
10159 *
10160 * :include: doc/string/chars.rdoc
10161 *
10162 */
10163
10164static VALUE
10165rb_str_chars(VALUE str)
10166{
10167 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
10168 return rb_str_enumerate_chars(str, ary);
10169}
10170
10171static VALUE
10172rb_str_enumerate_codepoints(VALUE str, VALUE ary)
10173{
10174 VALUE orig = str;
10175 int n;
10176 unsigned int c;
10177 const char *ptr, *end;
10178 rb_encoding *enc;
10179
10180 if (single_byte_optimizable(str))
10181 return rb_str_enumerate_bytes(str, ary);
10182
10183 str = rb_str_new_frozen(str);
10184 ptr = RSTRING_PTR(str);
10185 end = RSTRING_END(str);
10186 enc = STR_ENC_GET(str);
10187
10188 while (ptr < end) {
10189 c = rb_enc_codepoint_len(ptr, end, &n, enc);
10190 ENUM_ELEM(ary, UINT2NUM(c));
10191 ptr += n;
10192 }
10193 RB_GC_GUARD(str);
10194 if (ary)
10195 return ary;
10196 else
10197 return orig;
10198}
10199
10200/*
10201 * call-seq:
10202 * each_codepoint {|integer| ... } -> self
10203 * each_codepoint -> enumerator
10204 *
10205 * :include: doc/string/each_codepoint.rdoc
10206 *
10207 */
10208
10209static VALUE
10210rb_str_each_codepoint(VALUE str)
10211{
10212 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
10213 return rb_str_enumerate_codepoints(str, 0);
10214}
10215
10216/*
10217 * call-seq:
10218 * codepoints -> array_of_integers
10219 *
10220 * :include: doc/string/codepoints.rdoc
10221 *
10222 */
10223
10224static VALUE
10225rb_str_codepoints(VALUE str)
10226{
10227 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
10228 return rb_str_enumerate_codepoints(str, ary);
10229}
10230
10231static regex_t *
10232get_reg_grapheme_cluster(rb_encoding *enc)
10233{
10234 int encidx = rb_enc_to_index(enc);
10235
10236 const OnigUChar source_ascii[] = "\\X";
10237 const OnigUChar *source = source_ascii;
10238 size_t source_len = sizeof(source_ascii) - 1;
10239
10240 switch (encidx) {
10241#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
10242#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
10243#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
10244#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
10245#define CASE_UTF(e) \
10246 case ENCINDEX_UTF_##e: { \
10247 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
10248 source = source_UTF_##e; \
10249 source_len = sizeof(source_UTF_##e); \
10250 break; \
10251 }
10252 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
10253#undef CASE_UTF
10254#undef CHARS_16BE
10255#undef CHARS_16LE
10256#undef CHARS_32BE
10257#undef CHARS_32LE
10258 }
10259
10260 regex_t *reg_grapheme_cluster;
10261 OnigErrorInfo einfo;
10262 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
10263 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
10264 if (r) {
10265 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
10266 onig_error_code_to_str(message, r, &einfo);
10267 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
10268 }
10269
10270 return reg_grapheme_cluster;
10271}
10272
10273static regex_t *
10274get_cached_reg_grapheme_cluster(rb_encoding *enc)
10275{
10276 int encidx = rb_enc_to_index(enc);
10277 static regex_t *reg_grapheme_cluster_utf8 = NULL;
10278
10279 if (encidx == rb_utf8_encindex()) {
10280 if (!reg_grapheme_cluster_utf8) {
10281 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
10282 }
10283
10284 return reg_grapheme_cluster_utf8;
10285 }
10286
10287 return NULL;
10288}
10289
10290static VALUE
10291rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
10292{
10293 size_t grapheme_cluster_count = 0;
10294 rb_encoding *enc = get_encoding(str);
10295 const char *ptr, *end;
10296
10297 if (!rb_enc_unicode_p(enc)) {
10298 return rb_str_length(str);
10299 }
10300
10301 bool cached_reg_grapheme_cluster = true;
10302 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10303 if (!reg_grapheme_cluster) {
10304 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10305 cached_reg_grapheme_cluster = false;
10306 }
10307
10308 ptr = RSTRING_PTR(str);
10309 end = RSTRING_END(str);
10310
10311 while (ptr < end) {
10312 OnigPosition len = onig_match(reg_grapheme_cluster,
10313 (const OnigUChar *)ptr, (const OnigUChar *)end,
10314 (const OnigUChar *)ptr, NULL, 0);
10315 if (len <= 0) break;
10316 grapheme_cluster_count++;
10317 ptr += len;
10318 }
10319
10320 if (!cached_reg_grapheme_cluster) {
10321 onig_free(reg_grapheme_cluster);
10322 }
10323
10324 return SIZET2NUM(grapheme_cluster_count);
10325}
10326
10327static VALUE
10328rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
10329{
10330 VALUE orig = str;
10331 rb_encoding *enc = get_encoding(str);
10332 const char *ptr0, *ptr, *end;
10333
10334 if (!rb_enc_unicode_p(enc)) {
10335 return rb_str_enumerate_chars(str, ary);
10336 }
10337
10338 if (!ary) str = rb_str_new_frozen(str);
10339
10340 bool cached_reg_grapheme_cluster = true;
10341 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10342 if (!reg_grapheme_cluster) {
10343 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10344 cached_reg_grapheme_cluster = false;
10345 }
10346
10347 ptr0 = ptr = RSTRING_PTR(str);
10348 end = RSTRING_END(str);
10349
10350 while (ptr < end) {
10351 OnigPosition len = onig_match(reg_grapheme_cluster,
10352 (const OnigUChar *)ptr, (const OnigUChar *)end,
10353 (const OnigUChar *)ptr, NULL, 0);
10354 if (len <= 0) break;
10355 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10356 ptr += len;
10357 }
10358
10359 if (!cached_reg_grapheme_cluster) {
10360 onig_free(reg_grapheme_cluster);
10361 }
10362
10363 RB_GC_GUARD(str);
10364 if (ary)
10365 return ary;
10366 else
10367 return orig;
10368}
10369
10370/*
10371 * call-seq:
10372 * each_grapheme_cluster {|gc| ... } -> self
10373 * each_grapheme_cluster -> enumerator
10374 *
10375 * :include: doc/string/each_grapheme_cluster.rdoc
10376 *
10377 */
10378
10379static VALUE
10380rb_str_each_grapheme_cluster(VALUE str)
10381{
10382 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10383 return rb_str_enumerate_grapheme_clusters(str, 0);
10384}
10385
10386/*
10387 * call-seq:
10388 * grapheme_clusters -> array_of_grapheme_clusters
10389 *
10390 * :include: doc/string/grapheme_clusters.rdoc
10391 *
10392 */
10393
10394static VALUE
10395rb_str_grapheme_clusters(VALUE str)
10396{
10397 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10398 return rb_str_enumerate_grapheme_clusters(str, ary);
10399}
10400
10401static long
10402chopped_length(VALUE str)
10403{
10404 rb_encoding *enc = STR_ENC_GET(str);
10405 const char *p, *p2, *beg, *end;
10406
10407 beg = RSTRING_PTR(str);
10408 end = beg + RSTRING_LEN(str);
10409 if (beg >= end) return 0;
10410 p = rb_enc_prev_char(beg, end, end, enc);
10411 if (!p) return 0;
10412 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10413 p2 = rb_enc_prev_char(beg, p, end, enc);
10414 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10415 }
10416 return p - beg;
10417}
10418
10419/*
10420 * call-seq:
10421 * chop! -> self or nil
10422 *
10423 * Like String#chop, but modifies +self+ in place;
10424 * returns +nil+ if +self+ is empty, +self+ otherwise.
10425 *
10426 * Related: String#chomp!.
10427 */
10428
10429static VALUE
10430rb_str_chop_bang(VALUE str)
10431{
10432 str_modify_keep_cr(str);
10433 if (RSTRING_LEN(str) > 0) {
10434 long len;
10435 len = chopped_length(str);
10436 STR_SET_LEN(str, len);
10437 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10438 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10440 }
10441 return str;
10442 }
10443 return Qnil;
10444}
10445
10446
10447/*
10448 * call-seq:
10449 * chop -> new_string
10450 *
10451 * :include: doc/string/chop.rdoc
10452 *
10453 */
10454
10455static VALUE
10456rb_str_chop(VALUE str)
10457{
10458 return rb_str_subseq(str, 0, chopped_length(str));
10459}
10460
10461static long
10462smart_chomp(VALUE str, const char *e, const char *p)
10463{
10464 rb_encoding *enc = rb_enc_get(str);
10465 if (rb_enc_mbminlen(enc) > 1) {
10466 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10467 if (rb_enc_is_newline(pp, e, enc)) {
10468 e = pp;
10469 }
10470 pp = e - rb_enc_mbminlen(enc);
10471 if (pp >= p) {
10472 pp = rb_enc_left_char_head(p, pp, e, enc);
10473 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10474 e = pp;
10475 }
10476 }
10477 }
10478 else {
10479 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10480 case '\n':
10481 if (--e > p && *(e-1) == '\r') {
10482 --e;
10483 }
10484 break;
10485 case '\r':
10486 --e;
10487 break;
10488 }
10489 }
10490 return e - p;
10491}
10492
10493static long
10494chompped_length(VALUE str, VALUE rs)
10495{
10496 rb_encoding *enc;
10497 int newline;
10498 char *pp, *e, *rsptr;
10499 long rslen;
10500 char *const p = RSTRING_PTR(str);
10501 long len = RSTRING_LEN(str);
10502
10503 if (len == 0) return 0;
10504 e = p + len;
10505 if (rs == rb_default_rs) {
10506 return smart_chomp(str, e, p);
10507 }
10508
10509 enc = rb_enc_get(str);
10510 RSTRING_GETMEM(rs, rsptr, rslen);
10511 if (rslen == 0) {
10512 if (rb_enc_mbminlen(enc) > 1) {
10513 while (e > p) {
10514 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10515 if (!rb_enc_is_newline(pp, e, enc)) break;
10516 e = pp;
10517 pp -= rb_enc_mbminlen(enc);
10518 if (pp >= p) {
10519 pp = rb_enc_left_char_head(p, pp, e, enc);
10520 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10521 e = pp;
10522 }
10523 }
10524 }
10525 }
10526 else {
10527 while (e > p && *(e-1) == '\n') {
10528 --e;
10529 if (e > p && *(e-1) == '\r')
10530 --e;
10531 }
10532 }
10533 return e - p;
10534 }
10535 if (rslen > len) return len;
10536
10537 enc = rb_enc_get(rs);
10538 newline = rsptr[rslen-1];
10539 if (rslen == rb_enc_mbminlen(enc)) {
10540 if (rslen == 1) {
10541 if (newline == '\n')
10542 return smart_chomp(str, e, p);
10543 }
10544 else {
10545 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10546 return smart_chomp(str, e, p);
10547 }
10548 }
10549
10550 enc = rb_enc_check(str, rs);
10551 if (is_broken_string(rs)) {
10552 return len;
10553 }
10554 pp = e - rslen;
10555 if (p[len-1] == newline &&
10556 (rslen <= 1 ||
10557 memcmp(rsptr, pp, rslen) == 0)) {
10558 if (at_char_boundary(p, pp, e, enc))
10559 return len - rslen;
10560 RB_GC_GUARD(rs);
10561 }
10562 return len;
10563}
10564
10570static VALUE
10571chomp_rs(int argc, const VALUE *argv)
10572{
10573 rb_check_arity(argc, 0, 1);
10574 if (argc > 0) {
10575 VALUE rs = argv[0];
10576 if (!NIL_P(rs)) StringValue(rs);
10577 return rs;
10578 }
10579 else {
10580 return rb_rs;
10581 }
10582}
10583
10584VALUE
10585rb_str_chomp_string(VALUE str, VALUE rs)
10586{
10587 long olen = RSTRING_LEN(str);
10588 long len = chompped_length(str, rs);
10589 if (len >= olen) return Qnil;
10590 str_modify_keep_cr(str);
10591 STR_SET_LEN(str, len);
10592 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10593 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10595 }
10596 return str;
10597}
10598
10599/*
10600 * call-seq:
10601 * chomp!(line_sep = $/) -> self or nil
10602 *
10603 * Like String#chomp, but modifies +self+ in place;
10604 * returns +nil+ if no modification made, +self+ otherwise.
10605 *
10606 */
10607
10608static VALUE
10609rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10610{
10611 VALUE rs;
10612 str_modifiable(str);
10613 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10614 rs = chomp_rs(argc, argv);
10615 if (NIL_P(rs)) return Qnil;
10616 return rb_str_chomp_string(str, rs);
10617}
10618
10619
10620/*
10621 * call-seq:
10622 * chomp(line_sep = $/) -> new_string
10623 *
10624 * :include: doc/string/chomp.rdoc
10625 *
10626 */
10627
10628static VALUE
10629rb_str_chomp(int argc, VALUE *argv, VALUE str)
10630{
10631 VALUE rs = chomp_rs(argc, argv);
10632 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10633 return rb_str_subseq(str, 0, chompped_length(str, rs));
10634}
10635
10636static long
10637lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10638{
10639 const char *const start = s;
10640
10641 if (!s || s >= e) return 0;
10642
10643 /* remove spaces at head */
10644 if (single_byte_optimizable(str)) {
10645 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10646 }
10647 else {
10648 while (s < e) {
10649 int n;
10650 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10651
10652 if (cc && !rb_isspace(cc)) break;
10653 s += n;
10654 }
10655 }
10656 return s - start;
10657}
10658
10659/*
10660 * call-seq:
10661 * lstrip! -> self or nil
10662 *
10663 * Like String#lstrip, except that any modifications are made in +self+;
10664 * returns +self+ if any modification are made, +nil+ otherwise.
10665 *
10666 * Related: String#rstrip!, String#strip!.
10667 */
10668
10669static VALUE
10670rb_str_lstrip_bang(VALUE str)
10671{
10672 rb_encoding *enc;
10673 char *start, *s;
10674 long olen, loffset;
10675
10676 str_modify_keep_cr(str);
10677 enc = STR_ENC_GET(str);
10678 RSTRING_GETMEM(str, start, olen);
10679 loffset = lstrip_offset(str, start, start+olen, enc);
10680 if (loffset > 0) {
10681 long len = olen-loffset;
10682 s = start + loffset;
10683 memmove(start, s, len);
10684 STR_SET_LEN(str, len);
10685 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10686 return str;
10687 }
10688 return Qnil;
10689}
10690
10691
10692/*
10693 * call-seq:
10694 * lstrip -> new_string
10695 *
10696 * Returns a copy of +self+ with leading whitespace removed;
10697 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10698 *
10699 * whitespace = "\x00\t\n\v\f\r "
10700 * s = whitespace + 'abc' + whitespace
10701 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10702 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10703 *
10704 * Related: String#rstrip, String#strip.
10705 */
10706
10707static VALUE
10708rb_str_lstrip(VALUE str)
10709{
10710 char *start;
10711 long len, loffset;
10712 RSTRING_GETMEM(str, start, len);
10713 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10714 if (loffset <= 0) return str_duplicate(rb_cString, str);
10715 return rb_str_subseq(str, loffset, len - loffset);
10716}
10717
10718static long
10719rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10720{
10721 const char *t;
10722
10723 rb_str_check_dummy_enc(enc);
10725 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10726 }
10727 if (!s || s >= e) return 0;
10728 t = e;
10729
10730 /* remove trailing spaces or '\0's */
10731 if (single_byte_optimizable(str)) {
10732 unsigned char c;
10733 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10734 }
10735 else {
10736 char *tp;
10737
10738 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10739 unsigned int c = rb_enc_codepoint(tp, e, enc);
10740 if (c && !rb_isspace(c)) break;
10741 t = tp;
10742 }
10743 }
10744 return e - t;
10745}
10746
10747/*
10748 * call-seq:
10749 * rstrip! -> self or nil
10750 *
10751 * Like String#rstrip, except that any modifications are made in +self+;
10752 * returns +self+ if any modification are made, +nil+ otherwise.
10753 *
10754 * Related: String#lstrip!, String#strip!.
10755 */
10756
10757static VALUE
10758rb_str_rstrip_bang(VALUE str)
10759{
10760 rb_encoding *enc;
10761 char *start;
10762 long olen, roffset;
10763
10764 str_modify_keep_cr(str);
10765 enc = STR_ENC_GET(str);
10766 RSTRING_GETMEM(str, start, olen);
10767 roffset = rstrip_offset(str, start, start+olen, enc);
10768 if (roffset > 0) {
10769 long len = olen - roffset;
10770
10771 STR_SET_LEN(str, len);
10772 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10773 return str;
10774 }
10775 return Qnil;
10776}
10777
10778
10779/*
10780 * call-seq:
10781 * rstrip -> new_string
10782 *
10783 * Returns a copy of the receiver with trailing whitespace removed;
10784 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10785 *
10786 * whitespace = "\x00\t\n\v\f\r "
10787 * s = whitespace + 'abc' + whitespace
10788 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10789 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10790 *
10791 * Related: String#lstrip, String#strip.
10792 */
10793
10794static VALUE
10795rb_str_rstrip(VALUE str)
10796{
10797 rb_encoding *enc;
10798 char *start;
10799 long olen, roffset;
10800
10801 enc = STR_ENC_GET(str);
10802 RSTRING_GETMEM(str, start, olen);
10803 roffset = rstrip_offset(str, start, start+olen, enc);
10804
10805 if (roffset <= 0) return str_duplicate(rb_cString, str);
10806 return rb_str_subseq(str, 0, olen-roffset);
10807}
10808
10809
10810/*
10811 * call-seq:
10812 * strip! -> self or nil
10813 *
10814 * Like String#strip, except that any modifications are made in +self+;
10815 * returns +self+ if any modification are made, +nil+ otherwise.
10816 *
10817 * Related: String#lstrip!, String#strip!.
10818 */
10819
10820static VALUE
10821rb_str_strip_bang(VALUE str)
10822{
10823 char *start;
10824 long olen, loffset, roffset;
10825 rb_encoding *enc;
10826
10827 str_modify_keep_cr(str);
10828 enc = STR_ENC_GET(str);
10829 RSTRING_GETMEM(str, start, olen);
10830 loffset = lstrip_offset(str, start, start+olen, enc);
10831 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10832
10833 if (loffset > 0 || roffset > 0) {
10834 long len = olen-roffset;
10835 if (loffset > 0) {
10836 len -= loffset;
10837 memmove(start, start + loffset, len);
10838 }
10839 STR_SET_LEN(str, len);
10840 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10841 return str;
10842 }
10843 return Qnil;
10844}
10845
10846
10847/*
10848 * call-seq:
10849 * strip -> new_string
10850 *
10851 * Returns a copy of the receiver with leading and trailing whitespace removed;
10852 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10853 *
10854 * whitespace = "\x00\t\n\v\f\r "
10855 * s = whitespace + 'abc' + whitespace
10856 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10857 * s.strip # => "abc"
10858 *
10859 * Related: String#lstrip, String#rstrip.
10860 */
10861
10862static VALUE
10863rb_str_strip(VALUE str)
10864{
10865 char *start;
10866 long olen, loffset, roffset;
10867 rb_encoding *enc = STR_ENC_GET(str);
10868
10869 RSTRING_GETMEM(str, start, olen);
10870 loffset = lstrip_offset(str, start, start+olen, enc);
10871 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10872
10873 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10874 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10875}
10876
10877static VALUE
10878scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10879{
10880 VALUE result = Qnil;
10881 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10882 if (pos >= 0) {
10883 VALUE match;
10884 struct re_registers *regs;
10885 if (BUILTIN_TYPE(pat) == T_STRING) {
10886 regs = NULL;
10887 end = pos + RSTRING_LEN(pat);
10888 }
10889 else {
10890 match = rb_backref_get();
10891 regs = RMATCH_REGS(match);
10892 pos = BEG(0);
10893 end = END(0);
10894 }
10895
10896 if (pos == end) {
10897 rb_encoding *enc = STR_ENC_GET(str);
10898 /*
10899 * Always consume at least one character of the input string
10900 */
10901 if (RSTRING_LEN(str) > end)
10902 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10903 RSTRING_END(str), enc);
10904 else
10905 *start = end + 1;
10906 }
10907 else {
10908 *start = end;
10909 }
10910
10911 if (!regs || regs->num_regs == 1) {
10912 result = rb_str_subseq(str, pos, end - pos);
10913 return result;
10914 }
10915 else {
10916 result = rb_ary_new2(regs->num_regs);
10917 for (int i = 1; i < regs->num_regs; i++) {
10918 VALUE s = Qnil;
10919 if (BEG(i) >= 0) {
10920 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10921 }
10922
10923 rb_ary_push(result, s);
10924 }
10925 }
10926
10927 RB_GC_GUARD(match);
10928 }
10929
10930 return result;
10931}
10932
10933
10934/*
10935 * call-seq:
10936 * scan(string_or_regexp) -> array
10937 * scan(string_or_regexp) {|matches| ... } -> self
10938 *
10939 * Matches a pattern against +self+; the pattern is:
10940 *
10941 * - +string_or_regexp+ itself, if it is a Regexp.
10942 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10943 *
10944 * Iterates through +self+, generating a collection of matching results:
10945 *
10946 * - If the pattern contains no groups, each result is the
10947 * matched string, <code>$&</code>.
10948 * - If the pattern contains groups, each result is an array
10949 * containing one entry per group.
10950 *
10951 * With no block given, returns an array of the results:
10952 *
10953 * s = 'cruel world'
10954 * s.scan(/\w+/) # => ["cruel", "world"]
10955 * s.scan(/.../) # => ["cru", "el ", "wor"]
10956 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10957 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10958 *
10959 * With a block given, calls the block with each result; returns +self+:
10960 *
10961 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10962 * print "\n"
10963 * s.scan(/(.)(.)/) {|x,y| print y, x }
10964 * print "\n"
10965 *
10966 * Output:
10967 *
10968 * <<cruel>> <<world>>
10969 * rceu lowlr
10970 *
10971 */
10972
10973static VALUE
10974rb_str_scan(VALUE str, VALUE pat)
10975{
10976 VALUE result;
10977 long start = 0;
10978 long last = -1, prev = 0;
10979 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10980
10981 pat = get_pat_quoted(pat, 1);
10982 mustnot_broken(str);
10983 if (!rb_block_given_p()) {
10984 VALUE ary = rb_ary_new();
10985
10986 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10987 last = prev;
10988 prev = start;
10989 rb_ary_push(ary, result);
10990 }
10991 if (last >= 0) rb_pat_search(pat, str, last, 1);
10992 else rb_backref_set(Qnil);
10993 return ary;
10994 }
10995
10996 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10997 last = prev;
10998 prev = start;
10999 rb_yield(result);
11000 str_mod_check(str, p, len);
11001 }
11002 if (last >= 0) rb_pat_search(pat, str, last, 1);
11003 return str;
11004}
11005
11006
11007/*
11008 * call-seq:
11009 * hex -> integer
11010 *
11011 * Interprets the leading substring of +self+ as a string of hexadecimal digits
11012 * (with an optional sign and an optional <code>0x</code>) and returns the
11013 * corresponding number;
11014 * returns zero if there is no such leading substring:
11015 *
11016 * '0x0a'.hex # => 10
11017 * '-1234'.hex # => -4660
11018 * '0'.hex # => 0
11019 * 'non-numeric'.hex # => 0
11020 *
11021 * Related: String#oct.
11022 *
11023 */
11024
11025static VALUE
11026rb_str_hex(VALUE str)
11027{
11028 return rb_str_to_inum(str, 16, FALSE);
11029}
11030
11031
11032/*
11033 * call-seq:
11034 * oct -> integer
11035 *
11036 * Interprets the leading substring of +self+ as a string of octal digits
11037 * (with an optional sign) and returns the corresponding number;
11038 * returns zero if there is no such leading substring:
11039 *
11040 * '123'.oct # => 83
11041 * '-377'.oct # => -255
11042 * '0377non-numeric'.oct # => 255
11043 * 'non-numeric'.oct # => 0
11044 *
11045 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
11046 * see Kernel#Integer.
11047 *
11048 * Related: String#hex.
11049 *
11050 */
11051
11052static VALUE
11053rb_str_oct(VALUE str)
11054{
11055 return rb_str_to_inum(str, -8, FALSE);
11056}
11057
11058#ifndef HAVE_CRYPT_R
11059# include "ruby/thread_native.h"
11060# include "ruby/atomic.h"
11061
11062static struct {
11063 rb_nativethread_lock_t lock;
11064} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
11065
11066static void
11067crypt_mutex_initialize(void)
11068{
11069}
11070#endif
11071
11072/*
11073 * call-seq:
11074 * crypt(salt_str) -> new_string
11075 *
11076 * Returns the string generated by calling <code>crypt(3)</code>
11077 * standard library function with <code>str</code> and
11078 * <code>salt_str</code>, in this order, as its arguments. Please do
11079 * not use this method any longer. It is legacy; provided only for
11080 * backward compatibility with ruby scripts in earlier days. It is
11081 * bad to use in contemporary programs for several reasons:
11082 *
11083 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
11084 * run. The generated string lacks data portability.
11085 *
11086 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
11087 * (i.e. silently ends up in unexpected results).
11088 *
11089 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
11090 * thread safe.
11091 *
11092 * * So-called "traditional" usage of <code>crypt(3)</code> is very
11093 * very very weak. According to its manpage, Linux's traditional
11094 * <code>crypt(3)</code> output has only 2**56 variations; too
11095 * easy to brute force today. And this is the default behaviour.
11096 *
11097 * * In order to make things robust some OSes implement so-called
11098 * "modular" usage. To go through, you have to do a complex
11099 * build-up of the <code>salt_str</code> parameter, by hand.
11100 * Failure in generation of a proper salt string tends not to
11101 * yield any errors; typos in parameters are normally not
11102 * detectable.
11103 *
11104 * * For instance, in the following example, the second invocation
11105 * of String#crypt is wrong; it has a typo in "round=" (lacks
11106 * "s"). However the call does not fail and something unexpected
11107 * is generated.
11108 *
11109 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
11110 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
11111 *
11112 * * Even in the "modular" mode, some hash functions are considered
11113 * archaic and no longer recommended at all; for instance module
11114 * <code>$1$</code> is officially abandoned by its author: see
11115 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
11116 * instance module <code>$3$</code> is considered completely
11117 * broken: see the manpage of FreeBSD.
11118 *
11119 * * On some OS such as Mac OS, there is no modular mode. Yet, as
11120 * written above, <code>crypt(3)</code> on Mac OS never fails.
11121 * This means even if you build up a proper salt string it
11122 * generates a traditional DES hash anyways, and there is no way
11123 * for you to be aware of.
11124 *
11125 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
11126 *
11127 * If for some reason you cannot migrate to other secure contemporary
11128 * password hashing algorithms, install the string-crypt gem and
11129 * <code>require 'string/crypt'</code> to continue using it.
11130 */
11131
11132static VALUE
11133rb_str_crypt(VALUE str, VALUE salt)
11134{
11135#ifdef HAVE_CRYPT_R
11136 VALUE databuf;
11137 struct crypt_data *data;
11138# define CRYPT_END() ALLOCV_END(databuf)
11139#else
11140 extern char *crypt(const char *, const char *);
11141# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
11142#endif
11143 VALUE result;
11144 const char *s, *saltp;
11145 char *res;
11146#ifdef BROKEN_CRYPT
11147 char salt_8bit_clean[3];
11148#endif
11149
11150 StringValue(salt);
11151 mustnot_wchar(str);
11152 mustnot_wchar(salt);
11153 s = StringValueCStr(str);
11154 saltp = RSTRING_PTR(salt);
11155 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
11156 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
11157 }
11158
11159#ifdef BROKEN_CRYPT
11160 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
11161 salt_8bit_clean[0] = saltp[0] & 0x7f;
11162 salt_8bit_clean[1] = saltp[1] & 0x7f;
11163 salt_8bit_clean[2] = '\0';
11164 saltp = salt_8bit_clean;
11165 }
11166#endif
11167#ifdef HAVE_CRYPT_R
11168 data = ALLOCV(databuf, sizeof(struct crypt_data));
11169# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
11170 data->initialized = 0;
11171# endif
11172 res = crypt_r(s, saltp, data);
11173#else
11174 crypt_mutex_initialize();
11175 rb_nativethread_lock_lock(&crypt_mutex.lock);
11176 res = crypt(s, saltp);
11177#endif
11178 if (!res) {
11179 int err = errno;
11180 CRYPT_END();
11181 rb_syserr_fail(err, "crypt");
11182 }
11183 result = rb_str_new_cstr(res);
11184 CRYPT_END();
11185 return result;
11186}
11187
11188
11189/*
11190 * call-seq:
11191 * ord -> integer
11192 *
11193 * :include: doc/string/ord.rdoc
11194 *
11195 */
11196
11197static VALUE
11198rb_str_ord(VALUE s)
11199{
11200 unsigned int c;
11201
11202 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
11203 return UINT2NUM(c);
11204}
11205/*
11206 * call-seq:
11207 * sum(n = 16) -> integer
11208 *
11209 * :include: doc/string/sum.rdoc
11210 *
11211 */
11212
11213static VALUE
11214rb_str_sum(int argc, VALUE *argv, VALUE str)
11215{
11216 int bits = 16;
11217 char *ptr, *p, *pend;
11218 long len;
11219 VALUE sum = INT2FIX(0);
11220 unsigned long sum0 = 0;
11221
11222 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11223 bits = 0;
11224 }
11225 ptr = p = RSTRING_PTR(str);
11226 len = RSTRING_LEN(str);
11227 pend = p + len;
11228
11229 while (p < pend) {
11230 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11231 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11232 str_mod_check(str, ptr, len);
11233 sum0 = 0;
11234 }
11235 sum0 += (unsigned char)*p;
11236 p++;
11237 }
11238
11239 if (bits == 0) {
11240 if (sum0) {
11241 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11242 }
11243 }
11244 else {
11245 if (sum == INT2FIX(0)) {
11246 if (bits < (int)sizeof(long)*CHAR_BIT) {
11247 sum0 &= (((unsigned long)1)<<bits)-1;
11248 }
11249 sum = LONG2FIX(sum0);
11250 }
11251 else {
11252 VALUE mod;
11253
11254 if (sum0) {
11255 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11256 }
11257
11258 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11259 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11260 sum = rb_funcall(sum, '&', 1, mod);
11261 }
11262 }
11263 return sum;
11264}
11265
11266static VALUE
11267rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11268{
11269 rb_encoding *enc;
11270 VALUE w;
11271 long width, len, flen = 1, fclen = 1;
11272 VALUE res;
11273 char *p;
11274 const char *f = " ";
11275 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11276 VALUE pad;
11277 int singlebyte = 1, cr;
11278 int termlen;
11279
11280 rb_scan_args(argc, argv, "11", &w, &pad);
11281 enc = STR_ENC_GET(str);
11282 termlen = rb_enc_mbminlen(enc);
11283 width = NUM2LONG(w);
11284 if (argc == 2) {
11285 StringValue(pad);
11286 enc = rb_enc_check(str, pad);
11287 f = RSTRING_PTR(pad);
11288 flen = RSTRING_LEN(pad);
11289 fclen = str_strlen(pad, enc); /* rb_enc_check */
11290 singlebyte = single_byte_optimizable(pad);
11291 if (flen == 0 || fclen == 0) {
11292 rb_raise(rb_eArgError, "zero width padding");
11293 }
11294 }
11295 len = str_strlen(str, enc); /* rb_enc_check */
11296 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11297 n = width - len;
11298 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11299 rlen = n - llen;
11300 cr = ENC_CODERANGE(str);
11301 if (flen > 1) {
11302 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11303 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11304 }
11305 size = RSTRING_LEN(str);
11306 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11307 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11308 (len += llen2 + rlen2) >= LONG_MAX - size) {
11309 rb_raise(rb_eArgError, "argument too big");
11310 }
11311 len += size;
11312 res = str_enc_new(rb_cString, 0, len, enc);
11313 p = RSTRING_PTR(res);
11314 if (flen <= 1) {
11315 memset(p, *f, llen);
11316 p += llen;
11317 }
11318 else {
11319 while (llen >= fclen) {
11320 memcpy(p,f,flen);
11321 p += flen;
11322 llen -= fclen;
11323 }
11324 if (llen > 0) {
11325 memcpy(p, f, llen2);
11326 p += llen2;
11327 }
11328 }
11329 memcpy(p, RSTRING_PTR(str), size);
11330 p += size;
11331 if (flen <= 1) {
11332 memset(p, *f, rlen);
11333 p += rlen;
11334 }
11335 else {
11336 while (rlen >= fclen) {
11337 memcpy(p,f,flen);
11338 p += flen;
11339 rlen -= fclen;
11340 }
11341 if (rlen > 0) {
11342 memcpy(p, f, rlen2);
11343 p += rlen2;
11344 }
11345 }
11346 TERM_FILL(p, termlen);
11347 STR_SET_LEN(res, p-RSTRING_PTR(res));
11348
11349 if (argc == 2)
11350 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11351 if (cr != ENC_CODERANGE_BROKEN)
11352 ENC_CODERANGE_SET(res, cr);
11353
11354 RB_GC_GUARD(pad);
11355 return res;
11356}
11357
11358
11359/*
11360 * call-seq:
11361 * ljust(size, pad_string = ' ') -> new_string
11362 *
11363 * :include: doc/string/ljust.rdoc
11364 *
11365 * Related: String#rjust, String#center.
11366 *
11367 */
11368
11369static VALUE
11370rb_str_ljust(int argc, VALUE *argv, VALUE str)
11371{
11372 return rb_str_justify(argc, argv, str, 'l');
11373}
11374
11375/*
11376 * call-seq:
11377 * rjust(size, pad_string = ' ') -> new_string
11378 *
11379 * :include: doc/string/rjust.rdoc
11380 *
11381 * Related: String#ljust, String#center.
11382 *
11383 */
11384
11385static VALUE
11386rb_str_rjust(int argc, VALUE *argv, VALUE str)
11387{
11388 return rb_str_justify(argc, argv, str, 'r');
11389}
11390
11391
11392/*
11393 * call-seq:
11394 * center(size, pad_string = ' ') -> new_string
11395 *
11396 * :include: doc/string/center.rdoc
11397 *
11398 * Related: String#ljust, String#rjust.
11399 *
11400 */
11401
11402static VALUE
11403rb_str_center(int argc, VALUE *argv, VALUE str)
11404{
11405 return rb_str_justify(argc, argv, str, 'c');
11406}
11407
11408/*
11409 * call-seq:
11410 * partition(string_or_regexp) -> [head, match, tail]
11411 *
11412 * :include: doc/string/partition.rdoc
11413 *
11414 */
11415
11416static VALUE
11417rb_str_partition(VALUE str, VALUE sep)
11418{
11419 long pos;
11420
11421 sep = get_pat_quoted(sep, 0);
11422 if (RB_TYPE_P(sep, T_REGEXP)) {
11423 if (rb_reg_search(sep, str, 0, 0) < 0) {
11424 goto failed;
11425 }
11426 VALUE match = rb_backref_get();
11427 struct re_registers *regs = RMATCH_REGS(match);
11428
11429 pos = BEG(0);
11430 sep = rb_str_subseq(str, pos, END(0) - pos);
11431 }
11432 else {
11433 pos = rb_str_index(str, sep, 0);
11434 if (pos < 0) goto failed;
11435 }
11436 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11437 sep,
11438 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11439 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11440
11441 failed:
11442 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11443}
11444
11445/*
11446 * call-seq:
11447 * rpartition(sep) -> [head, match, tail]
11448 *
11449 * :include: doc/string/rpartition.rdoc
11450 *
11451 */
11452
11453static VALUE
11454rb_str_rpartition(VALUE str, VALUE sep)
11455{
11456 long pos = RSTRING_LEN(str);
11457
11458 sep = get_pat_quoted(sep, 0);
11459 if (RB_TYPE_P(sep, T_REGEXP)) {
11460 if (rb_reg_search(sep, str, pos, 1) < 0) {
11461 goto failed;
11462 }
11463 VALUE match = rb_backref_get();
11464 struct re_registers *regs = RMATCH_REGS(match);
11465
11466 pos = BEG(0);
11467 sep = rb_str_subseq(str, pos, END(0) - pos);
11468 }
11469 else {
11470 pos = rb_str_sublen(str, pos);
11471 pos = rb_str_rindex(str, sep, pos);
11472 if (pos < 0) {
11473 goto failed;
11474 }
11475 }
11476
11477 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11478 sep,
11479 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11480 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11481 failed:
11482 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11483}
11484
11485/*
11486 * call-seq:
11487 * start_with?(*string_or_regexp) -> true or false
11488 *
11489 * :include: doc/string/start_with_p.rdoc
11490 *
11491 */
11492
11493static VALUE
11494rb_str_start_with(int argc, VALUE *argv, VALUE str)
11495{
11496 int i;
11497
11498 for (i=0; i<argc; i++) {
11499 VALUE tmp = argv[i];
11500 if (RB_TYPE_P(tmp, T_REGEXP)) {
11501 if (rb_reg_start_with_p(tmp, str))
11502 return Qtrue;
11503 }
11504 else {
11505 const char *p, *s, *e;
11506 long slen, tlen;
11507 rb_encoding *enc;
11508
11509 StringValue(tmp);
11510 enc = rb_enc_check(str, tmp);
11511 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11512 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11513 p = RSTRING_PTR(str);
11514 e = p + slen;
11515 s = p + tlen;
11516 if (!at_char_right_boundary(p, s, e, enc))
11517 continue;
11518 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11519 return Qtrue;
11520 }
11521 }
11522 return Qfalse;
11523}
11524
11525/*
11526 * call-seq:
11527 * end_with?(*strings) -> true or false
11528 *
11529 * :include: doc/string/end_with_p.rdoc
11530 *
11531 */
11532
11533static VALUE
11534rb_str_end_with(int argc, VALUE *argv, VALUE str)
11535{
11536 int i;
11537
11538 for (i=0; i<argc; i++) {
11539 VALUE tmp = argv[i];
11540 const char *p, *s, *e;
11541 long slen, tlen;
11542 rb_encoding *enc;
11543
11544 StringValue(tmp);
11545 enc = rb_enc_check(str, tmp);
11546 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11547 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11548 p = RSTRING_PTR(str);
11549 e = p + slen;
11550 s = e - tlen;
11551 if (!at_char_boundary(p, s, e, enc))
11552 continue;
11553 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11554 return Qtrue;
11555 }
11556 return Qfalse;
11557}
11558
11568static long
11569deleted_prefix_length(VALUE str, VALUE prefix)
11570{
11571 const char *strptr, *prefixptr;
11572 long olen, prefixlen;
11573 rb_encoding *enc = rb_enc_get(str);
11574
11575 StringValue(prefix);
11576
11577 if (!is_broken_string(prefix) ||
11578 !rb_enc_asciicompat(enc) ||
11579 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11580 enc = rb_enc_check(str, prefix);
11581 }
11582
11583 /* return 0 if not start with prefix */
11584 prefixlen = RSTRING_LEN(prefix);
11585 if (prefixlen <= 0) return 0;
11586 olen = RSTRING_LEN(str);
11587 if (olen < prefixlen) return 0;
11588 strptr = RSTRING_PTR(str);
11589 prefixptr = RSTRING_PTR(prefix);
11590 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11591 if (is_broken_string(prefix)) {
11592 if (!is_broken_string(str)) {
11593 /* prefix in a valid string cannot be broken */
11594 return 0;
11595 }
11596 const char *strend = strptr + olen;
11597 const char *after_prefix = strptr + prefixlen;
11598 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11599 /* prefix does not end at char-boundary */
11600 return 0;
11601 }
11602 }
11603 /* prefix part in `str` also should be valid. */
11604
11605 return prefixlen;
11606}
11607
11608/*
11609 * call-seq:
11610 * delete_prefix!(prefix) -> self or nil
11611 *
11612 * Like String#delete_prefix, except that +self+ is modified in place.
11613 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11614 *
11615 */
11616
11617static VALUE
11618rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11619{
11620 long prefixlen;
11621 str_modify_keep_cr(str);
11622
11623 prefixlen = deleted_prefix_length(str, prefix);
11624 if (prefixlen <= 0) return Qnil;
11625
11626 return rb_str_drop_bytes(str, prefixlen);
11627}
11628
11629/*
11630 * call-seq:
11631 * delete_prefix(prefix) -> new_string
11632 *
11633 * :include: doc/string/delete_prefix.rdoc
11634 *
11635 */
11636
11637static VALUE
11638rb_str_delete_prefix(VALUE str, VALUE prefix)
11639{
11640 long prefixlen;
11641
11642 prefixlen = deleted_prefix_length(str, prefix);
11643 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11644
11645 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11646}
11647
11657static long
11658deleted_suffix_length(VALUE str, VALUE suffix)
11659{
11660 const char *strptr, *suffixptr;
11661 long olen, suffixlen;
11662 rb_encoding *enc;
11663
11664 StringValue(suffix);
11665 if (is_broken_string(suffix)) return 0;
11666 enc = rb_enc_check(str, suffix);
11667
11668 /* return 0 if not start with suffix */
11669 suffixlen = RSTRING_LEN(suffix);
11670 if (suffixlen <= 0) return 0;
11671 olen = RSTRING_LEN(str);
11672 if (olen < suffixlen) return 0;
11673 strptr = RSTRING_PTR(str);
11674 suffixptr = RSTRING_PTR(suffix);
11675 const char *strend = strptr + olen;
11676 const char *before_suffix = strend - suffixlen;
11677 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11678 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11679
11680 return suffixlen;
11681}
11682
11683/*
11684 * call-seq:
11685 * delete_suffix!(suffix) -> self or nil
11686 *
11687 * Like String#delete_suffix, except that +self+ is modified in place.
11688 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11689 *
11690 */
11691
11692static VALUE
11693rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11694{
11695 long olen, suffixlen, len;
11696 str_modifiable(str);
11697
11698 suffixlen = deleted_suffix_length(str, suffix);
11699 if (suffixlen <= 0) return Qnil;
11700
11701 olen = RSTRING_LEN(str);
11702 str_modify_keep_cr(str);
11703 len = olen - suffixlen;
11704 STR_SET_LEN(str, len);
11705 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11706 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11708 }
11709 return str;
11710}
11711
11712/*
11713 * call-seq:
11714 * delete_suffix(suffix) -> new_string
11715 *
11716 * :include: doc/string/delete_suffix.rdoc
11717 *
11718 */
11719
11720static VALUE
11721rb_str_delete_suffix(VALUE str, VALUE suffix)
11722{
11723 long suffixlen;
11724
11725 suffixlen = deleted_suffix_length(str, suffix);
11726 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11727
11728 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11729}
11730
11731void
11732rb_str_setter(VALUE val, ID id, VALUE *var)
11733{
11734 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11735 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11736 }
11737 *var = val;
11738}
11739
11740static void
11741rb_fs_setter(VALUE val, ID id, VALUE *var)
11742{
11743 val = rb_fs_check(val);
11744 if (!val) {
11745 rb_raise(rb_eTypeError,
11746 "value of %"PRIsVALUE" must be String or Regexp",
11747 rb_id2str(id));
11748 }
11749 if (!NIL_P(val)) {
11750 rb_warn_deprecated("'$;'", NULL);
11751 }
11752 *var = val;
11753}
11754
11755
11756/*
11757 * call-seq:
11758 * force_encoding(encoding) -> self
11759 *
11760 * :include: doc/string/force_encoding.rdoc
11761 *
11762 */
11763
11764static VALUE
11765rb_str_force_encoding(VALUE str, VALUE enc)
11766{
11767 str_modifiable(str);
11768
11769 rb_encoding *encoding = rb_to_encoding(enc);
11770 int idx = rb_enc_to_index(encoding);
11771
11772 // If the encoding is unchanged, we do nothing.
11773 if (ENCODING_GET(str) == idx) {
11774 return str;
11775 }
11776
11777 rb_enc_associate_index(str, idx);
11778
11779 // If the coderange was 7bit and the new encoding is ASCII-compatible
11780 // we can keep the coderange.
11781 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11782 return str;
11783 }
11784
11786 return str;
11787}
11788
11789/*
11790 * call-seq:
11791 * b -> string
11792 *
11793 * :include: doc/string/b.rdoc
11794 *
11795 */
11796
11797static VALUE
11798rb_str_b(VALUE str)
11799{
11800 VALUE str2;
11801 if (STR_EMBED_P(str)) {
11802 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11803 }
11804 else {
11805 str2 = str_alloc_heap(rb_cString);
11806 }
11807 str_replace_shared_without_enc(str2, str);
11808
11809 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11810 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11811 // If we know the receiver's code range then we know the result's code range.
11812 int cr = ENC_CODERANGE(str);
11813 switch (cr) {
11814 case ENC_CODERANGE_7BIT:
11816 break;
11820 break;
11821 default:
11822 ENC_CODERANGE_CLEAR(str2);
11823 break;
11824 }
11825 }
11826
11827 return str2;
11828}
11829
11830/*
11831 * call-seq:
11832 * valid_encoding? -> true or false
11833 *
11834 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11835 *
11836 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11837 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11838 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11839 */
11840
11841static VALUE
11842rb_str_valid_encoding_p(VALUE str)
11843{
11844 int cr = rb_enc_str_coderange(str);
11845
11846 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11847}
11848
11849/*
11850 * call-seq:
11851 * ascii_only? -> true or false
11852 *
11853 * Returns +true+ if +self+ contains only ASCII characters,
11854 * +false+ otherwise:
11855 *
11856 * 'abc'.ascii_only? # => true
11857 * "abc\u{6666}".ascii_only? # => false
11858 *
11859 */
11860
11861static VALUE
11862rb_str_is_ascii_only_p(VALUE str)
11863{
11864 int cr = rb_enc_str_coderange(str);
11865
11866 return RBOOL(cr == ENC_CODERANGE_7BIT);
11867}
11868
11869VALUE
11871{
11872 static const char ellipsis[] = "...";
11873 const long ellipsislen = sizeof(ellipsis) - 1;
11874 rb_encoding *const enc = rb_enc_get(str);
11875 const long blen = RSTRING_LEN(str);
11876 const char *const p = RSTRING_PTR(str), *e = p + blen;
11877 VALUE estr, ret = 0;
11878
11879 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11880 if (len * rb_enc_mbminlen(enc) >= blen ||
11881 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11882 ret = str;
11883 }
11884 else if (len <= ellipsislen ||
11885 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11886 if (rb_enc_asciicompat(enc)) {
11887 ret = rb_str_new(ellipsis, len);
11888 rb_enc_associate(ret, enc);
11889 }
11890 else {
11891 estr = rb_usascii_str_new(ellipsis, len);
11892 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11893 }
11894 }
11895 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11896 rb_str_cat(ret, ellipsis, ellipsislen);
11897 }
11898 else {
11899 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11900 rb_enc_from_encoding(enc), 0, Qnil);
11901 rb_str_append(ret, estr);
11902 }
11903 return ret;
11904}
11905
11906static VALUE
11907str_compat_and_valid(VALUE str, rb_encoding *enc)
11908{
11909 int cr;
11910 str = StringValue(str);
11911 cr = rb_enc_str_coderange(str);
11912 if (cr == ENC_CODERANGE_BROKEN) {
11913 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11914 }
11915 else {
11916 rb_encoding *e = STR_ENC_GET(str);
11917 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11918 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11919 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11920 }
11921 }
11922 return str;
11923}
11924
11925static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11926
11927VALUE
11929{
11930 rb_encoding *enc = STR_ENC_GET(str);
11931 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11932}
11933
11934VALUE
11935rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11936{
11937 int cr = ENC_CODERANGE_UNKNOWN;
11938 if (enc == STR_ENC_GET(str)) {
11939 /* cached coderange makes sense only when enc equals the
11940 * actual encoding of str */
11941 cr = ENC_CODERANGE(str);
11942 }
11943 return enc_str_scrub(enc, str, repl, cr);
11944}
11945
11946static VALUE
11947enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11948{
11949 int encidx;
11950 VALUE buf = Qnil;
11951 const char *rep, *p, *e, *p1, *sp;
11952 long replen = -1;
11953 long slen;
11954
11955 if (rb_block_given_p()) {
11956 if (!NIL_P(repl))
11957 rb_raise(rb_eArgError, "both of block and replacement given");
11958 replen = 0;
11959 }
11960
11961 if (ENC_CODERANGE_CLEAN_P(cr))
11962 return Qnil;
11963
11964 if (!NIL_P(repl)) {
11965 repl = str_compat_and_valid(repl, enc);
11966 }
11967
11968 if (rb_enc_dummy_p(enc)) {
11969 return Qnil;
11970 }
11971 encidx = rb_enc_to_index(enc);
11972
11973#define DEFAULT_REPLACE_CHAR(str) do { \
11974 static const char replace[sizeof(str)-1] = str; \
11975 rep = replace; replen = (int)sizeof(replace); \
11976 } while (0)
11977
11978 slen = RSTRING_LEN(str);
11979 p = RSTRING_PTR(str);
11980 e = RSTRING_END(str);
11981 p1 = p;
11982 sp = p;
11983
11984 if (rb_enc_asciicompat(enc)) {
11985 int rep7bit_p;
11986 if (!replen) {
11987 rep = NULL;
11988 rep7bit_p = FALSE;
11989 }
11990 else if (!NIL_P(repl)) {
11991 rep = RSTRING_PTR(repl);
11992 replen = RSTRING_LEN(repl);
11993 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11994 }
11995 else if (encidx == rb_utf8_encindex()) {
11996 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11997 rep7bit_p = FALSE;
11998 }
11999 else {
12000 DEFAULT_REPLACE_CHAR("?");
12001 rep7bit_p = TRUE;
12002 }
12003 cr = ENC_CODERANGE_7BIT;
12004
12005 p = search_nonascii(p, e);
12006 if (!p) {
12007 p = e;
12008 }
12009 while (p < e) {
12010 int ret = rb_enc_precise_mbclen(p, e, enc);
12011 if (MBCLEN_NEEDMORE_P(ret)) {
12012 break;
12013 }
12014 else if (MBCLEN_CHARFOUND_P(ret)) {
12016 p += MBCLEN_CHARFOUND_LEN(ret);
12017 }
12018 else if (MBCLEN_INVALID_P(ret)) {
12019 /*
12020 * p1~p: valid ascii/multibyte chars
12021 * p ~e: invalid bytes + unknown bytes
12022 */
12023 long clen = rb_enc_mbmaxlen(enc);
12024 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
12025 if (p > p1) {
12026 rb_str_buf_cat(buf, p1, p - p1);
12027 }
12028
12029 if (e - p < clen) clen = e - p;
12030 if (clen <= 2) {
12031 clen = 1;
12032 }
12033 else {
12034 const char *q = p;
12035 clen--;
12036 for (; clen > 1; clen--) {
12037 ret = rb_enc_precise_mbclen(q, q + clen, enc);
12038 if (MBCLEN_NEEDMORE_P(ret)) break;
12039 if (MBCLEN_INVALID_P(ret)) continue;
12041 }
12042 }
12043 if (rep) {
12044 rb_str_buf_cat(buf, rep, replen);
12045 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
12046 }
12047 else {
12048 repl = rb_yield(rb_enc_str_new(p, clen, enc));
12049 str_mod_check(str, sp, slen);
12050 repl = str_compat_and_valid(repl, enc);
12051 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12054 }
12055 p += clen;
12056 p1 = p;
12057 p = search_nonascii(p, e);
12058 if (!p) {
12059 p = e;
12060 break;
12061 }
12062 }
12063 else {
12065 }
12066 }
12067 if (NIL_P(buf)) {
12068 if (p == e) {
12069 ENC_CODERANGE_SET(str, cr);
12070 return Qnil;
12071 }
12072 buf = rb_str_buf_new(RSTRING_LEN(str));
12073 }
12074 if (p1 < p) {
12075 rb_str_buf_cat(buf, p1, p - p1);
12076 }
12077 if (p < e) {
12078 if (rep) {
12079 rb_str_buf_cat(buf, rep, replen);
12080 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
12081 }
12082 else {
12083 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12084 str_mod_check(str, sp, slen);
12085 repl = str_compat_and_valid(repl, enc);
12086 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12089 }
12090 }
12091 }
12092 else {
12093 /* ASCII incompatible */
12094 long mbminlen = rb_enc_mbminlen(enc);
12095 if (!replen) {
12096 rep = NULL;
12097 }
12098 else if (!NIL_P(repl)) {
12099 rep = RSTRING_PTR(repl);
12100 replen = RSTRING_LEN(repl);
12101 }
12102 else if (encidx == ENCINDEX_UTF_16BE) {
12103 DEFAULT_REPLACE_CHAR("\xFF\xFD");
12104 }
12105 else if (encidx == ENCINDEX_UTF_16LE) {
12106 DEFAULT_REPLACE_CHAR("\xFD\xFF");
12107 }
12108 else if (encidx == ENCINDEX_UTF_32BE) {
12109 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
12110 }
12111 else if (encidx == ENCINDEX_UTF_32LE) {
12112 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
12113 }
12114 else {
12115 DEFAULT_REPLACE_CHAR("?");
12116 }
12117
12118 while (p < e) {
12119 int ret = rb_enc_precise_mbclen(p, e, enc);
12120 if (MBCLEN_NEEDMORE_P(ret)) {
12121 break;
12122 }
12123 else if (MBCLEN_CHARFOUND_P(ret)) {
12124 p += MBCLEN_CHARFOUND_LEN(ret);
12125 }
12126 else if (MBCLEN_INVALID_P(ret)) {
12127 const char *q = p;
12128 long clen = rb_enc_mbmaxlen(enc);
12129 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
12130 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
12131
12132 if (e - p < clen) clen = e - p;
12133 if (clen <= mbminlen * 2) {
12134 clen = mbminlen;
12135 }
12136 else {
12137 clen -= mbminlen;
12138 for (; clen > mbminlen; clen-=mbminlen) {
12139 ret = rb_enc_precise_mbclen(q, q + clen, enc);
12140 if (MBCLEN_NEEDMORE_P(ret)) break;
12141 if (MBCLEN_INVALID_P(ret)) continue;
12143 }
12144 }
12145 if (rep) {
12146 rb_str_buf_cat(buf, rep, replen);
12147 }
12148 else {
12149 repl = rb_yield(rb_enc_str_new(p, clen, enc));
12150 str_mod_check(str, sp, slen);
12151 repl = str_compat_and_valid(repl, enc);
12152 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12153 }
12154 p += clen;
12155 p1 = p;
12156 }
12157 else {
12159 }
12160 }
12161 if (NIL_P(buf)) {
12162 if (p == e) {
12164 return Qnil;
12165 }
12166 buf = rb_str_buf_new(RSTRING_LEN(str));
12167 }
12168 if (p1 < p) {
12169 rb_str_buf_cat(buf, p1, p - p1);
12170 }
12171 if (p < e) {
12172 if (rep) {
12173 rb_str_buf_cat(buf, rep, replen);
12174 }
12175 else {
12176 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12177 str_mod_check(str, sp, slen);
12178 repl = str_compat_and_valid(repl, enc);
12179 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12180 }
12181 }
12183 }
12184 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
12185 return buf;
12186}
12187
12188/*
12189 * call-seq:
12190 * scrub(replacement_string = default_replacement) -> new_string
12191 * scrub{|bytes| ... } -> new_string
12192 *
12193 * :include: doc/string/scrub.rdoc
12194 *
12195 */
12196static VALUE
12197str_scrub(int argc, VALUE *argv, VALUE str)
12198{
12199 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12200 VALUE new = rb_str_scrub(str, repl);
12201 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
12202}
12203
12204/*
12205 * call-seq:
12206 * scrub! -> self
12207 * scrub!(replacement_string = default_replacement) -> self
12208 * scrub!{|bytes| ... } -> self
12209 *
12210 * Like String#scrub, except that any replacements are made in +self+.
12211 *
12212 */
12213static VALUE
12214str_scrub_bang(int argc, VALUE *argv, VALUE str)
12215{
12216 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12217 VALUE new = rb_str_scrub(str, repl);
12218 if (!NIL_P(new)) rb_str_replace(str, new);
12219 return str;
12220}
12221
12222static ID id_normalize;
12223static ID id_normalized_p;
12224static VALUE mUnicodeNormalize;
12225
12226static VALUE
12227unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12228{
12229 static int UnicodeNormalizeRequired = 0;
12230 VALUE argv2[2];
12231
12232 if (!UnicodeNormalizeRequired) {
12233 rb_require("unicode_normalize/normalize.rb");
12234 UnicodeNormalizeRequired = 1;
12235 }
12236 argv2[0] = str;
12237 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12238 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12239}
12240
12241/*
12242 * call-seq:
12243 * unicode_normalize(form = :nfc) -> string
12244 *
12245 * Returns a copy of +self+ with
12246 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
12247 *
12248 * Argument +form+ must be one of the following symbols
12249 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
12250 *
12251 * - +:nfc+: Canonical decomposition, followed by canonical composition.
12252 * - +:nfd+: Canonical decomposition.
12253 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
12254 * - +:nfkd+: Compatibility decomposition.
12255 *
12256 * The encoding of +self+ must be one of:
12257 *
12258 * - Encoding::UTF_8
12259 * - Encoding::UTF_16BE
12260 * - Encoding::UTF_16LE
12261 * - Encoding::UTF_32BE
12262 * - Encoding::UTF_32LE
12263 * - Encoding::GB18030
12264 * - Encoding::UCS_2BE
12265 * - Encoding::UCS_4BE
12266 *
12267 * Examples:
12268 *
12269 * "a\u0300".unicode_normalize # => "a"
12270 * "\u00E0".unicode_normalize(:nfd) # => "a "
12271 *
12272 * Related: String#unicode_normalize!, String#unicode_normalized?.
12273 */
12274static VALUE
12275rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12276{
12277 return unicode_normalize_common(argc, argv, str, id_normalize);
12278}
12279
12280/*
12281 * call-seq:
12282 * unicode_normalize!(form = :nfc) -> self
12283 *
12284 * Like String#unicode_normalize, except that the normalization
12285 * is performed on +self+.
12286 *
12287 * Related String#unicode_normalized?.
12288 *
12289 */
12290static VALUE
12291rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12292{
12293 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12294}
12295
12296/* call-seq:
12297 * unicode_normalized?(form = :nfc) -> true or false
12298 *
12299 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
12300 * +false+ otherwise.
12301 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12302 *
12303 * Examples:
12304 *
12305 * "a\u0300".unicode_normalized? # => false
12306 * "a\u0300".unicode_normalized?(:nfd) # => true
12307 * "\u00E0".unicode_normalized? # => true
12308 * "\u00E0".unicode_normalized?(:nfd) # => false
12309 *
12310 *
12311 * Raises an exception if +self+ is not in a Unicode encoding:
12312 *
12313 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12314 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
12315 *
12316 * Related: String#unicode_normalize, String#unicode_normalize!.
12317 *
12318 */
12319static VALUE
12320rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12321{
12322 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12323}
12324
12325/**********************************************************************
12326 * Document-class: Symbol
12327 *
12328 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12329 *
12330 * You can create a +Symbol+ object explicitly with:
12331 *
12332 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12333 *
12334 * The same +Symbol+ object will be
12335 * created for a given name or string for the duration of a program's
12336 * execution, regardless of the context or meaning of that name. Thus
12337 * if <code>Fred</code> is a constant in one context, a method in
12338 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12339 * will be the same object in all three contexts.
12340 *
12341 * module One
12342 * class Fred
12343 * end
12344 * $f1 = :Fred
12345 * end
12346 * module Two
12347 * Fred = 1
12348 * $f2 = :Fred
12349 * end
12350 * def Fred()
12351 * end
12352 * $f3 = :Fred
12353 * $f1.object_id #=> 2514190
12354 * $f2.object_id #=> 2514190
12355 * $f3.object_id #=> 2514190
12356 *
12357 * Constant, method, and variable names are returned as symbols:
12358 *
12359 * module One
12360 * Two = 2
12361 * def three; 3 end
12362 * @four = 4
12363 * @@five = 5
12364 * $six = 6
12365 * end
12366 * seven = 7
12367 *
12368 * One.constants
12369 * # => [:Two]
12370 * One.instance_methods(true)
12371 * # => [:three]
12372 * One.instance_variables
12373 * # => [:@four]
12374 * One.class_variables
12375 * # => [:@@five]
12376 * global_variables.grep(/six/)
12377 * # => [:$six]
12378 * local_variables
12379 * # => [:seven]
12380 *
12381 * A +Symbol+ object differs from a String object in that
12382 * a +Symbol+ object represents an identifier, while a String object
12383 * represents text or data.
12384 *
12385 * == What's Here
12386 *
12387 * First, what's elsewhere. Class +Symbol+:
12388 *
12389 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12390 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12391 *
12392 * Here, class +Symbol+ provides methods that are useful for:
12393 *
12394 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12395 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12396 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12397 *
12398 * === Methods for Querying
12399 *
12400 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12401 * - #=~: Returns the index of the first substring in symbol that matches a
12402 * given Regexp or other object; returns +nil+ if no match is found.
12403 * - #[], #slice : Returns a substring of symbol
12404 * determined by a given index, start/length, or range, or string.
12405 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12406 * - #encoding: Returns the Encoding object that represents the encoding
12407 * of symbol.
12408 * - #end_with?: Returns +true+ if symbol ends with
12409 * any of the given strings.
12410 * - #match: Returns a MatchData object if symbol
12411 * matches a given Regexp; +nil+ otherwise.
12412 * - #match?: Returns +true+ if symbol
12413 * matches a given Regexp; +false+ otherwise.
12414 * - #length, #size: Returns the number of characters in symbol.
12415 * - #start_with?: Returns +true+ if symbol starts with
12416 * any of the given strings.
12417 *
12418 * === Methods for Comparing
12419 *
12420 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12421 * or larger than symbol.
12422 * - #==, #===: Returns +true+ if a given symbol has the same content and
12423 * encoding.
12424 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12425 * symbol is smaller than, equal to, or larger than symbol.
12426 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12427 * after Unicode case folding; +false+ otherwise.
12428 *
12429 * === Methods for Converting
12430 *
12431 * - #capitalize: Returns symbol with the first character upcased
12432 * and all other characters downcased.
12433 * - #downcase: Returns symbol with all characters downcased.
12434 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12435 * - #name: Returns the frozen string corresponding to symbol.
12436 * - #succ, #next: Returns the symbol that is the successor to symbol.
12437 * - #swapcase: Returns symbol with all upcase characters downcased
12438 * and all downcase characters upcased.
12439 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12440 * - #to_s, #id2name: Returns the string corresponding to +self+.
12441 * - #to_sym, #intern: Returns +self+.
12442 * - #upcase: Returns symbol with all characters upcased.
12443 *
12444 */
12445
12446
12447/*
12448 * call-seq:
12449 * symbol == object -> true or false
12450 *
12451 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12452 */
12453
12454#define sym_equal rb_obj_equal
12455
12456static int
12457sym_printable(const char *s, const char *send, rb_encoding *enc)
12458{
12459 while (s < send) {
12460 int n;
12461 int c = rb_enc_precise_mbclen(s, send, enc);
12462
12463 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12464 n = MBCLEN_CHARFOUND_LEN(c);
12465 c = rb_enc_mbc_to_codepoint(s, send, enc);
12466 if (!rb_enc_isprint(c, enc)) return FALSE;
12467 s += n;
12468 }
12469 return TRUE;
12470}
12471
12472int
12473rb_str_symname_p(VALUE sym)
12474{
12475 rb_encoding *enc;
12476 const char *ptr;
12477 long len;
12478 rb_encoding *resenc = rb_default_internal_encoding();
12479
12480 if (resenc == NULL) resenc = rb_default_external_encoding();
12481 enc = STR_ENC_GET(sym);
12482 ptr = RSTRING_PTR(sym);
12483 len = RSTRING_LEN(sym);
12484 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12485 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12486 return FALSE;
12487 }
12488 return TRUE;
12489}
12490
12491VALUE
12492rb_str_quote_unprintable(VALUE str)
12493{
12494 rb_encoding *enc;
12495 const char *ptr;
12496 long len;
12497 rb_encoding *resenc;
12498
12499 Check_Type(str, T_STRING);
12500 resenc = rb_default_internal_encoding();
12501 if (resenc == NULL) resenc = rb_default_external_encoding();
12502 enc = STR_ENC_GET(str);
12503 ptr = RSTRING_PTR(str);
12504 len = RSTRING_LEN(str);
12505 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12506 !sym_printable(ptr, ptr + len, enc)) {
12507 return rb_str_escape(str);
12508 }
12509 return str;
12510}
12511
12512VALUE
12513rb_id_quote_unprintable(ID id)
12514{
12515 VALUE str = rb_id2str(id);
12516 if (!rb_str_symname_p(str)) {
12517 return rb_str_escape(str);
12518 }
12519 return str;
12520}
12521
12522/*
12523 * call-seq:
12524 * inspect -> string
12525 *
12526 * Returns a string representation of +self+ (including the leading colon):
12527 *
12528 * :foo.inspect # => ":foo"
12529 *
12530 * Related: Symbol#to_s, Symbol#name.
12531 *
12532 */
12533
12534static VALUE
12535sym_inspect(VALUE sym)
12536{
12537 VALUE str = rb_sym2str(sym);
12538 const char *ptr;
12539 long len;
12540 char *dest;
12541
12542 if (!rb_str_symname_p(str)) {
12543 str = rb_str_inspect(str);
12544 len = RSTRING_LEN(str);
12545 rb_str_resize(str, len + 1);
12546 dest = RSTRING_PTR(str);
12547 memmove(dest + 1, dest, len);
12548 }
12549 else {
12550 rb_encoding *enc = STR_ENC_GET(str);
12551 VALUE orig_str = str;
12552
12553 len = RSTRING_LEN(orig_str);
12554 str = rb_enc_str_new(0, len + 1, enc);
12555
12556 // Get data pointer after allocation
12557 ptr = RSTRING_PTR(orig_str);
12558 dest = RSTRING_PTR(str);
12559 memcpy(dest + 1, ptr, len);
12560
12561 RB_GC_GUARD(orig_str);
12562 }
12563 dest[0] = ':';
12564
12566
12567 return str;
12568}
12569
12570VALUE
12572{
12573 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12574 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12575 return str;
12576}
12577
12578VALUE
12579rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12580{
12581 VALUE obj;
12582
12583 if (argc < 1) {
12584 rb_raise(rb_eArgError, "no receiver given");
12585 }
12586 obj = argv[0];
12587 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12588}
12589
12590/*
12591 * call-seq:
12592 * succ
12593 *
12594 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12595 *
12596 * :foo.succ # => :fop
12597 *
12598 * Related: String#succ.
12599 */
12600
12601static VALUE
12602sym_succ(VALUE sym)
12603{
12604 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12605}
12606
12607/*
12608 * call-seq:
12609 * symbol <=> object -> -1, 0, +1, or nil
12610 *
12611 * If +object+ is a symbol,
12612 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12613 *
12614 * :bar <=> :foo # => -1
12615 * :foo <=> :foo # => 0
12616 * :foo <=> :bar # => 1
12617 *
12618 * Otherwise, returns +nil+:
12619 *
12620 * :foo <=> 'bar' # => nil
12621 *
12622 * Related: String#<=>.
12623 */
12624
12625static VALUE
12626sym_cmp(VALUE sym, VALUE other)
12627{
12628 if (!SYMBOL_P(other)) {
12629 return Qnil;
12630 }
12631 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12632}
12633
12634/*
12635 * call-seq:
12636 * casecmp(object) -> -1, 0, 1, or nil
12637 *
12638 * :include: doc/symbol/casecmp.rdoc
12639 *
12640 */
12641
12642static VALUE
12643sym_casecmp(VALUE sym, VALUE other)
12644{
12645 if (!SYMBOL_P(other)) {
12646 return Qnil;
12647 }
12648 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12649}
12650
12651/*
12652 * call-seq:
12653 * casecmp?(object) -> true, false, or nil
12654 *
12655 * :include: doc/symbol/casecmp_p.rdoc
12656 *
12657 */
12658
12659static VALUE
12660sym_casecmp_p(VALUE sym, VALUE other)
12661{
12662 if (!SYMBOL_P(other)) {
12663 return Qnil;
12664 }
12665 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12666}
12667
12668/*
12669 * call-seq:
12670 * symbol =~ object -> integer or nil
12671 *
12672 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12673 * including possible updates to global variables;
12674 * see String#=~.
12675 *
12676 */
12677
12678static VALUE
12679sym_match(VALUE sym, VALUE other)
12680{
12681 return rb_str_match(rb_sym2str(sym), other);
12682}
12683
12684/*
12685 * call-seq:
12686 * match(pattern, offset = 0) -> matchdata or nil
12687 * match(pattern, offset = 0) {|matchdata| } -> object
12688 *
12689 * Equivalent to <tt>self.to_s.match</tt>,
12690 * including possible updates to global variables;
12691 * see String#match.
12692 *
12693 */
12694
12695static VALUE
12696sym_match_m(int argc, VALUE *argv, VALUE sym)
12697{
12698 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12699}
12700
12701/*
12702 * call-seq:
12703 * match?(pattern, offset) -> true or false
12704 *
12705 * Equivalent to <tt>sym.to_s.match?</tt>;
12706 * see String#match.
12707 *
12708 */
12709
12710static VALUE
12711sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12712{
12713 return rb_str_match_m_p(argc, argv, sym);
12714}
12715
12716/*
12717 * call-seq:
12718 * symbol[index] -> string or nil
12719 * symbol[start, length] -> string or nil
12720 * symbol[range] -> string or nil
12721 * symbol[regexp, capture = 0] -> string or nil
12722 * symbol[substring] -> string or nil
12723 *
12724 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12725 *
12726 */
12727
12728static VALUE
12729sym_aref(int argc, VALUE *argv, VALUE sym)
12730{
12731 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12732}
12733
12734/*
12735 * call-seq:
12736 * length -> integer
12737 *
12738 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12739 */
12740
12741static VALUE
12742sym_length(VALUE sym)
12743{
12744 return rb_str_length(rb_sym2str(sym));
12745}
12746
12747/*
12748 * call-seq:
12749 * empty? -> true or false
12750 *
12751 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12752 *
12753 */
12754
12755static VALUE
12756sym_empty(VALUE sym)
12757{
12758 return rb_str_empty(rb_sym2str(sym));
12759}
12760
12761/*
12762 * call-seq:
12763 * upcase(*options) -> symbol
12764 *
12765 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12766 *
12767 * See String#upcase.
12768 *
12769 */
12770
12771static VALUE
12772sym_upcase(int argc, VALUE *argv, VALUE sym)
12773{
12774 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12775}
12776
12777/*
12778 * call-seq:
12779 * downcase(*options) -> symbol
12780 *
12781 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12782 *
12783 * See String#downcase.
12784 *
12785 * Related: Symbol#upcase.
12786 *
12787 */
12788
12789static VALUE
12790sym_downcase(int argc, VALUE *argv, VALUE sym)
12791{
12792 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12793}
12794
12795/*
12796 * call-seq:
12797 * capitalize(*options) -> symbol
12798 *
12799 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12800 *
12801 * See String#capitalize.
12802 *
12803 */
12804
12805static VALUE
12806sym_capitalize(int argc, VALUE *argv, VALUE sym)
12807{
12808 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12809}
12810
12811/*
12812 * call-seq:
12813 * swapcase(*options) -> symbol
12814 *
12815 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12816 *
12817 * See String#swapcase.
12818 *
12819 */
12820
12821static VALUE
12822sym_swapcase(int argc, VALUE *argv, VALUE sym)
12823{
12824 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12825}
12826
12827/*
12828 * call-seq:
12829 * start_with?(*string_or_regexp) -> true or false
12830 *
12831 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12832 *
12833 */
12834
12835static VALUE
12836sym_start_with(int argc, VALUE *argv, VALUE sym)
12837{
12838 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12839}
12840
12841/*
12842 * call-seq:
12843 * end_with?(*strings) -> true or false
12844 *
12845 *
12846 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12847 *
12848 */
12849
12850static VALUE
12851sym_end_with(int argc, VALUE *argv, VALUE sym)
12852{
12853 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12854}
12855
12856/*
12857 * call-seq:
12858 * encoding -> encoding
12859 *
12860 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12861 *
12862 */
12863
12864static VALUE
12865sym_encoding(VALUE sym)
12866{
12867 return rb_obj_encoding(rb_sym2str(sym));
12868}
12869
12870static VALUE
12871string_for_symbol(VALUE name)
12872{
12873 if (!RB_TYPE_P(name, T_STRING)) {
12874 VALUE tmp = rb_check_string_type(name);
12875 if (NIL_P(tmp)) {
12876 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12877 name);
12878 }
12879 name = tmp;
12880 }
12881 return name;
12882}
12883
12884ID
12886{
12887 if (SYMBOL_P(name)) {
12888 return SYM2ID(name);
12889 }
12890 name = string_for_symbol(name);
12891 return rb_intern_str(name);
12892}
12893
12894VALUE
12896{
12897 if (SYMBOL_P(name)) {
12898 return name;
12899 }
12900 name = string_for_symbol(name);
12901 return rb_str_intern(name);
12902}
12903
12904/*
12905 * call-seq:
12906 * Symbol.all_symbols -> array_of_symbols
12907 *
12908 * Returns an array of all symbols currently in Ruby's symbol table:
12909 *
12910 * Symbol.all_symbols.size # => 9334
12911 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12912 *
12913 */
12914
12915static VALUE
12916sym_all_symbols(VALUE _)
12917{
12918 return rb_sym_all_symbols();
12919}
12920
12921VALUE
12922rb_str_to_interned_str(VALUE str)
12923{
12924 return rb_fstring(str);
12925}
12926
12927VALUE
12928rb_interned_str(const char *ptr, long len)
12929{
12930 struct RString fake_str;
12931 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12932}
12933
12934VALUE
12936{
12937 return rb_interned_str(ptr, strlen(ptr));
12938}
12939
12940VALUE
12941rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12942{
12943 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12944 rb_enc_autoload(enc);
12945 }
12946
12947 struct RString fake_str;
12948 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12949}
12950
12951VALUE
12952rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12953{
12954 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12955 rb_enc_autoload(enc);
12956 }
12957
12958 struct RString fake_str;
12959 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12960}
12961
12962VALUE
12964{
12965 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12966}
12967
12968#if USE_YJIT
12969void
12970rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12971{
12972 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12973 ssize_t code = RB_NUM2SSIZE(codepoint);
12974
12975 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12976 rb_str_buf_cat_byte(str, (char) code);
12977 return;
12978 }
12979 }
12980
12981 rb_str_concat(str, codepoint);
12982}
12983#endif
12984
12985void
12986Init_String(void)
12987{
12988 rb_cString = rb_define_class("String", rb_cObject);
12989 struct fstring_table_struct *fstring_table = RTYPEDDATA_GET_DATA(fstring_table_obj);
12990 for (unsigned int i = 0; i < fstring_table->capacity; i++) {
12991 VALUE str = fstring_table->entries[i].str;
12992 if (!str) continue;
12993 RBASIC_SET_CLASS(str, rb_cString);
12994 }
12996 rb_define_alloc_func(rb_cString, empty_str_alloc);
12997 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12998 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12999 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
13000 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
13001 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
13002 rb_define_method(rb_cString, "==", rb_str_equal, 1);
13003 rb_define_method(rb_cString, "===", rb_str_equal, 1);
13004 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
13005 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
13006 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
13007 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
13008 rb_define_method(rb_cString, "+", rb_str_plus, 1);
13009 rb_define_method(rb_cString, "*", rb_str_times, 1);
13010 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
13011 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
13012 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
13013 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
13014 rb_define_method(rb_cString, "length", rb_str_length, 0);
13015 rb_define_method(rb_cString, "size", rb_str_length, 0);
13016 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
13017 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
13018 rb_define_method(rb_cString, "=~", rb_str_match, 1);
13019 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
13020 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
13021 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
13022 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
13023 rb_define_method(rb_cString, "next", rb_str_succ, 0);
13024 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
13025 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
13026 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
13027 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
13028 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
13029 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
13030 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
13031 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
13032 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
13033 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
13034 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
13035 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
13036 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
13037 rb_define_method(rb_cString, "scrub", str_scrub, -1);
13038 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
13039 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
13040 rb_define_method(rb_cString, "+@", str_uplus, 0);
13041 rb_define_method(rb_cString, "-@", str_uminus, 0);
13042 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
13043 rb_define_alias(rb_cString, "dedup", "-@");
13044
13045 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
13046 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
13047 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
13048 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
13049 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
13050 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
13051 rb_define_method(rb_cString, "undump", str_undump, 0);
13052
13053 sym_ascii = ID2SYM(rb_intern_const("ascii"));
13054 sym_turkic = ID2SYM(rb_intern_const("turkic"));
13055 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
13056 sym_fold = ID2SYM(rb_intern_const("fold"));
13057
13058 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
13059 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
13060 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
13061 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
13062
13063 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
13064 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
13065 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
13066 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
13067
13068 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
13069 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
13070 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
13071 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
13072 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
13073 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
13074 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
13075 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
13076 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
13077 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
13078 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
13079 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
13080 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
13081 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
13082 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
13083 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
13084 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
13085 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
13086
13087 rb_define_method(rb_cString, "include?", rb_str_include, 1);
13088 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
13089 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
13090
13091 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
13092
13093 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
13094 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
13095 rb_define_method(rb_cString, "center", rb_str_center, -1);
13096
13097 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
13098 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
13099 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
13100 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
13101 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
13102 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
13103 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
13104 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
13105 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
13106
13107 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
13108 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
13109 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
13110 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
13111 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
13112 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
13113 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
13114 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
13115 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
13116
13117 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
13118 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
13119 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
13120 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
13121 rb_define_method(rb_cString, "count", rb_str_count, -1);
13122
13123 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
13124 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
13125 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
13126 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
13127
13128 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
13129 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
13130 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
13131 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
13132 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
13133
13134 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
13135
13136 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
13137 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
13138
13139 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
13140 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
13141
13142 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
13143 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
13144 rb_define_method(rb_cString, "b", rb_str_b, 0);
13145 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
13146 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
13147
13148 /* define UnicodeNormalize module here so that we don't have to look it up */
13149 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
13150 id_normalize = rb_intern_const("normalize");
13151 id_normalized_p = rb_intern_const("normalized?");
13152
13153 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
13154 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
13155 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
13156
13157 rb_fs = Qnil;
13158 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
13159 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
13160 rb_gc_register_address(&rb_fs);
13161
13162 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
13166 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
13167
13168 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
13169 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
13170 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
13171 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
13172 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
13173 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
13174
13175 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
13176 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
13177 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
13178 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
13179
13180 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
13181 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
13182 rb_define_method(rb_cSymbol, "length", sym_length, 0);
13183 rb_define_method(rb_cSymbol, "size", sym_length, 0);
13184 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
13185 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
13186 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
13187
13188 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
13189 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
13190 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
13191 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
13192
13193 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
13194 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
13195
13196 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
13197}
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
#define RUBY_ATOMIC_VALUE_CAS(var, oldval, newval)
Identical to RUBY_ATOMIC_CAS, except it expects its arguments are VALUE.
Definition atomic.h:356
#define RUBY_ATOMIC_VALUE_SET(var, val)
Identical to RUBY_ATOMIC_SET, except it expects its arguments are VALUE.
Definition atomic.h:328
std::atomic< unsigned > rb_atomic_t
Type that is eligible for atomic operations.
Definition atomic.h:69
#define RUBY_ATOMIC_FETCH_ADD(var, val)
Atomically replaces the value pointed by var with the result of addition of val to the old value of v...
Definition atomic.h:93
#define RUBY_ATOMIC_VALUE_EXCHANGE(var, val)
Identical to RUBY_ATOMIC_EXCHANGE, except it expects its arguments are VALUE.
Definition atomic.h:342
#define RUBY_ATOMIC_DEC(var)
Atomically decrements the value pointed by var.
Definition atomic.h:198
#define RUBY_ATOMIC_LOAD(var)
Atomic load.
Definition atomic.h:150
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1187
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1095
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2345
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2166
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2635
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:937
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2424
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:126
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:676
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3905
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:669
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2121
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2139
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1296
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3532
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:576
VALUE rb_cSymbol
Symbol class.
Definition string.c:81
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1284
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:80
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3216
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1655
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:1270
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1520
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3316
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1539
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12941
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2627
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:4003
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1468
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1760
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1661
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:1289
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12963
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:1154
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:430
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1099
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:1039
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1858
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1058
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1864
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1926
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4219
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3716
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1926
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:2055
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1825
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2778
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:4068
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1736
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12571
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2850
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1712
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:2049
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3344
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5660
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4437
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3441
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11870
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1752
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:2091
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1502
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1324
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1831
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:2294
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4423
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3836
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2716
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:2312
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6897
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3449
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12935
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1742
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:4034
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3391
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4539
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3658
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7618
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:3078
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12928
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4493
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4310
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4468
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:4010
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3566
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:6170
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11928
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:2005
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:3238
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3538
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3641
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1514
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:3032
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7732
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1724
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:2021
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2730
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:6088
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9825
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1508
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:894
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:2153
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1924
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:1941
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2958
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1289
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:986
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12895
ID rb_to_id(VALUE str)
Definition string.c:12885
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1865
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3500
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4463
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1754
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:3215
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:3097
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1748
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:3110
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:2082
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
#define TypedData_Make_Struct(klass, type, data_type, sval)
Identical to TypedData_Wrap_Struct, except it allocates a new data region internally instead of takin...
Definition rtypeddata.h:497
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1417
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:75
Ruby's String.
Definition rstring.h:196
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
union RString::@52::@53::@55 aux
Auxiliary info.
Definition string.c:530
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:200
const char * wrap_struct_name
Name of structs of this kind.
Definition rtypeddata.h:207
Definition string.c:8690
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113