Ruby 3.5.0dev (2025-04-03 revision 1dddc6c78b5f6dc6ae18ee04ebe44abfce3b0433)
string.c (1dddc6c78b5f6dc6ae18ee04ebe44abfce3b0433)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/util.h"
46#include "ruby_assert.h"
47#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
73#undef rb_str_buf_cat
74#undef rb_str_buf_cat2
75#undef rb_str_cat2
76#undef rb_str_cat_cstr
77#undef rb_fstring_cstr
78
81
82/* Flags of RString
83 *
84 * 0: STR_SHARED (equal to ELTS_SHARED)
85 * The string is shared. The buffer this string points to is owned by
86 * another string (the shared root).
87 * 1: RSTRING_NOEMBED
88 * The string is not embedded. When a string is embedded, the contents
89 * follow the header. When a string is not embedded, the contents is
90 * on a separately allocated buffer.
91 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
92 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
93 * It emits a deprecation warning when mutated for the first time.
94 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
95 * The string was allocated by the `Symbol#to_s` method.
96 * It emits a deprecation warning when mutated for the first time.
97 * 4: STR_PRECOMPUTED_HASH
98 * The string is embedded and has its precomputed hashcode stored
99 * after the terminator.
100 * 5: STR_SHARED_ROOT
101 * Other strings may point to the contents of this string. When this
102 * flag is set, STR_SHARED must not be set.
103 * 6: STR_BORROWED
104 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
105 * to be unshared by rb_str_tmp_frozen_release.
106 * 7: STR_TMPLOCK
107 * The pointer to the buffer is passed to a system call such as
108 * read(2). Any modification and realloc is prohibited.
109 * 8-9: ENC_CODERANGE
110 * Stores the coderange of the string.
111 * 10-16: ENCODING
112 * Stores the encoding of the string.
113 * 17: RSTRING_FSTR
114 * The string is a fstring. The string is deduplicated in the fstring
115 * table.
116 * 18: STR_NOFREE
117 * Do not free this string's buffer when the string is reclaimed
118 * by the garbage collector. Used for when the string buffer is a C
119 * string literal.
120 * 19: STR_FAKESTR
121 * The string is not allocated or managed by the garbage collector.
122 * Typically, the string object header (struct RString) is temporarily
123 * allocated on C stack.
124 */
125
126#define RUBY_MAX_CHAR_LEN 16
127#define STR_PRECOMPUTED_HASH FL_USER4
128#define STR_SHARED_ROOT FL_USER5
129#define STR_BORROWED FL_USER6
130#define STR_TMPLOCK FL_USER7
131#define STR_NOFREE FL_USER18
132#define STR_FAKESTR FL_USER19
133
134#define STR_SET_NOEMBED(str) do {\
135 FL_SET((str), STR_NOEMBED);\
136 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
137} while (0)
138#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
139
140#define STR_SET_LEN(str, n) do { \
141 RSTRING(str)->len = (n); \
142} while (0)
143
144static inline bool
145str_encindex_fastpath(int encindex)
146{
147 // The overwhelming majority of strings are in one of these 3 encodings.
148 switch (encindex) {
149 case ENCINDEX_ASCII_8BIT:
150 case ENCINDEX_UTF_8:
151 case ENCINDEX_US_ASCII:
152 return true;
153 default:
154 return false;
155 }
156}
157
158static inline bool
159str_enc_fastpath(VALUE str)
160{
161 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
162}
163
164#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
165#define TERM_FILL(ptr, termlen) do {\
166 char *const term_fill_ptr = (ptr);\
167 const int term_fill_len = (termlen);\
168 *term_fill_ptr = '\0';\
169 if (UNLIKELY(term_fill_len > 1))\
170 memset(term_fill_ptr, 0, term_fill_len);\
171} while (0)
172
173#define RESIZE_CAPA(str,capacity) do {\
174 const int termlen = TERM_LEN(str);\
175 RESIZE_CAPA_TERM(str,capacity,termlen);\
176} while (0)
177#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
178 if (STR_EMBED_P(str)) {\
179 if (str_embed_capa(str) < capacity + termlen) {\
180 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
181 const long tlen = RSTRING_LEN(str);\
182 memcpy(tmp, RSTRING_PTR(str), tlen);\
183 RSTRING(str)->as.heap.ptr = tmp;\
184 RSTRING(str)->len = tlen;\
185 STR_SET_NOEMBED(str);\
186 RSTRING(str)->as.heap.aux.capa = (capacity);\
187 }\
188 }\
189 else {\
190 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
191 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
192 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
193 RSTRING(str)->as.heap.aux.capa = (capacity);\
194 }\
195} while (0)
196
197#define STR_SET_SHARED(str, shared_str) do { \
198 if (!FL_TEST(str, STR_FAKESTR)) { \
199 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
200 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
201 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
202 FL_SET((str), STR_SHARED); \
203 FL_SET((shared_str), STR_SHARED_ROOT); \
204 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
205 FL_SET_RAW((shared_str), STR_BORROWED); \
206 } \
207} while (0)
208
209#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
210#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
211/* TODO: include the terminator size in capa. */
212
213#define STR_ENC_GET(str) get_encoding(str)
214
215#if !defined SHARABLE_MIDDLE_SUBSTRING
216# define SHARABLE_MIDDLE_SUBSTRING 0
217#endif
218#if !SHARABLE_MIDDLE_SUBSTRING
219#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
220#else
221#define SHARABLE_SUBSTRING_P(beg, len, end) 1
222#endif
223
224
225static inline long
226str_embed_capa(VALUE str)
227{
228 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
229}
230
231bool
232rb_str_reembeddable_p(VALUE str)
233{
234 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
235}
236
237static inline size_t
238rb_str_embed_size(long capa)
239{
240 return offsetof(struct RString, as.embed.ary) + capa;
241}
242
243size_t
244rb_str_size_as_embedded(VALUE str)
245{
246 size_t real_size;
247 if (STR_EMBED_P(str)) {
248 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
249 }
250 /* if the string is not currently embedded, but it can be embedded, how
251 * much space would it require */
252 else if (rb_str_reembeddable_p(str)) {
253 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
254 }
255 else {
256 real_size = sizeof(struct RString);
257 }
258
259 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
260 real_size += sizeof(st_index_t);
261 }
262
263 return real_size;
264}
265
266static inline bool
267STR_EMBEDDABLE_P(long len, long termlen)
268{
269 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
270}
271
272static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
273static VALUE str_new_frozen(VALUE klass, VALUE orig);
274static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
275static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
276static VALUE str_new(VALUE klass, const char *ptr, long len);
277static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
278static inline void str_modifiable(VALUE str);
279static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
280static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
281
282static inline void
283str_make_independent(VALUE str)
284{
285 long len = RSTRING_LEN(str);
286 int termlen = TERM_LEN(str);
287 str_make_independent_expand((str), len, 0L, termlen);
288}
289
290static inline int str_dependent_p(VALUE str);
291
292void
293rb_str_make_independent(VALUE str)
294{
295 if (str_dependent_p(str)) {
296 str_make_independent(str);
297 }
298}
299
300void
301rb_str_make_embedded(VALUE str)
302{
303 RUBY_ASSERT(rb_str_reembeddable_p(str));
304 RUBY_ASSERT(!STR_EMBED_P(str));
305
306 char *buf = RSTRING(str)->as.heap.ptr;
307 long len = RSTRING(str)->len;
308
309 STR_SET_EMBED(str);
310 STR_SET_LEN(str, len);
311
312 if (len > 0) {
313 memcpy(RSTRING_PTR(str), buf, len);
314 ruby_xfree(buf);
315 }
316
317 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
318}
319
320void
321rb_debug_rstring_null_ptr(const char *func)
322{
323 fprintf(stderr, "%s is returning NULL!! "
324 "SIGSEGV is highly expected to follow immediately.\n"
325 "If you could reproduce, attach your debugger here, "
326 "and look at the passed string.\n",
327 func);
328}
329
330/* symbols for [up|down|swap]case/capitalize options */
331static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
332
333static rb_encoding *
334get_encoding(VALUE str)
335{
336 return rb_enc_from_index(ENCODING_GET(str));
337}
338
339static void
340mustnot_broken(VALUE str)
341{
342 if (is_broken_string(str)) {
343 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
344 }
345}
346
347static void
348mustnot_wchar(VALUE str)
349{
350 rb_encoding *enc = STR_ENC_GET(str);
351 if (rb_enc_mbminlen(enc) > 1) {
352 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
353 }
354}
355
356static int fstring_cmp(VALUE a, VALUE b);
357
358static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
359
360#if SIZEOF_LONG == SIZEOF_VOIDP
361#define PRECOMPUTED_FAKESTR_HASH 1
362#else
363#endif
364
365#ifdef PRECOMPUTED_FAKESTR_HASH
366static st_index_t
367fstring_hash(VALUE str)
368{
369 st_index_t h;
370 if (FL_TEST_RAW(str, STR_FAKESTR)) {
371 // register_fstring precomputes the hash and stores it in capa for fake strings
372 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
373 }
374 else {
375 h = rb_str_hash(str);
376 }
377 // rb_str_hash doesn't include the encoding for ascii only strings, so
378 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
379 return rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
380}
381#else
382#define fstring_hash rb_str_hash
383#endif
384
385const struct st_hash_type rb_fstring_hash_type = {
386 fstring_cmp,
387 fstring_hash,
388};
389
390#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
391
392static inline st_index_t
393str_do_hash(VALUE str)
394{
395 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
396 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
397 if (e && !is_ascii_string(str)) {
398 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
399 }
400 return h;
401}
402
403static VALUE
404str_store_precomputed_hash(VALUE str, st_index_t hash)
405{
406 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
407 RUBY_ASSERT(STR_EMBED_P(str));
408
409#if RUBY_DEBUG
410 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
411 size_t free_bytes = str_embed_capa(str) - used_bytes;
412 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
413#endif
414
415 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
416
417 FL_SET(str, STR_PRECOMPUTED_HASH);
418
419 return str;
420}
421
423 VALUE fstr;
424 bool copy;
425 bool force_precompute_hash;
426};
427
428static int
429fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
430{
431 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
432 VALUE str = (VALUE)*key;
433
434 if (existing) {
435 /* because of lazy sweep, str may be unmarked already and swept
436 * at next time */
437
438 if (rb_objspace_garbage_object_p(str)) {
439 arg->fstr = Qundef;
440 // When RSTRING_FSTR strings are swept, they call `st_delete`.
441 // To avoid a race condition if an equivalent string was inserted
442 // we must remove the flag immediately.
443 FL_UNSET_RAW(str, RSTRING_FSTR);
444 return ST_DELETE;
445 }
446
447 arg->fstr = str;
448 return ST_STOP;
449 }
450 else {
451 // Unless the string is empty or binary, its coderange has been precomputed.
452 int coderange = ENC_CODERANGE(str);
453
454 if (FL_TEST_RAW(str, STR_FAKESTR)) {
455 if (arg->copy) {
456 VALUE new_str;
457 long len = RSTRING_LEN(str);
458 long capa = len + sizeof(st_index_t);
459 int term_len = TERM_LEN(str);
460
461 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
462 new_str = str_alloc_embed(rb_cString, capa + term_len);
463 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
464 STR_SET_LEN(new_str, RSTRING_LEN(str));
465 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
466 rb_enc_copy(new_str, str);
467 str_store_precomputed_hash(new_str, str_do_hash(str));
468 }
469 else {
470 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
471 rb_enc_copy(new_str, str);
472#ifdef PRECOMPUTED_FAKESTR_HASH
473 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
474 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
475 }
476#endif
477 }
478 str = new_str;
479 }
480 else {
481 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
482 RSTRING(str)->len,
483 ENCODING_GET(str));
484 }
485 OBJ_FREEZE(str);
486 }
487 else {
488 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
489 str = str_new_frozen(rb_cString, str);
490 }
491 if (STR_SHARED_P(str)) { /* str should not be shared */
492 /* shared substring */
493 str_make_independent(str);
495 }
496 if (!BARE_STRING_P(str)) {
497 str = str_new_frozen(rb_cString, str);
498 }
499 }
500
501 ENC_CODERANGE_SET(str, coderange);
502 RBASIC(str)->flags |= RSTRING_FSTR;
503
504 *key = *value = arg->fstr = str;
505 return ST_CONTINUE;
506 }
507}
508
509VALUE
510rb_fstring(VALUE str)
511{
512 VALUE fstr;
513 int bare;
514
515 Check_Type(str, T_STRING);
516
517 if (FL_TEST(str, RSTRING_FSTR))
518 return str;
519
520 bare = BARE_STRING_P(str);
521 if (!bare) {
522 if (STR_EMBED_P(str)) {
523 OBJ_FREEZE(str);
524 return str;
525 }
526
527 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
529 return str;
530 }
531 }
532
533 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
534 rb_str_resize(str, RSTRING_LEN(str));
535
536 fstr = register_fstring(str, false, false);
537
538 if (!bare) {
539 str_replace_shared_without_enc(str, fstr);
540 OBJ_FREEZE(str);
541 return str;
542 }
543 return fstr;
544}
545
546static VALUE
547register_fstring(VALUE str, bool copy, bool force_precompute_hash)
548{
549 struct fstr_update_arg args = {
550 .copy = copy,
551 .force_precompute_hash = force_precompute_hash
552 };
553
554#if SIZEOF_VOIDP == SIZEOF_LONG
555 if (FL_TEST_RAW(str, STR_FAKESTR)) {
556 // if the string hasn't been interned, we'll need the hash twice, so we
557 // compute it once and store it in capa
558 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
559 }
560#endif
561
562 RB_VM_LOCK_ENTER();
563 {
564 st_table *frozen_strings = rb_vm_fstring_table();
565 do {
566 args.fstr = str;
567 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
568 } while (UNDEF_P(args.fstr));
569 }
570 RB_VM_LOCK_LEAVE();
571
572 RUBY_ASSERT(OBJ_FROZEN(args.fstr));
573 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
574 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
575 RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
576
577 return args.fstr;
578}
579
580static VALUE
581setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
582{
583 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
584
585 if (!name) {
587 name = "";
588 }
589
590 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
591
592 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
593 fake_str->len = len;
594 fake_str->as.heap.ptr = (char *)name;
595 fake_str->as.heap.aux.capa = len;
596 return (VALUE)fake_str;
597}
598
599/*
600 * set up a fake string which refers a static string literal.
601 */
602VALUE
603rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
604{
605 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
606}
607
608/*
609 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
610 * shared string which refers a static string literal. `ptr` must
611 * point a constant string.
612 */
613VALUE
614rb_fstring_new(const char *ptr, long len)
615{
616 struct RString fake_str;
617 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
618}
619
620VALUE
621rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
622{
623 struct RString fake_str;
624 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
625}
626
627VALUE
628rb_fstring_cstr(const char *ptr)
629{
630 return rb_fstring_new(ptr, strlen(ptr));
631}
632
633static int
634fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
635{
636 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
637 return ST_CONTINUE;
638}
639
640static int
641fstring_cmp(VALUE a, VALUE b)
642{
643 long alen, blen;
644 const char *aptr, *bptr;
645 RSTRING_GETMEM(a, aptr, alen);
646 RSTRING_GETMEM(b, bptr, blen);
647 return (alen != blen ||
648 ENCODING_GET(a) != ENCODING_GET(b) ||
649 memcmp(aptr, bptr, alen) != 0);
650}
651
652static inline bool
653single_byte_optimizable(VALUE str)
654{
655 int encindex = ENCODING_GET(str);
656 switch (encindex) {
657 case ENCINDEX_ASCII_8BIT:
658 case ENCINDEX_US_ASCII:
659 return true;
660 case ENCINDEX_UTF_8:
661 // For UTF-8 it's worth scanning the string coderange when unknown.
663 }
664 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
665 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
666 return true;
667 }
668
669 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
670 return true;
671 }
672
673 /* Conservative. Possibly single byte.
674 * "\xa1" in Shift_JIS for example. */
675 return false;
676}
677
679
680static inline const char *
681search_nonascii(const char *p, const char *e)
682{
683 const uintptr_t *s, *t;
684
685#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
686# if SIZEOF_UINTPTR_T == 8
687# define NONASCII_MASK UINT64_C(0x8080808080808080)
688# elif SIZEOF_UINTPTR_T == 4
689# define NONASCII_MASK UINT32_C(0x80808080)
690# else
691# error "don't know what to do."
692# endif
693#else
694# if SIZEOF_UINTPTR_T == 8
695# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
696# elif SIZEOF_UINTPTR_T == 4
697# define NONASCII_MASK 0x80808080UL /* or...? */
698# else
699# error "don't know what to do."
700# endif
701#endif
702
703 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
704#if !UNALIGNED_WORD_ACCESS
705 if ((uintptr_t)p % SIZEOF_VOIDP) {
706 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
707 p += l;
708 switch (l) {
709 default: UNREACHABLE;
710#if SIZEOF_VOIDP > 4
711 case 7: if (p[-7]&0x80) return p-7;
712 case 6: if (p[-6]&0x80) return p-6;
713 case 5: if (p[-5]&0x80) return p-5;
714 case 4: if (p[-4]&0x80) return p-4;
715#endif
716 case 3: if (p[-3]&0x80) return p-3;
717 case 2: if (p[-2]&0x80) return p-2;
718 case 1: if (p[-1]&0x80) return p-1;
719 case 0: break;
720 }
721 }
722#endif
723#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
724#define aligned_ptr(value) \
725 __builtin_assume_aligned((value), sizeof(uintptr_t))
726#else
727#define aligned_ptr(value) (uintptr_t *)(value)
728#endif
729 s = aligned_ptr(p);
730 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
731#undef aligned_ptr
732 for (;s < t; s++) {
733 if (*s & NONASCII_MASK) {
734#ifdef WORDS_BIGENDIAN
735 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
736#else
737 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
738#endif
739 }
740 }
741 p = (const char *)s;
742 }
743
744 switch (e - p) {
745 default: UNREACHABLE;
746#if SIZEOF_VOIDP > 4
747 case 7: if (e[-7]&0x80) return e-7;
748 case 6: if (e[-6]&0x80) return e-6;
749 case 5: if (e[-5]&0x80) return e-5;
750 case 4: if (e[-4]&0x80) return e-4;
751#endif
752 case 3: if (e[-3]&0x80) return e-3;
753 case 2: if (e[-2]&0x80) return e-2;
754 case 1: if (e[-1]&0x80) return e-1;
755 case 0: return NULL;
756 }
757}
758
759static int
760coderange_scan(const char *p, long len, rb_encoding *enc)
761{
762 const char *e = p + len;
763
764 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
765 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
766 p = search_nonascii(p, e);
768 }
769
770 if (rb_enc_asciicompat(enc)) {
771 p = search_nonascii(p, e);
772 if (!p) return ENC_CODERANGE_7BIT;
773 for (;;) {
774 int ret = rb_enc_precise_mbclen(p, e, enc);
776 p += MBCLEN_CHARFOUND_LEN(ret);
777 if (p == e) break;
778 p = search_nonascii(p, e);
779 if (!p) break;
780 }
781 }
782 else {
783 while (p < e) {
784 int ret = rb_enc_precise_mbclen(p, e, enc);
786 p += MBCLEN_CHARFOUND_LEN(ret);
787 }
788 }
789 return ENC_CODERANGE_VALID;
790}
791
792long
793rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
794{
795 const char *p = s;
796
797 if (*cr == ENC_CODERANGE_BROKEN)
798 return e - s;
799
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
801 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
802 if (*cr == ENC_CODERANGE_VALID) return e - s;
803 p = search_nonascii(p, e);
805 return e - s;
806 }
807 else if (rb_enc_asciicompat(enc)) {
808 p = search_nonascii(p, e);
809 if (!p) {
810 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
811 return e - s;
812 }
813 for (;;) {
814 int ret = rb_enc_precise_mbclen(p, e, enc);
815 if (!MBCLEN_CHARFOUND_P(ret)) {
817 return p - s;
818 }
819 p += MBCLEN_CHARFOUND_LEN(ret);
820 if (p == e) break;
821 p = search_nonascii(p, e);
822 if (!p) break;
823 }
824 }
825 else {
826 while (p < e) {
827 int ret = rb_enc_precise_mbclen(p, e, enc);
828 if (!MBCLEN_CHARFOUND_P(ret)) {
830 return p - s;
831 }
832 p += MBCLEN_CHARFOUND_LEN(ret);
833 }
834 }
836 return e - s;
837}
838
839static inline void
840str_enc_copy(VALUE str1, VALUE str2)
841{
842 rb_enc_set_index(str1, ENCODING_GET(str2));
843}
844
845/* Like str_enc_copy, but does not check frozen status of str1.
846 * You should use this only if you're certain that str1 is not frozen. */
847static inline void
848str_enc_copy_direct(VALUE str1, VALUE str2)
849{
850 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
851 if (inlined_encoding == ENCODING_INLINE_MAX) {
852 rb_enc_set_index(str1, rb_enc_get_index(str2));
853 }
854 else {
855 ENCODING_SET_INLINED(str1, inlined_encoding);
856 }
857}
858
859static void
860rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
861{
862 /* this function is designed for copying encoding and coderange
863 * from src to new string "dest" which is made from the part of src.
864 */
865 str_enc_copy(dest, src);
866 if (RSTRING_LEN(dest) == 0) {
867 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
869 else
871 return;
872 }
873 switch (ENC_CODERANGE(src)) {
876 break;
878 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
879 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
881 else
883 break;
884 default:
885 break;
886 }
887}
888
889static void
890rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
891{
892 str_enc_copy(dest, src);
894}
895
896static int
897enc_coderange_scan(VALUE str, rb_encoding *enc)
898{
899 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
900}
901
902int
903rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
904{
905 return enc_coderange_scan(str, enc);
906}
907
908int
910{
911 int cr = ENC_CODERANGE(str);
912
913 if (cr == ENC_CODERANGE_UNKNOWN) {
914 cr = enc_coderange_scan(str, get_encoding(str));
915 ENC_CODERANGE_SET(str, cr);
916 }
917 return cr;
918}
919
920static inline bool
921rb_enc_str_asciicompat(VALUE str)
922{
923 int encindex = ENCODING_GET_INLINED(str);
924 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
925}
926
927int
929{
930 switch(ENC_CODERANGE(str)) {
932 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
934 return true;
935 default:
936 return false;
937 }
938}
939
940static inline void
941str_mod_check(VALUE s, const char *p, long len)
942{
943 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
944 rb_raise(rb_eRuntimeError, "string modified");
945 }
946}
947
948static size_t
949str_capacity(VALUE str, const int termlen)
950{
951 if (STR_EMBED_P(str)) {
952 return str_embed_capa(str) - termlen;
953 }
954 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
955 return RSTRING(str)->len;
956 }
957 else {
958 return RSTRING(str)->as.heap.aux.capa;
959 }
960}
961
962size_t
964{
965 return str_capacity(str, TERM_LEN(str));
966}
967
968static inline void
969must_not_null(const char *ptr)
970{
971 if (!ptr) {
972 rb_raise(rb_eArgError, "NULL pointer given");
973 }
974}
975
976static inline VALUE
977str_alloc_embed(VALUE klass, size_t capa)
978{
979 size_t size = rb_str_embed_size(capa);
980 RUBY_ASSERT(size > 0);
981 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
982
983 NEWOBJ_OF(str, struct RString, klass,
985
986 return (VALUE)str;
987}
988
989static inline VALUE
990str_alloc_heap(VALUE klass)
991{
992 NEWOBJ_OF(str, struct RString, klass,
993 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
994
995 return (VALUE)str;
996}
997
998static inline VALUE
999empty_str_alloc(VALUE klass)
1000{
1001 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1002 VALUE str = str_alloc_embed(klass, 0);
1003 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1005 return str;
1006}
1007
1008static VALUE
1009str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1010{
1011 VALUE str;
1012
1013 if (len < 0) {
1014 rb_raise(rb_eArgError, "negative string size (or size too big)");
1015 }
1016
1017 if (enc == NULL) {
1018 enc = rb_ascii8bit_encoding();
1019 }
1020
1021 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1022
1023 int termlen = rb_enc_mbminlen(enc);
1024
1025 if (STR_EMBEDDABLE_P(len, termlen)) {
1026 str = str_alloc_embed(klass, len + termlen);
1027 if (len == 0) {
1028 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1029 }
1030 }
1031 else {
1032 str = str_alloc_heap(klass);
1033 RSTRING(str)->as.heap.aux.capa = len;
1034 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1035 * integer overflow. If we can STATIC_ASSERT that, the following
1036 * mul_add_mul can be reverted to a simple ALLOC_N. */
1037 RSTRING(str)->as.heap.ptr =
1038 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1039 }
1040
1041 rb_enc_raw_set(str, enc);
1042
1043 if (ptr) {
1044 memcpy(RSTRING_PTR(str), ptr, len);
1045 }
1046
1047 STR_SET_LEN(str, len);
1048 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1049 return str;
1050}
1051
1052static VALUE
1053str_new(VALUE klass, const char *ptr, long len)
1054{
1055 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1056}
1057
1058VALUE
1059rb_str_new(const char *ptr, long len)
1060{
1061 return str_new(rb_cString, ptr, len);
1062}
1063
1064VALUE
1065rb_usascii_str_new(const char *ptr, long len)
1066{
1067 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1068}
1069
1070VALUE
1071rb_utf8_str_new(const char *ptr, long len)
1072{
1073 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1074}
1075
1076VALUE
1077rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1078{
1079 return str_enc_new(rb_cString, ptr, len, enc);
1080}
1081
1082VALUE
1084{
1085 must_not_null(ptr);
1086 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1087 * memory regions, and that cannot be detected by the MSAN. Just
1088 * trust the programmer that the argument passed here is a sane C
1089 * string. */
1090 __msan_unpoison_string(ptr);
1091 return rb_str_new(ptr, strlen(ptr));
1092}
1093
1094VALUE
1096{
1097 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1098}
1099
1100VALUE
1102{
1103 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1104}
1105
1106VALUE
1108{
1109 must_not_null(ptr);
1110 if (rb_enc_mbminlen(enc) != 1) {
1111 rb_raise(rb_eArgError, "wchar encoding given");
1112 }
1113 return rb_enc_str_new(ptr, strlen(ptr), enc);
1114}
1115
1116static VALUE
1117str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1118{
1119 VALUE str;
1120
1121 if (len < 0) {
1122 rb_raise(rb_eArgError, "negative string size (or size too big)");
1123 }
1124
1125 if (!ptr) {
1126 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1127 }
1128 else {
1129 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1130 str = str_alloc_heap(klass);
1131 RSTRING(str)->len = len;
1132 RSTRING(str)->as.heap.ptr = (char *)ptr;
1133 RSTRING(str)->as.heap.aux.capa = len;
1134 RBASIC(str)->flags |= STR_NOFREE;
1135 rb_enc_associate_index(str, encindex);
1136 }
1137 return str;
1138}
1139
1140VALUE
1141rb_str_new_static(const char *ptr, long len)
1142{
1143 return str_new_static(rb_cString, ptr, len, 0);
1144}
1145
1146VALUE
1148{
1149 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1150}
1151
1152VALUE
1154{
1155 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1156}
1157
1158VALUE
1160{
1161 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1162}
1163
1164static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1165 rb_encoding *from, rb_encoding *to,
1166 int ecflags, VALUE ecopts);
1167
1168static inline bool
1169is_enc_ascii_string(VALUE str, rb_encoding *enc)
1170{
1171 int encidx = rb_enc_to_index(enc);
1172 if (rb_enc_get_index(str) == encidx)
1173 return is_ascii_string(str);
1174 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1175}
1176
1177VALUE
1178rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1179{
1180 long len;
1181 const char *ptr;
1182 VALUE newstr;
1183
1184 if (!to) return str;
1185 if (!from) from = rb_enc_get(str);
1186 if (from == to) return str;
1187 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1188 rb_is_ascii8bit_enc(to)) {
1189 if (STR_ENC_GET(str) != to) {
1190 str = rb_str_dup(str);
1191 rb_enc_associate(str, to);
1192 }
1193 return str;
1194 }
1195
1196 RSTRING_GETMEM(str, ptr, len);
1197 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1198 from, to, ecflags, ecopts);
1199 if (NIL_P(newstr)) {
1200 /* some error, return original */
1201 return str;
1202 }
1203 return newstr;
1204}
1205
1206VALUE
1207rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1208 rb_encoding *from, int ecflags, VALUE ecopts)
1209{
1210 long olen;
1211
1212 olen = RSTRING_LEN(newstr);
1213 if (ofs < -olen || olen < ofs)
1214 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1215 if (ofs < 0) ofs += olen;
1216 if (!from) {
1217 STR_SET_LEN(newstr, ofs);
1218 return rb_str_cat(newstr, ptr, len);
1219 }
1220
1221 rb_str_modify(newstr);
1222 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1223 rb_enc_get(newstr),
1224 ecflags, ecopts);
1225}
1226
1227VALUE
1228rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1229{
1230 STR_SET_LEN(str, 0);
1231 rb_enc_associate(str, enc);
1232 rb_str_cat(str, ptr, len);
1233 return str;
1234}
1235
1236static VALUE
1237str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1238 rb_encoding *from, rb_encoding *to,
1239 int ecflags, VALUE ecopts)
1240{
1241 rb_econv_t *ec;
1243 long olen;
1244 VALUE econv_wrapper;
1245 const unsigned char *start, *sp;
1246 unsigned char *dest, *dp;
1247 size_t converted_output = (size_t)ofs;
1248
1249 olen = rb_str_capacity(newstr);
1250
1251 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1252 RBASIC_CLEAR_CLASS(econv_wrapper);
1253 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1254 if (!ec) return Qnil;
1255 DATA_PTR(econv_wrapper) = ec;
1256
1257 sp = (unsigned char*)ptr;
1258 start = sp;
1259 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1260 (dp = dest + converted_output),
1261 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1263 /* destination buffer short */
1264 size_t converted_input = sp - start;
1265 size_t rest = len - converted_input;
1266 converted_output = dp - dest;
1267 rb_str_set_len(newstr, converted_output);
1268 if (converted_input && converted_output &&
1269 rest < (LONG_MAX / converted_output)) {
1270 rest = (rest * converted_output) / converted_input;
1271 }
1272 else {
1273 rest = olen;
1274 }
1275 olen += rest < 2 ? 2 : rest;
1276 rb_str_resize(newstr, olen);
1277 }
1278 DATA_PTR(econv_wrapper) = 0;
1279 RB_GC_GUARD(econv_wrapper);
1280 rb_econv_close(ec);
1281 switch (ret) {
1282 case econv_finished:
1283 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1284 rb_str_set_len(newstr, len);
1285 rb_enc_associate(newstr, to);
1286 return newstr;
1287
1288 default:
1289 return Qnil;
1290 }
1291}
1292
1293VALUE
1295{
1296 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1297}
1298
1299VALUE
1301{
1302 rb_encoding *ienc;
1303 VALUE str;
1304 const int eidx = rb_enc_to_index(eenc);
1305
1306 if (!ptr) {
1307 return rb_enc_str_new(ptr, len, eenc);
1308 }
1309
1310 /* ASCII-8BIT case, no conversion */
1311 if ((eidx == rb_ascii8bit_encindex()) ||
1312 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1313 return rb_str_new(ptr, len);
1314 }
1315 /* no default_internal or same encoding, no conversion */
1316 ienc = rb_default_internal_encoding();
1317 if (!ienc || eenc == ienc) {
1318 return rb_enc_str_new(ptr, len, eenc);
1319 }
1320 /* ASCII compatible, and ASCII only string, no conversion in
1321 * default_internal */
1322 if ((eidx == rb_ascii8bit_encindex()) ||
1323 (eidx == rb_usascii_encindex()) ||
1324 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1325 return rb_enc_str_new(ptr, len, ienc);
1326 }
1327 /* convert from the given encoding to default_internal */
1328 str = rb_enc_str_new(NULL, 0, ienc);
1329 /* when the conversion failed for some reason, just ignore the
1330 * default_internal and result in the given encoding as-is. */
1331 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1332 rb_str_initialize(str, ptr, len, eenc);
1333 }
1334 return str;
1335}
1336
1337VALUE
1338rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1339{
1340 int eidx = rb_enc_to_index(eenc);
1341 if (eidx == rb_usascii_encindex() &&
1342 !is_ascii_string(str)) {
1343 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1344 return str;
1345 }
1346 rb_enc_associate_index(str, eidx);
1347 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1348}
1349
1350VALUE
1351rb_external_str_new(const char *ptr, long len)
1352{
1353 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1354}
1355
1356VALUE
1358{
1359 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1360}
1361
1362VALUE
1363rb_locale_str_new(const char *ptr, long len)
1364{
1365 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1366}
1367
1368VALUE
1370{
1371 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1372}
1373
1374VALUE
1376{
1377 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1378}
1379
1380VALUE
1382{
1383 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1384}
1385
1386VALUE
1388{
1389 return rb_str_export_to_enc(str, rb_default_external_encoding());
1390}
1391
1392VALUE
1394{
1395 return rb_str_export_to_enc(str, rb_locale_encoding());
1396}
1397
1398VALUE
1400{
1401 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1402}
1403
1404static VALUE
1405str_replace_shared_without_enc(VALUE str2, VALUE str)
1406{
1407 const int termlen = TERM_LEN(str);
1408 char *ptr;
1409 long len;
1410
1411 RSTRING_GETMEM(str, ptr, len);
1412 if (str_embed_capa(str2) >= len + termlen) {
1413 char *ptr2 = RSTRING(str2)->as.embed.ary;
1414 STR_SET_EMBED(str2);
1415 memcpy(ptr2, RSTRING_PTR(str), len);
1416 TERM_FILL(ptr2+len, termlen);
1417 }
1418 else {
1419 VALUE root;
1420 if (STR_SHARED_P(str)) {
1421 root = RSTRING(str)->as.heap.aux.shared;
1422 RSTRING_GETMEM(str, ptr, len);
1423 }
1424 else {
1425 root = rb_str_new_frozen(str);
1426 RSTRING_GETMEM(root, ptr, len);
1427 }
1428 RUBY_ASSERT(OBJ_FROZEN(root));
1429
1430 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1431 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1432 rb_fatal("about to free a possible shared root");
1433 }
1434 char *ptr2 = STR_HEAP_PTR(str2);
1435 if (ptr2 != ptr) {
1436 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1437 }
1438 }
1439 FL_SET(str2, STR_NOEMBED);
1440 RSTRING(str2)->as.heap.ptr = ptr;
1441 STR_SET_SHARED(str2, root);
1442 }
1443
1444 STR_SET_LEN(str2, len);
1445
1446 return str2;
1447}
1448
1449static VALUE
1450str_replace_shared(VALUE str2, VALUE str)
1451{
1452 str_replace_shared_without_enc(str2, str);
1453 rb_enc_cr_str_exact_copy(str2, str);
1454 return str2;
1455}
1456
1457static VALUE
1458str_new_shared(VALUE klass, VALUE str)
1459{
1460 return str_replace_shared(str_alloc_heap(klass), str);
1461}
1462
1463VALUE
1465{
1466 return str_new_shared(rb_obj_class(str), str);
1467}
1468
1469VALUE
1471{
1472 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1473 return str_new_frozen(rb_obj_class(orig), orig);
1474}
1475
1476static VALUE
1477rb_str_new_frozen_String(VALUE orig)
1478{
1479 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1480 return str_new_frozen(rb_cString, orig);
1481}
1482
1483
1484VALUE
1485rb_str_frozen_bare_string(VALUE orig)
1486{
1487 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1488 return str_new_frozen(rb_cString, orig);
1489}
1490
1491VALUE
1492rb_str_tmp_frozen_acquire(VALUE orig)
1493{
1494 if (OBJ_FROZEN_RAW(orig)) return orig;
1495 return str_new_frozen_buffer(0, orig, FALSE);
1496}
1497
1498VALUE
1499rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1500{
1501 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1502 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1503
1504 VALUE str = str_alloc_heap(0);
1505 OBJ_FREEZE(str);
1506 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1507 FL_SET(str, STR_SHARED_ROOT);
1508
1509 size_t capa = str_capacity(orig, TERM_LEN(orig));
1510
1511 /* If the string is embedded then we want to create a copy that is heap
1512 * allocated. If the string is shared then the shared root must be
1513 * embedded, so we want to create a copy. If the string is a shared root
1514 * then it must be embedded, so we want to create a copy. */
1515 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1516 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1517 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1518 }
1519 else {
1520 /* orig must be heap allocated and not shared, so we can safely transfer
1521 * the pointer to str. */
1522 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1523 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1524 RBASIC(orig)->flags &= ~STR_NOFREE;
1525 STR_SET_SHARED(orig, str);
1526 }
1527
1528 RSTRING(str)->len = RSTRING(orig)->len;
1529 RSTRING(str)->as.heap.aux.capa = capa;
1530
1531 return str;
1532}
1533
1534void
1535rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1536{
1537 if (RBASIC_CLASS(tmp) != 0)
1538 return;
1539
1540 if (STR_EMBED_P(tmp)) {
1542 }
1543 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1544 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1545 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1546
1547 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1548 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1549 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1550
1551 /* Unshare orig since the root (tmp) only has this one child. */
1552 FL_UNSET_RAW(orig, STR_SHARED);
1553 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1554 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1556
1557 /* Make tmp embedded and empty so it is safe for sweeping. */
1558 STR_SET_EMBED(tmp);
1559 STR_SET_LEN(tmp, 0);
1560 }
1561 }
1562}
1563
1564static VALUE
1565str_new_frozen(VALUE klass, VALUE orig)
1566{
1567 return str_new_frozen_buffer(klass, orig, TRUE);
1568}
1569
1570static VALUE
1571heap_str_make_shared(VALUE klass, VALUE orig)
1572{
1573 RUBY_ASSERT(!STR_EMBED_P(orig));
1574 RUBY_ASSERT(!STR_SHARED_P(orig));
1575
1576 VALUE str = str_alloc_heap(klass);
1577 STR_SET_LEN(str, RSTRING_LEN(orig));
1578 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1579 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1580 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1581 RBASIC(orig)->flags &= ~STR_NOFREE;
1582 STR_SET_SHARED(orig, str);
1583 if (klass == 0)
1584 FL_UNSET_RAW(str, STR_BORROWED);
1585 return str;
1586}
1587
1588static VALUE
1589str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1590{
1591 VALUE str;
1592
1593 long len = RSTRING_LEN(orig);
1594 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1595 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1596
1597 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1598 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1599 RUBY_ASSERT(STR_EMBED_P(str));
1600 }
1601 else {
1602 if (FL_TEST_RAW(orig, STR_SHARED)) {
1603 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1604 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1605 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1606 RUBY_ASSERT(ofs >= 0);
1607 RUBY_ASSERT(rest >= 0);
1608 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1610
1611 if ((ofs > 0) || (rest > 0) ||
1612 (klass != RBASIC(shared)->klass) ||
1613 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1614 str = str_new_shared(klass, shared);
1615 RUBY_ASSERT(!STR_EMBED_P(str));
1616 RSTRING(str)->as.heap.ptr += ofs;
1617 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1618 }
1619 else {
1620 if (RBASIC_CLASS(shared) == 0)
1621 FL_SET_RAW(shared, STR_BORROWED);
1622 return shared;
1623 }
1624 }
1625 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1626 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1627 STR_SET_EMBED(str);
1628 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1629 STR_SET_LEN(str, RSTRING_LEN(orig));
1630 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1631 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1632 }
1633 else {
1634 str = heap_str_make_shared(klass, orig);
1635 }
1636 }
1637
1638 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1639 OBJ_FREEZE(str);
1640 return str;
1641}
1642
1643VALUE
1644rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1645{
1646 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1647}
1648
1649static VALUE
1650str_new_empty_String(VALUE str)
1651{
1652 VALUE v = rb_str_new(0, 0);
1653 rb_enc_copy(v, str);
1654 return v;
1655}
1656
1657#define STR_BUF_MIN_SIZE 63
1658
1659VALUE
1661{
1662 if (STR_EMBEDDABLE_P(capa, 1)) {
1663 return str_alloc_embed(rb_cString, capa + 1);
1664 }
1665
1666 VALUE str = str_alloc_heap(rb_cString);
1667
1668 RSTRING(str)->as.heap.aux.capa = capa;
1669 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1670 RSTRING(str)->as.heap.ptr[0] = '\0';
1671
1672 return str;
1673}
1674
1675VALUE
1677{
1678 VALUE str;
1679 long len = strlen(ptr);
1680
1681 str = rb_str_buf_new(len);
1682 rb_str_buf_cat(str, ptr, len);
1683
1684 return str;
1685}
1686
1687VALUE
1689{
1690 return str_new(0, 0, len);
1691}
1692
1693void
1695{
1696 if (STR_EMBED_P(str)) {
1697 RB_DEBUG_COUNTER_INC(obj_str_embed);
1698 }
1699 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1700 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1701 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1702 }
1703 else {
1704 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1705 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1706 }
1707}
1708
1709size_t
1710rb_str_memsize(VALUE str)
1711{
1712 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1713 return STR_HEAP_SIZE(str);
1714 }
1715 else {
1716 return 0;
1717 }
1718}
1719
1720VALUE
1722{
1723 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1724}
1725
1726static inline void str_discard(VALUE str);
1727static void str_shared_replace(VALUE str, VALUE str2);
1728
1729void
1731{
1732 if (str != str2) str_shared_replace(str, str2);
1733}
1734
1735static void
1736str_shared_replace(VALUE str, VALUE str2)
1737{
1738 rb_encoding *enc;
1739 int cr;
1740 int termlen;
1741
1742 RUBY_ASSERT(str2 != str);
1743 enc = STR_ENC_GET(str2);
1744 cr = ENC_CODERANGE(str2);
1745 str_discard(str);
1746 termlen = rb_enc_mbminlen(enc);
1747
1748 STR_SET_LEN(str, RSTRING_LEN(str2));
1749
1750 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1751 STR_SET_EMBED(str);
1752 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1753 rb_enc_associate(str, enc);
1754 ENC_CODERANGE_SET(str, cr);
1755 }
1756 else {
1757 if (STR_EMBED_P(str2)) {
1758 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1759 long len = RSTRING_LEN(str2);
1760 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1761
1762 char *new_ptr = ALLOC_N(char, len + termlen);
1763 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1764 RSTRING(str2)->as.heap.ptr = new_ptr;
1765 STR_SET_LEN(str2, len);
1766 RSTRING(str2)->as.heap.aux.capa = len;
1767 STR_SET_NOEMBED(str2);
1768 }
1769
1770 STR_SET_NOEMBED(str);
1771 FL_UNSET(str, STR_SHARED);
1772 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1773
1774 if (FL_TEST(str2, STR_SHARED)) {
1775 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1776 STR_SET_SHARED(str, shared);
1777 }
1778 else {
1779 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1780 }
1781
1782 /* abandon str2 */
1783 STR_SET_EMBED(str2);
1784 RSTRING_PTR(str2)[0] = 0;
1785 STR_SET_LEN(str2, 0);
1786 rb_enc_associate(str, enc);
1787 ENC_CODERANGE_SET(str, cr);
1788 }
1789}
1790
1791VALUE
1793{
1794 VALUE str;
1795
1796 if (RB_TYPE_P(obj, T_STRING)) {
1797 return obj;
1798 }
1799 str = rb_funcall(obj, idTo_s, 0);
1800 return rb_obj_as_string_result(str, obj);
1801}
1802
1803VALUE
1804rb_obj_as_string_result(VALUE str, VALUE obj)
1805{
1806 if (!RB_TYPE_P(str, T_STRING))
1807 return rb_any_to_s(obj);
1808 return str;
1809}
1810
1811static VALUE
1812str_replace(VALUE str, VALUE str2)
1813{
1814 long len;
1815
1816 len = RSTRING_LEN(str2);
1817 if (STR_SHARED_P(str2)) {
1818 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1820 STR_SET_NOEMBED(str);
1821 STR_SET_LEN(str, len);
1822 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1823 STR_SET_SHARED(str, shared);
1824 rb_enc_cr_str_exact_copy(str, str2);
1825 }
1826 else {
1827 str_replace_shared(str, str2);
1828 }
1829
1830 return str;
1831}
1832
1833static inline VALUE
1834ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1835{
1836 size_t size = rb_str_embed_size(capa);
1837 RUBY_ASSERT(size > 0);
1838 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1839
1840 NEWOBJ_OF(str, struct RString, klass,
1842
1843 return (VALUE)str;
1844}
1845
1846static inline VALUE
1847ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1848{
1849 NEWOBJ_OF(str, struct RString, klass,
1850 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1851
1852 return (VALUE)str;
1853}
1854
1855static inline VALUE
1856str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1857{
1858 int encidx = 0;
1859 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1860 encidx = rb_enc_get_index(str);
1861 flags &= ~ENCODING_MASK;
1862 }
1863 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1864 if (encidx) rb_enc_associate_index(dup, encidx);
1865 return dup;
1866}
1867
1868static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1869
1870static inline VALUE
1871str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1872{
1873 VALUE flags = FL_TEST_RAW(str, flag_mask);
1874 long len = RSTRING_LEN(str);
1875
1876 RUBY_ASSERT(STR_EMBED_P(dup));
1877 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1878 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1879 STR_SET_LEN(dup, RSTRING_LEN(str));
1880 return str_duplicate_setup_encoding(str, dup, flags);
1881}
1882
1883static inline VALUE
1884str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1885{
1886 VALUE flags = FL_TEST_RAW(str, flag_mask);
1887 VALUE root = str;
1888 if (FL_TEST_RAW(str, STR_SHARED)) {
1889 root = RSTRING(str)->as.heap.aux.shared;
1890 }
1891 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1892 root = str = str_new_frozen(klass, str);
1893 flags = FL_TEST_RAW(str, flag_mask);
1894 }
1895 RUBY_ASSERT(!STR_SHARED_P(root));
1897
1898 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1899 FL_SET(root, STR_SHARED_ROOT);
1900 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1901 flags |= RSTRING_NOEMBED | STR_SHARED;
1902
1903 STR_SET_LEN(dup, RSTRING_LEN(str));
1904 return str_duplicate_setup_encoding(str, dup, flags);
1905}
1906
1907static inline VALUE
1908str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1909{
1910 if (STR_EMBED_P(str)) {
1911 return str_duplicate_setup_embed(klass, str, dup);
1912 }
1913 else {
1914 return str_duplicate_setup_heap(klass, str, dup);
1915 }
1916}
1917
1918static inline VALUE
1919str_duplicate(VALUE klass, VALUE str)
1920{
1921 VALUE dup;
1922 if (STR_EMBED_P(str)) {
1923 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1924 }
1925 else {
1926 dup = str_alloc_heap(klass);
1927 }
1928
1929 return str_duplicate_setup(klass, str, dup);
1930}
1931
1932VALUE
1934{
1935 return str_duplicate(rb_obj_class(str), str);
1936}
1937
1938/* :nodoc: */
1939VALUE
1940rb_str_dup_m(VALUE str)
1941{
1942 if (LIKELY(BARE_STRING_P(str))) {
1943 return str_duplicate(rb_obj_class(str), str);
1944 }
1945 else {
1946 return rb_obj_dup(str);
1947 }
1948}
1949
1950VALUE
1952{
1953 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1954 return str_duplicate(rb_cString, str);
1955}
1956
1957VALUE
1958rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1959{
1960 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1961 VALUE new_str, klass = rb_cString;
1962
1963 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1964 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1965 str_duplicate_setup_embed(klass, str, new_str);
1966 }
1967 else {
1968 new_str = ec_str_alloc_heap(ec, klass);
1969 str_duplicate_setup_heap(klass, str, new_str);
1970 }
1971 if (chilled) {
1972 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1973 }
1974 return new_str;
1975}
1976
1977VALUE
1978rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
1979{
1980 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
1981 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
1982 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1983 FL_SET_RAW(str, STR_CHILLED_LITERAL);
1984 return rb_str_freeze(str);
1985}
1986
1987/*
1988 *
1989 * call-seq:
1990 * String.new(string = '', **opts) -> new_string
1991 *
1992 * :include: doc/string/new.rdoc
1993 *
1994 */
1995
1996static VALUE
1997rb_str_init(int argc, VALUE *argv, VALUE str)
1998{
1999 static ID keyword_ids[2];
2000 VALUE orig, opt, venc, vcapa;
2001 VALUE kwargs[2];
2002 rb_encoding *enc = 0;
2003 int n;
2004
2005 if (!keyword_ids[0]) {
2006 keyword_ids[0] = rb_id_encoding();
2007 CONST_ID(keyword_ids[1], "capacity");
2008 }
2009
2010 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2011 if (!NIL_P(opt)) {
2012 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2013 venc = kwargs[0];
2014 vcapa = kwargs[1];
2015 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2016 enc = rb_to_encoding(venc);
2017 }
2018 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2019 long capa = NUM2LONG(vcapa);
2020 long len = 0;
2021 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2022
2023 if (capa < STR_BUF_MIN_SIZE) {
2024 capa = STR_BUF_MIN_SIZE;
2025 }
2026 if (n == 1) {
2027 StringValue(orig);
2028 len = RSTRING_LEN(orig);
2029 if (capa < len) {
2030 capa = len;
2031 }
2032 if (orig == str) n = 0;
2033 }
2034 str_modifiable(str);
2035 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2036 /* make noembed always */
2037 const size_t size = (size_t)capa + termlen;
2038 const char *const old_ptr = RSTRING_PTR(str);
2039 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2040 char *new_ptr = ALLOC_N(char, size);
2041 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2042 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2043 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2044 RSTRING(str)->as.heap.ptr = new_ptr;
2045 }
2046 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2047 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2048 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2049 }
2050 STR_SET_LEN(str, len);
2051 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2052 if (n == 1) {
2053 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2054 rb_enc_cr_str_exact_copy(str, orig);
2055 }
2056 FL_SET(str, STR_NOEMBED);
2057 RSTRING(str)->as.heap.aux.capa = capa;
2058 }
2059 else if (n == 1) {
2060 rb_str_replace(str, orig);
2061 }
2062 if (enc) {
2063 rb_enc_associate(str, enc);
2065 }
2066 }
2067 else if (n == 1) {
2068 rb_str_replace(str, orig);
2069 }
2070 return str;
2071}
2072
2073/* :nodoc: */
2074static VALUE
2075rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2076{
2077 if (klass != rb_cString) {
2078 return rb_class_new_instance_pass_kw(argc, argv, klass);
2079 }
2080
2081 static ID keyword_ids[2];
2082 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2083 VALUE kwargs[2];
2084 rb_encoding *enc = NULL;
2085
2086 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2087 if (NIL_P(opt)) {
2088 return rb_class_new_instance_pass_kw(argc, argv, klass);
2089 }
2090
2091 keyword_ids[0] = rb_id_encoding();
2092 CONST_ID(keyword_ids[1], "capacity");
2093 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2094 encoding = kwargs[0];
2095 capacity = kwargs[1];
2096
2097 if (n == 1) {
2098 orig = StringValue(orig);
2099 }
2100 else {
2101 orig = Qnil;
2102 }
2103
2104 if (UNDEF_P(encoding)) {
2105 if (!NIL_P(orig)) {
2106 encoding = rb_obj_encoding(orig);
2107 }
2108 }
2109
2110 if (!UNDEF_P(encoding)) {
2111 enc = rb_to_encoding(encoding);
2112 }
2113
2114 // If capacity is nil, we're basically just duping `orig`.
2115 if (UNDEF_P(capacity)) {
2116 if (NIL_P(orig)) {
2117 VALUE empty_str = str_new(klass, "", 0);
2118 if (enc) {
2119 rb_enc_associate(empty_str, enc);
2120 }
2121 return empty_str;
2122 }
2123 VALUE copy = str_duplicate(klass, orig);
2124 rb_enc_associate(copy, enc);
2125 ENC_CODERANGE_CLEAR(copy);
2126 return copy;
2127 }
2128
2129 long capa = 0;
2130 capa = NUM2LONG(capacity);
2131 if (capa < 0) {
2132 capa = 0;
2133 }
2134
2135 if (!NIL_P(orig)) {
2136 long orig_capa = rb_str_capacity(orig);
2137 if (orig_capa > capa) {
2138 capa = orig_capa;
2139 }
2140 }
2141
2142 VALUE str = str_enc_new(klass, NULL, capa, enc);
2143 STR_SET_LEN(str, 0);
2144 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2145
2146 if (!NIL_P(orig)) {
2147 rb_str_buf_append(str, orig);
2148 }
2149
2150 return str;
2151}
2152
2153#ifdef NONASCII_MASK
2154#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2155
2156/*
2157 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2158 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2159 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2160 *
2161 * if (!(byte & 0x80))
2162 * byte |= 0x40; // turn on bit6
2163 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2164 *
2165 * This function calculates whether a byte is leading or not for all bytes
2166 * in the argument word by concurrently using the above logic, and then
2167 * adds up the number of leading bytes in the word.
2168 */
2169static inline uintptr_t
2170count_utf8_lead_bytes_with_word(const uintptr_t *s)
2171{
2172 uintptr_t d = *s;
2173
2174 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2175 d = (d>>6) | (~d>>7);
2176 d &= NONASCII_MASK >> 7;
2177
2178 /* Gather all bytes. */
2179#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2180 /* use only if it can use POPCNT */
2181 return rb_popcount_intptr(d);
2182#else
2183 d += (d>>8);
2184 d += (d>>16);
2185# if SIZEOF_VOIDP == 8
2186 d += (d>>32);
2187# endif
2188 return (d&0xF);
2189#endif
2190}
2191#endif
2192
2193static inline long
2194enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2195{
2196 long c;
2197 const char *q;
2198
2199 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2200 long diff = (long)(e - p);
2201 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2202 }
2203#ifdef NONASCII_MASK
2204 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2205 uintptr_t len = 0;
2206 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2207 const uintptr_t *s, *t;
2208 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2209 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2210 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2211 while (p < (const char *)s) {
2212 if (is_utf8_lead_byte(*p)) len++;
2213 p++;
2214 }
2215 while (s < t) {
2216 len += count_utf8_lead_bytes_with_word(s);
2217 s++;
2218 }
2219 p = (const char *)s;
2220 }
2221 while (p < e) {
2222 if (is_utf8_lead_byte(*p)) len++;
2223 p++;
2224 }
2225 return (long)len;
2226 }
2227#endif
2228 else if (rb_enc_asciicompat(enc)) {
2229 c = 0;
2230 if (ENC_CODERANGE_CLEAN_P(cr)) {
2231 while (p < e) {
2232 if (ISASCII(*p)) {
2233 q = search_nonascii(p, e);
2234 if (!q)
2235 return c + (e - p);
2236 c += q - p;
2237 p = q;
2238 }
2239 p += rb_enc_fast_mbclen(p, e, enc);
2240 c++;
2241 }
2242 }
2243 else {
2244 while (p < e) {
2245 if (ISASCII(*p)) {
2246 q = search_nonascii(p, e);
2247 if (!q)
2248 return c + (e - p);
2249 c += q - p;
2250 p = q;
2251 }
2252 p += rb_enc_mbclen(p, e, enc);
2253 c++;
2254 }
2255 }
2256 return c;
2257 }
2258
2259 for (c=0; p<e; c++) {
2260 p += rb_enc_mbclen(p, e, enc);
2261 }
2262 return c;
2263}
2264
2265long
2266rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2267{
2268 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2269}
2270
2271/* To get strlen with cr
2272 * Note that given cr is not used.
2273 */
2274long
2275rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2276{
2277 long c;
2278 const char *q;
2279 int ret;
2280
2281 *cr = 0;
2282 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2283 long diff = (long)(e - p);
2284 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2285 }
2286 else if (rb_enc_asciicompat(enc)) {
2287 c = 0;
2288 while (p < e) {
2289 if (ISASCII(*p)) {
2290 q = search_nonascii(p, e);
2291 if (!q) {
2292 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2293 return c + (e - p);
2294 }
2295 c += q - p;
2296 p = q;
2297 }
2298 ret = rb_enc_precise_mbclen(p, e, enc);
2299 if (MBCLEN_CHARFOUND_P(ret)) {
2300 *cr |= ENC_CODERANGE_VALID;
2301 p += MBCLEN_CHARFOUND_LEN(ret);
2302 }
2303 else {
2305 p++;
2306 }
2307 c++;
2308 }
2309 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2310 return c;
2311 }
2312
2313 for (c=0; p<e; c++) {
2314 ret = rb_enc_precise_mbclen(p, e, enc);
2315 if (MBCLEN_CHARFOUND_P(ret)) {
2316 *cr |= ENC_CODERANGE_VALID;
2317 p += MBCLEN_CHARFOUND_LEN(ret);
2318 }
2319 else {
2321 if (p + rb_enc_mbminlen(enc) <= e)
2322 p += rb_enc_mbminlen(enc);
2323 else
2324 p = e;
2325 }
2326 }
2327 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2328 return c;
2329}
2330
2331/* enc must be str's enc or rb_enc_check(str, str2) */
2332static long
2333str_strlen(VALUE str, rb_encoding *enc)
2334{
2335 const char *p, *e;
2336 int cr;
2337
2338 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2339 if (!enc) enc = STR_ENC_GET(str);
2340 p = RSTRING_PTR(str);
2341 e = RSTRING_END(str);
2342 cr = ENC_CODERANGE(str);
2343
2344 if (cr == ENC_CODERANGE_UNKNOWN) {
2345 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2346 if (cr) ENC_CODERANGE_SET(str, cr);
2347 return n;
2348 }
2349 else {
2350 return enc_strlen(p, e, enc, cr);
2351 }
2352}
2353
2354long
2356{
2357 return str_strlen(str, NULL);
2358}
2359
2360/*
2361 * call-seq:
2362 * length -> integer
2363 *
2364 * :include: doc/string/length.rdoc
2365 *
2366 */
2367
2368VALUE
2370{
2371 return LONG2NUM(str_strlen(str, NULL));
2372}
2373
2374/*
2375 * call-seq:
2376 * bytesize -> integer
2377 *
2378 * :include: doc/string/bytesize.rdoc
2379 *
2380 */
2381
2382VALUE
2383rb_str_bytesize(VALUE str)
2384{
2385 return LONG2NUM(RSTRING_LEN(str));
2386}
2387
2388/*
2389 * call-seq:
2390 * empty? -> true or false
2391 *
2392 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2393 *
2394 * "hello".empty? # => false
2395 * " ".empty? # => false
2396 * "".empty? # => true
2397 *
2398 */
2399
2400static VALUE
2401rb_str_empty(VALUE str)
2402{
2403 return RBOOL(RSTRING_LEN(str) == 0);
2404}
2405
2406/*
2407 * call-seq:
2408 * string + other_string -> new_string
2409 *
2410 * Returns a new +String+ containing +other_string+ concatenated to +self+:
2411 *
2412 * "Hello from " + self.to_s # => "Hello from main"
2413 *
2414 */
2415
2416VALUE
2418{
2419 VALUE str3;
2420 rb_encoding *enc;
2421 char *ptr1, *ptr2, *ptr3;
2422 long len1, len2;
2423 int termlen;
2424
2425 StringValue(str2);
2426 enc = rb_enc_check_str(str1, str2);
2427 RSTRING_GETMEM(str1, ptr1, len1);
2428 RSTRING_GETMEM(str2, ptr2, len2);
2429 termlen = rb_enc_mbminlen(enc);
2430 if (len1 > LONG_MAX - len2) {
2431 rb_raise(rb_eArgError, "string size too big");
2432 }
2433 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2434 ptr3 = RSTRING_PTR(str3);
2435 memcpy(ptr3, ptr1, len1);
2436 memcpy(ptr3+len1, ptr2, len2);
2437 TERM_FILL(&ptr3[len1+len2], termlen);
2438
2439 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2441 RB_GC_GUARD(str1);
2442 RB_GC_GUARD(str2);
2443 return str3;
2444}
2445
2446/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2447VALUE
2448rb_str_opt_plus(VALUE str1, VALUE str2)
2449{
2452 long len1, len2;
2453 MAYBE_UNUSED(char) *ptr1, *ptr2;
2454 RSTRING_GETMEM(str1, ptr1, len1);
2455 RSTRING_GETMEM(str2, ptr2, len2);
2456 int enc1 = rb_enc_get_index(str1);
2457 int enc2 = rb_enc_get_index(str2);
2458
2459 if (enc1 < 0) {
2460 return Qundef;
2461 }
2462 else if (enc2 < 0) {
2463 return Qundef;
2464 }
2465 else if (enc1 != enc2) {
2466 return Qundef;
2467 }
2468 else if (len1 > LONG_MAX - len2) {
2469 return Qundef;
2470 }
2471 else {
2472 return rb_str_plus(str1, str2);
2473 }
2474
2475}
2476
2477/*
2478 * call-seq:
2479 * string * integer -> new_string
2480 *
2481 * Returns a new +String+ containing +integer+ copies of +self+:
2482 *
2483 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2484 * "Ho! " * 0 # => ""
2485 *
2486 */
2487
2488VALUE
2490{
2491 VALUE str2;
2492 long n, len;
2493 char *ptr2;
2494 int termlen;
2495
2496 if (times == INT2FIX(1)) {
2497 return str_duplicate(rb_cString, str);
2498 }
2499 if (times == INT2FIX(0)) {
2500 str2 = str_alloc_embed(rb_cString, 0);
2501 rb_enc_copy(str2, str);
2502 return str2;
2503 }
2504 len = NUM2LONG(times);
2505 if (len < 0) {
2506 rb_raise(rb_eArgError, "negative argument");
2507 }
2508 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2509 if (STR_EMBEDDABLE_P(len, 1)) {
2510 str2 = str_alloc_embed(rb_cString, len + 1);
2511 memset(RSTRING_PTR(str2), 0, len + 1);
2512 }
2513 else {
2514 str2 = str_alloc_heap(rb_cString);
2515 RSTRING(str2)->as.heap.aux.capa = len;
2516 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2517 }
2518 STR_SET_LEN(str2, len);
2519 rb_enc_copy(str2, str);
2520 return str2;
2521 }
2522 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2523 rb_raise(rb_eArgError, "argument too big");
2524 }
2525
2526 len *= RSTRING_LEN(str);
2527 termlen = TERM_LEN(str);
2528 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2529 ptr2 = RSTRING_PTR(str2);
2530 if (len) {
2531 n = RSTRING_LEN(str);
2532 memcpy(ptr2, RSTRING_PTR(str), n);
2533 while (n <= len/2) {
2534 memcpy(ptr2 + n, ptr2, n);
2535 n *= 2;
2536 }
2537 memcpy(ptr2 + n, ptr2, len-n);
2538 }
2539 STR_SET_LEN(str2, len);
2540 TERM_FILL(&ptr2[len], termlen);
2541 rb_enc_cr_str_copy_for_substr(str2, str);
2542
2543 return str2;
2544}
2545
2546/*
2547 * call-seq:
2548 * string % object -> new_string
2549 *
2550 * Returns the result of formatting +object+ into the format specification +self+
2551 * (see Kernel#sprintf for formatting details):
2552 *
2553 * "%05d" % 123 # => "00123"
2554 *
2555 * If +self+ contains multiple substitutions, +object+ must be
2556 * an Array or Hash containing the values to be substituted:
2557 *
2558 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2559 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2560 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2561 *
2562 */
2563
2564static VALUE
2565rb_str_format_m(VALUE str, VALUE arg)
2566{
2567 VALUE tmp = rb_check_array_type(arg);
2568
2569 if (!NIL_P(tmp)) {
2570 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2571 }
2572 return rb_str_format(1, &arg, str);
2573}
2574
2575static inline void
2576rb_check_lockedtmp(VALUE str)
2577{
2578 if (FL_TEST(str, STR_TMPLOCK)) {
2579 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2580 }
2581}
2582
2583// If none of these flags are set, we know we have an modifiable string.
2584// If any is set, we need to do more detailed checks.
2585#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2586static inline void
2587str_modifiable(VALUE str)
2588{
2589 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2590 if (CHILLED_STRING_P(str)) {
2591 CHILLED_STRING_MUTATED(str);
2592 }
2593 rb_check_lockedtmp(str);
2594 rb_check_frozen(str);
2595 }
2596}
2597
2598static inline int
2599str_dependent_p(VALUE str)
2600{
2601 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2602 return FALSE;
2603 }
2604 else {
2605 return TRUE;
2606 }
2607}
2608
2609// If none of these flags are set, we know we have an independent string.
2610// If any is set, we need to do more detailed checks.
2611#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2612static inline int
2613str_independent(VALUE str)
2614{
2615 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2616 str_modifiable(str);
2617 return !str_dependent_p(str);
2618 }
2619 return TRUE;
2620}
2621
2622static void
2623str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2624{
2625 char *ptr;
2626 char *oldptr;
2627 long capa = len + expand;
2628
2629 if (len > capa) len = capa;
2630
2631 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2632 ptr = RSTRING(str)->as.heap.ptr;
2633 STR_SET_EMBED(str);
2634 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2635 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2636 STR_SET_LEN(str, len);
2637 return;
2638 }
2639
2640 ptr = ALLOC_N(char, (size_t)capa + termlen);
2641 oldptr = RSTRING_PTR(str);
2642 if (oldptr) {
2643 memcpy(ptr, oldptr, len);
2644 }
2645 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2646 xfree(oldptr);
2647 }
2648 STR_SET_NOEMBED(str);
2649 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2650 TERM_FILL(ptr + len, termlen);
2651 RSTRING(str)->as.heap.ptr = ptr;
2652 STR_SET_LEN(str, len);
2653 RSTRING(str)->as.heap.aux.capa = capa;
2654}
2655
2656void
2657rb_str_modify(VALUE str)
2658{
2659 if (!str_independent(str))
2660 str_make_independent(str);
2662}
2663
2664void
2666{
2667 int termlen = TERM_LEN(str);
2668 long len = RSTRING_LEN(str);
2669
2670 if (expand < 0) {
2671 rb_raise(rb_eArgError, "negative expanding string size");
2672 }
2673 if (expand >= LONG_MAX - len) {
2674 rb_raise(rb_eArgError, "string size too big");
2675 }
2676
2677 if (!str_independent(str)) {
2678 str_make_independent_expand(str, len, expand, termlen);
2679 }
2680 else if (expand > 0) {
2681 RESIZE_CAPA_TERM(str, len + expand, termlen);
2682 }
2684}
2685
2686/* As rb_str_modify(), but don't clear coderange */
2687static void
2688str_modify_keep_cr(VALUE str)
2689{
2690 if (!str_independent(str))
2691 str_make_independent(str);
2693 /* Force re-scan later */
2695}
2696
2697static inline void
2698str_discard(VALUE str)
2699{
2700 str_modifiable(str);
2701 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2702 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2703 RSTRING(str)->as.heap.ptr = 0;
2704 STR_SET_LEN(str, 0);
2705 }
2706}
2707
2708void
2710{
2711 int encindex = rb_enc_get_index(str);
2712
2713 if (RB_UNLIKELY(encindex == -1)) {
2714 rb_raise(rb_eTypeError, "not encoding capable object");
2715 }
2716
2717 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2718 return;
2719 }
2720
2721 rb_encoding *enc = rb_enc_from_index(encindex);
2722 if (!rb_enc_asciicompat(enc)) {
2723 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2724 }
2725}
2726
2727VALUE
2729{
2730 VALUE s = *ptr;
2731 if (!RB_TYPE_P(s, T_STRING)) {
2732 s = rb_str_to_str(s);
2733 *ptr = s;
2734 }
2735 return s;
2736}
2737
2738char *
2740{
2741 VALUE str = rb_string_value(ptr);
2742 return RSTRING_PTR(str);
2743}
2744
2745static int
2746zero_filled(const char *s, int n)
2747{
2748 for (; n > 0; --n) {
2749 if (*s++) return 0;
2750 }
2751 return 1;
2752}
2753
2754static const char *
2755str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2756{
2757 const char *e = s + len;
2758
2759 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2760 if (zero_filled(s, minlen)) return s;
2761 }
2762 return 0;
2763}
2764
2765static char *
2766str_fill_term(VALUE str, char *s, long len, int termlen)
2767{
2768 /* This function assumes that (capa + termlen) bytes of memory
2769 * is allocated, like many other functions in this file.
2770 */
2771 if (str_dependent_p(str)) {
2772 if (!zero_filled(s + len, termlen))
2773 str_make_independent_expand(str, len, 0L, termlen);
2774 }
2775 else {
2776 TERM_FILL(s + len, termlen);
2777 return s;
2778 }
2779 return RSTRING_PTR(str);
2780}
2781
2782void
2783rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2784{
2785 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2786 long len = RSTRING_LEN(str);
2787
2788 RUBY_ASSERT(capa >= len);
2789 if (capa - len < termlen) {
2790 rb_check_lockedtmp(str);
2791 str_make_independent_expand(str, len, 0L, termlen);
2792 }
2793 else if (str_dependent_p(str)) {
2794 if (termlen > oldtermlen)
2795 str_make_independent_expand(str, len, 0L, termlen);
2796 }
2797 else {
2798 if (!STR_EMBED_P(str)) {
2799 /* modify capa instead of realloc */
2800 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2801 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2802 }
2803 if (termlen > oldtermlen) {
2804 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2805 }
2806 }
2807
2808 return;
2809}
2810
2811static char *
2812str_null_check(VALUE str, int *w)
2813{
2814 char *s = RSTRING_PTR(str);
2815 long len = RSTRING_LEN(str);
2816 rb_encoding *enc = rb_enc_get(str);
2817 const int minlen = rb_enc_mbminlen(enc);
2818
2819 if (minlen > 1) {
2820 *w = 1;
2821 if (str_null_char(s, len, minlen, enc)) {
2822 return NULL;
2823 }
2824 return str_fill_term(str, s, len, minlen);
2825 }
2826 *w = 0;
2827 if (!s || memchr(s, 0, len)) {
2828 return NULL;
2829 }
2830 if (s[len]) {
2831 s = str_fill_term(str, s, len, minlen);
2832 }
2833 return s;
2834}
2835
2836char *
2837rb_str_to_cstr(VALUE str)
2838{
2839 int w;
2840 return str_null_check(str, &w);
2841}
2842
2843char *
2845{
2846 VALUE str = rb_string_value(ptr);
2847 int w;
2848 char *s = str_null_check(str, &w);
2849 if (!s) {
2850 if (w) {
2851 rb_raise(rb_eArgError, "string contains null char");
2852 }
2853 rb_raise(rb_eArgError, "string contains null byte");
2854 }
2855 return s;
2856}
2857
2858char *
2859rb_str_fill_terminator(VALUE str, const int newminlen)
2860{
2861 char *s = RSTRING_PTR(str);
2862 long len = RSTRING_LEN(str);
2863 return str_fill_term(str, s, len, newminlen);
2864}
2865
2866VALUE
2868{
2869 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2870 return str;
2871}
2872
2873/*
2874 * call-seq:
2875 * String.try_convert(object) -> object, new_string, or nil
2876 *
2877 * Attempts to convert the given +object+ to a string.
2878 *
2879 * If +object+ is already a string, returns +object+, unmodified.
2880 *
2881 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2882 * calls <tt>object.to_str</tt> and returns the result.
2883 *
2884 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2885 *
2886 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2887 */
2888static VALUE
2889rb_str_s_try_convert(VALUE dummy, VALUE str)
2890{
2891 return rb_check_string_type(str);
2892}
2893
2894static char*
2895str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2896{
2897 long nth = *nthp;
2898 if (rb_enc_mbmaxlen(enc) == 1) {
2899 p += nth;
2900 }
2901 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2902 p += nth * rb_enc_mbmaxlen(enc);
2903 }
2904 else if (rb_enc_asciicompat(enc)) {
2905 const char *p2, *e2;
2906 int n;
2907
2908 while (p < e && 0 < nth) {
2909 e2 = p + nth;
2910 if (e < e2) {
2911 *nthp = nth;
2912 return (char *)e;
2913 }
2914 if (ISASCII(*p)) {
2915 p2 = search_nonascii(p, e2);
2916 if (!p2) {
2917 nth -= e2 - p;
2918 *nthp = nth;
2919 return (char *)e2;
2920 }
2921 nth -= p2 - p;
2922 p = p2;
2923 }
2924 n = rb_enc_mbclen(p, e, enc);
2925 p += n;
2926 nth--;
2927 }
2928 *nthp = nth;
2929 if (nth != 0) {
2930 return (char *)e;
2931 }
2932 return (char *)p;
2933 }
2934 else {
2935 while (p < e && nth--) {
2936 p += rb_enc_mbclen(p, e, enc);
2937 }
2938 }
2939 if (p > e) p = e;
2940 *nthp = nth;
2941 return (char*)p;
2942}
2943
2944char*
2945rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2946{
2947 return str_nth_len(p, e, &nth, enc);
2948}
2949
2950static char*
2951str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2952{
2953 if (singlebyte)
2954 p += nth;
2955 else {
2956 p = str_nth_len(p, e, &nth, enc);
2957 }
2958 if (!p) return 0;
2959 if (p > e) p = e;
2960 return (char *)p;
2961}
2962
2963/* char offset to byte offset */
2964static long
2965str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2966{
2967 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2968 if (!pp) return e - p;
2969 return pp - p;
2970}
2971
2972long
2973rb_str_offset(VALUE str, long pos)
2974{
2975 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2976 STR_ENC_GET(str), single_byte_optimizable(str));
2977}
2978
2979#ifdef NONASCII_MASK
2980static char *
2981str_utf8_nth(const char *p, const char *e, long *nthp)
2982{
2983 long nth = *nthp;
2984 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2985 const uintptr_t *s, *t;
2986 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2987 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2988 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2989 while (p < (const char *)s) {
2990 if (is_utf8_lead_byte(*p)) nth--;
2991 p++;
2992 }
2993 do {
2994 nth -= count_utf8_lead_bytes_with_word(s);
2995 s++;
2996 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2997 p = (char *)s;
2998 }
2999 while (p < e) {
3000 if (is_utf8_lead_byte(*p)) {
3001 if (nth == 0) break;
3002 nth--;
3003 }
3004 p++;
3005 }
3006 *nthp = nth;
3007 return (char *)p;
3008}
3009
3010static long
3011str_utf8_offset(const char *p, const char *e, long nth)
3012{
3013 const char *pp = str_utf8_nth(p, e, &nth);
3014 return pp - p;
3015}
3016#endif
3017
3018/* byte offset to char offset */
3019long
3020rb_str_sublen(VALUE str, long pos)
3021{
3022 if (single_byte_optimizable(str) || pos < 0)
3023 return pos;
3024 else {
3025 char *p = RSTRING_PTR(str);
3026 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3027 }
3028}
3029
3030static VALUE
3031str_subseq(VALUE str, long beg, long len)
3032{
3033 VALUE str2;
3034
3035 RUBY_ASSERT(beg >= 0);
3036 RUBY_ASSERT(len >= 0);
3037 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3038
3039 const int termlen = TERM_LEN(str);
3040 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3041 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3042 RB_GC_GUARD(str);
3043 return str2;
3044 }
3045
3046 str2 = str_alloc_heap(rb_cString);
3047 if (str_embed_capa(str2) >= len + termlen) {
3048 char *ptr2 = RSTRING(str2)->as.embed.ary;
3049 STR_SET_EMBED(str2);
3050 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3051 TERM_FILL(ptr2+len, termlen);
3052
3053 STR_SET_LEN(str2, len);
3054 RB_GC_GUARD(str);
3055 }
3056 else {
3057 str_replace_shared(str2, str);
3058 RUBY_ASSERT(!STR_EMBED_P(str2));
3059 ENC_CODERANGE_CLEAR(str2);
3060 RSTRING(str2)->as.heap.ptr += beg;
3061 if (RSTRING_LEN(str2) > len) {
3062 STR_SET_LEN(str2, len);
3063 }
3064 }
3065
3066 return str2;
3067}
3068
3069VALUE
3070rb_str_subseq(VALUE str, long beg, long len)
3071{
3072 VALUE str2 = str_subseq(str, beg, len);
3073 rb_enc_cr_str_copy_for_substr(str2, str);
3074 return str2;
3075}
3076
3077char *
3078rb_str_subpos(VALUE str, long beg, long *lenp)
3079{
3080 long len = *lenp;
3081 long slen = -1L;
3082 const long blen = RSTRING_LEN(str);
3083 rb_encoding *enc = STR_ENC_GET(str);
3084 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3085
3086 if (len < 0) return 0;
3087 if (beg < 0 && -beg < 0) return 0;
3088 if (!blen) {
3089 len = 0;
3090 }
3091 if (single_byte_optimizable(str)) {
3092 if (beg > blen) return 0;
3093 if (beg < 0) {
3094 beg += blen;
3095 if (beg < 0) return 0;
3096 }
3097 if (len > blen - beg)
3098 len = blen - beg;
3099 if (len < 0) return 0;
3100 p = s + beg;
3101 goto end;
3102 }
3103 if (beg < 0) {
3104 if (len > -beg) len = -beg;
3105 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3106 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3107 beg = -beg;
3108 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3109 p = e;
3110 if (!p) return 0;
3111 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3112 if (!p) return 0;
3113 len = e - p;
3114 goto end;
3115 }
3116 else {
3117 slen = str_strlen(str, enc);
3118 beg += slen;
3119 if (beg < 0) return 0;
3120 p = s + beg;
3121 if (len == 0) goto end;
3122 }
3123 }
3124 else if (beg > 0 && beg > blen) {
3125 return 0;
3126 }
3127 if (len == 0) {
3128 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3129 p = s + beg;
3130 }
3131#ifdef NONASCII_MASK
3132 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3133 enc == rb_utf8_encoding()) {
3134 p = str_utf8_nth(s, e, &beg);
3135 if (beg > 0) return 0;
3136 len = str_utf8_offset(p, e, len);
3137 }
3138#endif
3139 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3140 int char_sz = rb_enc_mbmaxlen(enc);
3141
3142 p = s + beg * char_sz;
3143 if (p > e) {
3144 return 0;
3145 }
3146 else if (len * char_sz > e - p)
3147 len = e - p;
3148 else
3149 len *= char_sz;
3150 }
3151 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3152 if (beg > 0) return 0;
3153 len = 0;
3154 }
3155 else {
3156 len = str_offset(p, e, len, enc, 0);
3157 }
3158 end:
3159 *lenp = len;
3160 RB_GC_GUARD(str);
3161 return p;
3162}
3163
3164static VALUE str_substr(VALUE str, long beg, long len, int empty);
3165
3166VALUE
3167rb_str_substr(VALUE str, long beg, long len)
3168{
3169 return str_substr(str, beg, len, TRUE);
3170}
3171
3172VALUE
3173rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3174{
3175 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3176}
3177
3178static VALUE
3179str_substr(VALUE str, long beg, long len, int empty)
3180{
3181 char *p = rb_str_subpos(str, beg, &len);
3182
3183 if (!p) return Qnil;
3184 if (!len && !empty) return Qnil;
3185
3186 beg = p - RSTRING_PTR(str);
3187
3188 VALUE str2 = str_subseq(str, beg, len);
3189 rb_enc_cr_str_copy_for_substr(str2, str);
3190 return str2;
3191}
3192
3193/* :nodoc: */
3194VALUE
3196{
3197 if (CHILLED_STRING_P(str)) {
3198 FL_UNSET_RAW(str, STR_CHILLED);
3199 }
3200
3201 if (OBJ_FROZEN(str)) return str;
3202 rb_str_resize(str, RSTRING_LEN(str));
3203 return rb_obj_freeze(str);
3204}
3205
3206/*
3207 * call-seq:
3208 * +string -> new_string or self
3209 *
3210 * Returns +self+ if +self+ is not frozen and can be mutated
3211 * without warning issuance.
3212 *
3213 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3214 */
3215static VALUE
3216str_uplus(VALUE str)
3217{
3218 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3219 return rb_str_dup(str);
3220 }
3221 else {
3222 return str;
3223 }
3224}
3225
3226/*
3227 * call-seq:
3228 * -string -> frozen_string
3229 * dedup -> frozen_string
3230 *
3231 * Returns a frozen, possibly pre-existing copy of the string.
3232 *
3233 * The returned +String+ will be deduplicated as long as it does not have
3234 * any instance variables set on it and is not a String subclass.
3235 *
3236 * Note that <tt>-string</tt> variant is more convenient for defining
3237 * constants:
3238 *
3239 * FILENAME = -'config/database.yml'
3240 *
3241 * while +dedup+ is better suitable for using the method in chains
3242 * of calculations:
3243 *
3244 * @url_list.concat(urls.map(&:dedup))
3245 *
3246 */
3247static VALUE
3248str_uminus(VALUE str)
3249{
3250 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3251 str = rb_str_dup(str);
3252 }
3253 return rb_fstring(str);
3254}
3255
3256RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3257#define rb_str_dup_frozen rb_str_new_frozen
3258
3259VALUE
3261{
3262 if (FL_TEST(str, STR_TMPLOCK)) {
3263 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3264 }
3265 FL_SET(str, STR_TMPLOCK);
3266 return str;
3267}
3268
3269VALUE
3271{
3272 if (!FL_TEST(str, STR_TMPLOCK)) {
3273 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3274 }
3275 FL_UNSET(str, STR_TMPLOCK);
3276 return str;
3277}
3278
3279VALUE
3280rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3281{
3282 rb_str_locktmp(str);
3283 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3284}
3285
3286void
3288{
3289 long capa;
3290 const int termlen = TERM_LEN(str);
3291
3292 str_modifiable(str);
3293 if (STR_SHARED_P(str)) {
3294 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3295 }
3296 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3297 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3298 }
3299
3300 int cr = ENC_CODERANGE(str);
3301 if (len == 0) {
3302 /* Empty string does not contain non-ASCII */
3304 }
3305 else if (cr == ENC_CODERANGE_UNKNOWN) {
3306 /* Leave unknown. */
3307 }
3308 else if (len > RSTRING_LEN(str)) {
3309 if (ENC_CODERANGE_CLEAN_P(cr)) {
3310 /* Update the coderange regarding the extended part. */
3311 const char *const prev_end = RSTRING_END(str);
3312 const char *const new_end = RSTRING_PTR(str) + len;
3313 rb_encoding *enc = rb_enc_get(str);
3314 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3315 ENC_CODERANGE_SET(str, cr);
3316 }
3317 else if (cr == ENC_CODERANGE_BROKEN) {
3318 /* May be valid now, by appended part. */
3320 }
3321 }
3322 else if (len < RSTRING_LEN(str)) {
3323 if (cr != ENC_CODERANGE_7BIT) {
3324 /* ASCII-only string is keeping after truncated. Valid
3325 * and broken may be invalid or valid, leave unknown. */
3327 }
3328 }
3329
3330 STR_SET_LEN(str, len);
3331 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3332}
3333
3334VALUE
3335rb_str_resize(VALUE str, long len)
3336{
3337 if (len < 0) {
3338 rb_raise(rb_eArgError, "negative string size (or size too big)");
3339 }
3340
3341 int independent = str_independent(str);
3342 long slen = RSTRING_LEN(str);
3343 const int termlen = TERM_LEN(str);
3344
3345 if (slen > len || (termlen != 1 && slen < len)) {
3347 }
3348
3349 {
3350 long capa;
3351 if (STR_EMBED_P(str)) {
3352 if (len == slen) return str;
3353 if (str_embed_capa(str) >= len + termlen) {
3354 STR_SET_LEN(str, len);
3355 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3356 return str;
3357 }
3358 str_make_independent_expand(str, slen, len - slen, termlen);
3359 }
3360 else if (str_embed_capa(str) >= len + termlen) {
3361 char *ptr = STR_HEAP_PTR(str);
3362 STR_SET_EMBED(str);
3363 if (slen > len) slen = len;
3364 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3365 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3366 STR_SET_LEN(str, len);
3367 if (independent) ruby_xfree(ptr);
3368 return str;
3369 }
3370 else if (!independent) {
3371 if (len == slen) return str;
3372 str_make_independent_expand(str, slen, len - slen, termlen);
3373 }
3374 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3375 (capa - len) > (len < 1024 ? len : 1024)) {
3376 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3377 (size_t)len + termlen, STR_HEAP_SIZE(str));
3378 RSTRING(str)->as.heap.aux.capa = len;
3379 }
3380 else if (len == slen) return str;
3381 STR_SET_LEN(str, len);
3382 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3383 }
3384 return str;
3385}
3386
3387static void
3388str_ensure_available_capa(VALUE str, long len)
3389{
3390 str_modify_keep_cr(str);
3391
3392 const int termlen = TERM_LEN(str);
3393 long olen = RSTRING_LEN(str);
3394
3395 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3396 rb_raise(rb_eArgError, "string sizes too big");
3397 }
3398
3399 long total = olen + len;
3400 long capa = str_capacity(str, termlen);
3401
3402 if (capa < total) {
3403 if (total >= LONG_MAX / 2) {
3404 capa = total;
3405 }
3406 while (total > capa) {
3407 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3408 }
3409 RESIZE_CAPA_TERM(str, capa, termlen);
3410 }
3411}
3412
3413static VALUE
3414str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3415{
3416 if (keep_cr) {
3417 str_modify_keep_cr(str);
3418 }
3419 else {
3420 rb_str_modify(str);
3421 }
3422 if (len == 0) return 0;
3423
3424 long total, olen, off = -1;
3425 char *sptr;
3426 const int termlen = TERM_LEN(str);
3427
3428 RSTRING_GETMEM(str, sptr, olen);
3429 if (ptr >= sptr && ptr <= sptr + olen) {
3430 off = ptr - sptr;
3431 }
3432
3433 long capa = str_capacity(str, termlen);
3434
3435 if (olen > LONG_MAX - len) {
3436 rb_raise(rb_eArgError, "string sizes too big");
3437 }
3438 total = olen + len;
3439 if (capa < total) {
3440 if (total >= LONG_MAX / 2) {
3441 capa = total;
3442 }
3443 while (total > capa) {
3444 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3445 }
3446 RESIZE_CAPA_TERM(str, capa, termlen);
3447 sptr = RSTRING_PTR(str);
3448 }
3449 if (off != -1) {
3450 ptr = sptr + off;
3451 }
3452 memcpy(sptr + olen, ptr, len);
3453 STR_SET_LEN(str, total);
3454 TERM_FILL(sptr + total, termlen); /* sentinel */
3455
3456 return str;
3457}
3458
3459#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3460#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3461
3462VALUE
3463rb_str_cat(VALUE str, const char *ptr, long len)
3464{
3465 if (len == 0) return str;
3466 if (len < 0) {
3467 rb_raise(rb_eArgError, "negative string size (or size too big)");
3468 }
3469 return str_buf_cat(str, ptr, len);
3470}
3471
3472VALUE
3473rb_str_cat_cstr(VALUE str, const char *ptr)
3474{
3475 must_not_null(ptr);
3476 return rb_str_buf_cat(str, ptr, strlen(ptr));
3477}
3478
3479static void
3480rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3481{
3482 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3483
3484 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3485 if (UNLIKELY(!str_independent(str))) {
3486 str_make_independent(str);
3487 }
3488
3489 long string_length = -1;
3490 const int null_terminator_length = 1;
3491 char *sptr;
3492 RSTRING_GETMEM(str, sptr, string_length);
3493
3494 // Ensure the resulting string wouldn't be too long.
3495 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3496 rb_raise(rb_eArgError, "string sizes too big");
3497 }
3498
3499 long string_capacity = str_capacity(str, null_terminator_length);
3500
3501 // Get the code range before any modifications since those might clear the code range.
3502 int cr = ENC_CODERANGE(str);
3503
3504 // Check if the string has spare string_capacity to write the new byte.
3505 if (LIKELY(string_capacity >= string_length + 1)) {
3506 // In fast path we can write the new byte and note the string's new length.
3507 sptr[string_length] = byte;
3508 STR_SET_LEN(str, string_length + 1);
3509 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3510 }
3511 else {
3512 // If there's not enough string_capacity, make a call into the general string concatenation function.
3513 str_buf_cat(str, (char *)&byte, 1);
3514 }
3515
3516 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3517 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3518 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3519 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3520 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3521 if (ISASCII(byte)) {
3523 }
3524 else {
3526
3527 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3528 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3529 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3530 }
3531 }
3532 }
3533}
3534
3535RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3536RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3537RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3538
3539static VALUE
3540rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3541 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3542{
3543 int str_encindex = ENCODING_GET(str);
3544 int res_encindex;
3545 int str_cr, res_cr;
3546 rb_encoding *str_enc, *ptr_enc;
3547
3548 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3549
3550 if (str_encindex == ptr_encindex) {
3551 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3552 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3553 }
3554 }
3555 else {
3556 str_enc = rb_enc_from_index(str_encindex);
3557 ptr_enc = rb_enc_from_index(ptr_encindex);
3558 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3559 if (len == 0)
3560 return str;
3561 if (RSTRING_LEN(str) == 0) {
3562 rb_str_buf_cat(str, ptr, len);
3563 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3564 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3565 return str;
3566 }
3567 goto incompatible;
3568 }
3569 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3570 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3571 }
3572 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3573 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3574 str_cr = rb_enc_str_coderange(str);
3575 }
3576 }
3577 }
3578 if (ptr_cr_ret)
3579 *ptr_cr_ret = ptr_cr;
3580
3581 if (str_encindex != ptr_encindex &&
3582 str_cr != ENC_CODERANGE_7BIT &&
3583 ptr_cr != ENC_CODERANGE_7BIT) {
3584 str_enc = rb_enc_from_index(str_encindex);
3585 ptr_enc = rb_enc_from_index(ptr_encindex);
3586 goto incompatible;
3587 }
3588
3589 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3590 res_encindex = str_encindex;
3591 res_cr = ENC_CODERANGE_UNKNOWN;
3592 }
3593 else if (str_cr == ENC_CODERANGE_7BIT) {
3594 if (ptr_cr == ENC_CODERANGE_7BIT) {
3595 res_encindex = str_encindex;
3596 res_cr = ENC_CODERANGE_7BIT;
3597 }
3598 else {
3599 res_encindex = ptr_encindex;
3600 res_cr = ptr_cr;
3601 }
3602 }
3603 else if (str_cr == ENC_CODERANGE_VALID) {
3604 res_encindex = str_encindex;
3605 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3606 res_cr = str_cr;
3607 else
3608 res_cr = ptr_cr;
3609 }
3610 else { /* str_cr == ENC_CODERANGE_BROKEN */
3611 res_encindex = str_encindex;
3612 res_cr = str_cr;
3613 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3614 }
3615
3616 if (len < 0) {
3617 rb_raise(rb_eArgError, "negative string size (or size too big)");
3618 }
3619 str_buf_cat(str, ptr, len);
3620 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3621 return str;
3622
3623 incompatible:
3624 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3625 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3627}
3628
3629VALUE
3630rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3631{
3632 return rb_enc_cr_str_buf_cat(str, ptr, len,
3633 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3634}
3635
3636VALUE
3638{
3639 /* ptr must reference NUL terminated ASCII string. */
3640 int encindex = ENCODING_GET(str);
3641 rb_encoding *enc = rb_enc_from_index(encindex);
3642 if (rb_enc_asciicompat(enc)) {
3643 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3644 encindex, ENC_CODERANGE_7BIT, 0);
3645 }
3646 else {
3647 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3648 while (*ptr) {
3649 unsigned int c = (unsigned char)*ptr;
3650 int len = rb_enc_codelen(c, enc);
3651 rb_enc_mbcput(c, buf, enc);
3652 rb_enc_cr_str_buf_cat(str, buf, len,
3653 encindex, ENC_CODERANGE_VALID, 0);
3654 ptr++;
3655 }
3656 return str;
3657 }
3658}
3659
3660VALUE
3662{
3663 int str2_cr = rb_enc_str_coderange(str2);
3664
3665 if (str_enc_fastpath(str)) {
3666 switch (str2_cr) {
3667 case ENC_CODERANGE_7BIT:
3668 // If RHS is 7bit we can do simple concatenation
3669 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3670 RB_GC_GUARD(str2);
3671 return str;
3673 // If RHS is valid, we can do simple concatenation if encodings are the same
3674 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3675 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3676 int str_cr = ENC_CODERANGE(str);
3677 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3678 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3679 }
3680 RB_GC_GUARD(str2);
3681 return str;
3682 }
3683 }
3684 }
3685
3686 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3687 ENCODING_GET(str2), str2_cr, &str2_cr);
3688
3689 ENC_CODERANGE_SET(str2, str2_cr);
3690
3691 return str;
3692}
3693
3694VALUE
3696{
3697 StringValue(str2);
3698 return rb_str_buf_append(str, str2);
3699}
3700
3701VALUE
3702rb_str_concat_literals(size_t num, const VALUE *strary)
3703{
3704 VALUE str;
3705 size_t i, s = 0;
3706 unsigned long len = 1;
3707
3708 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3709 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3710
3711 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3712 str = rb_str_buf_new(len);
3713 str_enc_copy_direct(str, strary[0]);
3714
3715 for (i = s; i < num; ++i) {
3716 const VALUE v = strary[i];
3717 int encidx = ENCODING_GET(v);
3718
3719 rb_str_buf_append(str, v);
3720 if (encidx != ENCINDEX_US_ASCII) {
3721 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3722 rb_enc_set_index(str, encidx);
3723 }
3724 }
3725 return str;
3726}
3727
3728/*
3729 * call-seq:
3730 * concat(*objects) -> string
3731 *
3732 * Concatenates each object in +objects+ to +self+ and returns +self+:
3733 *
3734 * s = 'foo'
3735 * s.concat('bar', 'baz') # => "foobarbaz"
3736 * s # => "foobarbaz"
3737 *
3738 * For each given object +object+ that is an Integer,
3739 * the value is considered a codepoint and converted to a character before concatenation:
3740 *
3741 * s = 'foo'
3742 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3743 *
3744 * Related: String#<<, which takes a single argument.
3745 */
3746static VALUE
3747rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3748{
3749 str_modifiable(str);
3750
3751 if (argc == 1) {
3752 return rb_str_concat(str, argv[0]);
3753 }
3754 else if (argc > 1) {
3755 int i;
3756 VALUE arg_str = rb_str_tmp_new(0);
3757 rb_enc_copy(arg_str, str);
3758 for (i = 0; i < argc; i++) {
3759 rb_str_concat(arg_str, argv[i]);
3760 }
3761 rb_str_buf_append(str, arg_str);
3762 }
3763
3764 return str;
3765}
3766
3767/*
3768 * call-seq:
3769 * append_as_bytes(*objects) -> string
3770 *
3771 * Concatenates each object in +objects+ into +self+ without any encoding
3772 * validation or conversion and returns +self+:
3773 *
3774 * s = 'foo'
3775 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3776 * s.valid_encoding? # => false
3777 * s.append_as_bytes("\xAC 12")
3778 * s.valid_encoding? # => true
3779 *
3780 * For each given object +object+ that is an Integer,
3781 * the value is considered a Byte. If the Integer is bigger
3782 * than one byte, only the lower byte is considered, similar to String#setbyte:
3783 *
3784 * s = ""
3785 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3786 *
3787 * Related: String#<<, String#concat, which do an encoding aware concatenation.
3788 */
3789
3790VALUE
3791rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3792{
3793 long needed_capacity = 0;
3794 volatile VALUE t0;
3795 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3796
3797 for (int index = 0; index < argc; index++) {
3798 VALUE obj = argv[index];
3799 enum ruby_value_type type = types[index] = rb_type(obj);
3800 switch (type) {
3801 case T_FIXNUM:
3802 case T_BIGNUM:
3803 needed_capacity++;
3804 break;
3805 case T_STRING:
3806 needed_capacity += RSTRING_LEN(obj);
3807 break;
3808 default:
3809 rb_raise(
3811 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3812 rb_obj_class(obj)
3813 );
3814 break;
3815 }
3816 }
3817
3818 str_ensure_available_capa(str, needed_capacity);
3819 char *sptr = RSTRING_END(str);
3820
3821 for (int index = 0; index < argc; index++) {
3822 VALUE obj = argv[index];
3823 enum ruby_value_type type = types[index];
3824 switch (type) {
3825 case T_FIXNUM:
3826 case T_BIGNUM: {
3827 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3828 char byte = (char)(NUM2INT(obj) & 0xFF);
3829 *sptr = byte;
3830 sptr++;
3831 break;
3832 }
3833 case T_STRING: {
3834 const char *ptr;
3835 long len;
3836 RSTRING_GETMEM(obj, ptr, len);
3837 memcpy(sptr, ptr, len);
3838 sptr += len;
3839 break;
3840 }
3841 default:
3842 rb_bug("append_as_bytes arguments should have been validated");
3843 }
3844 }
3845
3846 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3847 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3848
3849 int cr = ENC_CODERANGE(str);
3850 switch (cr) {
3851 case ENC_CODERANGE_7BIT: {
3852 for (int index = 0; index < argc; index++) {
3853 VALUE obj = argv[index];
3854 enum ruby_value_type type = types[index];
3855 switch (type) {
3856 case T_FIXNUM:
3857 case T_BIGNUM: {
3858 if (!ISASCII(NUM2INT(obj))) {
3859 goto clear_cr;
3860 }
3861 break;
3862 }
3863 case T_STRING: {
3864 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3865 goto clear_cr;
3866 }
3867 break;
3868 }
3869 default:
3870 rb_bug("append_as_bytes arguments should have been validated");
3871 }
3872 }
3873 break;
3874 }
3876 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3877 goto keep_cr;
3878 }
3879 else {
3880 goto clear_cr;
3881 }
3882 break;
3883 default:
3884 goto clear_cr;
3885 break;
3886 }
3887
3888 RB_GC_GUARD(t0);
3889
3890 clear_cr:
3891 // If no fast path was hit, we clear the coderange.
3892 // append_as_bytes is predominently meant to be used in
3893 // buffering situation, hence it's likely the coderange
3894 // will never be scanned, so it's not worth spending time
3895 // precomputing the coderange except for simple and common
3896 // situations.
3898 keep_cr:
3899 return str;
3900}
3901
3902/*
3903 * call-seq:
3904 * string << object -> string
3905 *
3906 * Concatenates +object+ to +self+ and returns +self+:
3907 *
3908 * s = 'foo'
3909 * s << 'bar' # => "foobar"
3910 * s # => "foobar"
3911 *
3912 * If +object+ is an Integer,
3913 * the value is considered a codepoint and converted to a character before concatenation:
3914 *
3915 * s = 'foo'
3916 * s << 33 # => "foo!"
3917 *
3918 * If that codepoint is not representable in the encoding of
3919 * _string_, RangeError is raised.
3920 *
3921 * s = 'foo'
3922 * s.encoding # => <Encoding:UTF-8>
3923 * s << 0x00110000 # 1114112 out of char range (RangeError)
3924 * s = 'foo'.encode(Encoding::EUC_JP)
3925 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3926 *
3927 * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
3928 * is automatically promoted to ASCII-8BIT.
3929 *
3930 * s = 'foo'.encode(Encoding::US_ASCII)
3931 * s << 0xff
3932 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3933 *
3934 * Related: String#concat, which takes multiple arguments.
3935 */
3936VALUE
3938{
3939 unsigned int code;
3940 rb_encoding *enc = STR_ENC_GET(str1);
3941 int encidx;
3942
3943 if (RB_INTEGER_TYPE_P(str2)) {
3944 if (rb_num_to_uint(str2, &code) == 0) {
3945 }
3946 else if (FIXNUM_P(str2)) {
3947 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3948 }
3949 else {
3950 rb_raise(rb_eRangeError, "bignum out of char range");
3951 }
3952 }
3953 else {
3954 return rb_str_append(str1, str2);
3955 }
3956
3957 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3958
3959 if (encidx >= 0) {
3960 rb_str_buf_cat_byte(str1, (unsigned char)code);
3961 }
3962 else {
3963 long pos = RSTRING_LEN(str1);
3964 int cr = ENC_CODERANGE(str1);
3965 int len;
3966 char *buf;
3967
3968 switch (len = rb_enc_codelen(code, enc)) {
3969 case ONIGERR_INVALID_CODE_POINT_VALUE:
3970 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3971 break;
3972 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3973 case 0:
3974 rb_raise(rb_eRangeError, "%u out of char range", code);
3975 break;
3976 }
3977 buf = ALLOCA_N(char, len + 1);
3978 rb_enc_mbcput(code, buf, enc);
3979 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3980 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3981 }
3982 rb_str_resize(str1, pos+len);
3983 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3984 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3986 }
3987 else if (cr == ENC_CODERANGE_BROKEN) {
3989 }
3990 ENC_CODERANGE_SET(str1, cr);
3991 }
3992 return str1;
3993}
3994
3995int
3996rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3997{
3998 int encidx = rb_enc_to_index(enc);
3999
4000 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4001 /* US-ASCII automatically extended to ASCII-8BIT */
4002 if (code > 0xFF) {
4003 rb_raise(rb_eRangeError, "%u out of char range", code);
4004 }
4005 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4006 return ENCINDEX_ASCII_8BIT;
4007 }
4008 return encidx;
4009 }
4010 else {
4011 return -1;
4012 }
4013}
4014
4015/*
4016 * call-seq:
4017 * prepend(*other_strings) -> string
4018 *
4019 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4020 *
4021 * s = 'foo'
4022 * s.prepend('bar', 'baz') # => "barbazfoo"
4023 * s # => "barbazfoo"
4024 *
4025 * Related: String#concat.
4026 */
4027
4028static VALUE
4029rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4030{
4031 str_modifiable(str);
4032
4033 if (argc == 1) {
4034 rb_str_update(str, 0L, 0L, argv[0]);
4035 }
4036 else if (argc > 1) {
4037 int i;
4038 VALUE arg_str = rb_str_tmp_new(0);
4039 rb_enc_copy(arg_str, str);
4040 for (i = 0; i < argc; i++) {
4041 rb_str_append(arg_str, argv[i]);
4042 }
4043 rb_str_update(str, 0L, 0L, arg_str);
4044 }
4045
4046 return str;
4047}
4048
4049st_index_t
4051{
4052 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4053 st_index_t precomputed_hash;
4054 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4055
4056 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4057 return precomputed_hash;
4058 }
4059
4060 return str_do_hash(str);
4061}
4062
4063int
4065{
4066 long len1, len2;
4067 const char *ptr1, *ptr2;
4068 RSTRING_GETMEM(str1, ptr1, len1);
4069 RSTRING_GETMEM(str2, ptr2, len2);
4070 return (len1 != len2 ||
4071 !rb_str_comparable(str1, str2) ||
4072 memcmp(ptr1, ptr2, len1) != 0);
4073}
4074
4075/*
4076 * call-seq:
4077 * hash -> integer
4078 *
4079 * Returns the integer hash value for +self+.
4080 * The value is based on the length, content and encoding of +self+.
4081 *
4082 * Related: Object#hash.
4083 */
4084
4085static VALUE
4086rb_str_hash_m(VALUE str)
4087{
4088 st_index_t hval = rb_str_hash(str);
4089 return ST2FIX(hval);
4090}
4091
4092#define lesser(a,b) (((a)>(b))?(b):(a))
4093
4094int
4096{
4097 int idx1, idx2;
4098 int rc1, rc2;
4099
4100 if (RSTRING_LEN(str1) == 0) return TRUE;
4101 if (RSTRING_LEN(str2) == 0) return TRUE;
4102 idx1 = ENCODING_GET(str1);
4103 idx2 = ENCODING_GET(str2);
4104 if (idx1 == idx2) return TRUE;
4105 rc1 = rb_enc_str_coderange(str1);
4106 rc2 = rb_enc_str_coderange(str2);
4107 if (rc1 == ENC_CODERANGE_7BIT) {
4108 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4109 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4110 return TRUE;
4111 }
4112 if (rc2 == ENC_CODERANGE_7BIT) {
4113 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4114 return TRUE;
4115 }
4116 return FALSE;
4117}
4118
4119int
4121{
4122 long len1, len2;
4123 const char *ptr1, *ptr2;
4124 int retval;
4125
4126 if (str1 == str2) return 0;
4127 RSTRING_GETMEM(str1, ptr1, len1);
4128 RSTRING_GETMEM(str2, ptr2, len2);
4129 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4130 if (len1 == len2) {
4131 if (!rb_str_comparable(str1, str2)) {
4132 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4133 return 1;
4134 return -1;
4135 }
4136 return 0;
4137 }
4138 if (len1 > len2) return 1;
4139 return -1;
4140 }
4141 if (retval > 0) return 1;
4142 return -1;
4143}
4144
4145/*
4146 * call-seq:
4147 * string == object -> true or false
4148 * string === object -> true or false
4149 *
4150 * Returns +true+ if +object+ has the same length and content;
4151 * as +self+; +false+ otherwise:
4152 *
4153 * s = 'foo'
4154 * s == 'foo' # => true
4155 * s == 'food' # => false
4156 * s == 'FOO' # => false
4157 *
4158 * Returns +false+ if the two strings' encodings are not compatible:
4159 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4160 *
4161 * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4162 * two strings are compared using <code>object.==</code>.
4163 */
4164
4165VALUE
4167{
4168 if (str1 == str2) return Qtrue;
4169 if (!RB_TYPE_P(str2, T_STRING)) {
4170 if (!rb_respond_to(str2, idTo_str)) {
4171 return Qfalse;
4172 }
4173 return rb_equal(str2, str1);
4174 }
4175 return rb_str_eql_internal(str1, str2);
4176}
4177
4178/*
4179 * call-seq:
4180 * eql?(object) -> true or false
4181 *
4182 * Returns +true+ if +object+ has the same length and content;
4183 * as +self+; +false+ otherwise:
4184 *
4185 * s = 'foo'
4186 * s.eql?('foo') # => true
4187 * s.eql?('food') # => false
4188 * s.eql?('FOO') # => false
4189 *
4190 * Returns +false+ if the two strings' encodings are not compatible:
4191 *
4192 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1).eql?("\u{c4 d6 dc}") # => false
4193 *
4194 */
4195
4196VALUE
4197rb_str_eql(VALUE str1, VALUE str2)
4198{
4199 if (str1 == str2) return Qtrue;
4200 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4201 return rb_str_eql_internal(str1, str2);
4202}
4203
4204/*
4205 * call-seq:
4206 * string <=> other_string -> -1, 0, 1, or nil
4207 *
4208 * Compares +self+ and +other_string+, returning:
4209 *
4210 * - -1 if +other_string+ is larger.
4211 * - 0 if the two are equal.
4212 * - 1 if +other_string+ is smaller.
4213 * - +nil+ if the two are incomparable.
4214 *
4215 * Examples:
4216 *
4217 * 'foo' <=> 'foo' # => 0
4218 * 'foo' <=> 'food' # => -1
4219 * 'food' <=> 'foo' # => 1
4220 * 'FOO' <=> 'foo' # => -1
4221 * 'foo' <=> 'FOO' # => 1
4222 * 'foo' <=> 1 # => nil
4223 *
4224 */
4225
4226static VALUE
4227rb_str_cmp_m(VALUE str1, VALUE str2)
4228{
4229 int result;
4230 VALUE s = rb_check_string_type(str2);
4231 if (NIL_P(s)) {
4232 return rb_invcmp(str1, str2);
4233 }
4234 result = rb_str_cmp(str1, s);
4235 return INT2FIX(result);
4236}
4237
4238static VALUE str_casecmp(VALUE str1, VALUE str2);
4239static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4240
4241/*
4242 * call-seq:
4243 * casecmp(other_string) -> -1, 0, 1, or nil
4244 *
4245 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4246 *
4247 * - -1 if <tt>other_string.downcase</tt> is larger.
4248 * - 0 if the two are equal.
4249 * - 1 if <tt>other_string.downcase</tt> is smaller.
4250 * - +nil+ if the two are incomparable.
4251 *
4252 * Examples:
4253 *
4254 * 'foo'.casecmp('foo') # => 0
4255 * 'foo'.casecmp('food') # => -1
4256 * 'food'.casecmp('foo') # => 1
4257 * 'FOO'.casecmp('foo') # => 0
4258 * 'foo'.casecmp('FOO') # => 0
4259 * 'foo'.casecmp(1) # => nil
4260 *
4261 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4262 *
4263 * Related: String#casecmp?.
4264 *
4265 */
4266
4267static VALUE
4268rb_str_casecmp(VALUE str1, VALUE str2)
4269{
4270 VALUE s = rb_check_string_type(str2);
4271 if (NIL_P(s)) {
4272 return Qnil;
4273 }
4274 return str_casecmp(str1, s);
4275}
4276
4277static VALUE
4278str_casecmp(VALUE str1, VALUE str2)
4279{
4280 long len;
4281 rb_encoding *enc;
4282 const char *p1, *p1end, *p2, *p2end;
4283
4284 enc = rb_enc_compatible(str1, str2);
4285 if (!enc) {
4286 return Qnil;
4287 }
4288
4289 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4290 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4291 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4292 while (p1 < p1end && p2 < p2end) {
4293 if (*p1 != *p2) {
4294 unsigned int c1 = TOLOWER(*p1 & 0xff);
4295 unsigned int c2 = TOLOWER(*p2 & 0xff);
4296 if (c1 != c2)
4297 return INT2FIX(c1 < c2 ? -1 : 1);
4298 }
4299 p1++;
4300 p2++;
4301 }
4302 }
4303 else {
4304 while (p1 < p1end && p2 < p2end) {
4305 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4306 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4307
4308 if (0 <= c1 && 0 <= c2) {
4309 c1 = TOLOWER(c1);
4310 c2 = TOLOWER(c2);
4311 if (c1 != c2)
4312 return INT2FIX(c1 < c2 ? -1 : 1);
4313 }
4314 else {
4315 int r;
4316 l1 = rb_enc_mbclen(p1, p1end, enc);
4317 l2 = rb_enc_mbclen(p2, p2end, enc);
4318 len = l1 < l2 ? l1 : l2;
4319 r = memcmp(p1, p2, len);
4320 if (r != 0)
4321 return INT2FIX(r < 0 ? -1 : 1);
4322 if (l1 != l2)
4323 return INT2FIX(l1 < l2 ? -1 : 1);
4324 }
4325 p1 += l1;
4326 p2 += l2;
4327 }
4328 }
4329 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4330 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4331 return INT2FIX(-1);
4332}
4333
4334/*
4335 * call-seq:
4336 * casecmp?(other_string) -> true, false, or nil
4337 *
4338 * Returns +true+ if +self+ and +other_string+ are equal after
4339 * Unicode case folding, otherwise +false+:
4340 *
4341 * 'foo'.casecmp?('foo') # => true
4342 * 'foo'.casecmp?('food') # => false
4343 * 'food'.casecmp?('foo') # => false
4344 * 'FOO'.casecmp?('foo') # => true
4345 * 'foo'.casecmp?('FOO') # => true
4346 *
4347 * Returns +nil+ if the two values are incomparable:
4348 *
4349 * 'foo'.casecmp?(1) # => nil
4350 *
4351 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4352 *
4353 * Related: String#casecmp.
4354 *
4355 */
4356
4357static VALUE
4358rb_str_casecmp_p(VALUE str1, VALUE str2)
4359{
4360 VALUE s = rb_check_string_type(str2);
4361 if (NIL_P(s)) {
4362 return Qnil;
4363 }
4364 return str_casecmp_p(str1, s);
4365}
4366
4367static VALUE
4368str_casecmp_p(VALUE str1, VALUE str2)
4369{
4370 rb_encoding *enc;
4371 VALUE folded_str1, folded_str2;
4372 VALUE fold_opt = sym_fold;
4373
4374 enc = rb_enc_compatible(str1, str2);
4375 if (!enc) {
4376 return Qnil;
4377 }
4378
4379 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4380 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4381
4382 return rb_str_eql(folded_str1, folded_str2);
4383}
4384
4385static long
4386strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4387 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4388{
4389 const char *search_start = str_ptr;
4390 long pos, search_len = str_len - offset;
4391
4392 for (;;) {
4393 const char *t;
4394 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4395 if (pos < 0) return pos;
4396 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4397 if (t == search_start + pos) break;
4398 search_len -= t - search_start;
4399 if (search_len <= 0) return -1;
4400 offset += t - search_start;
4401 search_start = t;
4402 }
4403 return pos + offset;
4404}
4405
4406/* found index in byte */
4407#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4408#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4409
4410static long
4411rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4412{
4413 const char *str_ptr, *str_ptr_end, *sub_ptr;
4414 long str_len, sub_len;
4415 rb_encoding *enc;
4416
4417 enc = rb_enc_check(str, sub);
4418 if (is_broken_string(sub)) return -1;
4419
4420 str_ptr = RSTRING_PTR(str);
4421 str_ptr_end = RSTRING_END(str);
4422 str_len = RSTRING_LEN(str);
4423 sub_ptr = RSTRING_PTR(sub);
4424 sub_len = RSTRING_LEN(sub);
4425
4426 if (str_len < sub_len) return -1;
4427
4428 if (offset != 0) {
4429 long str_len_char, sub_len_char;
4430 int single_byte = single_byte_optimizable(str);
4431 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4432 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4433 if (offset < 0) {
4434 offset += str_len_char;
4435 if (offset < 0) return -1;
4436 }
4437 if (str_len_char - offset < sub_len_char) return -1;
4438 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4439 str_ptr += offset;
4440 }
4441 if (sub_len == 0) return offset;
4442
4443 /* need proceed one character at a time */
4444 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4445}
4446
4447
4448/*
4449 * call-seq:
4450 * index(substring, offset = 0) -> integer or nil
4451 * index(regexp, offset = 0) -> integer or nil
4452 *
4453 * :include: doc/string/index.rdoc
4454 *
4455 */
4456
4457static VALUE
4458rb_str_index_m(int argc, VALUE *argv, VALUE str)
4459{
4460 VALUE sub;
4461 VALUE initpos;
4462 rb_encoding *enc = STR_ENC_GET(str);
4463 long pos;
4464
4465 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4466 long slen = str_strlen(str, enc); /* str's enc */
4467 pos = NUM2LONG(initpos);
4468 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4469 if (RB_TYPE_P(sub, T_REGEXP)) {
4471 }
4472 return Qnil;
4473 }
4474 }
4475 else {
4476 pos = 0;
4477 }
4478
4479 if (RB_TYPE_P(sub, T_REGEXP)) {
4480 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4481 enc, single_byte_optimizable(str));
4482
4483 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4484 VALUE match = rb_backref_get();
4485 struct re_registers *regs = RMATCH_REGS(match);
4486 pos = rb_str_sublen(str, BEG(0));
4487 return LONG2NUM(pos);
4488 }
4489 }
4490 else {
4491 StringValue(sub);
4492 pos = rb_str_index(str, sub, pos);
4493 if (pos >= 0) {
4494 pos = rb_str_sublen(str, pos);
4495 return LONG2NUM(pos);
4496 }
4497 }
4498 return Qnil;
4499}
4500
4501/* Ensure that the given pos is a valid character boundary.
4502 * Note that in this function, "character" means a code point
4503 * (Unicode scalar value), not a grapheme cluster.
4504 */
4505static void
4506str_ensure_byte_pos(VALUE str, long pos)
4507{
4508 if (!single_byte_optimizable(str)) {
4509 const char *s = RSTRING_PTR(str);
4510 const char *e = RSTRING_END(str);
4511 const char *p = s + pos;
4512 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4513 rb_raise(rb_eIndexError,
4514 "offset %ld does not land on character boundary", pos);
4515 }
4516 }
4517}
4518
4519/*
4520 * call-seq:
4521 * byteindex(substring, offset = 0) -> integer or nil
4522 * byteindex(regexp, offset = 0) -> integer or nil
4523 *
4524 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4525 * or +nil+ if none found:
4526 *
4527 * 'foo'.byteindex('f') # => 0
4528 * 'foo'.byteindex('o') # => 1
4529 * 'foo'.byteindex('oo') # => 1
4530 * 'foo'.byteindex('ooo') # => nil
4531 *
4532 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4533 * or +nil+ if none found:
4534 *
4535 * 'foo'.byteindex(/f/) # => 0
4536 * 'foo'.byteindex(/o/) # => 1
4537 * 'foo'.byteindex(/oo/) # => 1
4538 * 'foo'.byteindex(/ooo/) # => nil
4539 *
4540 * Integer argument +offset+, if given, specifies the byte-based position in the
4541 * string to begin the search:
4542 *
4543 * 'foo'.byteindex('o', 1) # => 1
4544 * 'foo'.byteindex('o', 2) # => 2
4545 * 'foo'.byteindex('o', 3) # => nil
4546 *
4547 * If +offset+ is negative, counts backward from the end of +self+:
4548 *
4549 * 'foo'.byteindex('o', -1) # => 2
4550 * 'foo'.byteindex('o', -2) # => 1
4551 * 'foo'.byteindex('o', -3) # => 1
4552 * 'foo'.byteindex('o', -4) # => nil
4553 *
4554 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4555 * raised.
4556 *
4557 * Related: String#index, String#byterindex.
4558 */
4559
4560static VALUE
4561rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4562{
4563 VALUE sub;
4564 VALUE initpos;
4565 long pos;
4566
4567 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4568 long slen = RSTRING_LEN(str);
4569 pos = NUM2LONG(initpos);
4570 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4571 if (RB_TYPE_P(sub, T_REGEXP)) {
4573 }
4574 return Qnil;
4575 }
4576 }
4577 else {
4578 pos = 0;
4579 }
4580
4581 str_ensure_byte_pos(str, pos);
4582
4583 if (RB_TYPE_P(sub, T_REGEXP)) {
4584 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4585 VALUE match = rb_backref_get();
4586 struct re_registers *regs = RMATCH_REGS(match);
4587 pos = BEG(0);
4588 return LONG2NUM(pos);
4589 }
4590 }
4591 else {
4592 StringValue(sub);
4593 pos = rb_str_byteindex(str, sub, pos);
4594 if (pos >= 0) return LONG2NUM(pos);
4595 }
4596 return Qnil;
4597}
4598
4599#ifndef HAVE_MEMRCHR
4600static void*
4601memrchr(const char *search_str, int chr, long search_len)
4602{
4603 const char *ptr = search_str + search_len;
4604 while (ptr > search_str) {
4605 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4606 }
4607
4608 return ((void *)0);
4609}
4610#endif
4611
4612static long
4613str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4614{
4615 char *hit, *adjusted;
4616 int c;
4617 long slen, searchlen;
4618 char *sbeg, *e, *t;
4619
4620 sbeg = RSTRING_PTR(str);
4621 slen = RSTRING_LEN(sub);
4622 if (slen == 0) return s - sbeg;
4623 e = RSTRING_END(str);
4624 t = RSTRING_PTR(sub);
4625 c = *t & 0xff;
4626 searchlen = s - sbeg + 1;
4627
4628 if (memcmp(s, t, slen) == 0) {
4629 return s - sbeg;
4630 }
4631
4632 do {
4633 hit = memrchr(sbeg, c, searchlen);
4634 if (!hit) break;
4635 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4636 if (hit != adjusted) {
4637 searchlen = adjusted - sbeg;
4638 continue;
4639 }
4640 if (memcmp(hit, t, slen) == 0)
4641 return hit - sbeg;
4642 searchlen = adjusted - sbeg;
4643 } while (searchlen > 0);
4644
4645 return -1;
4646}
4647
4648/* found index in byte */
4649static long
4650rb_str_rindex(VALUE str, VALUE sub, long pos)
4651{
4652 long len, slen;
4653 char *sbeg, *s;
4654 rb_encoding *enc;
4655 int singlebyte;
4656
4657 enc = rb_enc_check(str, sub);
4658 if (is_broken_string(sub)) return -1;
4659 singlebyte = single_byte_optimizable(str);
4660 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4661 slen = str_strlen(sub, enc); /* rb_enc_check */
4662
4663 /* substring longer than string */
4664 if (len < slen) return -1;
4665 if (len - pos < slen) pos = len - slen;
4666 if (len == 0) return pos;
4667
4668 sbeg = RSTRING_PTR(str);
4669
4670 if (pos == 0) {
4671 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4672 return 0;
4673 else
4674 return -1;
4675 }
4676
4677 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4678 return str_rindex(str, sub, s, enc);
4679}
4680
4681/*
4682 * call-seq:
4683 * rindex(substring, offset = self.length) -> integer or nil
4684 * rindex(regexp, offset = self.length) -> integer or nil
4685 *
4686 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4687 * or +nil+ if none found:
4688 *
4689 * 'foo'.rindex('f') # => 0
4690 * 'foo'.rindex('o') # => 2
4691 * 'foo'.rindex('oo') # => 1
4692 * 'foo'.rindex('ooo') # => nil
4693 *
4694 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4695 * or +nil+ if none found:
4696 *
4697 * 'foo'.rindex(/f/) # => 0
4698 * 'foo'.rindex(/o/) # => 2
4699 * 'foo'.rindex(/oo/) # => 1
4700 * 'foo'.rindex(/ooo/) # => nil
4701 *
4702 * The _last_ match means starting at the possible last position, not
4703 * the last of longest matches.
4704 *
4705 * 'foo'.rindex(/o+/) # => 2
4706 * $~ #=> #<MatchData "o">
4707 *
4708 * To get the last longest match, needs to combine with negative
4709 * lookbehind.
4710 *
4711 * 'foo'.rindex(/(?<!o)o+/) # => 1
4712 * $~ #=> #<MatchData "oo">
4713 *
4714 * Or String#index with negative lookforward.
4715 *
4716 * 'foo'.index(/o+(?!.*o)/) # => 1
4717 * $~ #=> #<MatchData "oo">
4718 *
4719 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4720 * string to _end_ the search:
4721 *
4722 * 'foo'.rindex('o', 0) # => nil
4723 * 'foo'.rindex('o', 1) # => 1
4724 * 'foo'.rindex('o', 2) # => 2
4725 * 'foo'.rindex('o', 3) # => 2
4726 *
4727 * If +offset+ is a negative Integer, the maximum starting position in the
4728 * string to _end_ the search is the sum of the string's length and +offset+:
4729 *
4730 * 'foo'.rindex('o', -1) # => 2
4731 * 'foo'.rindex('o', -2) # => 1
4732 * 'foo'.rindex('o', -3) # => nil
4733 * 'foo'.rindex('o', -4) # => nil
4734 *
4735 * Related: String#index.
4736 */
4737
4738static VALUE
4739rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4740{
4741 VALUE sub;
4742 VALUE initpos;
4743 rb_encoding *enc = STR_ENC_GET(str);
4744 long pos, len = str_strlen(str, enc); /* str's enc */
4745
4746 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4747 pos = NUM2LONG(initpos);
4748 if (pos < 0 && (pos += len) < 0) {
4749 if (RB_TYPE_P(sub, T_REGEXP)) {
4751 }
4752 return Qnil;
4753 }
4754 if (pos > len) pos = len;
4755 }
4756 else {
4757 pos = len;
4758 }
4759
4760 if (RB_TYPE_P(sub, T_REGEXP)) {
4761 /* enc = rb_enc_check(str, sub); */
4762 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4763 enc, single_byte_optimizable(str));
4764
4765 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4766 VALUE match = rb_backref_get();
4767 struct re_registers *regs = RMATCH_REGS(match);
4768 pos = rb_str_sublen(str, BEG(0));
4769 return LONG2NUM(pos);
4770 }
4771 }
4772 else {
4773 StringValue(sub);
4774 pos = rb_str_rindex(str, sub, pos);
4775 if (pos >= 0) {
4776 pos = rb_str_sublen(str, pos);
4777 return LONG2NUM(pos);
4778 }
4779 }
4780 return Qnil;
4781}
4782
4783static long
4784rb_str_byterindex(VALUE str, VALUE sub, long pos)
4785{
4786 long len, slen;
4787 char *sbeg, *s;
4788 rb_encoding *enc;
4789
4790 enc = rb_enc_check(str, sub);
4791 if (is_broken_string(sub)) return -1;
4792 len = RSTRING_LEN(str);
4793 slen = RSTRING_LEN(sub);
4794
4795 /* substring longer than string */
4796 if (len < slen) return -1;
4797 if (len - pos < slen) pos = len - slen;
4798 if (len == 0) return pos;
4799
4800 sbeg = RSTRING_PTR(str);
4801
4802 if (pos == 0) {
4803 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4804 return 0;
4805 else
4806 return -1;
4807 }
4808
4809 s = sbeg + pos;
4810 return str_rindex(str, sub, s, enc);
4811}
4812
4813
4814/*
4815 * call-seq:
4816 * byterindex(substring, offset = self.bytesize) -> integer or nil
4817 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4818 *
4819 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4820 * or +nil+ if none found:
4821 *
4822 * 'foo'.byterindex('f') # => 0
4823 * 'foo'.byterindex('o') # => 2
4824 * 'foo'.byterindex('oo') # => 1
4825 * 'foo'.byterindex('ooo') # => nil
4826 *
4827 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4828 * or +nil+ if none found:
4829 *
4830 * 'foo'.byterindex(/f/) # => 0
4831 * 'foo'.byterindex(/o/) # => 2
4832 * 'foo'.byterindex(/oo/) # => 1
4833 * 'foo'.byterindex(/ooo/) # => nil
4834 *
4835 * The _last_ match means starting at the possible last position, not
4836 * the last of longest matches.
4837 *
4838 * 'foo'.byterindex(/o+/) # => 2
4839 * $~ #=> #<MatchData "o">
4840 *
4841 * To get the last longest match, needs to combine with negative
4842 * lookbehind.
4843 *
4844 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4845 * $~ #=> #<MatchData "oo">
4846 *
4847 * Or String#byteindex with negative lookforward.
4848 *
4849 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4850 * $~ #=> #<MatchData "oo">
4851 *
4852 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4853 * string to _end_ the search:
4854 *
4855 * 'foo'.byterindex('o', 0) # => nil
4856 * 'foo'.byterindex('o', 1) # => 1
4857 * 'foo'.byterindex('o', 2) # => 2
4858 * 'foo'.byterindex('o', 3) # => 2
4859 *
4860 * If +offset+ is a negative Integer, the maximum starting position in the
4861 * string to _end_ the search is the sum of the string's length and +offset+:
4862 *
4863 * 'foo'.byterindex('o', -1) # => 2
4864 * 'foo'.byterindex('o', -2) # => 1
4865 * 'foo'.byterindex('o', -3) # => nil
4866 * 'foo'.byterindex('o', -4) # => nil
4867 *
4868 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4869 * raised.
4870 *
4871 * Related: String#byteindex.
4872 */
4873
4874static VALUE
4875rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4876{
4877 VALUE sub;
4878 VALUE initpos;
4879 long pos, len = RSTRING_LEN(str);
4880
4881 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4882 pos = NUM2LONG(initpos);
4883 if (pos < 0 && (pos += len) < 0) {
4884 if (RB_TYPE_P(sub, T_REGEXP)) {
4886 }
4887 return Qnil;
4888 }
4889 if (pos > len) pos = len;
4890 }
4891 else {
4892 pos = len;
4893 }
4894
4895 str_ensure_byte_pos(str, pos);
4896
4897 if (RB_TYPE_P(sub, T_REGEXP)) {
4898 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4899 VALUE match = rb_backref_get();
4900 struct re_registers *regs = RMATCH_REGS(match);
4901 pos = BEG(0);
4902 return LONG2NUM(pos);
4903 }
4904 }
4905 else {
4906 StringValue(sub);
4907 pos = rb_str_byterindex(str, sub, pos);
4908 if (pos >= 0) return LONG2NUM(pos);
4909 }
4910 return Qnil;
4911}
4912
4913/*
4914 * call-seq:
4915 * string =~ regexp -> integer or nil
4916 * string =~ object -> integer or nil
4917 *
4918 * Returns the Integer index of the first substring that matches
4919 * the given +regexp+, or +nil+ if no match found:
4920 *
4921 * 'foo' =~ /f/ # => 0
4922 * 'foo' =~ /o/ # => 1
4923 * 'foo' =~ /x/ # => nil
4924 *
4925 * Note: also updates Regexp@Global+Variables.
4926 *
4927 * If the given +object+ is not a Regexp, returns the value
4928 * returned by <tt>object =~ self</tt>.
4929 *
4930 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4931 * (see Regexp#=~):
4932 *
4933 * number= nil
4934 * "no. 9" =~ /(?<number>\d+)/
4935 * number # => nil (not assigned)
4936 * /(?<number>\d+)/ =~ "no. 9"
4937 * number #=> "9"
4938 *
4939 */
4940
4941static VALUE
4942rb_str_match(VALUE x, VALUE y)
4943{
4944 switch (OBJ_BUILTIN_TYPE(y)) {
4945 case T_STRING:
4946 rb_raise(rb_eTypeError, "type mismatch: String given");
4947
4948 case T_REGEXP:
4949 return rb_reg_match(y, x);
4950
4951 default:
4952 return rb_funcall(y, idEqTilde, 1, x);
4953 }
4954}
4955
4956
4957static VALUE get_pat(VALUE);
4958
4959
4960/*
4961 * call-seq:
4962 * match(pattern, offset = 0) -> matchdata or nil
4963 * match(pattern, offset = 0) {|matchdata| ... } -> object
4964 *
4965 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4966 *
4967 * Note: also updates Regexp@Global+Variables.
4968 *
4969 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4970 * regexp = Regexp.new(pattern)
4971 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4972 * (see Regexp#match):
4973 * matchdata = regexp.match(self)
4974 *
4975 * With no block given, returns the computed +matchdata+:
4976 *
4977 * 'foo'.match('f') # => #<MatchData "f">
4978 * 'foo'.match('o') # => #<MatchData "o">
4979 * 'foo'.match('x') # => nil
4980 *
4981 * If Integer argument +offset+ is given, the search begins at index +offset+:
4982 *
4983 * 'foo'.match('f', 1) # => nil
4984 * 'foo'.match('o', 1) # => #<MatchData "o">
4985 *
4986 * With a block given, calls the block with the computed +matchdata+
4987 * and returns the block's return value:
4988 *
4989 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4990 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4991 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4992 *
4993 */
4994
4995static VALUE
4996rb_str_match_m(int argc, VALUE *argv, VALUE str)
4997{
4998 VALUE re, result;
4999 if (argc < 1)
5000 rb_check_arity(argc, 1, 2);
5001 re = argv[0];
5002 argv[0] = str;
5003 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5004 if (!NIL_P(result) && rb_block_given_p()) {
5005 return rb_yield(result);
5006 }
5007 return result;
5008}
5009
5010/*
5011 * call-seq:
5012 * match?(pattern, offset = 0) -> true or false
5013 *
5014 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5015 *
5016 * Note: does not update Regexp@Global+Variables.
5017 *
5018 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5019 * regexp = Regexp.new(pattern)
5020 *
5021 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5022 * +false+ otherwise:
5023 *
5024 * 'foo'.match?(/o/) # => true
5025 * 'foo'.match?('o') # => true
5026 * 'foo'.match?(/x/) # => false
5027 *
5028 * If Integer argument +offset+ is given, the search begins at index +offset+:
5029 * 'foo'.match?('f', 1) # => false
5030 * 'foo'.match?('o', 1) # => true
5031 *
5032 */
5033
5034static VALUE
5035rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5036{
5037 VALUE re;
5038 rb_check_arity(argc, 1, 2);
5039 re = get_pat(argv[0]);
5040 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5041}
5042
5043enum neighbor_char {
5044 NEIGHBOR_NOT_CHAR,
5045 NEIGHBOR_FOUND,
5046 NEIGHBOR_WRAPPED
5047};
5048
5049static enum neighbor_char
5050enc_succ_char(char *p, long len, rb_encoding *enc)
5051{
5052 long i;
5053 int l;
5054
5055 if (rb_enc_mbminlen(enc) > 1) {
5056 /* wchar, trivial case */
5057 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5058 if (!MBCLEN_CHARFOUND_P(r)) {
5059 return NEIGHBOR_NOT_CHAR;
5060 }
5061 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5062 l = rb_enc_code_to_mbclen(c, enc);
5063 if (!l) return NEIGHBOR_NOT_CHAR;
5064 if (l != len) return NEIGHBOR_WRAPPED;
5065 rb_enc_mbcput(c, p, enc);
5066 r = rb_enc_precise_mbclen(p, p + len, enc);
5067 if (!MBCLEN_CHARFOUND_P(r)) {
5068 return NEIGHBOR_NOT_CHAR;
5069 }
5070 return NEIGHBOR_FOUND;
5071 }
5072 while (1) {
5073 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5074 p[i] = '\0';
5075 if (i < 0)
5076 return NEIGHBOR_WRAPPED;
5077 ++((unsigned char*)p)[i];
5078 l = rb_enc_precise_mbclen(p, p+len, enc);
5079 if (MBCLEN_CHARFOUND_P(l)) {
5080 l = MBCLEN_CHARFOUND_LEN(l);
5081 if (l == len) {
5082 return NEIGHBOR_FOUND;
5083 }
5084 else {
5085 memset(p+l, 0xff, len-l);
5086 }
5087 }
5088 if (MBCLEN_INVALID_P(l) && i < len-1) {
5089 long len2;
5090 int l2;
5091 for (len2 = len-1; 0 < len2; len2--) {
5092 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5093 if (!MBCLEN_INVALID_P(l2))
5094 break;
5095 }
5096 memset(p+len2+1, 0xff, len-(len2+1));
5097 }
5098 }
5099}
5100
5101static enum neighbor_char
5102enc_pred_char(char *p, long len, rb_encoding *enc)
5103{
5104 long i;
5105 int l;
5106 if (rb_enc_mbminlen(enc) > 1) {
5107 /* wchar, trivial case */
5108 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5109 if (!MBCLEN_CHARFOUND_P(r)) {
5110 return NEIGHBOR_NOT_CHAR;
5111 }
5112 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5113 if (!c) return NEIGHBOR_NOT_CHAR;
5114 --c;
5115 l = rb_enc_code_to_mbclen(c, enc);
5116 if (!l) return NEIGHBOR_NOT_CHAR;
5117 if (l != len) return NEIGHBOR_WRAPPED;
5118 rb_enc_mbcput(c, p, enc);
5119 r = rb_enc_precise_mbclen(p, p + len, enc);
5120 if (!MBCLEN_CHARFOUND_P(r)) {
5121 return NEIGHBOR_NOT_CHAR;
5122 }
5123 return NEIGHBOR_FOUND;
5124 }
5125 while (1) {
5126 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5127 p[i] = '\xff';
5128 if (i < 0)
5129 return NEIGHBOR_WRAPPED;
5130 --((unsigned char*)p)[i];
5131 l = rb_enc_precise_mbclen(p, p+len, enc);
5132 if (MBCLEN_CHARFOUND_P(l)) {
5133 l = MBCLEN_CHARFOUND_LEN(l);
5134 if (l == len) {
5135 return NEIGHBOR_FOUND;
5136 }
5137 else {
5138 memset(p+l, 0, len-l);
5139 }
5140 }
5141 if (MBCLEN_INVALID_P(l) && i < len-1) {
5142 long len2;
5143 int l2;
5144 for (len2 = len-1; 0 < len2; len2--) {
5145 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5146 if (!MBCLEN_INVALID_P(l2))
5147 break;
5148 }
5149 memset(p+len2+1, 0, len-(len2+1));
5150 }
5151 }
5152}
5153
5154/*
5155 overwrite +p+ by succeeding letter in +enc+ and returns
5156 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5157 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5158 assuming each ranges are successive, and mbclen
5159 never change in each ranges.
5160 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5161 character.
5162 */
5163static enum neighbor_char
5164enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5165{
5166 enum neighbor_char ret;
5167 unsigned int c;
5168 int ctype;
5169 int range;
5170 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5171
5172 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5173 int try;
5174 const int max_gaps = 1;
5175
5176 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5177 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5178 ctype = ONIGENC_CTYPE_DIGIT;
5179 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5180 ctype = ONIGENC_CTYPE_ALPHA;
5181 else
5182 return NEIGHBOR_NOT_CHAR;
5183
5184 MEMCPY(save, p, char, len);
5185 for (try = 0; try <= max_gaps; ++try) {
5186 ret = enc_succ_char(p, len, enc);
5187 if (ret == NEIGHBOR_FOUND) {
5188 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5189 if (rb_enc_isctype(c, ctype, enc))
5190 return NEIGHBOR_FOUND;
5191 }
5192 }
5193 MEMCPY(p, save, char, len);
5194 range = 1;
5195 while (1) {
5196 MEMCPY(save, p, char, len);
5197 ret = enc_pred_char(p, len, enc);
5198 if (ret == NEIGHBOR_FOUND) {
5199 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5200 if (!rb_enc_isctype(c, ctype, enc)) {
5201 MEMCPY(p, save, char, len);
5202 break;
5203 }
5204 }
5205 else {
5206 MEMCPY(p, save, char, len);
5207 break;
5208 }
5209 range++;
5210 }
5211 if (range == 1) {
5212 return NEIGHBOR_NOT_CHAR;
5213 }
5214
5215 if (ctype != ONIGENC_CTYPE_DIGIT) {
5216 MEMCPY(carry, p, char, len);
5217 return NEIGHBOR_WRAPPED;
5218 }
5219
5220 MEMCPY(carry, p, char, len);
5221 enc_succ_char(carry, len, enc);
5222 return NEIGHBOR_WRAPPED;
5223}
5224
5225
5226static VALUE str_succ(VALUE str);
5227
5228/*
5229 * call-seq:
5230 * succ -> new_str
5231 *
5232 * Returns the successor to +self+. The successor is calculated by
5233 * incrementing characters.
5234 *
5235 * The first character to be incremented is the rightmost alphanumeric:
5236 * or, if no alphanumerics, the rightmost character:
5237 *
5238 * 'THX1138'.succ # => "THX1139"
5239 * '<<koala>>'.succ # => "<<koalb>>"
5240 * '***'.succ # => '**+'
5241 *
5242 * The successor to a digit is another digit, "carrying" to the next-left
5243 * character for a "rollover" from 9 to 0, and prepending another digit
5244 * if necessary:
5245 *
5246 * '00'.succ # => "01"
5247 * '09'.succ # => "10"
5248 * '99'.succ # => "100"
5249 *
5250 * The successor to a letter is another letter of the same case,
5251 * carrying to the next-left character for a rollover,
5252 * and prepending another same-case letter if necessary:
5253 *
5254 * 'aa'.succ # => "ab"
5255 * 'az'.succ # => "ba"
5256 * 'zz'.succ # => "aaa"
5257 * 'AA'.succ # => "AB"
5258 * 'AZ'.succ # => "BA"
5259 * 'ZZ'.succ # => "AAA"
5260 *
5261 * The successor to a non-alphanumeric character is the next character
5262 * in the underlying character set's collating sequence,
5263 * carrying to the next-left character for a rollover,
5264 * and prepending another character if necessary:
5265 *
5266 * s = 0.chr * 3
5267 * s # => "\x00\x00\x00"
5268 * s.succ # => "\x00\x00\x01"
5269 * s = 255.chr * 3
5270 * s # => "\xFF\xFF\xFF"
5271 * s.succ # => "\x01\x00\x00\x00"
5272 *
5273 * Carrying can occur between and among mixtures of alphanumeric characters:
5274 *
5275 * s = 'zz99zz99'
5276 * s.succ # => "aaa00aa00"
5277 * s = '99zz99zz'
5278 * s.succ # => "100aa00aa"
5279 *
5280 * The successor to an empty +String+ is a new empty +String+:
5281 *
5282 * ''.succ # => ""
5283 *
5284 */
5285
5286VALUE
5288{
5289 VALUE str;
5290 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5291 rb_enc_cr_str_copy_for_substr(str, orig);
5292 return str_succ(str);
5293}
5294
5295static VALUE
5296str_succ(VALUE str)
5297{
5298 rb_encoding *enc;
5299 char *sbeg, *s, *e, *last_alnum = 0;
5300 int found_alnum = 0;
5301 long l, slen;
5302 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5303 long carry_pos = 0, carry_len = 1;
5304 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5305
5306 slen = RSTRING_LEN(str);
5307 if (slen == 0) return str;
5308
5309 enc = STR_ENC_GET(str);
5310 sbeg = RSTRING_PTR(str);
5311 s = e = sbeg + slen;
5312
5313 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5314 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5315 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5316 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5317 break;
5318 }
5319 }
5320 l = rb_enc_precise_mbclen(s, e, enc);
5321 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5322 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5323 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5324 switch (neighbor) {
5325 case NEIGHBOR_NOT_CHAR:
5326 continue;
5327 case NEIGHBOR_FOUND:
5328 return str;
5329 case NEIGHBOR_WRAPPED:
5330 last_alnum = s;
5331 break;
5332 }
5333 found_alnum = 1;
5334 carry_pos = s - sbeg;
5335 carry_len = l;
5336 }
5337 if (!found_alnum) { /* str contains no alnum */
5338 s = e;
5339 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5340 enum neighbor_char neighbor;
5341 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5342 l = rb_enc_precise_mbclen(s, e, enc);
5343 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5344 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5345 MEMCPY(tmp, s, char, l);
5346 neighbor = enc_succ_char(tmp, l, enc);
5347 switch (neighbor) {
5348 case NEIGHBOR_FOUND:
5349 MEMCPY(s, tmp, char, l);
5350 return str;
5351 break;
5352 case NEIGHBOR_WRAPPED:
5353 MEMCPY(s, tmp, char, l);
5354 break;
5355 case NEIGHBOR_NOT_CHAR:
5356 break;
5357 }
5358 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5359 /* wrapped to \0...\0. search next valid char. */
5360 enc_succ_char(s, l, enc);
5361 }
5362 if (!rb_enc_asciicompat(enc)) {
5363 MEMCPY(carry, s, char, l);
5364 carry_len = l;
5365 }
5366 carry_pos = s - sbeg;
5367 }
5369 }
5370 RESIZE_CAPA(str, slen + carry_len);
5371 sbeg = RSTRING_PTR(str);
5372 s = sbeg + carry_pos;
5373 memmove(s + carry_len, s, slen - carry_pos);
5374 memmove(s, carry, carry_len);
5375 slen += carry_len;
5376 STR_SET_LEN(str, slen);
5377 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5379 return str;
5380}
5381
5382
5383/*
5384 * call-seq:
5385 * succ! -> self
5386 *
5387 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5388 */
5389
5390static VALUE
5391rb_str_succ_bang(VALUE str)
5392{
5393 rb_str_modify(str);
5394 str_succ(str);
5395 return str;
5396}
5397
5398static int
5399all_digits_p(const char *s, long len)
5400{
5401 while (len-- > 0) {
5402 if (!ISDIGIT(*s)) return 0;
5403 s++;
5404 }
5405 return 1;
5406}
5407
5408static int
5409str_upto_i(VALUE str, VALUE arg)
5410{
5411 rb_yield(str);
5412 return 0;
5413}
5414
5415/*
5416 * call-seq:
5417 * upto(other_string, exclusive = false) {|string| ... } -> self
5418 * upto(other_string, exclusive = false) -> new_enumerator
5419 *
5420 * With a block given, calls the block with each +String+ value
5421 * returned by successive calls to String#succ;
5422 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5423 * the sequence terminates when value +other_string+ is reached;
5424 * returns +self+:
5425 *
5426 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5427 * Output:
5428 *
5429 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5430 *
5431 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5432 *
5433 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5434 *
5435 * Output:
5436 *
5437 * a8 a9 b0 b1 b2 b3 b4 b5
5438 *
5439 * If +other_string+ would not be reached, does not call the block:
5440 *
5441 * '25'.upto('5') {|s| fail s }
5442 * 'aa'.upto('a') {|s| fail s }
5443 *
5444 * With no block given, returns a new Enumerator:
5445 *
5446 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5447 *
5448 */
5449
5450static VALUE
5451rb_str_upto(int argc, VALUE *argv, VALUE beg)
5452{
5453 VALUE end, exclusive;
5454
5455 rb_scan_args(argc, argv, "11", &end, &exclusive);
5456 RETURN_ENUMERATOR(beg, argc, argv);
5457 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5458}
5459
5460VALUE
5461rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5462{
5463 VALUE current, after_end;
5464 ID succ;
5465 int n, ascii;
5466 rb_encoding *enc;
5467
5468 CONST_ID(succ, "succ");
5469 StringValue(end);
5470 enc = rb_enc_check(beg, end);
5471 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5472 /* single character */
5473 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5474 char c = RSTRING_PTR(beg)[0];
5475 char e = RSTRING_PTR(end)[0];
5476
5477 if (c > e || (excl && c == e)) return beg;
5478 for (;;) {
5479 VALUE str = rb_enc_str_new(&c, 1, enc);
5481 if ((*each)(str, arg)) break;
5482 if (!excl && c == e) break;
5483 c++;
5484 if (excl && c == e) break;
5485 }
5486 return beg;
5487 }
5488 /* both edges are all digits */
5489 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5490 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5491 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5492 VALUE b, e;
5493 int width;
5494
5495 width = RSTRING_LENINT(beg);
5496 b = rb_str_to_inum(beg, 10, FALSE);
5497 e = rb_str_to_inum(end, 10, FALSE);
5498 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5499 long bi = FIX2LONG(b);
5500 long ei = FIX2LONG(e);
5501 rb_encoding *usascii = rb_usascii_encoding();
5502
5503 while (bi <= ei) {
5504 if (excl && bi == ei) break;
5505 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5506 bi++;
5507 }
5508 }
5509 else {
5510 ID op = excl ? '<' : idLE;
5511 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5512
5513 args[0] = INT2FIX(width);
5514 while (rb_funcall(b, op, 1, e)) {
5515 args[1] = b;
5516 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5517 b = rb_funcallv(b, succ, 0, 0);
5518 }
5519 }
5520 return beg;
5521 }
5522 /* normal case */
5523 n = rb_str_cmp(beg, end);
5524 if (n > 0 || (excl && n == 0)) return beg;
5525
5526 after_end = rb_funcallv(end, succ, 0, 0);
5527 current = str_duplicate(rb_cString, beg);
5528 while (!rb_str_equal(current, after_end)) {
5529 VALUE next = Qnil;
5530 if (excl || !rb_str_equal(current, end))
5531 next = rb_funcallv(current, succ, 0, 0);
5532 if ((*each)(current, arg)) break;
5533 if (NIL_P(next)) break;
5534 current = next;
5535 StringValue(current);
5536 if (excl && rb_str_equal(current, end)) break;
5537 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5538 break;
5539 }
5540
5541 return beg;
5542}
5543
5544VALUE
5545rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5546{
5547 VALUE current;
5548 ID succ;
5549
5550 CONST_ID(succ, "succ");
5551 /* both edges are all digits */
5552 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5553 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5554 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5555 int width = RSTRING_LENINT(beg);
5556 b = rb_str_to_inum(beg, 10, FALSE);
5557 if (FIXNUM_P(b)) {
5558 long bi = FIX2LONG(b);
5559 rb_encoding *usascii = rb_usascii_encoding();
5560
5561 while (FIXABLE(bi)) {
5562 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5563 bi++;
5564 }
5565 b = LONG2NUM(bi);
5566 }
5567 args[0] = INT2FIX(width);
5568 while (1) {
5569 args[1] = b;
5570 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5571 b = rb_funcallv(b, succ, 0, 0);
5572 }
5573 }
5574 /* normal case */
5575 current = str_duplicate(rb_cString, beg);
5576 while (1) {
5577 VALUE next = rb_funcallv(current, succ, 0, 0);
5578 if ((*each)(current, arg)) break;
5579 current = next;
5580 StringValue(current);
5581 if (RSTRING_LEN(current) == 0)
5582 break;
5583 }
5584
5585 return beg;
5586}
5587
5588static int
5589include_range_i(VALUE str, VALUE arg)
5590{
5591 VALUE *argp = (VALUE *)arg;
5592 if (!rb_equal(str, *argp)) return 0;
5593 *argp = Qnil;
5594 return 1;
5595}
5596
5597VALUE
5598rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5599{
5600 beg = rb_str_new_frozen(beg);
5601 StringValue(end);
5602 end = rb_str_new_frozen(end);
5603 if (NIL_P(val)) return Qfalse;
5604 val = rb_check_string_type(val);
5605 if (NIL_P(val)) return Qfalse;
5606 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5607 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5608 rb_enc_asciicompat(STR_ENC_GET(val))) {
5609 const char *bp = RSTRING_PTR(beg);
5610 const char *ep = RSTRING_PTR(end);
5611 const char *vp = RSTRING_PTR(val);
5612 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5613 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5614 return Qfalse;
5615 else {
5616 char b = *bp;
5617 char e = *ep;
5618 char v = *vp;
5619
5620 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5621 if (b <= v && v < e) return Qtrue;
5622 return RBOOL(!RTEST(exclusive) && v == e);
5623 }
5624 }
5625 }
5626#if 0
5627 /* both edges are all digits */
5628 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5629 all_digits_p(bp, RSTRING_LEN(beg)) &&
5630 all_digits_p(ep, RSTRING_LEN(end))) {
5631 /* TODO */
5632 }
5633#endif
5634 }
5635 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5636
5637 return RBOOL(NIL_P(val));
5638}
5639
5640static VALUE
5641rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5642{
5643 if (rb_reg_search(re, str, 0, 0) >= 0) {
5644 VALUE match = rb_backref_get();
5645 int nth = rb_reg_backref_number(match, backref);
5646 return rb_reg_nth_match(nth, match);
5647 }
5648 return Qnil;
5649}
5650
5651static VALUE
5652rb_str_aref(VALUE str, VALUE indx)
5653{
5654 long idx;
5655
5656 if (FIXNUM_P(indx)) {
5657 idx = FIX2LONG(indx);
5658 }
5659 else if (RB_TYPE_P(indx, T_REGEXP)) {
5660 return rb_str_subpat(str, indx, INT2FIX(0));
5661 }
5662 else if (RB_TYPE_P(indx, T_STRING)) {
5663 if (rb_str_index(str, indx, 0) != -1)
5664 return str_duplicate(rb_cString, indx);
5665 return Qnil;
5666 }
5667 else {
5668 /* check if indx is Range */
5669 long beg, len = str_strlen(str, NULL);
5670 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5671 case Qfalse:
5672 break;
5673 case Qnil:
5674 return Qnil;
5675 default:
5676 return rb_str_substr(str, beg, len);
5677 }
5678 idx = NUM2LONG(indx);
5679 }
5680
5681 return str_substr(str, idx, 1, FALSE);
5682}
5683
5684
5685/*
5686 * call-seq:
5687 * string[index] -> new_string or nil
5688 * string[start, length] -> new_string or nil
5689 * string[range] -> new_string or nil
5690 * string[regexp, capture = 0] -> new_string or nil
5691 * string[substring] -> new_string or nil
5692 *
5693 * Returns the substring of +self+ specified by the arguments.
5694 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5695 *
5696 *
5697 */
5698
5699static VALUE
5700rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5701{
5702 if (argc == 2) {
5703 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5704 return rb_str_subpat(str, argv[0], argv[1]);
5705 }
5706 else {
5707 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5708 }
5709 }
5710 rb_check_arity(argc, 1, 2);
5711 return rb_str_aref(str, argv[0]);
5712}
5713
5714VALUE
5716{
5717 char *ptr = RSTRING_PTR(str);
5718 long olen = RSTRING_LEN(str), nlen;
5719
5720 str_modifiable(str);
5721 if (len > olen) len = olen;
5722 nlen = olen - len;
5723 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5724 char *oldptr = ptr;
5725 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5726 STR_SET_EMBED(str);
5727 ptr = RSTRING(str)->as.embed.ary;
5728 memmove(ptr, oldptr + len, nlen);
5729 if (fl == STR_NOEMBED) xfree(oldptr);
5730 }
5731 else {
5732 if (!STR_SHARED_P(str)) {
5733 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5734 rb_enc_cr_str_exact_copy(shared, str);
5735 OBJ_FREEZE(shared);
5736 }
5737 ptr = RSTRING(str)->as.heap.ptr += len;
5738 }
5739 STR_SET_LEN(str, nlen);
5740
5741 if (!SHARABLE_MIDDLE_SUBSTRING) {
5742 TERM_FILL(ptr + nlen, TERM_LEN(str));
5743 }
5745 return str;
5746}
5747
5748static void
5749rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5750{
5751 char *sptr;
5752 long slen;
5753 int cr;
5754
5755 if (beg == 0 && vlen == 0) {
5756 rb_str_drop_bytes(str, len);
5757 return;
5758 }
5759
5760 str_modify_keep_cr(str);
5761 RSTRING_GETMEM(str, sptr, slen);
5762 if (len < vlen) {
5763 /* expand string */
5764 RESIZE_CAPA(str, slen + vlen - len);
5765 sptr = RSTRING_PTR(str);
5766 }
5767
5769 cr = rb_enc_str_coderange(val);
5770 else
5772
5773 if (vlen != len) {
5774 memmove(sptr + beg + vlen,
5775 sptr + beg + len,
5776 slen - (beg + len));
5777 }
5778 if (vlen < beg && len < 0) {
5779 MEMZERO(sptr + slen, char, -len);
5780 }
5781 if (vlen > 0) {
5782 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5783 }
5784 slen += vlen - len;
5785 STR_SET_LEN(str, slen);
5786 TERM_FILL(&sptr[slen], TERM_LEN(str));
5787 ENC_CODERANGE_SET(str, cr);
5788}
5789
5790static inline void
5791rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5792{
5793 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5794}
5795
5796void
5797rb_str_update(VALUE str, long beg, long len, VALUE val)
5798{
5799 long slen;
5800 char *p, *e;
5801 rb_encoding *enc;
5802 int singlebyte = single_byte_optimizable(str);
5803 int cr;
5804
5805 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5806
5807 StringValue(val);
5808 enc = rb_enc_check(str, val);
5809 slen = str_strlen(str, enc); /* rb_enc_check */
5810
5811 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5812 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5813 }
5814 if (beg < 0) {
5815 beg += slen;
5816 }
5817 RUBY_ASSERT(beg >= 0);
5818 RUBY_ASSERT(beg <= slen);
5819
5820 if (len > slen - beg) {
5821 len = slen - beg;
5822 }
5823 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5824 if (!p) p = RSTRING_END(str);
5825 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5826 if (!e) e = RSTRING_END(str);
5827 /* error check */
5828 beg = p - RSTRING_PTR(str); /* physical position */
5829 len = e - p; /* physical length */
5830 rb_str_update_0(str, beg, len, val);
5831 rb_enc_associate(str, enc);
5833 if (cr != ENC_CODERANGE_BROKEN)
5834 ENC_CODERANGE_SET(str, cr);
5835}
5836
5837static void
5838rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5839{
5840 int nth;
5841 VALUE match;
5842 long start, end, len;
5843 rb_encoding *enc;
5844 struct re_registers *regs;
5845
5846 if (rb_reg_search(re, str, 0, 0) < 0) {
5847 rb_raise(rb_eIndexError, "regexp not matched");
5848 }
5849 match = rb_backref_get();
5850 nth = rb_reg_backref_number(match, backref);
5851 regs = RMATCH_REGS(match);
5852 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5853 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5854 }
5855 if (nth < 0) {
5856 nth += regs->num_regs;
5857 }
5858
5859 start = BEG(nth);
5860 if (start == -1) {
5861 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5862 }
5863 end = END(nth);
5864 len = end - start;
5865 StringValue(val);
5866 enc = rb_enc_check_str(str, val);
5867 rb_str_update_0(str, start, len, val);
5868 rb_enc_associate(str, enc);
5869}
5870
5871static VALUE
5872rb_str_aset(VALUE str, VALUE indx, VALUE val)
5873{
5874 long idx, beg;
5875
5876 switch (TYPE(indx)) {
5877 case T_REGEXP:
5878 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5879 return val;
5880
5881 case T_STRING:
5882 beg = rb_str_index(str, indx, 0);
5883 if (beg < 0) {
5884 rb_raise(rb_eIndexError, "string not matched");
5885 }
5886 beg = rb_str_sublen(str, beg);
5887 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5888 return val;
5889
5890 default:
5891 /* check if indx is Range */
5892 {
5893 long beg, len;
5894 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5895 rb_str_update(str, beg, len, val);
5896 return val;
5897 }
5898 }
5899 /* FALLTHROUGH */
5900
5901 case T_FIXNUM:
5902 idx = NUM2LONG(indx);
5903 rb_str_update(str, idx, 1, val);
5904 return val;
5905 }
5906}
5907
5908/*
5909 * call-seq:
5910 * string[index] = new_string
5911 * string[start, length] = new_string
5912 * string[range] = new_string
5913 * string[regexp, capture = 0] = new_string
5914 * string[substring] = new_string
5915 *
5916 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5917 * See {String Slices}[rdoc-ref:String@String+Slices].
5918 *
5919 * A few examples:
5920 *
5921 * s = 'foo'
5922 * s[2] = 'rtune' # => "rtune"
5923 * s # => "fortune"
5924 * s[1, 5] = 'init' # => "init"
5925 * s # => "finite"
5926 * s[3..4] = 'al' # => "al"
5927 * s # => "finale"
5928 * s[/e$/] = 'ly' # => "ly"
5929 * s # => "finally"
5930 * s['lly'] = 'ncial' # => "ncial"
5931 * s # => "financial"
5932 *
5933 */
5934
5935static VALUE
5936rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5937{
5938 if (argc == 3) {
5939 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5940 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5941 }
5942 else {
5943 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5944 }
5945 return argv[2];
5946 }
5947 rb_check_arity(argc, 2, 3);
5948 return rb_str_aset(str, argv[0], argv[1]);
5949}
5950
5951/*
5952 * call-seq:
5953 * insert(index, other_string) -> self
5954 *
5955 * Inserts the given +other_string+ into +self+; returns +self+.
5956 *
5957 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5958 *
5959 * 'foo'.insert(1, 'bar') # => "fbaroo"
5960 *
5961 * If the Integer +index+ is negative, counts backward from the end of +self+
5962 * and inserts +other_string+ at offset <tt>index+1</tt>
5963 * (that is, _after_ <tt>self[index]</tt>):
5964 *
5965 * 'foo'.insert(-2, 'bar') # => "fobaro"
5966 *
5967 */
5968
5969static VALUE
5970rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5971{
5972 long pos = NUM2LONG(idx);
5973
5974 if (pos == -1) {
5975 return rb_str_append(str, str2);
5976 }
5977 else if (pos < 0) {
5978 pos++;
5979 }
5980 rb_str_update(str, pos, 0, str2);
5981 return str;
5982}
5983
5984
5985/*
5986 * call-seq:
5987 * slice!(index) -> new_string or nil
5988 * slice!(start, length) -> new_string or nil
5989 * slice!(range) -> new_string or nil
5990 * slice!(regexp, capture = 0) -> new_string or nil
5991 * slice!(substring) -> new_string or nil
5992 *
5993 * Removes and returns the substring of +self+ specified by the arguments.
5994 * See {String Slices}[rdoc-ref:String@String+Slices].
5995 *
5996 * A few examples:
5997 *
5998 * string = "This is a string"
5999 * string.slice!(2) #=> "i"
6000 * string.slice!(3..6) #=> " is "
6001 * string.slice!(/s.*t/) #=> "sa st"
6002 * string.slice!("r") #=> "r"
6003 * string #=> "Thing"
6004 *
6005 */
6006
6007static VALUE
6008rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6009{
6010 VALUE result = Qnil;
6011 VALUE indx;
6012 long beg, len = 1;
6013 char *p;
6014
6015 rb_check_arity(argc, 1, 2);
6016 str_modify_keep_cr(str);
6017 indx = argv[0];
6018 if (RB_TYPE_P(indx, T_REGEXP)) {
6019 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6020 VALUE match = rb_backref_get();
6021 struct re_registers *regs = RMATCH_REGS(match);
6022 int nth = 0;
6023 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6024 if ((nth += regs->num_regs) <= 0) return Qnil;
6025 }
6026 else if (nth >= regs->num_regs) return Qnil;
6027 beg = BEG(nth);
6028 len = END(nth) - beg;
6029 goto subseq;
6030 }
6031 else if (argc == 2) {
6032 beg = NUM2LONG(indx);
6033 len = NUM2LONG(argv[1]);
6034 goto num_index;
6035 }
6036 else if (FIXNUM_P(indx)) {
6037 beg = FIX2LONG(indx);
6038 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6039 if (!len) return Qnil;
6040 beg = p - RSTRING_PTR(str);
6041 goto subseq;
6042 }
6043 else if (RB_TYPE_P(indx, T_STRING)) {
6044 beg = rb_str_index(str, indx, 0);
6045 if (beg == -1) return Qnil;
6046 len = RSTRING_LEN(indx);
6047 result = str_duplicate(rb_cString, indx);
6048 goto squash;
6049 }
6050 else {
6051 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6052 case Qnil:
6053 return Qnil;
6054 case Qfalse:
6055 beg = NUM2LONG(indx);
6056 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6057 if (!len) return Qnil;
6058 beg = p - RSTRING_PTR(str);
6059 goto subseq;
6060 default:
6061 goto num_index;
6062 }
6063 }
6064
6065 num_index:
6066 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6067 beg = p - RSTRING_PTR(str);
6068
6069 subseq:
6070 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6071 rb_enc_cr_str_copy_for_substr(result, str);
6072
6073 squash:
6074 if (len > 0) {
6075 if (beg == 0) {
6076 rb_str_drop_bytes(str, len);
6077 }
6078 else {
6079 char *sptr = RSTRING_PTR(str);
6080 long slen = RSTRING_LEN(str);
6081 if (beg + len > slen) /* pathological check */
6082 len = slen - beg;
6083 memmove(sptr + beg,
6084 sptr + beg + len,
6085 slen - (beg + len));
6086 slen -= len;
6087 STR_SET_LEN(str, slen);
6088 TERM_FILL(&sptr[slen], TERM_LEN(str));
6089 }
6090 }
6091 return result;
6092}
6093
6094static VALUE
6095get_pat(VALUE pat)
6096{
6097 VALUE val;
6098
6099 switch (OBJ_BUILTIN_TYPE(pat)) {
6100 case T_REGEXP:
6101 return pat;
6102
6103 case T_STRING:
6104 break;
6105
6106 default:
6107 val = rb_check_string_type(pat);
6108 if (NIL_P(val)) {
6109 Check_Type(pat, T_REGEXP);
6110 }
6111 pat = val;
6112 }
6113
6114 return rb_reg_regcomp(pat);
6115}
6116
6117static VALUE
6118get_pat_quoted(VALUE pat, int check)
6119{
6120 VALUE val;
6121
6122 switch (OBJ_BUILTIN_TYPE(pat)) {
6123 case T_REGEXP:
6124 return pat;
6125
6126 case T_STRING:
6127 break;
6128
6129 default:
6130 val = rb_check_string_type(pat);
6131 if (NIL_P(val)) {
6132 Check_Type(pat, T_REGEXP);
6133 }
6134 pat = val;
6135 }
6136 if (check && is_broken_string(pat)) {
6137 rb_exc_raise(rb_reg_check_preprocess(pat));
6138 }
6139 return pat;
6140}
6141
6142static long
6143rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6144{
6145 if (BUILTIN_TYPE(pat) == T_STRING) {
6146 pos = rb_str_byteindex(str, pat, pos);
6147 if (set_backref_str) {
6148 if (pos >= 0) {
6149 str = rb_str_new_frozen_String(str);
6150 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6151 if (match) {
6152 *match = match_data;
6153 }
6154 }
6155 else {
6157 }
6158 }
6159 return pos;
6160 }
6161 else {
6162 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6163 }
6164}
6165
6166static long
6167rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6168{
6169 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6170}
6171
6172
6173/*
6174 * call-seq:
6175 * sub!(pattern, replacement) -> self or nil
6176 * sub!(pattern) {|match| ... } -> self or nil
6177 *
6178 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6179 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6180 *
6181 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6182 *
6183 * Related: String#sub, String#gsub, String#gsub!.
6184 *
6185 */
6186
6187static VALUE
6188rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6189{
6190 VALUE pat, repl, hash = Qnil;
6191 int iter = 0;
6192 long plen;
6193 int min_arity = rb_block_given_p() ? 1 : 2;
6194 long beg;
6195
6196 rb_check_arity(argc, min_arity, 2);
6197 if (argc == 1) {
6198 iter = 1;
6199 }
6200 else {
6201 repl = argv[1];
6202 hash = rb_check_hash_type(argv[1]);
6203 if (NIL_P(hash)) {
6204 StringValue(repl);
6205 }
6206 }
6207
6208 pat = get_pat_quoted(argv[0], 1);
6209
6210 str_modifiable(str);
6211 beg = rb_pat_search(pat, str, 0, 1);
6212 if (beg >= 0) {
6213 rb_encoding *enc;
6214 int cr = ENC_CODERANGE(str);
6215 long beg0, end0;
6216 VALUE match, match0 = Qnil;
6217 struct re_registers *regs;
6218 char *p, *rp;
6219 long len, rlen;
6220
6221 match = rb_backref_get();
6222 regs = RMATCH_REGS(match);
6223 if (RB_TYPE_P(pat, T_STRING)) {
6224 beg0 = beg;
6225 end0 = beg0 + RSTRING_LEN(pat);
6226 match0 = pat;
6227 }
6228 else {
6229 beg0 = BEG(0);
6230 end0 = END(0);
6231 if (iter) match0 = rb_reg_nth_match(0, match);
6232 }
6233
6234 if (iter || !NIL_P(hash)) {
6235 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6236
6237 if (iter) {
6238 repl = rb_obj_as_string(rb_yield(match0));
6239 }
6240 else {
6241 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6242 repl = rb_obj_as_string(repl);
6243 }
6244 str_mod_check(str, p, len);
6245 rb_check_frozen(str);
6246 }
6247 else {
6248 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6249 }
6250
6251 enc = rb_enc_compatible(str, repl);
6252 if (!enc) {
6253 rb_encoding *str_enc = STR_ENC_GET(str);
6254 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6255 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6256 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6257 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6258 rb_enc_inspect_name(str_enc),
6259 rb_enc_inspect_name(STR_ENC_GET(repl)));
6260 }
6261 enc = STR_ENC_GET(repl);
6262 }
6263 rb_str_modify(str);
6264 rb_enc_associate(str, enc);
6266 int cr2 = ENC_CODERANGE(repl);
6267 if (cr2 == ENC_CODERANGE_BROKEN ||
6268 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6270 else
6271 cr = cr2;
6272 }
6273 plen = end0 - beg0;
6274 rlen = RSTRING_LEN(repl);
6275 len = RSTRING_LEN(str);
6276 if (rlen > plen) {
6277 RESIZE_CAPA(str, len + rlen - plen);
6278 }
6279 p = RSTRING_PTR(str);
6280 if (rlen != plen) {
6281 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6282 }
6283 rp = RSTRING_PTR(repl);
6284 memmove(p + beg0, rp, rlen);
6285 len += rlen - plen;
6286 STR_SET_LEN(str, len);
6287 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6288 ENC_CODERANGE_SET(str, cr);
6289
6290 RB_GC_GUARD(match);
6291
6292 return str;
6293 }
6294 return Qnil;
6295}
6296
6297
6298/*
6299 * call-seq:
6300 * sub(pattern, replacement) -> new_string
6301 * sub(pattern) {|match| ... } -> new_string
6302 *
6303 * Returns a copy of +self+ with only the first occurrence
6304 * (not all occurrences) of the given +pattern+ replaced.
6305 *
6306 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6307 *
6308 * Related: String#sub!, String#gsub, String#gsub!.
6309 *
6310 */
6311
6312static VALUE
6313rb_str_sub(int argc, VALUE *argv, VALUE str)
6314{
6315 str = str_duplicate(rb_cString, str);
6316 rb_str_sub_bang(argc, argv, str);
6317 return str;
6318}
6319
6320static VALUE
6321str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6322{
6323 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6324 long beg, beg0, end0;
6325 long offset, blen, slen, len, last;
6326 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6327 char *sp, *cp;
6328 int need_backref_str = -1;
6329 rb_encoding *str_enc;
6330
6331 switch (argc) {
6332 case 1:
6333 RETURN_ENUMERATOR(str, argc, argv);
6334 mode = ITER;
6335 break;
6336 case 2:
6337 repl = argv[1];
6338 hash = rb_check_hash_type(argv[1]);
6339 if (NIL_P(hash)) {
6340 StringValue(repl);
6341 }
6342 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6343 mode = FAST_MAP;
6344 }
6345 else {
6346 mode = MAP;
6347 }
6348 break;
6349 default:
6350 rb_error_arity(argc, 1, 2);
6351 }
6352
6353 pat = get_pat_quoted(argv[0], 1);
6354 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6355
6356 if (beg < 0) {
6357 if (bang) return Qnil; /* no match, no substitution */
6358 return str_duplicate(rb_cString, str);
6359 }
6360
6361 offset = 0;
6362 blen = RSTRING_LEN(str) + 30; /* len + margin */
6363 dest = rb_str_buf_new(blen);
6364 sp = RSTRING_PTR(str);
6365 slen = RSTRING_LEN(str);
6366 cp = sp;
6367 str_enc = STR_ENC_GET(str);
6368 rb_enc_associate(dest, str_enc);
6369 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6370
6371 do {
6372 struct re_registers *regs = RMATCH_REGS(match);
6373 if (RB_TYPE_P(pat, T_STRING)) {
6374 beg0 = beg;
6375 end0 = beg0 + RSTRING_LEN(pat);
6376 match0 = pat;
6377 }
6378 else {
6379 beg0 = BEG(0);
6380 end0 = END(0);
6381 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6382 }
6383
6384 if (mode != STR) {
6385 if (mode == ITER) {
6386 val = rb_obj_as_string(rb_yield(match0));
6387 }
6388 else {
6389 struct RString fake_str;
6390 VALUE key;
6391 if (mode == FAST_MAP) {
6392 // It is safe to use a fake_str here because we established that it won't escape,
6393 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6394 // default proc.
6395 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6396 }
6397 else {
6398 key = rb_str_subseq(str, beg0, end0 - beg0);
6399 }
6400 val = rb_hash_aref(hash, key);
6401 val = rb_obj_as_string(val);
6402 }
6403 str_mod_check(str, sp, slen);
6404 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6405 rb_raise(rb_eRuntimeError, "block should not cheat");
6406 }
6407 }
6408 else if (need_backref_str) {
6409 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6410 if (need_backref_str < 0) {
6411 need_backref_str = val != repl;
6412 }
6413 }
6414 else {
6415 val = repl;
6416 }
6417
6418 len = beg0 - offset; /* copy pre-match substr */
6419 if (len) {
6420 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6421 }
6422
6423 rb_str_buf_append(dest, val);
6424
6425 last = offset;
6426 offset = end0;
6427 if (beg0 == end0) {
6428 /*
6429 * Always consume at least one character of the input string
6430 * in order to prevent infinite loops.
6431 */
6432 if (RSTRING_LEN(str) <= end0) break;
6433 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6434 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6435 offset = end0 + len;
6436 }
6437 cp = RSTRING_PTR(str) + offset;
6438 if (offset > RSTRING_LEN(str)) break;
6439
6440 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6441 if (mode != FAST_MAP && mode != STR) {
6442 match = Qnil;
6443 }
6444 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6445
6446 RB_GC_GUARD(match);
6447 } while (beg >= 0);
6448
6449 if (RSTRING_LEN(str) > offset) {
6450 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6451 }
6452 rb_pat_search0(pat, str, last, 1, &match);
6453 if (bang) {
6454 str_shared_replace(str, dest);
6455 }
6456 else {
6457 str = dest;
6458 }
6459
6460 return str;
6461}
6462
6463
6464/*
6465 * call-seq:
6466 * gsub!(pattern, replacement) -> self or nil
6467 * gsub!(pattern) {|match| ... } -> self or nil
6468 * gsub!(pattern) -> an_enumerator
6469 *
6470 * Performs the specified substring replacement(s) on +self+;
6471 * returns +self+ if any replacement occurred, +nil+ otherwise.
6472 *
6473 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6474 *
6475 * Returns an Enumerator if no +replacement+ and no block given.
6476 *
6477 * Related: String#sub, String#gsub, String#sub!.
6478 *
6479 */
6480
6481static VALUE
6482rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6483{
6484 str_modify_keep_cr(str);
6485 return str_gsub(argc, argv, str, 1);
6486}
6487
6488
6489/*
6490 * call-seq:
6491 * gsub(pattern, replacement) -> new_string
6492 * gsub(pattern) {|match| ... } -> new_string
6493 * gsub(pattern) -> enumerator
6494 *
6495 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6496 *
6497 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6498 *
6499 * Returns an Enumerator if no +replacement+ and no block given.
6500 *
6501 * Related: String#sub, String#sub!, String#gsub!.
6502 *
6503 */
6504
6505static VALUE
6506rb_str_gsub(int argc, VALUE *argv, VALUE str)
6507{
6508 return str_gsub(argc, argv, str, 0);
6509}
6510
6511
6512/*
6513 * call-seq:
6514 * replace(other_string) -> self
6515 *
6516 * Replaces the contents of +self+ with the contents of +other_string+:
6517 *
6518 * s = 'foo' # => "foo"
6519 * s.replace('bar') # => "bar"
6520 *
6521 */
6522
6523VALUE
6525{
6526 str_modifiable(str);
6527 if (str == str2) return str;
6528
6529 StringValue(str2);
6530 str_discard(str);
6531 return str_replace(str, str2);
6532}
6533
6534/*
6535 * call-seq:
6536 * clear -> self
6537 *
6538 * Removes the contents of +self+:
6539 *
6540 * s = 'foo' # => "foo"
6541 * s.clear # => ""
6542 *
6543 */
6544
6545static VALUE
6546rb_str_clear(VALUE str)
6547{
6548 str_discard(str);
6549 STR_SET_EMBED(str);
6550 STR_SET_LEN(str, 0);
6551 RSTRING_PTR(str)[0] = 0;
6552 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6554 else
6556 return str;
6557}
6558
6559/*
6560 * call-seq:
6561 * chr -> string
6562 *
6563 * Returns a string containing the first character of +self+:
6564 *
6565 * s = 'foo' # => "foo"
6566 * s.chr # => "f"
6567 *
6568 */
6569
6570static VALUE
6571rb_str_chr(VALUE str)
6572{
6573 return rb_str_substr(str, 0, 1);
6574}
6575
6576/*
6577 * call-seq:
6578 * getbyte(index) -> integer or nil
6579 *
6580 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6581 *
6582 * s = 'abcde' # => "abcde"
6583 * s.getbyte(0) # => 97
6584 * s.getbyte(-1) # => 101
6585 * s.getbyte(5) # => nil
6586 *
6587 * Related: String#setbyte.
6588 */
6589VALUE
6590rb_str_getbyte(VALUE str, VALUE index)
6591{
6592 long pos = NUM2LONG(index);
6593
6594 if (pos < 0)
6595 pos += RSTRING_LEN(str);
6596 if (pos < 0 || RSTRING_LEN(str) <= pos)
6597 return Qnil;
6598
6599 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6600}
6601
6602/*
6603 * call-seq:
6604 * setbyte(index, integer) -> integer
6605 *
6606 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6607 *
6608 * s = 'abcde' # => "abcde"
6609 * s.setbyte(0, 98) # => 98
6610 * s # => "bbcde"
6611 *
6612 * Related: String#getbyte.
6613 */
6614VALUE
6615rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6616{
6617 long pos = NUM2LONG(index);
6618 long len = RSTRING_LEN(str);
6619 char *ptr, *head, *left = 0;
6620 rb_encoding *enc;
6621 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6622
6623 if (pos < -len || len <= pos)
6624 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6625 if (pos < 0)
6626 pos += len;
6627
6628 VALUE v = rb_to_int(value);
6629 VALUE w = rb_int_and(v, INT2FIX(0xff));
6630 char byte = (char)(NUM2INT(w) & 0xFF);
6631
6632 if (!str_independent(str))
6633 str_make_independent(str);
6634 enc = STR_ENC_GET(str);
6635 head = RSTRING_PTR(str);
6636 ptr = &head[pos];
6637 if (!STR_EMBED_P(str)) {
6638 cr = ENC_CODERANGE(str);
6639 switch (cr) {
6640 case ENC_CODERANGE_7BIT:
6641 left = ptr;
6642 *ptr = byte;
6643 if (ISASCII(byte)) goto end;
6644 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6645 if (!MBCLEN_CHARFOUND_P(nlen))
6647 else
6649 goto end;
6651 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6652 width = rb_enc_precise_mbclen(left, head+len, enc);
6653 *ptr = byte;
6654 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6655 if (!MBCLEN_CHARFOUND_P(nlen))
6657 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6659 goto end;
6660 }
6661 }
6663 *ptr = byte;
6664
6665 end:
6666 return value;
6667}
6668
6669static VALUE
6670str_byte_substr(VALUE str, long beg, long len, int empty)
6671{
6672 long n = RSTRING_LEN(str);
6673
6674 if (beg > n || len < 0) return Qnil;
6675 if (beg < 0) {
6676 beg += n;
6677 if (beg < 0) return Qnil;
6678 }
6679 if (len > n - beg)
6680 len = n - beg;
6681 if (len <= 0) {
6682 if (!empty) return Qnil;
6683 len = 0;
6684 }
6685
6686 VALUE str2 = str_subseq(str, beg, len);
6687
6688 str_enc_copy_direct(str2, str);
6689
6690 if (RSTRING_LEN(str2) == 0) {
6691 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6693 else
6695 }
6696 else {
6697 switch (ENC_CODERANGE(str)) {
6698 case ENC_CODERANGE_7BIT:
6700 break;
6701 default:
6703 break;
6704 }
6705 }
6706
6707 return str2;
6708}
6709
6710VALUE
6711rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6712{
6713 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6714}
6715
6716static VALUE
6717str_byte_aref(VALUE str, VALUE indx)
6718{
6719 long idx;
6720 if (FIXNUM_P(indx)) {
6721 idx = FIX2LONG(indx);
6722 }
6723 else {
6724 /* check if indx is Range */
6725 long beg, len = RSTRING_LEN(str);
6726
6727 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6728 case Qfalse:
6729 break;
6730 case Qnil:
6731 return Qnil;
6732 default:
6733 return str_byte_substr(str, beg, len, TRUE);
6734 }
6735
6736 idx = NUM2LONG(indx);
6737 }
6738 return str_byte_substr(str, idx, 1, FALSE);
6739}
6740
6741/*
6742 * call-seq:
6743 * byteslice(index, length = 1) -> string or nil
6744 * byteslice(range) -> string or nil
6745 *
6746 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6747 *
6748 * With integer arguments +index+ and +length+ given,
6749 * returns the substring beginning at the given +index+
6750 * of the given +length+ (if possible),
6751 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6752 *
6753 * s = '0123456789' # => "0123456789"
6754 * s.byteslice(2) # => "2"
6755 * s.byteslice(200) # => nil
6756 * s.byteslice(4, 3) # => "456"
6757 * s.byteslice(4, 30) # => "456789"
6758 * s.byteslice(4, -1) # => nil
6759 * s.byteslice(40, 2) # => nil
6760 *
6761 * In either case above, counts backwards from the end of +self+
6762 * if +index+ is negative:
6763 *
6764 * s = '0123456789' # => "0123456789"
6765 * s.byteslice(-4) # => "6"
6766 * s.byteslice(-4, 3) # => "678"
6767 *
6768 * With Range argument +range+ given, returns
6769 * <tt>byteslice(range.begin, range.size)</tt>:
6770 *
6771 * s = '0123456789' # => "0123456789"
6772 * s.byteslice(4..6) # => "456"
6773 * s.byteslice(-6..-4) # => "456"
6774 * s.byteslice(5..2) # => "" # range.size is zero.
6775 * s.byteslice(40..42) # => nil
6776 *
6777 * In all cases, a returned string has the same encoding as +self+:
6778 *
6779 * s.encoding # => #<Encoding:UTF-8>
6780 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6781 *
6782 */
6783
6784static VALUE
6785rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6786{
6787 if (argc == 2) {
6788 long beg = NUM2LONG(argv[0]);
6789 long len = NUM2LONG(argv[1]);
6790 return str_byte_substr(str, beg, len, TRUE);
6791 }
6792 rb_check_arity(argc, 1, 2);
6793 return str_byte_aref(str, argv[0]);
6794}
6795
6796static void
6797str_check_beg_len(VALUE str, long *beg, long *len)
6798{
6799 long end, slen = RSTRING_LEN(str);
6800
6801 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6802 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6803 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6804 }
6805 if (*beg < 0) {
6806 *beg += slen;
6807 }
6808 RUBY_ASSERT(*beg >= 0);
6809 RUBY_ASSERT(*beg <= slen);
6810
6811 if (*len > slen - *beg) {
6812 *len = slen - *beg;
6813 }
6814 end = *beg + *len;
6815 str_ensure_byte_pos(str, *beg);
6816 str_ensure_byte_pos(str, end);
6817}
6818
6819/*
6820 * call-seq:
6821 * bytesplice(index, length, str) -> string
6822 * bytesplice(index, length, str, str_index, str_length) -> string
6823 * bytesplice(range, str) -> string
6824 * bytesplice(range, str, str_range) -> string
6825 *
6826 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6827 * The portion of the string affected is determined using
6828 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6829 * If the replacement string is not the same length as the text it is replacing,
6830 * the string will be adjusted accordingly.
6831 *
6832 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6833 *
6834 * The form that take an Integer will raise an IndexError if the value is out
6835 * of range; the Range form will raise a RangeError.
6836 * If the beginning or ending offset does not land on character (codepoint)
6837 * boundary, an IndexError will be raised.
6838 */
6839
6840static VALUE
6841rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6842{
6843 long beg, len, vbeg, vlen;
6844 VALUE val;
6845 int cr;
6846
6847 rb_check_arity(argc, 2, 5);
6848 if (!(argc == 2 || argc == 3 || argc == 5)) {
6849 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6850 }
6851 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6852 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6853 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6854 rb_builtin_class_name(argv[0]));
6855 }
6856 val = argv[1];
6857 StringValue(val);
6858 if (argc == 2) {
6859 /* bytesplice(range, str) */
6860 vbeg = 0;
6861 vlen = RSTRING_LEN(val);
6862 }
6863 else {
6864 /* bytesplice(range, str, str_range) */
6865 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6866 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6867 rb_builtin_class_name(argv[2]));
6868 }
6869 }
6870 }
6871 else {
6872 beg = NUM2LONG(argv[0]);
6873 len = NUM2LONG(argv[1]);
6874 val = argv[2];
6875 StringValue(val);
6876 if (argc == 3) {
6877 /* bytesplice(index, length, str) */
6878 vbeg = 0;
6879 vlen = RSTRING_LEN(val);
6880 }
6881 else {
6882 /* bytesplice(index, length, str, str_index, str_length) */
6883 vbeg = NUM2LONG(argv[3]);
6884 vlen = NUM2LONG(argv[4]);
6885 }
6886 }
6887 str_check_beg_len(str, &beg, &len);
6888 str_check_beg_len(val, &vbeg, &vlen);
6889 str_modify_keep_cr(str);
6890
6891 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6892 rb_enc_associate(str, rb_enc_check(str, val));
6893 }
6894
6895 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6897 if (cr != ENC_CODERANGE_BROKEN)
6898 ENC_CODERANGE_SET(str, cr);
6899 return str;
6900}
6901
6902/*
6903 * call-seq:
6904 * reverse -> string
6905 *
6906 * Returns a new string with the characters from +self+ in reverse order.
6907 *
6908 * 'stressed'.reverse # => "desserts"
6909 *
6910 */
6911
6912static VALUE
6913rb_str_reverse(VALUE str)
6914{
6915 rb_encoding *enc;
6916 VALUE rev;
6917 char *s, *e, *p;
6918 int cr;
6919
6920 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6921 enc = STR_ENC_GET(str);
6922 rev = rb_str_new(0, RSTRING_LEN(str));
6923 s = RSTRING_PTR(str); e = RSTRING_END(str);
6924 p = RSTRING_END(rev);
6925 cr = ENC_CODERANGE(str);
6926
6927 if (RSTRING_LEN(str) > 1) {
6928 if (single_byte_optimizable(str)) {
6929 while (s < e) {
6930 *--p = *s++;
6931 }
6932 }
6933 else if (cr == ENC_CODERANGE_VALID) {
6934 while (s < e) {
6935 int clen = rb_enc_fast_mbclen(s, e, enc);
6936
6937 p -= clen;
6938 memcpy(p, s, clen);
6939 s += clen;
6940 }
6941 }
6942 else {
6943 cr = rb_enc_asciicompat(enc) ?
6945 while (s < e) {
6946 int clen = rb_enc_mbclen(s, e, enc);
6947
6948 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6949 p -= clen;
6950 memcpy(p, s, clen);
6951 s += clen;
6952 }
6953 }
6954 }
6955 STR_SET_LEN(rev, RSTRING_LEN(str));
6956 str_enc_copy_direct(rev, str);
6957 ENC_CODERANGE_SET(rev, cr);
6958
6959 return rev;
6960}
6961
6962
6963/*
6964 * call-seq:
6965 * reverse! -> self
6966 *
6967 * Returns +self+ with its characters reversed:
6968 *
6969 * s = 'stressed'
6970 * s.reverse! # => "desserts"
6971 * s # => "desserts"
6972 *
6973 */
6974
6975static VALUE
6976rb_str_reverse_bang(VALUE str)
6977{
6978 if (RSTRING_LEN(str) > 1) {
6979 if (single_byte_optimizable(str)) {
6980 char *s, *e, c;
6981
6982 str_modify_keep_cr(str);
6983 s = RSTRING_PTR(str);
6984 e = RSTRING_END(str) - 1;
6985 while (s < e) {
6986 c = *s;
6987 *s++ = *e;
6988 *e-- = c;
6989 }
6990 }
6991 else {
6992 str_shared_replace(str, rb_str_reverse(str));
6993 }
6994 }
6995 else {
6996 str_modify_keep_cr(str);
6997 }
6998 return str;
6999}
7000
7001
7002/*
7003 * call-seq:
7004 * include?(other_string) -> true or false
7005 *
7006 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
7007 *
7008 * s = 'foo'
7009 * s.include?('f') # => true
7010 * s.include?('fo') # => true
7011 * s.include?('food') # => false
7012 *
7013 */
7014
7015VALUE
7016rb_str_include(VALUE str, VALUE arg)
7017{
7018 long i;
7019
7020 StringValue(arg);
7021 i = rb_str_index(str, arg, 0);
7022
7023 return RBOOL(i != -1);
7024}
7025
7026
7027/*
7028 * call-seq:
7029 * to_i(base = 10) -> integer
7030 *
7031 * Returns the result of interpreting leading characters in +self+
7032 * as an integer in the given +base+ (which must be in (0, 2..36)):
7033 *
7034 * '123456'.to_i # => 123456
7035 * '123def'.to_i(16) # => 1195503
7036 *
7037 * With +base+ zero, string +object+ may contain leading characters
7038 * to specify the actual base:
7039 *
7040 * '123def'.to_i(0) # => 123
7041 * '0123def'.to_i(0) # => 83
7042 * '0b123def'.to_i(0) # => 1
7043 * '0o123def'.to_i(0) # => 83
7044 * '0d123def'.to_i(0) # => 123
7045 * '0x123def'.to_i(0) # => 1195503
7046 *
7047 * Characters past a leading valid number (in the given +base+) are ignored:
7048 *
7049 * '12.345'.to_i # => 12
7050 * '12345'.to_i(2) # => 1
7051 *
7052 * Returns zero if there is no leading valid number:
7053 *
7054 * 'abcdef'.to_i # => 0
7055 * '2'.to_i(2) # => 0
7056 *
7057 */
7058
7059static VALUE
7060rb_str_to_i(int argc, VALUE *argv, VALUE str)
7061{
7062 int base = 10;
7063
7064 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7065 rb_raise(rb_eArgError, "invalid radix %d", base);
7066 }
7067 return rb_str_to_inum(str, base, FALSE);
7068}
7069
7070
7071/*
7072 * call-seq:
7073 * to_f -> float
7074 *
7075 * Returns the result of interpreting leading characters in +self+ as a Float:
7076 *
7077 * '3.14159'.to_f # => 3.14159
7078 * '1.234e-2'.to_f # => 0.01234
7079 *
7080 * Characters past a leading valid number (in the given +base+) are ignored:
7081 *
7082 * '3.14 (pi to two places)'.to_f # => 3.14
7083 *
7084 * Returns zero if there is no leading valid number:
7085 *
7086 * 'abcdef'.to_f # => 0.0
7087 *
7088 */
7089
7090static VALUE
7091rb_str_to_f(VALUE str)
7092{
7093 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7094}
7095
7096
7097/*
7098 * call-seq:
7099 * to_s -> self or string
7100 *
7101 * Returns +self+ if +self+ is a +String+,
7102 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7103 */
7104
7105static VALUE
7106rb_str_to_s(VALUE str)
7107{
7108 if (rb_obj_class(str) != rb_cString) {
7109 return str_duplicate(rb_cString, str);
7110 }
7111 return str;
7112}
7113
7114#if 0
7115static void
7116str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7117{
7118 char s[RUBY_MAX_CHAR_LEN];
7119 int n = rb_enc_codelen(c, enc);
7120
7121 rb_enc_mbcput(c, s, enc);
7122 rb_enc_str_buf_cat(str, s, n, enc);
7123}
7124#endif
7125
7126#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7127
7128int
7129rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7130{
7131 char buf[CHAR_ESC_LEN + 1];
7132 int l;
7133
7134#if SIZEOF_INT > 4
7135 c &= 0xffffffff;
7136#endif
7137 if (unicode_p) {
7138 if (c < 0x7F && ISPRINT(c)) {
7139 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7140 }
7141 else if (c < 0x10000) {
7142 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7143 }
7144 else {
7145 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7146 }
7147 }
7148 else {
7149 if (c < 0x100) {
7150 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7151 }
7152 else {
7153 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7154 }
7155 }
7156 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7157 rb_str_buf_cat(result, buf, l);
7158 return l;
7159}
7160
7161const char *
7162ruby_escaped_char(int c)
7163{
7164 switch (c) {
7165 case '\0': return "\\0";
7166 case '\n': return "\\n";
7167 case '\r': return "\\r";
7168 case '\t': return "\\t";
7169 case '\f': return "\\f";
7170 case '\013': return "\\v";
7171 case '\010': return "\\b";
7172 case '\007': return "\\a";
7173 case '\033': return "\\e";
7174 case '\x7f': return "\\c?";
7175 }
7176 return NULL;
7177}
7178
7179VALUE
7180rb_str_escape(VALUE str)
7181{
7182 int encidx = ENCODING_GET(str);
7183 rb_encoding *enc = rb_enc_from_index(encidx);
7184 const char *p = RSTRING_PTR(str);
7185 const char *pend = RSTRING_END(str);
7186 const char *prev = p;
7187 char buf[CHAR_ESC_LEN + 1];
7188 VALUE result = rb_str_buf_new(0);
7189 int unicode_p = rb_enc_unicode_p(enc);
7190 int asciicompat = rb_enc_asciicompat(enc);
7191
7192 while (p < pend) {
7193 unsigned int c;
7194 const char *cc;
7195 int n = rb_enc_precise_mbclen(p, pend, enc);
7196 if (!MBCLEN_CHARFOUND_P(n)) {
7197 if (p > prev) str_buf_cat(result, prev, p - prev);
7198 n = rb_enc_mbminlen(enc);
7199 if (pend < p + n)
7200 n = (int)(pend - p);
7201 while (n--) {
7202 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7203 str_buf_cat(result, buf, strlen(buf));
7204 prev = ++p;
7205 }
7206 continue;
7207 }
7208 n = MBCLEN_CHARFOUND_LEN(n);
7209 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7210 p += n;
7211 cc = ruby_escaped_char(c);
7212 if (cc) {
7213 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7214 str_buf_cat(result, cc, strlen(cc));
7215 prev = p;
7216 }
7217 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7218 }
7219 else {
7220 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7221 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7222 prev = p;
7223 }
7224 }
7225 if (p > prev) str_buf_cat(result, prev, p - prev);
7226 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7227
7228 return result;
7229}
7230
7231/*
7232 * call-seq:
7233 * inspect -> string
7234 *
7235 * Returns a printable version of +self+, enclosed in double-quotes,
7236 * and with special characters escaped:
7237 *
7238 * s = "foo\tbar\tbaz\n"
7239 * s.inspect
7240 * # => "\"foo\\tbar\\tbaz\\n\""
7241 *
7242 */
7243
7244VALUE
7246{
7247 int encidx = ENCODING_GET(str);
7248 rb_encoding *enc = rb_enc_from_index(encidx);
7249 const char *p, *pend, *prev;
7250 char buf[CHAR_ESC_LEN + 1];
7251 VALUE result = rb_str_buf_new(0);
7252 rb_encoding *resenc = rb_default_internal_encoding();
7253 int unicode_p = rb_enc_unicode_p(enc);
7254 int asciicompat = rb_enc_asciicompat(enc);
7255
7256 if (resenc == NULL) resenc = rb_default_external_encoding();
7257 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7258 rb_enc_associate(result, resenc);
7259 str_buf_cat2(result, "\"");
7260
7261 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7262 prev = p;
7263 while (p < pend) {
7264 unsigned int c, cc;
7265 int n;
7266
7267 n = rb_enc_precise_mbclen(p, pend, enc);
7268 if (!MBCLEN_CHARFOUND_P(n)) {
7269 if (p > prev) str_buf_cat(result, prev, p - prev);
7270 n = rb_enc_mbminlen(enc);
7271 if (pend < p + n)
7272 n = (int)(pend - p);
7273 while (n--) {
7274 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7275 str_buf_cat(result, buf, strlen(buf));
7276 prev = ++p;
7277 }
7278 continue;
7279 }
7280 n = MBCLEN_CHARFOUND_LEN(n);
7281 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7282 p += n;
7283 if ((asciicompat || unicode_p) &&
7284 (c == '"'|| c == '\\' ||
7285 (c == '#' &&
7286 p < pend &&
7287 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7288 (cc = rb_enc_codepoint(p,pend,enc),
7289 (cc == '$' || cc == '@' || cc == '{'))))) {
7290 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7291 str_buf_cat2(result, "\\");
7292 if (asciicompat || enc == resenc) {
7293 prev = p - n;
7294 continue;
7295 }
7296 }
7297 switch (c) {
7298 case '\n': cc = 'n'; break;
7299 case '\r': cc = 'r'; break;
7300 case '\t': cc = 't'; break;
7301 case '\f': cc = 'f'; break;
7302 case '\013': cc = 'v'; break;
7303 case '\010': cc = 'b'; break;
7304 case '\007': cc = 'a'; break;
7305 case 033: cc = 'e'; break;
7306 default: cc = 0; break;
7307 }
7308 if (cc) {
7309 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7310 buf[0] = '\\';
7311 buf[1] = (char)cc;
7312 str_buf_cat(result, buf, 2);
7313 prev = p;
7314 continue;
7315 }
7316 /* The special casing of 0x85 (NEXT_LINE) here is because
7317 * Oniguruma historically treats it as printable, but it
7318 * doesn't match the print POSIX bracket class or character
7319 * property in regexps.
7320 *
7321 * See Ruby Bug #16842 for details:
7322 * https://bugs.ruby-lang.org/issues/16842
7323 */
7324 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7325 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7326 continue;
7327 }
7328 else {
7329 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7330 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7331 prev = p;
7332 continue;
7333 }
7334 }
7335 if (p > prev) str_buf_cat(result, prev, p - prev);
7336 str_buf_cat2(result, "\"");
7337
7338 return result;
7339}
7340
7341#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7342
7343/*
7344 * call-seq:
7345 * dump -> string
7346 *
7347 * Returns a printable version of +self+, enclosed in double-quotes,
7348 * with special characters escaped, and with non-printing characters
7349 * replaced by hexadecimal notation:
7350 *
7351 * "hello \n ''".dump # => "\"hello \\n ''\""
7352 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7353 *
7354 * Related: String#undump (inverse of String#dump).
7355 *
7356 */
7357
7358VALUE
7360{
7361 int encidx = rb_enc_get_index(str);
7362 rb_encoding *enc = rb_enc_from_index(encidx);
7363 long len;
7364 const char *p, *pend;
7365 char *q, *qend;
7366 VALUE result;
7367 int u8 = (encidx == rb_utf8_encindex());
7368 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7369
7370 len = 2; /* "" */
7371 if (!rb_enc_asciicompat(enc)) {
7372 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7373 len += strlen(enc->name);
7374 }
7375
7376 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7377 while (p < pend) {
7378 int clen;
7379 unsigned char c = *p++;
7380
7381 switch (c) {
7382 case '"': case '\\':
7383 case '\n': case '\r':
7384 case '\t': case '\f':
7385 case '\013': case '\010': case '\007': case '\033':
7386 clen = 2;
7387 break;
7388
7389 case '#':
7390 clen = IS_EVSTR(p, pend) ? 2 : 1;
7391 break;
7392
7393 default:
7394 if (ISPRINT(c)) {
7395 clen = 1;
7396 }
7397 else {
7398 if (u8 && c > 0x7F) { /* \u notation */
7399 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7400 if (MBCLEN_CHARFOUND_P(n)) {
7401 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7402 if (cc <= 0xFFFF)
7403 clen = 6; /* \uXXXX */
7404 else if (cc <= 0xFFFFF)
7405 clen = 9; /* \u{XXXXX} */
7406 else
7407 clen = 10; /* \u{XXXXXX} */
7408 p += MBCLEN_CHARFOUND_LEN(n)-1;
7409 break;
7410 }
7411 }
7412 clen = 4; /* \xNN */
7413 }
7414 break;
7415 }
7416
7417 if (clen > LONG_MAX - len) {
7418 rb_raise(rb_eRuntimeError, "string size too big");
7419 }
7420 len += clen;
7421 }
7422
7423 result = rb_str_new(0, len);
7424 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7425 q = RSTRING_PTR(result); qend = q + len + 1;
7426
7427 *q++ = '"';
7428 while (p < pend) {
7429 unsigned char c = *p++;
7430
7431 if (c == '"' || c == '\\') {
7432 *q++ = '\\';
7433 *q++ = c;
7434 }
7435 else if (c == '#') {
7436 if (IS_EVSTR(p, pend)) *q++ = '\\';
7437 *q++ = '#';
7438 }
7439 else if (c == '\n') {
7440 *q++ = '\\';
7441 *q++ = 'n';
7442 }
7443 else if (c == '\r') {
7444 *q++ = '\\';
7445 *q++ = 'r';
7446 }
7447 else if (c == '\t') {
7448 *q++ = '\\';
7449 *q++ = 't';
7450 }
7451 else if (c == '\f') {
7452 *q++ = '\\';
7453 *q++ = 'f';
7454 }
7455 else if (c == '\013') {
7456 *q++ = '\\';
7457 *q++ = 'v';
7458 }
7459 else if (c == '\010') {
7460 *q++ = '\\';
7461 *q++ = 'b';
7462 }
7463 else if (c == '\007') {
7464 *q++ = '\\';
7465 *q++ = 'a';
7466 }
7467 else if (c == '\033') {
7468 *q++ = '\\';
7469 *q++ = 'e';
7470 }
7471 else if (ISPRINT(c)) {
7472 *q++ = c;
7473 }
7474 else {
7475 *q++ = '\\';
7476 if (u8) {
7477 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7478 if (MBCLEN_CHARFOUND_P(n)) {
7479 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7480 p += n;
7481 if (cc <= 0xFFFF)
7482 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7483 else
7484 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7485 q += strlen(q);
7486 continue;
7487 }
7488 }
7489 snprintf(q, qend-q, "x%02X", c);
7490 q += 3;
7491 }
7492 }
7493 *q++ = '"';
7494 *q = '\0';
7495 if (!rb_enc_asciicompat(enc)) {
7496 snprintf(q, qend-q, nonascii_suffix, enc->name);
7497 encidx = rb_ascii8bit_encindex();
7498 }
7499 /* result from dump is ASCII */
7500 rb_enc_associate_index(result, encidx);
7502 return result;
7503}
7504
7505static int
7506unescape_ascii(unsigned int c)
7507{
7508 switch (c) {
7509 case 'n':
7510 return '\n';
7511 case 'r':
7512 return '\r';
7513 case 't':
7514 return '\t';
7515 case 'f':
7516 return '\f';
7517 case 'v':
7518 return '\13';
7519 case 'b':
7520 return '\010';
7521 case 'a':
7522 return '\007';
7523 case 'e':
7524 return 033;
7525 }
7527}
7528
7529static void
7530undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7531{
7532 const char *s = *ss;
7533 unsigned int c;
7534 int codelen;
7535 size_t hexlen;
7536 unsigned char buf[6];
7537 static rb_encoding *enc_utf8 = NULL;
7538
7539 switch (*s) {
7540 case '\\':
7541 case '"':
7542 case '#':
7543 rb_str_cat(undumped, s, 1); /* cat itself */
7544 s++;
7545 break;
7546 case 'n':
7547 case 'r':
7548 case 't':
7549 case 'f':
7550 case 'v':
7551 case 'b':
7552 case 'a':
7553 case 'e':
7554 *buf = unescape_ascii(*s);
7555 rb_str_cat(undumped, (char *)buf, 1);
7556 s++;
7557 break;
7558 case 'u':
7559 if (*binary) {
7560 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7561 }
7562 *utf8 = true;
7563 if (++s >= s_end) {
7564 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7565 }
7566 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7567 if (*penc != enc_utf8) {
7568 *penc = enc_utf8;
7569 rb_enc_associate(undumped, enc_utf8);
7570 }
7571 if (*s == '{') { /* handle \u{...} form */
7572 s++;
7573 for (;;) {
7574 if (s >= s_end) {
7575 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7576 }
7577 if (*s == '}') {
7578 s++;
7579 break;
7580 }
7581 if (ISSPACE(*s)) {
7582 s++;
7583 continue;
7584 }
7585 c = scan_hex(s, s_end-s, &hexlen);
7586 if (hexlen == 0 || hexlen > 6) {
7587 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7588 }
7589 if (c > 0x10ffff) {
7590 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7591 }
7592 if (0xd800 <= c && c <= 0xdfff) {
7593 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7594 }
7595 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7596 rb_str_cat(undumped, (char *)buf, codelen);
7597 s += hexlen;
7598 }
7599 }
7600 else { /* handle \uXXXX form */
7601 c = scan_hex(s, 4, &hexlen);
7602 if (hexlen != 4) {
7603 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7604 }
7605 if (0xd800 <= c && c <= 0xdfff) {
7606 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7607 }
7608 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7609 rb_str_cat(undumped, (char *)buf, codelen);
7610 s += hexlen;
7611 }
7612 break;
7613 case 'x':
7614 if (*utf8) {
7615 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7616 }
7617 *binary = true;
7618 if (++s >= s_end) {
7619 rb_raise(rb_eRuntimeError, "invalid hex escape");
7620 }
7621 *buf = scan_hex(s, 2, &hexlen);
7622 if (hexlen != 2) {
7623 rb_raise(rb_eRuntimeError, "invalid hex escape");
7624 }
7625 rb_str_cat(undumped, (char *)buf, 1);
7626 s += hexlen;
7627 break;
7628 default:
7629 rb_str_cat(undumped, s-1, 2);
7630 s++;
7631 }
7632
7633 *ss = s;
7634}
7635
7636static VALUE rb_str_is_ascii_only_p(VALUE str);
7637
7638/*
7639 * call-seq:
7640 * undump -> string
7641 *
7642 * Returns an unescaped version of +self+:
7643 *
7644 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7645 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7646 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7647 * s_undumped == s_orig # => true
7648 *
7649 * Related: String#dump (inverse of String#undump).
7650 *
7651 */
7652
7653static VALUE
7654str_undump(VALUE str)
7655{
7656 const char *s = RSTRING_PTR(str);
7657 const char *s_end = RSTRING_END(str);
7658 rb_encoding *enc = rb_enc_get(str);
7659 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7660 bool utf8 = false;
7661 bool binary = false;
7662 int w;
7663
7665 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7666 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7667 }
7668 if (!str_null_check(str, &w)) {
7669 rb_raise(rb_eRuntimeError, "string contains null byte");
7670 }
7671 if (RSTRING_LEN(str) < 2) goto invalid_format;
7672 if (*s != '"') goto invalid_format;
7673
7674 /* strip '"' at the start */
7675 s++;
7676
7677 for (;;) {
7678 if (s >= s_end) {
7679 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7680 }
7681
7682 if (*s == '"') {
7683 /* epilogue */
7684 s++;
7685 if (s == s_end) {
7686 /* ascii compatible dumped string */
7687 break;
7688 }
7689 else {
7690 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7691 static const char dup_suffix[] = ".dup";
7692 const char *encname;
7693 int encidx;
7694 ptrdiff_t size;
7695
7696 /* check separately for strings dumped by older versions */
7697 size = sizeof(dup_suffix) - 1;
7698 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7699
7700 size = sizeof(force_encoding_suffix) - 1;
7701 if (s_end - s <= size) goto invalid_format;
7702 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7703 s += size;
7704
7705 if (utf8) {
7706 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7707 }
7708
7709 encname = s;
7710 s = memchr(s, '"', s_end-s);
7711 size = s - encname;
7712 if (!s) goto invalid_format;
7713 if (s_end - s != 2) goto invalid_format;
7714 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7715
7716 encidx = rb_enc_find_index2(encname, (long)size);
7717 if (encidx < 0) {
7718 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7719 }
7720 rb_enc_associate_index(undumped, encidx);
7721 }
7722 break;
7723 }
7724
7725 if (*s == '\\') {
7726 s++;
7727 if (s >= s_end) {
7728 rb_raise(rb_eRuntimeError, "invalid escape");
7729 }
7730 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7731 }
7732 else {
7733 rb_str_cat(undumped, s++, 1);
7734 }
7735 }
7736
7737 RB_GC_GUARD(str);
7738
7739 return undumped;
7740invalid_format:
7741 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7742}
7743
7744static void
7745rb_str_check_dummy_enc(rb_encoding *enc)
7746{
7747 if (rb_enc_dummy_p(enc)) {
7748 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7749 rb_enc_name(enc));
7750 }
7751}
7752
7753static rb_encoding *
7754str_true_enc(VALUE str)
7755{
7756 rb_encoding *enc = STR_ENC_GET(str);
7757 rb_str_check_dummy_enc(enc);
7758 return enc;
7759}
7760
7761static OnigCaseFoldType
7762check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7763{
7764 if (argc==0)
7765 return flags;
7766 if (argc>2)
7767 rb_raise(rb_eArgError, "too many options");
7768 if (argv[0]==sym_turkic) {
7769 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7770 if (argc==2) {
7771 if (argv[1]==sym_lithuanian)
7772 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7773 else
7774 rb_raise(rb_eArgError, "invalid second option");
7775 }
7776 }
7777 else if (argv[0]==sym_lithuanian) {
7778 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7779 if (argc==2) {
7780 if (argv[1]==sym_turkic)
7781 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7782 else
7783 rb_raise(rb_eArgError, "invalid second option");
7784 }
7785 }
7786 else if (argc>1)
7787 rb_raise(rb_eArgError, "too many options");
7788 else if (argv[0]==sym_ascii)
7789 flags |= ONIGENC_CASE_ASCII_ONLY;
7790 else if (argv[0]==sym_fold) {
7791 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7792 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7793 else
7794 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7795 }
7796 else
7797 rb_raise(rb_eArgError, "invalid option");
7798 return flags;
7799}
7800
7801static inline bool
7802case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7803{
7804 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7805 return true;
7806 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7807}
7808
7809/* 16 should be long enough to absorb any kind of single character length increase */
7810#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7811#ifndef CASEMAP_DEBUG
7812# define CASEMAP_DEBUG 0
7813#endif
7814
7815struct mapping_buffer;
7816typedef struct mapping_buffer {
7817 size_t capa;
7818 size_t used;
7819 struct mapping_buffer *next;
7820 OnigUChar space[FLEX_ARY_LEN];
7822
7823static void
7824mapping_buffer_free(void *p)
7825{
7826 mapping_buffer *previous_buffer;
7827 mapping_buffer *current_buffer = p;
7828 while (current_buffer) {
7829 previous_buffer = current_buffer;
7830 current_buffer = current_buffer->next;
7831 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7832 }
7833}
7834
7835static const rb_data_type_t mapping_buffer_type = {
7836 "mapping_buffer",
7837 {0, mapping_buffer_free,},
7838 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7839};
7840
7841static VALUE
7842rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7843{
7844 VALUE target;
7845
7846 const OnigUChar *source_current, *source_end;
7847 int target_length = 0;
7848 VALUE buffer_anchor;
7849 mapping_buffer *current_buffer = 0;
7850 mapping_buffer **pre_buffer;
7851 size_t buffer_count = 0;
7852 int buffer_length_or_invalid;
7853
7854 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7855
7856 source_current = (OnigUChar*)RSTRING_PTR(source);
7857 source_end = (OnigUChar*)RSTRING_END(source);
7858
7859 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7860 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7861 while (source_current < source_end) {
7862 /* increase multiplier using buffer count to converge quickly */
7863 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7864 if (CASEMAP_DEBUG) {
7865 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7866 }
7867 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7868 *pre_buffer = current_buffer;
7869 pre_buffer = &current_buffer->next;
7870 current_buffer->next = NULL;
7871 current_buffer->capa = capa;
7872 buffer_length_or_invalid = enc->case_map(flags,
7873 &source_current, source_end,
7874 current_buffer->space,
7875 current_buffer->space+current_buffer->capa,
7876 enc);
7877 if (buffer_length_or_invalid < 0) {
7878 current_buffer = DATA_PTR(buffer_anchor);
7879 DATA_PTR(buffer_anchor) = 0;
7880 mapping_buffer_free(current_buffer);
7881 rb_raise(rb_eArgError, "input string invalid");
7882 }
7883 target_length += current_buffer->used = buffer_length_or_invalid;
7884 }
7885 if (CASEMAP_DEBUG) {
7886 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7887 }
7888
7889 if (buffer_count==1) {
7890 target = rb_str_new((const char*)current_buffer->space, target_length);
7891 }
7892 else {
7893 char *target_current;
7894
7895 target = rb_str_new(0, target_length);
7896 target_current = RSTRING_PTR(target);
7897 current_buffer = DATA_PTR(buffer_anchor);
7898 while (current_buffer) {
7899 memcpy(target_current, current_buffer->space, current_buffer->used);
7900 target_current += current_buffer->used;
7901 current_buffer = current_buffer->next;
7902 }
7903 }
7904 current_buffer = DATA_PTR(buffer_anchor);
7905 DATA_PTR(buffer_anchor) = 0;
7906 mapping_buffer_free(current_buffer);
7907
7908 RB_GC_GUARD(buffer_anchor);
7909
7910 /* TODO: check about string terminator character */
7911 str_enc_copy_direct(target, source);
7912 /*ENC_CODERANGE_SET(mapped, cr);*/
7913
7914 return target;
7915}
7916
7917static VALUE
7918rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7919{
7920 const OnigUChar *source_current, *source_end;
7921 OnigUChar *target_current, *target_end;
7922 long old_length = RSTRING_LEN(source);
7923 int length_or_invalid;
7924
7925 if (old_length == 0) return Qnil;
7926
7927 source_current = (OnigUChar*)RSTRING_PTR(source);
7928 source_end = (OnigUChar*)RSTRING_END(source);
7929 if (source == target) {
7930 target_current = (OnigUChar*)source_current;
7931 target_end = (OnigUChar*)source_end;
7932 }
7933 else {
7934 target_current = (OnigUChar*)RSTRING_PTR(target);
7935 target_end = (OnigUChar*)RSTRING_END(target);
7936 }
7937
7938 length_or_invalid = onigenc_ascii_only_case_map(flags,
7939 &source_current, source_end,
7940 target_current, target_end, enc);
7941 if (length_or_invalid < 0)
7942 rb_raise(rb_eArgError, "input string invalid");
7943 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7944 fprintf(stderr, "problem with rb_str_ascii_casemap"
7945 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7946 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7947 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7948 }
7949
7950 str_enc_copy(target, source);
7951
7952 return target;
7953}
7954
7955static bool
7956upcase_single(VALUE str)
7957{
7958 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7959 bool modified = false;
7960
7961 while (s < send) {
7962 unsigned int c = *(unsigned char*)s;
7963
7964 if ('a' <= c && c <= 'z') {
7965 *s = 'A' + (c - 'a');
7966 modified = true;
7967 }
7968 s++;
7969 }
7970 return modified;
7971}
7972
7973/*
7974 * call-seq:
7975 * upcase!(*options) -> self or nil
7976 *
7977 * Upcases the characters in +self+;
7978 * returns +self+ if any changes were made, +nil+ otherwise:
7979 *
7980 * s = 'Hello World!' # => "Hello World!"
7981 * s.upcase! # => "HELLO WORLD!"
7982 * s # => "HELLO WORLD!"
7983 * s.upcase! # => nil
7984 *
7985 * The casing may be affected by the given +options+;
7986 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7987 *
7988 * Related: String#upcase, String#downcase, String#downcase!.
7989 *
7990 */
7991
7992static VALUE
7993rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7994{
7995 rb_encoding *enc;
7996 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7997
7998 flags = check_case_options(argc, argv, flags);
7999 str_modify_keep_cr(str);
8000 enc = str_true_enc(str);
8001 if (case_option_single_p(flags, enc, str)) {
8002 if (upcase_single(str))
8003 flags |= ONIGENC_CASE_MODIFIED;
8004 }
8005 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8006 rb_str_ascii_casemap(str, str, &flags, enc);
8007 else
8008 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8009
8010 if (ONIGENC_CASE_MODIFIED&flags) return str;
8011 return Qnil;
8012}
8013
8014
8015/*
8016 * call-seq:
8017 * upcase(*options) -> string
8018 *
8019 * Returns a string containing the upcased characters in +self+:
8020 *
8021 * s = 'Hello World!' # => "Hello World!"
8022 * s.upcase # => "HELLO WORLD!"
8023 *
8024 * The casing may be affected by the given +options+;
8025 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8026 *
8027 * Related: String#upcase!, String#downcase, String#downcase!.
8028 *
8029 */
8030
8031static VALUE
8032rb_str_upcase(int argc, VALUE *argv, VALUE str)
8033{
8034 rb_encoding *enc;
8035 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8036 VALUE ret;
8037
8038 flags = check_case_options(argc, argv, flags);
8039 enc = str_true_enc(str);
8040 if (case_option_single_p(flags, enc, str)) {
8041 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8042 str_enc_copy_direct(ret, str);
8043 upcase_single(ret);
8044 }
8045 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8046 ret = rb_str_new(0, RSTRING_LEN(str));
8047 rb_str_ascii_casemap(str, ret, &flags, enc);
8048 }
8049 else {
8050 ret = rb_str_casemap(str, &flags, enc);
8051 }
8052
8053 return ret;
8054}
8055
8056static bool
8057downcase_single(VALUE str)
8058{
8059 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8060 bool modified = false;
8061
8062 while (s < send) {
8063 unsigned int c = *(unsigned char*)s;
8064
8065 if ('A' <= c && c <= 'Z') {
8066 *s = 'a' + (c - 'A');
8067 modified = true;
8068 }
8069 s++;
8070 }
8071
8072 return modified;
8073}
8074
8075/*
8076 * call-seq:
8077 * downcase!(*options) -> self or nil
8078 *
8079 * Downcases the characters in +self+;
8080 * returns +self+ if any changes were made, +nil+ otherwise:
8081 *
8082 * s = 'Hello World!' # => "Hello World!"
8083 * s.downcase! # => "hello world!"
8084 * s # => "hello world!"
8085 * s.downcase! # => nil
8086 *
8087 * The casing may be affected by the given +options+;
8088 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8089 *
8090 * Related: String#downcase, String#upcase, String#upcase!.
8091 *
8092 */
8093
8094static VALUE
8095rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8096{
8097 rb_encoding *enc;
8098 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8099
8100 flags = check_case_options(argc, argv, flags);
8101 str_modify_keep_cr(str);
8102 enc = str_true_enc(str);
8103 if (case_option_single_p(flags, enc, str)) {
8104 if (downcase_single(str))
8105 flags |= ONIGENC_CASE_MODIFIED;
8106 }
8107 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8108 rb_str_ascii_casemap(str, str, &flags, enc);
8109 else
8110 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8111
8112 if (ONIGENC_CASE_MODIFIED&flags) return str;
8113 return Qnil;
8114}
8115
8116
8117/*
8118 * call-seq:
8119 * downcase(*options) -> string
8120 *
8121 * Returns a string containing the downcased characters in +self+:
8122 *
8123 * s = 'Hello World!' # => "Hello World!"
8124 * s.downcase # => "hello world!"
8125 *
8126 * The casing may be affected by the given +options+;
8127 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8128 *
8129 * Related: String#downcase!, String#upcase, String#upcase!.
8130 *
8131 */
8132
8133static VALUE
8134rb_str_downcase(int argc, VALUE *argv, VALUE str)
8135{
8136 rb_encoding *enc;
8137 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8138 VALUE ret;
8139
8140 flags = check_case_options(argc, argv, flags);
8141 enc = str_true_enc(str);
8142 if (case_option_single_p(flags, enc, str)) {
8143 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8144 str_enc_copy_direct(ret, str);
8145 downcase_single(ret);
8146 }
8147 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8148 ret = rb_str_new(0, RSTRING_LEN(str));
8149 rb_str_ascii_casemap(str, ret, &flags, enc);
8150 }
8151 else {
8152 ret = rb_str_casemap(str, &flags, enc);
8153 }
8154
8155 return ret;
8156}
8157
8158
8159/*
8160 * call-seq:
8161 * capitalize!(*options) -> self or nil
8162 *
8163 * Upcases the first character in +self+;
8164 * downcases the remaining characters;
8165 * returns +self+ if any changes were made, +nil+ otherwise:
8166 *
8167 * s = 'hello World!' # => "hello World!"
8168 * s.capitalize! # => "Hello world!"
8169 * s # => "Hello world!"
8170 * s.capitalize! # => nil
8171 *
8172 * The casing may be affected by the given +options+;
8173 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8174 *
8175 * Related: String#capitalize.
8176 *
8177 */
8178
8179static VALUE
8180rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8181{
8182 rb_encoding *enc;
8183 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8184
8185 flags = check_case_options(argc, argv, flags);
8186 str_modify_keep_cr(str);
8187 enc = str_true_enc(str);
8188 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8189 if (flags&ONIGENC_CASE_ASCII_ONLY)
8190 rb_str_ascii_casemap(str, str, &flags, enc);
8191 else
8192 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8193
8194 if (ONIGENC_CASE_MODIFIED&flags) return str;
8195 return Qnil;
8196}
8197
8198
8199/*
8200 * call-seq:
8201 * capitalize(*options) -> string
8202 *
8203 * Returns a string containing the characters in +self+;
8204 * the first character is upcased;
8205 * the remaining characters are downcased:
8206 *
8207 * s = 'hello World!' # => "hello World!"
8208 * s.capitalize # => "Hello world!"
8209 *
8210 * The casing may be affected by the given +options+;
8211 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8212 *
8213 * Related: String#capitalize!.
8214 *
8215 */
8216
8217static VALUE
8218rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8219{
8220 rb_encoding *enc;
8221 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8222 VALUE ret;
8223
8224 flags = check_case_options(argc, argv, flags);
8225 enc = str_true_enc(str);
8226 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8227 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8228 ret = rb_str_new(0, RSTRING_LEN(str));
8229 rb_str_ascii_casemap(str, ret, &flags, enc);
8230 }
8231 else {
8232 ret = rb_str_casemap(str, &flags, enc);
8233 }
8234 return ret;
8235}
8236
8237
8238/*
8239 * call-seq:
8240 * swapcase!(*options) -> self or nil
8241 *
8242 * Upcases each lowercase character in +self+;
8243 * downcases uppercase character;
8244 * returns +self+ if any changes were made, +nil+ otherwise:
8245 *
8246 * s = 'Hello World!' # => "Hello World!"
8247 * s.swapcase! # => "hELLO wORLD!"
8248 * s # => "hELLO wORLD!"
8249 * ''.swapcase! # => nil
8250 *
8251 * The casing may be affected by the given +options+;
8252 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8253 *
8254 * Related: String#swapcase.
8255 *
8256 */
8257
8258static VALUE
8259rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8260{
8261 rb_encoding *enc;
8262 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8263
8264 flags = check_case_options(argc, argv, flags);
8265 str_modify_keep_cr(str);
8266 enc = str_true_enc(str);
8267 if (flags&ONIGENC_CASE_ASCII_ONLY)
8268 rb_str_ascii_casemap(str, str, &flags, enc);
8269 else
8270 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8271
8272 if (ONIGENC_CASE_MODIFIED&flags) return str;
8273 return Qnil;
8274}
8275
8276
8277/*
8278 * call-seq:
8279 * swapcase(*options) -> string
8280 *
8281 * Returns a string containing the characters in +self+, with cases reversed;
8282 * each uppercase character is downcased;
8283 * each lowercase character is upcased:
8284 *
8285 * s = 'Hello World!' # => "Hello World!"
8286 * s.swapcase # => "hELLO wORLD!"
8287 *
8288 * The casing may be affected by the given +options+;
8289 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8290 *
8291 * Related: String#swapcase!.
8292 *
8293 */
8294
8295static VALUE
8296rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8297{
8298 rb_encoding *enc;
8299 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8300 VALUE ret;
8301
8302 flags = check_case_options(argc, argv, flags);
8303 enc = str_true_enc(str);
8304 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8305 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8306 ret = rb_str_new(0, RSTRING_LEN(str));
8307 rb_str_ascii_casemap(str, ret, &flags, enc);
8308 }
8309 else {
8310 ret = rb_str_casemap(str, &flags, enc);
8311 }
8312 return ret;
8313}
8314
8315typedef unsigned char *USTR;
8316
8317struct tr {
8318 int gen;
8319 unsigned int now, max;
8320 char *p, *pend;
8321};
8322
8323static unsigned int
8324trnext(struct tr *t, rb_encoding *enc)
8325{
8326 int n;
8327
8328 for (;;) {
8329 nextpart:
8330 if (!t->gen) {
8331 if (t->p == t->pend) return -1;
8332 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8333 t->p += n;
8334 }
8335 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8336 t->p += n;
8337 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8338 t->p += n;
8339 if (t->p < t->pend) {
8340 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8341 t->p += n;
8342 if (t->now > c) {
8343 if (t->now < 0x80 && c < 0x80) {
8344 rb_raise(rb_eArgError,
8345 "invalid range \"%c-%c\" in string transliteration",
8346 t->now, c);
8347 }
8348 else {
8349 rb_raise(rb_eArgError, "invalid range in string transliteration");
8350 }
8351 continue; /* not reached */
8352 }
8353 else if (t->now < c) {
8354 t->gen = 1;
8355 t->max = c;
8356 }
8357 }
8358 }
8359 return t->now;
8360 }
8361 else {
8362 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8363 if (t->now == t->max) {
8364 t->gen = 0;
8365 goto nextpart;
8366 }
8367 }
8368 if (t->now < t->max) {
8369 return t->now;
8370 }
8371 else {
8372 t->gen = 0;
8373 return t->max;
8374 }
8375 }
8376 }
8377}
8378
8379static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8380
8381static VALUE
8382tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8383{
8384 const unsigned int errc = -1;
8385 unsigned int trans[256];
8386 rb_encoding *enc, *e1, *e2;
8387 struct tr trsrc, trrepl;
8388 int cflag = 0;
8389 unsigned int c, c0, last = 0;
8390 int modify = 0, i, l;
8391 unsigned char *s, *send;
8392 VALUE hash = 0;
8393 int singlebyte = single_byte_optimizable(str);
8394 int termlen;
8395 int cr;
8396
8397#define CHECK_IF_ASCII(c) \
8398 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8399 (cr = ENC_CODERANGE_VALID) : 0)
8400
8401 StringValue(src);
8402 StringValue(repl);
8403 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8404 if (RSTRING_LEN(repl) == 0) {
8405 return rb_str_delete_bang(1, &src, str);
8406 }
8407
8408 cr = ENC_CODERANGE(str);
8409 e1 = rb_enc_check(str, src);
8410 e2 = rb_enc_check(str, repl);
8411 if (e1 == e2) {
8412 enc = e1;
8413 }
8414 else {
8415 enc = rb_enc_check(src, repl);
8416 }
8417 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8418 if (RSTRING_LEN(src) > 1 &&
8419 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8420 trsrc.p + l < trsrc.pend) {
8421 cflag = 1;
8422 trsrc.p += l;
8423 }
8424 trrepl.p = RSTRING_PTR(repl);
8425 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8426 trsrc.gen = trrepl.gen = 0;
8427 trsrc.now = trrepl.now = 0;
8428 trsrc.max = trrepl.max = 0;
8429
8430 if (cflag) {
8431 for (i=0; i<256; i++) {
8432 trans[i] = 1;
8433 }
8434 while ((c = trnext(&trsrc, enc)) != errc) {
8435 if (c < 256) {
8436 trans[c] = errc;
8437 }
8438 else {
8439 if (!hash) hash = rb_hash_new();
8440 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8441 }
8442 }
8443 while ((c = trnext(&trrepl, enc)) != errc)
8444 /* retrieve last replacer */;
8445 last = trrepl.now;
8446 for (i=0; i<256; i++) {
8447 if (trans[i] != errc) {
8448 trans[i] = last;
8449 }
8450 }
8451 }
8452 else {
8453 unsigned int r;
8454
8455 for (i=0; i<256; i++) {
8456 trans[i] = errc;
8457 }
8458 while ((c = trnext(&trsrc, enc)) != errc) {
8459 r = trnext(&trrepl, enc);
8460 if (r == errc) r = trrepl.now;
8461 if (c < 256) {
8462 trans[c] = r;
8463 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8464 }
8465 else {
8466 if (!hash) hash = rb_hash_new();
8467 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8468 }
8469 }
8470 }
8471
8472 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8473 cr = ENC_CODERANGE_7BIT;
8474 str_modify_keep_cr(str);
8475 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8476 termlen = rb_enc_mbminlen(enc);
8477 if (sflag) {
8478 int clen, tlen;
8479 long offset, max = RSTRING_LEN(str);
8480 unsigned int save = -1;
8481 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8482
8483 while (s < send) {
8484 int may_modify = 0;
8485
8486 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8487 if (!MBCLEN_CHARFOUND_P(r)) {
8488 xfree(buf);
8489 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8490 }
8491 clen = MBCLEN_CHARFOUND_LEN(r);
8492 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8493
8494 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8495
8496 s += clen;
8497 if (c < 256) {
8498 c = trans[c];
8499 }
8500 else if (hash) {
8501 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8502 if (NIL_P(tmp)) {
8503 if (cflag) c = last;
8504 else c = errc;
8505 }
8506 else if (cflag) c = errc;
8507 else c = NUM2INT(tmp);
8508 }
8509 else {
8510 c = errc;
8511 }
8512 if (c != (unsigned int)-1) {
8513 if (save == c) {
8514 CHECK_IF_ASCII(c);
8515 continue;
8516 }
8517 save = c;
8518 tlen = rb_enc_codelen(c, enc);
8519 modify = 1;
8520 }
8521 else {
8522 save = -1;
8523 c = c0;
8524 if (enc != e1) may_modify = 1;
8525 }
8526 if ((offset = t - buf) + tlen > max) {
8527 size_t MAYBE_UNUSED(old) = max + termlen;
8528 max = offset + tlen + (send - s);
8529 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8530 t = buf + offset;
8531 }
8532 rb_enc_mbcput(c, t, enc);
8533 if (may_modify && memcmp(s, t, tlen) != 0) {
8534 modify = 1;
8535 }
8536 CHECK_IF_ASCII(c);
8537 t += tlen;
8538 }
8539 if (!STR_EMBED_P(str)) {
8540 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8541 }
8542 TERM_FILL((char *)t, termlen);
8543 RSTRING(str)->as.heap.ptr = (char *)buf;
8544 STR_SET_LEN(str, t - buf);
8545 STR_SET_NOEMBED(str);
8546 RSTRING(str)->as.heap.aux.capa = max;
8547 }
8548 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8549 while (s < send) {
8550 c = (unsigned char)*s;
8551 if (trans[c] != errc) {
8552 if (!cflag) {
8553 c = trans[c];
8554 *s = c;
8555 modify = 1;
8556 }
8557 else {
8558 *s = last;
8559 modify = 1;
8560 }
8561 }
8562 CHECK_IF_ASCII(c);
8563 s++;
8564 }
8565 }
8566 else {
8567 int clen, tlen;
8568 long offset, max = (long)((send - s) * 1.2);
8569 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8570
8571 while (s < send) {
8572 int may_modify = 0;
8573
8574 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8575 if (!MBCLEN_CHARFOUND_P(r)) {
8576 xfree(buf);
8577 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8578 }
8579 clen = MBCLEN_CHARFOUND_LEN(r);
8580 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8581
8582 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8583
8584 if (c < 256) {
8585 c = trans[c];
8586 }
8587 else if (hash) {
8588 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8589 if (NIL_P(tmp)) {
8590 if (cflag) c = last;
8591 else c = errc;
8592 }
8593 else if (cflag) c = errc;
8594 else c = NUM2INT(tmp);
8595 }
8596 else {
8597 c = cflag ? last : errc;
8598 }
8599 if (c != errc) {
8600 tlen = rb_enc_codelen(c, enc);
8601 modify = 1;
8602 }
8603 else {
8604 c = c0;
8605 if (enc != e1) may_modify = 1;
8606 }
8607 if ((offset = t - buf) + tlen > max) {
8608 size_t MAYBE_UNUSED(old) = max + termlen;
8609 max = offset + tlen + (long)((send - s) * 1.2);
8610 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8611 t = buf + offset;
8612 }
8613 if (s != t) {
8614 rb_enc_mbcput(c, t, enc);
8615 if (may_modify && memcmp(s, t, tlen) != 0) {
8616 modify = 1;
8617 }
8618 }
8619 CHECK_IF_ASCII(c);
8620 s += clen;
8621 t += tlen;
8622 }
8623 if (!STR_EMBED_P(str)) {
8624 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8625 }
8626 TERM_FILL((char *)t, termlen);
8627 RSTRING(str)->as.heap.ptr = (char *)buf;
8628 STR_SET_LEN(str, t - buf);
8629 STR_SET_NOEMBED(str);
8630 RSTRING(str)->as.heap.aux.capa = max;
8631 }
8632
8633 if (modify) {
8634 if (cr != ENC_CODERANGE_BROKEN)
8635 ENC_CODERANGE_SET(str, cr);
8636 rb_enc_associate(str, enc);
8637 return str;
8638 }
8639 return Qnil;
8640}
8641
8642
8643/*
8644 * call-seq:
8645 * tr!(selector, replacements) -> self or nil
8646 *
8647 * Like String#tr, but modifies +self+ in place.
8648 * Returns +self+ if any changes were made, +nil+ otherwise.
8649 *
8650 */
8651
8652static VALUE
8653rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8654{
8655 return tr_trans(str, src, repl, 0);
8656}
8657
8658
8659/*
8660 * call-seq:
8661 * tr(selector, replacements) -> new_string
8662 *
8663 * Returns a copy of +self+ with each character specified by string +selector+
8664 * translated to the corresponding character in string +replacements+.
8665 * The correspondence is _positional_:
8666 *
8667 * - Each occurrence of the first character specified by +selector+
8668 * is translated to the first character in +replacements+.
8669 * - Each occurrence of the second character specified by +selector+
8670 * is translated to the second character in +replacements+.
8671 * - And so on.
8672 *
8673 * Example:
8674 *
8675 * 'hello'.tr('el', 'ip') #=> "hippo"
8676 *
8677 * If +replacements+ is shorter than +selector+,
8678 * it is implicitly padded with its own last character:
8679 *
8680 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8681 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8682 *
8683 * Arguments +selector+ and +replacements+ must be valid character selectors
8684 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8685 * and may use any of its valid forms, including negation, ranges, and escaping:
8686 *
8687 * # Negation.
8688 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8689 * # Ranges.
8690 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8691 * # Escapes.
8692 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8693 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8694 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8695 *
8696 */
8697
8698static VALUE
8699rb_str_tr(VALUE str, VALUE src, VALUE repl)
8700{
8701 str = str_duplicate(rb_cString, str);
8702 tr_trans(str, src, repl, 0);
8703 return str;
8704}
8705
8706#define TR_TABLE_MAX (UCHAR_MAX+1)
8707#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8708static void
8709tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8710 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8711{
8712 const unsigned int errc = -1;
8713 char buf[TR_TABLE_MAX];
8714 struct tr tr;
8715 unsigned int c;
8716 VALUE table = 0, ptable = 0;
8717 int i, l, cflag = 0;
8718
8719 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8720 tr.gen = tr.now = tr.max = 0;
8721
8722 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8723 cflag = 1;
8724 tr.p += l;
8725 }
8726 if (first) {
8727 for (i=0; i<TR_TABLE_MAX; i++) {
8728 stable[i] = 1;
8729 }
8730 stable[TR_TABLE_MAX] = cflag;
8731 }
8732 else if (stable[TR_TABLE_MAX] && !cflag) {
8733 stable[TR_TABLE_MAX] = 0;
8734 }
8735 for (i=0; i<TR_TABLE_MAX; i++) {
8736 buf[i] = cflag;
8737 }
8738
8739 while ((c = trnext(&tr, enc)) != errc) {
8740 if (c < TR_TABLE_MAX) {
8741 buf[(unsigned char)c] = !cflag;
8742 }
8743 else {
8744 VALUE key = UINT2NUM(c);
8745
8746 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8747 if (cflag) {
8748 ptable = *ctablep;
8749 table = ptable ? ptable : rb_hash_new();
8750 *ctablep = table;
8751 }
8752 else {
8753 table = rb_hash_new();
8754 ptable = *tablep;
8755 *tablep = table;
8756 }
8757 }
8758 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8759 rb_hash_aset(table, key, Qtrue);
8760 }
8761 }
8762 }
8763 for (i=0; i<TR_TABLE_MAX; i++) {
8764 stable[i] = stable[i] && buf[i];
8765 }
8766 if (!table && !cflag) {
8767 *tablep = 0;
8768 }
8769}
8770
8771
8772static int
8773tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8774{
8775 if (c < TR_TABLE_MAX) {
8776 return table[c] != 0;
8777 }
8778 else {
8779 VALUE v = UINT2NUM(c);
8780
8781 if (del) {
8782 if (!NIL_P(rb_hash_lookup(del, v)) &&
8783 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8784 return TRUE;
8785 }
8786 }
8787 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8788 return FALSE;
8789 }
8790 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8791 }
8792}
8793
8794/*
8795 * call-seq:
8796 * delete!(*selectors) -> self or nil
8797 *
8798 * Like String#delete, but modifies +self+ in place.
8799 * Returns +self+ if any changes were made, +nil+ otherwise.
8800 *
8801 */
8802
8803static VALUE
8804rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8805{
8806 char squeez[TR_TABLE_SIZE];
8807 rb_encoding *enc = 0;
8808 char *s, *send, *t;
8809 VALUE del = 0, nodel = 0;
8810 int modify = 0;
8811 int i, ascompat, cr;
8812
8813 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8815 for (i=0; i<argc; i++) {
8816 VALUE s = argv[i];
8817
8818 StringValue(s);
8819 enc = rb_enc_check(str, s);
8820 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8821 }
8822
8823 str_modify_keep_cr(str);
8824 ascompat = rb_enc_asciicompat(enc);
8825 s = t = RSTRING_PTR(str);
8826 send = RSTRING_END(str);
8827 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8828 while (s < send) {
8829 unsigned int c;
8830 int clen;
8831
8832 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8833 if (squeez[c]) {
8834 modify = 1;
8835 }
8836 else {
8837 if (t != s) *t = c;
8838 t++;
8839 }
8840 s++;
8841 }
8842 else {
8843 c = rb_enc_codepoint_len(s, send, &clen, enc);
8844
8845 if (tr_find(c, squeez, del, nodel)) {
8846 modify = 1;
8847 }
8848 else {
8849 if (t != s) rb_enc_mbcput(c, t, enc);
8850 t += clen;
8852 }
8853 s += clen;
8854 }
8855 }
8856 TERM_FILL(t, TERM_LEN(str));
8857 STR_SET_LEN(str, t - RSTRING_PTR(str));
8858 ENC_CODERANGE_SET(str, cr);
8859
8860 if (modify) return str;
8861 return Qnil;
8862}
8863
8864
8865/*
8866 * call-seq:
8867 * delete(*selectors) -> new_string
8868 *
8869 * Returns a copy of +self+ with characters specified by +selectors+ removed
8870 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8871 *
8872 * "hello".delete "l","lo" #=> "heo"
8873 * "hello".delete "lo" #=> "he"
8874 * "hello".delete "aeiou", "^e" #=> "hell"
8875 * "hello".delete "ej-m" #=> "ho"
8876 *
8877 */
8878
8879static VALUE
8880rb_str_delete(int argc, VALUE *argv, VALUE str)
8881{
8882 str = str_duplicate(rb_cString, str);
8883 rb_str_delete_bang(argc, argv, str);
8884 return str;
8885}
8886
8887
8888/*
8889 * call-seq:
8890 * squeeze!(*selectors) -> self or nil
8891 *
8892 * Like String#squeeze, but modifies +self+ in place.
8893 * Returns +self+ if any changes were made, +nil+ otherwise.
8894 */
8895
8896static VALUE
8897rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8898{
8899 char squeez[TR_TABLE_SIZE];
8900 rb_encoding *enc = 0;
8901 VALUE del = 0, nodel = 0;
8902 unsigned char *s, *send, *t;
8903 int i, modify = 0;
8904 int ascompat, singlebyte = single_byte_optimizable(str);
8905 unsigned int save;
8906
8907 if (argc == 0) {
8908 enc = STR_ENC_GET(str);
8909 }
8910 else {
8911 for (i=0; i<argc; i++) {
8912 VALUE s = argv[i];
8913
8914 StringValue(s);
8915 enc = rb_enc_check(str, s);
8916 if (singlebyte && !single_byte_optimizable(s))
8917 singlebyte = 0;
8918 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8919 }
8920 }
8921
8922 str_modify_keep_cr(str);
8923 s = t = (unsigned char *)RSTRING_PTR(str);
8924 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8925 send = (unsigned char *)RSTRING_END(str);
8926 save = -1;
8927 ascompat = rb_enc_asciicompat(enc);
8928
8929 if (singlebyte) {
8930 while (s < send) {
8931 unsigned int c = *s++;
8932 if (c != save || (argc > 0 && !squeez[c])) {
8933 *t++ = save = c;
8934 }
8935 }
8936 }
8937 else {
8938 while (s < send) {
8939 unsigned int c;
8940 int clen;
8941
8942 if (ascompat && (c = *s) < 0x80) {
8943 if (c != save || (argc > 0 && !squeez[c])) {
8944 *t++ = save = c;
8945 }
8946 s++;
8947 }
8948 else {
8949 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8950
8951 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8952 if (t != s) rb_enc_mbcput(c, t, enc);
8953 save = c;
8954 t += clen;
8955 }
8956 s += clen;
8957 }
8958 }
8959 }
8960
8961 TERM_FILL((char *)t, TERM_LEN(str));
8962 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8963 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8964 modify = 1;
8965 }
8966
8967 if (modify) return str;
8968 return Qnil;
8969}
8970
8971
8972/*
8973 * call-seq:
8974 * squeeze(*selectors) -> new_string
8975 *
8976 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8977 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8978 *
8979 * "Squeezed" means that each multiple-character run of a selected character
8980 * is squeezed down to a single character;
8981 * with no arguments given, squeezes all characters:
8982 *
8983 * "yellow moon".squeeze #=> "yelow mon"
8984 * " now is the".squeeze(" ") #=> " now is the"
8985 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8986 *
8987 */
8988
8989static VALUE
8990rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8991{
8992 str = str_duplicate(rb_cString, str);
8993 rb_str_squeeze_bang(argc, argv, str);
8994 return str;
8995}
8996
8997
8998/*
8999 * call-seq:
9000 * tr_s!(selector, replacements) -> self or nil
9001 *
9002 * Like String#tr_s, but modifies +self+ in place.
9003 * Returns +self+ if any changes were made, +nil+ otherwise.
9004 *
9005 * Related: String#squeeze!.
9006 */
9007
9008static VALUE
9009rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9010{
9011 return tr_trans(str, src, repl, 1);
9012}
9013
9014
9015/*
9016 * call-seq:
9017 * tr_s(selector, replacements) -> string
9018 *
9019 * Like String#tr, but also squeezes the modified portions of the translated string;
9020 * returns a new string (translated and squeezed).
9021 *
9022 * 'hello'.tr_s('l', 'r') #=> "hero"
9023 * 'hello'.tr_s('el', '-') #=> "h-o"
9024 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9025 *
9026 * Related: String#squeeze.
9027 *
9028 */
9029
9030static VALUE
9031rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9032{
9033 str = str_duplicate(rb_cString, str);
9034 tr_trans(str, src, repl, 1);
9035 return str;
9036}
9037
9038
9039/*
9040 * call-seq:
9041 * count(*selectors) -> integer
9042 *
9043 * Returns the total number of characters in +self+
9044 * that are specified by the given +selectors+
9045 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9046 *
9047 * a = "hello world"
9048 * a.count "lo" #=> 5
9049 * a.count "lo", "o" #=> 2
9050 * a.count "hello", "^l" #=> 4
9051 * a.count "ej-m" #=> 4
9052 *
9053 * "hello^world".count "\\^aeiou" #=> 4
9054 * "hello-world".count "a\\-eo" #=> 4
9055 *
9056 * c = "hello world\\r\\n"
9057 * c.count "\\" #=> 2
9058 * c.count "\\A" #=> 0
9059 * c.count "X-\\w" #=> 3
9060 */
9061
9062static VALUE
9063rb_str_count(int argc, VALUE *argv, VALUE str)
9064{
9065 char table[TR_TABLE_SIZE];
9066 rb_encoding *enc = 0;
9067 VALUE del = 0, nodel = 0, tstr;
9068 char *s, *send;
9069 int i;
9070 int ascompat;
9071 size_t n = 0;
9072
9074
9075 tstr = argv[0];
9076 StringValue(tstr);
9077 enc = rb_enc_check(str, tstr);
9078 if (argc == 1) {
9079 const char *ptstr;
9080 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9081 (ptstr = RSTRING_PTR(tstr),
9082 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9083 !is_broken_string(str)) {
9084 int clen;
9085 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9086
9087 s = RSTRING_PTR(str);
9088 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9089 send = RSTRING_END(str);
9090 while (s < send) {
9091 if (*(unsigned char*)s++ == c) n++;
9092 }
9093 return SIZET2NUM(n);
9094 }
9095 }
9096
9097 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9098 for (i=1; i<argc; i++) {
9099 tstr = argv[i];
9100 StringValue(tstr);
9101 enc = rb_enc_check(str, tstr);
9102 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9103 }
9104
9105 s = RSTRING_PTR(str);
9106 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9107 send = RSTRING_END(str);
9108 ascompat = rb_enc_asciicompat(enc);
9109 while (s < send) {
9110 unsigned int c;
9111
9112 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9113 if (table[c]) {
9114 n++;
9115 }
9116 s++;
9117 }
9118 else {
9119 int clen;
9120 c = rb_enc_codepoint_len(s, send, &clen, enc);
9121 if (tr_find(c, table, del, nodel)) {
9122 n++;
9123 }
9124 s += clen;
9125 }
9126 }
9127
9128 return SIZET2NUM(n);
9129}
9130
9131static VALUE
9132rb_fs_check(VALUE val)
9133{
9134 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9135 val = rb_check_string_type(val);
9136 if (NIL_P(val)) return 0;
9137 }
9138 return val;
9139}
9140
9141static const char isspacetable[256] = {
9142 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9144 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9148 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9150 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9152 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9154 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9156 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9158};
9159
9160#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9161
9162static long
9163split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9164{
9165 if (empty_count >= 0 && len == 0) {
9166 return empty_count + 1;
9167 }
9168 if (empty_count > 0) {
9169 /* make different substrings */
9170 if (result) {
9171 do {
9172 rb_ary_push(result, str_new_empty_String(str));
9173 } while (--empty_count > 0);
9174 }
9175 else {
9176 do {
9177 rb_yield(str_new_empty_String(str));
9178 } while (--empty_count > 0);
9179 }
9180 }
9181 str = rb_str_subseq(str, beg, len);
9182 if (result) {
9183 rb_ary_push(result, str);
9184 }
9185 else {
9186 rb_yield(str);
9187 }
9188 return empty_count;
9189}
9190
9191typedef enum {
9192 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9193} split_type_t;
9194
9195static split_type_t
9196literal_split_pattern(VALUE spat, split_type_t default_type)
9197{
9198 rb_encoding *enc = STR_ENC_GET(spat);
9199 const char *ptr;
9200 long len;
9201 RSTRING_GETMEM(spat, ptr, len);
9202 if (len == 0) {
9203 /* Special case - split into chars */
9204 return SPLIT_TYPE_CHARS;
9205 }
9206 else if (rb_enc_asciicompat(enc)) {
9207 if (len == 1 && ptr[0] == ' ') {
9208 return SPLIT_TYPE_AWK;
9209 }
9210 }
9211 else {
9212 int l;
9213 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9214 return SPLIT_TYPE_AWK;
9215 }
9216 }
9217 return default_type;
9218}
9219
9220/*
9221 * call-seq:
9222 * split(field_sep = $;, limit = 0) -> array
9223 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9224 *
9225 * :include: doc/string/split.rdoc
9226 *
9227 */
9228
9229static VALUE
9230rb_str_split_m(int argc, VALUE *argv, VALUE str)
9231{
9232 rb_encoding *enc;
9233 VALUE spat;
9234 VALUE limit;
9235 split_type_t split_type;
9236 long beg, end, i = 0, empty_count = -1;
9237 int lim = 0;
9238 VALUE result, tmp;
9239
9240 result = rb_block_given_p() ? Qfalse : Qnil;
9241 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9242 lim = NUM2INT(limit);
9243 if (lim <= 0) limit = Qnil;
9244 else if (lim == 1) {
9245 if (RSTRING_LEN(str) == 0)
9246 return result ? rb_ary_new2(0) : str;
9247 tmp = str_duplicate(rb_cString, str);
9248 if (!result) {
9249 rb_yield(tmp);
9250 return str;
9251 }
9252 return rb_ary_new3(1, tmp);
9253 }
9254 i = 1;
9255 }
9256 if (NIL_P(limit) && !lim) empty_count = 0;
9257
9258 enc = STR_ENC_GET(str);
9259 split_type = SPLIT_TYPE_REGEXP;
9260 if (!NIL_P(spat)) {
9261 spat = get_pat_quoted(spat, 0);
9262 }
9263 else if (NIL_P(spat = rb_fs)) {
9264 split_type = SPLIT_TYPE_AWK;
9265 }
9266 else if (!(spat = rb_fs_check(spat))) {
9267 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9268 }
9269 else {
9270 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9271 }
9272 if (split_type != SPLIT_TYPE_AWK) {
9273 switch (BUILTIN_TYPE(spat)) {
9274 case T_REGEXP:
9275 rb_reg_options(spat); /* check if uninitialized */
9276 tmp = RREGEXP_SRC(spat);
9277 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9278 if (split_type == SPLIT_TYPE_AWK) {
9279 spat = tmp;
9280 split_type = SPLIT_TYPE_STRING;
9281 }
9282 break;
9283
9284 case T_STRING:
9285 mustnot_broken(spat);
9286 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9287 break;
9288
9289 default:
9291 }
9292 }
9293
9294#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9295
9296 beg = 0;
9297 char *ptr = RSTRING_PTR(str);
9298 char *eptr = RSTRING_END(str);
9299 if (split_type == SPLIT_TYPE_AWK) {
9300 char *bptr = ptr;
9301 int skip = 1;
9302 unsigned int c;
9303
9304 if (result) result = rb_ary_new();
9305 end = beg;
9306 if (is_ascii_string(str)) {
9307 while (ptr < eptr) {
9308 c = (unsigned char)*ptr++;
9309 if (skip) {
9310 if (ascii_isspace(c)) {
9311 beg = ptr - bptr;
9312 }
9313 else {
9314 end = ptr - bptr;
9315 skip = 0;
9316 if (!NIL_P(limit) && lim <= i) break;
9317 }
9318 }
9319 else if (ascii_isspace(c)) {
9320 SPLIT_STR(beg, end-beg);
9321 skip = 1;
9322 beg = ptr - bptr;
9323 if (!NIL_P(limit)) ++i;
9324 }
9325 else {
9326 end = ptr - bptr;
9327 }
9328 }
9329 }
9330 else {
9331 while (ptr < eptr) {
9332 int n;
9333
9334 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9335 ptr += n;
9336 if (skip) {
9337 if (rb_isspace(c)) {
9338 beg = ptr - bptr;
9339 }
9340 else {
9341 end = ptr - bptr;
9342 skip = 0;
9343 if (!NIL_P(limit) && lim <= i) break;
9344 }
9345 }
9346 else if (rb_isspace(c)) {
9347 SPLIT_STR(beg, end-beg);
9348 skip = 1;
9349 beg = ptr - bptr;
9350 if (!NIL_P(limit)) ++i;
9351 }
9352 else {
9353 end = ptr - bptr;
9354 }
9355 }
9356 }
9357 }
9358 else if (split_type == SPLIT_TYPE_STRING) {
9359 char *str_start = ptr;
9360 char *substr_start = ptr;
9361 char *sptr = RSTRING_PTR(spat);
9362 long slen = RSTRING_LEN(spat);
9363
9364 if (result) result = rb_ary_new();
9365 mustnot_broken(str);
9366 enc = rb_enc_check(str, spat);
9367 while (ptr < eptr &&
9368 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9369 /* Check we are at the start of a char */
9370 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9371 if (t != ptr + end) {
9372 ptr = t;
9373 continue;
9374 }
9375 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9376 ptr += end + slen;
9377 substr_start = ptr;
9378 if (!NIL_P(limit) && lim <= ++i) break;
9379 }
9380 beg = ptr - str_start;
9381 }
9382 else if (split_type == SPLIT_TYPE_CHARS) {
9383 char *str_start = ptr;
9384 int n;
9385
9386 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9387 mustnot_broken(str);
9388 enc = rb_enc_get(str);
9389 while (ptr < eptr &&
9390 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9391 SPLIT_STR(ptr - str_start, n);
9392 ptr += n;
9393 if (!NIL_P(limit) && lim <= ++i) break;
9394 }
9395 beg = ptr - str_start;
9396 }
9397 else {
9398 if (result) result = rb_ary_new();
9399 long len = RSTRING_LEN(str);
9400 long start = beg;
9401 long idx;
9402 int last_null = 0;
9403 struct re_registers *regs;
9404 VALUE match = 0;
9405
9406 for (; rb_reg_search(spat, str, start, 0) >= 0;
9407 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9408 match = rb_backref_get();
9409 if (!result) rb_match_busy(match);
9410 regs = RMATCH_REGS(match);
9411 end = BEG(0);
9412 if (start == end && BEG(0) == END(0)) {
9413 if (!ptr) {
9414 SPLIT_STR(0, 0);
9415 break;
9416 }
9417 else if (last_null == 1) {
9418 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9419 beg = start;
9420 }
9421 else {
9422 if (start == len)
9423 start++;
9424 else
9425 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9426 last_null = 1;
9427 continue;
9428 }
9429 }
9430 else {
9431 SPLIT_STR(beg, end-beg);
9432 beg = start = END(0);
9433 }
9434 last_null = 0;
9435
9436 for (idx=1; idx < regs->num_regs; idx++) {
9437 if (BEG(idx) == -1) continue;
9438 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9439 }
9440 if (!NIL_P(limit) && lim <= ++i) break;
9441 }
9442 if (match) rb_match_unbusy(match);
9443 }
9444 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9445 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9446 }
9447
9448 return result ? result : str;
9449}
9450
9451VALUE
9452rb_str_split(VALUE str, const char *sep0)
9453{
9454 VALUE sep;
9455
9456 StringValue(str);
9457 sep = rb_str_new_cstr(sep0);
9458 return rb_str_split_m(1, &sep, str);
9459}
9460
9461#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9462
9463static inline int
9464enumerator_element(VALUE ary, VALUE e)
9465{
9466 if (ary) {
9467 rb_ary_push(ary, e);
9468 return 0;
9469 }
9470 else {
9471 rb_yield(e);
9472 return 1;
9473 }
9474}
9475
9476#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9477
9478static const char *
9479chomp_newline(const char *p, const char *e, rb_encoding *enc)
9480{
9481 const char *prev = rb_enc_prev_char(p, e, e, enc);
9482 if (rb_enc_is_newline(prev, e, enc)) {
9483 e = prev;
9484 prev = rb_enc_prev_char(p, e, e, enc);
9485 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9486 e = prev;
9487 }
9488 return e;
9489}
9490
9491static VALUE
9492get_rs(void)
9493{
9494 VALUE rs = rb_rs;
9495 if (!NIL_P(rs) &&
9496 (!RB_TYPE_P(rs, T_STRING) ||
9497 RSTRING_LEN(rs) != 1 ||
9498 RSTRING_PTR(rs)[0] != '\n')) {
9499 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9500 }
9501 return rs;
9502}
9503
9504#define rb_rs get_rs()
9505
9506static VALUE
9507rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9508{
9509 rb_encoding *enc;
9510 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9511 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9512 long pos, len, rslen;
9513 int rsnewline = 0;
9514
9515 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9516 rs = rb_rs;
9517 if (!NIL_P(opts)) {
9518 static ID keywords[1];
9519 if (!keywords[0]) {
9520 keywords[0] = rb_intern_const("chomp");
9521 }
9522 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9523 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9524 }
9525
9526 if (NIL_P(rs)) {
9527 if (!ENUM_ELEM(ary, str)) {
9528 return ary;
9529 }
9530 else {
9531 return orig;
9532 }
9533 }
9534
9535 if (!RSTRING_LEN(str)) goto end;
9536 str = rb_str_new_frozen(str);
9537 ptr = subptr = RSTRING_PTR(str);
9538 pend = RSTRING_END(str);
9539 len = RSTRING_LEN(str);
9540 StringValue(rs);
9541 rslen = RSTRING_LEN(rs);
9542
9543 if (rs == rb_default_rs)
9544 enc = rb_enc_get(str);
9545 else
9546 enc = rb_enc_check(str, rs);
9547
9548 if (rslen == 0) {
9549 /* paragraph mode */
9550 int n;
9551 const char *eol = NULL;
9552 subend = subptr;
9553 while (subend < pend) {
9554 long chomp_rslen = 0;
9555 do {
9556 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9557 n = 0;
9558 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9559 if (rb_enc_is_newline(subend + n, pend, enc)) {
9560 if (eol == subend) break;
9561 subend += rslen;
9562 if (subptr) {
9563 eol = subend;
9564 chomp_rslen = -rslen;
9565 }
9566 }
9567 else {
9568 if (!subptr) subptr = subend;
9569 subend += rslen;
9570 }
9571 rslen = 0;
9572 } while (subend < pend);
9573 if (!subptr) break;
9574 if (rslen == 0) chomp_rslen = 0;
9575 line = rb_str_subseq(str, subptr - ptr,
9576 subend - subptr + (chomp ? chomp_rslen : rslen));
9577 if (ENUM_ELEM(ary, line)) {
9578 str_mod_check(str, ptr, len);
9579 }
9580 subptr = eol = NULL;
9581 }
9582 goto end;
9583 }
9584 else {
9585 rsptr = RSTRING_PTR(rs);
9586 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9587 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9588 rsnewline = 1;
9589 }
9590 }
9591
9592 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9593 rs = rb_str_new(rsptr, rslen);
9594 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9595 rsptr = RSTRING_PTR(rs);
9596 rslen = RSTRING_LEN(rs);
9597 }
9598
9599 while (subptr < pend) {
9600 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9601 if (pos < 0) break;
9602 hit = subptr + pos;
9603 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9604 if (hit != adjusted) {
9605 subptr = adjusted;
9606 continue;
9607 }
9608 subend = hit += rslen;
9609 if (chomp) {
9610 if (rsnewline) {
9611 subend = chomp_newline(subptr, subend, enc);
9612 }
9613 else {
9614 subend -= rslen;
9615 }
9616 }
9617 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9618 if (ENUM_ELEM(ary, line)) {
9619 str_mod_check(str, ptr, len);
9620 }
9621 subptr = hit;
9622 }
9623
9624 if (subptr != pend) {
9625 if (chomp) {
9626 if (rsnewline) {
9627 pend = chomp_newline(subptr, pend, enc);
9628 }
9629 else if (pend - subptr >= rslen &&
9630 memcmp(pend - rslen, rsptr, rslen) == 0) {
9631 pend -= rslen;
9632 }
9633 }
9634 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9635 ENUM_ELEM(ary, line);
9636 RB_GC_GUARD(str);
9637 }
9638
9639 end:
9640 if (ary)
9641 return ary;
9642 else
9643 return orig;
9644}
9645
9646/*
9647 * call-seq:
9648 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9649 * each_line(line_sep = $/, chomp: false) -> enumerator
9650 *
9651 * :include: doc/string/each_line.rdoc
9652 *
9653 */
9654
9655static VALUE
9656rb_str_each_line(int argc, VALUE *argv, VALUE str)
9657{
9658 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9659 return rb_str_enumerate_lines(argc, argv, str, 0);
9660}
9661
9662/*
9663 * call-seq:
9664 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9665 *
9666 * Forms substrings ("lines") of +self+ according to the given arguments
9667 * (see String#each_line for details); returns the lines in an array.
9668 *
9669 */
9670
9671static VALUE
9672rb_str_lines(int argc, VALUE *argv, VALUE str)
9673{
9674 VALUE ary = WANTARRAY("lines", 0);
9675 return rb_str_enumerate_lines(argc, argv, str, ary);
9676}
9677
9678static VALUE
9679rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9680{
9681 return LONG2FIX(RSTRING_LEN(str));
9682}
9683
9684static VALUE
9685rb_str_enumerate_bytes(VALUE str, VALUE ary)
9686{
9687 long i;
9688
9689 for (i=0; i<RSTRING_LEN(str); i++) {
9690 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9691 }
9692 if (ary)
9693 return ary;
9694 else
9695 return str;
9696}
9697
9698/*
9699 * call-seq:
9700 * each_byte {|byte| ... } -> self
9701 * each_byte -> enumerator
9702 *
9703 * :include: doc/string/each_byte.rdoc
9704 *
9705 */
9706
9707static VALUE
9708rb_str_each_byte(VALUE str)
9709{
9710 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9711 return rb_str_enumerate_bytes(str, 0);
9712}
9713
9714/*
9715 * call-seq:
9716 * bytes -> array_of_bytes
9717 *
9718 * :include: doc/string/bytes.rdoc
9719 *
9720 */
9721
9722static VALUE
9723rb_str_bytes(VALUE str)
9724{
9725 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9726 return rb_str_enumerate_bytes(str, ary);
9727}
9728
9729static VALUE
9730rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9731{
9732 return rb_str_length(str);
9733}
9734
9735static VALUE
9736rb_str_enumerate_chars(VALUE str, VALUE ary)
9737{
9738 VALUE orig = str;
9739 long i, len, n;
9740 const char *ptr;
9741 rb_encoding *enc;
9742
9743 str = rb_str_new_frozen(str);
9744 ptr = RSTRING_PTR(str);
9745 len = RSTRING_LEN(str);
9746 enc = rb_enc_get(str);
9747
9749 for (i = 0; i < len; i += n) {
9750 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9751 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9752 }
9753 }
9754 else {
9755 for (i = 0; i < len; i += n) {
9756 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9757 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9758 }
9759 }
9760 RB_GC_GUARD(str);
9761 if (ary)
9762 return ary;
9763 else
9764 return orig;
9765}
9766
9767/*
9768 * call-seq:
9769 * each_char {|c| ... } -> self
9770 * each_char -> enumerator
9771 *
9772 * :include: doc/string/each_char.rdoc
9773 *
9774 */
9775
9776static VALUE
9777rb_str_each_char(VALUE str)
9778{
9779 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9780 return rb_str_enumerate_chars(str, 0);
9781}
9782
9783/*
9784 * call-seq:
9785 * chars -> array_of_characters
9786 *
9787 * :include: doc/string/chars.rdoc
9788 *
9789 */
9790
9791static VALUE
9792rb_str_chars(VALUE str)
9793{
9794 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9795 return rb_str_enumerate_chars(str, ary);
9796}
9797
9798static VALUE
9799rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9800{
9801 VALUE orig = str;
9802 int n;
9803 unsigned int c;
9804 const char *ptr, *end;
9805 rb_encoding *enc;
9806
9807 if (single_byte_optimizable(str))
9808 return rb_str_enumerate_bytes(str, ary);
9809
9810 str = rb_str_new_frozen(str);
9811 ptr = RSTRING_PTR(str);
9812 end = RSTRING_END(str);
9813 enc = STR_ENC_GET(str);
9814
9815 while (ptr < end) {
9816 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9817 ENUM_ELEM(ary, UINT2NUM(c));
9818 ptr += n;
9819 }
9820 RB_GC_GUARD(str);
9821 if (ary)
9822 return ary;
9823 else
9824 return orig;
9825}
9826
9827/*
9828 * call-seq:
9829 * each_codepoint {|integer| ... } -> self
9830 * each_codepoint -> enumerator
9831 *
9832 * :include: doc/string/each_codepoint.rdoc
9833 *
9834 */
9835
9836static VALUE
9837rb_str_each_codepoint(VALUE str)
9838{
9839 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9840 return rb_str_enumerate_codepoints(str, 0);
9841}
9842
9843/*
9844 * call-seq:
9845 * codepoints -> array_of_integers
9846 *
9847 * :include: doc/string/codepoints.rdoc
9848 *
9849 */
9850
9851static VALUE
9852rb_str_codepoints(VALUE str)
9853{
9854 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9855 return rb_str_enumerate_codepoints(str, ary);
9856}
9857
9858static regex_t *
9859get_reg_grapheme_cluster(rb_encoding *enc)
9860{
9861 int encidx = rb_enc_to_index(enc);
9862
9863 const OnigUChar source_ascii[] = "\\X";
9864 const OnigUChar *source = source_ascii;
9865 size_t source_len = sizeof(source_ascii) - 1;
9866
9867 switch (encidx) {
9868#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9869#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9870#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9871#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9872#define CASE_UTF(e) \
9873 case ENCINDEX_UTF_##e: { \
9874 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9875 source = source_UTF_##e; \
9876 source_len = sizeof(source_UTF_##e); \
9877 break; \
9878 }
9879 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9880#undef CASE_UTF
9881#undef CHARS_16BE
9882#undef CHARS_16LE
9883#undef CHARS_32BE
9884#undef CHARS_32LE
9885 }
9886
9887 regex_t *reg_grapheme_cluster;
9888 OnigErrorInfo einfo;
9889 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9890 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9891 if (r) {
9892 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9893 onig_error_code_to_str(message, r, &einfo);
9894 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9895 }
9896
9897 return reg_grapheme_cluster;
9898}
9899
9900static regex_t *
9901get_cached_reg_grapheme_cluster(rb_encoding *enc)
9902{
9903 int encidx = rb_enc_to_index(enc);
9904 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9905
9906 if (encidx == rb_utf8_encindex()) {
9907 if (!reg_grapheme_cluster_utf8) {
9908 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9909 }
9910
9911 return reg_grapheme_cluster_utf8;
9912 }
9913
9914 return NULL;
9915}
9916
9917static VALUE
9918rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9919{
9920 size_t grapheme_cluster_count = 0;
9921 rb_encoding *enc = get_encoding(str);
9922 const char *ptr, *end;
9923
9924 if (!rb_enc_unicode_p(enc)) {
9925 return rb_str_length(str);
9926 }
9927
9928 bool cached_reg_grapheme_cluster = true;
9929 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9930 if (!reg_grapheme_cluster) {
9931 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9932 cached_reg_grapheme_cluster = false;
9933 }
9934
9935 ptr = RSTRING_PTR(str);
9936 end = RSTRING_END(str);
9937
9938 while (ptr < end) {
9939 OnigPosition len = onig_match(reg_grapheme_cluster,
9940 (const OnigUChar *)ptr, (const OnigUChar *)end,
9941 (const OnigUChar *)ptr, NULL, 0);
9942 if (len <= 0) break;
9943 grapheme_cluster_count++;
9944 ptr += len;
9945 }
9946
9947 if (!cached_reg_grapheme_cluster) {
9948 onig_free(reg_grapheme_cluster);
9949 }
9950
9951 return SIZET2NUM(grapheme_cluster_count);
9952}
9953
9954static VALUE
9955rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9956{
9957 VALUE orig = str;
9958 rb_encoding *enc = get_encoding(str);
9959 const char *ptr0, *ptr, *end;
9960
9961 if (!rb_enc_unicode_p(enc)) {
9962 return rb_str_enumerate_chars(str, ary);
9963 }
9964
9965 if (!ary) str = rb_str_new_frozen(str);
9966
9967 bool cached_reg_grapheme_cluster = true;
9968 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9969 if (!reg_grapheme_cluster) {
9970 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9971 cached_reg_grapheme_cluster = false;
9972 }
9973
9974 ptr0 = ptr = RSTRING_PTR(str);
9975 end = RSTRING_END(str);
9976
9977 while (ptr < end) {
9978 OnigPosition len = onig_match(reg_grapheme_cluster,
9979 (const OnigUChar *)ptr, (const OnigUChar *)end,
9980 (const OnigUChar *)ptr, NULL, 0);
9981 if (len <= 0) break;
9982 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9983 ptr += len;
9984 }
9985
9986 if (!cached_reg_grapheme_cluster) {
9987 onig_free(reg_grapheme_cluster);
9988 }
9989
9990 RB_GC_GUARD(str);
9991 if (ary)
9992 return ary;
9993 else
9994 return orig;
9995}
9996
9997/*
9998 * call-seq:
9999 * each_grapheme_cluster {|gc| ... } -> self
10000 * each_grapheme_cluster -> enumerator
10001 *
10002 * :include: doc/string/each_grapheme_cluster.rdoc
10003 *
10004 */
10005
10006static VALUE
10007rb_str_each_grapheme_cluster(VALUE str)
10008{
10009 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10010 return rb_str_enumerate_grapheme_clusters(str, 0);
10011}
10012
10013/*
10014 * call-seq:
10015 * grapheme_clusters -> array_of_grapheme_clusters
10016 *
10017 * :include: doc/string/grapheme_clusters.rdoc
10018 *
10019 */
10020
10021static VALUE
10022rb_str_grapheme_clusters(VALUE str)
10023{
10024 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10025 return rb_str_enumerate_grapheme_clusters(str, ary);
10026}
10027
10028static long
10029chopped_length(VALUE str)
10030{
10031 rb_encoding *enc = STR_ENC_GET(str);
10032 const char *p, *p2, *beg, *end;
10033
10034 beg = RSTRING_PTR(str);
10035 end = beg + RSTRING_LEN(str);
10036 if (beg >= end) return 0;
10037 p = rb_enc_prev_char(beg, end, end, enc);
10038 if (!p) return 0;
10039 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10040 p2 = rb_enc_prev_char(beg, p, end, enc);
10041 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10042 }
10043 return p - beg;
10044}
10045
10046/*
10047 * call-seq:
10048 * chop! -> self or nil
10049 *
10050 * Like String#chop, but modifies +self+ in place;
10051 * returns +nil+ if +self+ is empty, +self+ otherwise.
10052 *
10053 * Related: String#chomp!.
10054 */
10055
10056static VALUE
10057rb_str_chop_bang(VALUE str)
10058{
10059 str_modify_keep_cr(str);
10060 if (RSTRING_LEN(str) > 0) {
10061 long len;
10062 len = chopped_length(str);
10063 STR_SET_LEN(str, len);
10064 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10065 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10067 }
10068 return str;
10069 }
10070 return Qnil;
10071}
10072
10073
10074/*
10075 * call-seq:
10076 * chop -> new_string
10077 *
10078 * :include: doc/string/chop.rdoc
10079 *
10080 */
10081
10082static VALUE
10083rb_str_chop(VALUE str)
10084{
10085 return rb_str_subseq(str, 0, chopped_length(str));
10086}
10087
10088static long
10089smart_chomp(VALUE str, const char *e, const char *p)
10090{
10091 rb_encoding *enc = rb_enc_get(str);
10092 if (rb_enc_mbminlen(enc) > 1) {
10093 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10094 if (rb_enc_is_newline(pp, e, enc)) {
10095 e = pp;
10096 }
10097 pp = e - rb_enc_mbminlen(enc);
10098 if (pp >= p) {
10099 pp = rb_enc_left_char_head(p, pp, e, enc);
10100 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10101 e = pp;
10102 }
10103 }
10104 }
10105 else {
10106 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10107 case '\n':
10108 if (--e > p && *(e-1) == '\r') {
10109 --e;
10110 }
10111 break;
10112 case '\r':
10113 --e;
10114 break;
10115 }
10116 }
10117 return e - p;
10118}
10119
10120static long
10121chompped_length(VALUE str, VALUE rs)
10122{
10123 rb_encoding *enc;
10124 int newline;
10125 char *pp, *e, *rsptr;
10126 long rslen;
10127 char *const p = RSTRING_PTR(str);
10128 long len = RSTRING_LEN(str);
10129
10130 if (len == 0) return 0;
10131 e = p + len;
10132 if (rs == rb_default_rs) {
10133 return smart_chomp(str, e, p);
10134 }
10135
10136 enc = rb_enc_get(str);
10137 RSTRING_GETMEM(rs, rsptr, rslen);
10138 if (rslen == 0) {
10139 if (rb_enc_mbminlen(enc) > 1) {
10140 while (e > p) {
10141 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10142 if (!rb_enc_is_newline(pp, e, enc)) break;
10143 e = pp;
10144 pp -= rb_enc_mbminlen(enc);
10145 if (pp >= p) {
10146 pp = rb_enc_left_char_head(p, pp, e, enc);
10147 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10148 e = pp;
10149 }
10150 }
10151 }
10152 }
10153 else {
10154 while (e > p && *(e-1) == '\n') {
10155 --e;
10156 if (e > p && *(e-1) == '\r')
10157 --e;
10158 }
10159 }
10160 return e - p;
10161 }
10162 if (rslen > len) return len;
10163
10164 enc = rb_enc_get(rs);
10165 newline = rsptr[rslen-1];
10166 if (rslen == rb_enc_mbminlen(enc)) {
10167 if (rslen == 1) {
10168 if (newline == '\n')
10169 return smart_chomp(str, e, p);
10170 }
10171 else {
10172 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10173 return smart_chomp(str, e, p);
10174 }
10175 }
10176
10177 enc = rb_enc_check(str, rs);
10178 if (is_broken_string(rs)) {
10179 return len;
10180 }
10181 pp = e - rslen;
10182 if (p[len-1] == newline &&
10183 (rslen <= 1 ||
10184 memcmp(rsptr, pp, rslen) == 0)) {
10185 if (at_char_boundary(p, pp, e, enc))
10186 return len - rslen;
10187 RB_GC_GUARD(rs);
10188 }
10189 return len;
10190}
10191
10197static VALUE
10198chomp_rs(int argc, const VALUE *argv)
10199{
10200 rb_check_arity(argc, 0, 1);
10201 if (argc > 0) {
10202 VALUE rs = argv[0];
10203 if (!NIL_P(rs)) StringValue(rs);
10204 return rs;
10205 }
10206 else {
10207 return rb_rs;
10208 }
10209}
10210
10211VALUE
10212rb_str_chomp_string(VALUE str, VALUE rs)
10213{
10214 long olen = RSTRING_LEN(str);
10215 long len = chompped_length(str, rs);
10216 if (len >= olen) return Qnil;
10217 str_modify_keep_cr(str);
10218 STR_SET_LEN(str, len);
10219 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10220 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10222 }
10223 return str;
10224}
10225
10226/*
10227 * call-seq:
10228 * chomp!(line_sep = $/) -> self or nil
10229 *
10230 * Like String#chomp, but modifies +self+ in place;
10231 * returns +nil+ if no modification made, +self+ otherwise.
10232 *
10233 */
10234
10235static VALUE
10236rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10237{
10238 VALUE rs;
10239 str_modifiable(str);
10240 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10241 rs = chomp_rs(argc, argv);
10242 if (NIL_P(rs)) return Qnil;
10243 return rb_str_chomp_string(str, rs);
10244}
10245
10246
10247/*
10248 * call-seq:
10249 * chomp(line_sep = $/) -> new_string
10250 *
10251 * :include: doc/string/chomp.rdoc
10252 *
10253 */
10254
10255static VALUE
10256rb_str_chomp(int argc, VALUE *argv, VALUE str)
10257{
10258 VALUE rs = chomp_rs(argc, argv);
10259 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10260 return rb_str_subseq(str, 0, chompped_length(str, rs));
10261}
10262
10263static long
10264lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10265{
10266 const char *const start = s;
10267
10268 if (!s || s >= e) return 0;
10269
10270 /* remove spaces at head */
10271 if (single_byte_optimizable(str)) {
10272 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10273 }
10274 else {
10275 while (s < e) {
10276 int n;
10277 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10278
10279 if (cc && !rb_isspace(cc)) break;
10280 s += n;
10281 }
10282 }
10283 return s - start;
10284}
10285
10286/*
10287 * call-seq:
10288 * lstrip! -> self or nil
10289 *
10290 * Like String#lstrip, except that any modifications are made in +self+;
10291 * returns +self+ if any modification are made, +nil+ otherwise.
10292 *
10293 * Related: String#rstrip!, String#strip!.
10294 */
10295
10296static VALUE
10297rb_str_lstrip_bang(VALUE str)
10298{
10299 rb_encoding *enc;
10300 char *start, *s;
10301 long olen, loffset;
10302
10303 str_modify_keep_cr(str);
10304 enc = STR_ENC_GET(str);
10305 RSTRING_GETMEM(str, start, olen);
10306 loffset = lstrip_offset(str, start, start+olen, enc);
10307 if (loffset > 0) {
10308 long len = olen-loffset;
10309 s = start + loffset;
10310 memmove(start, s, len);
10311 STR_SET_LEN(str, len);
10312 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10313 return str;
10314 }
10315 return Qnil;
10316}
10317
10318
10319/*
10320 * call-seq:
10321 * lstrip -> new_string
10322 *
10323 * Returns a copy of +self+ with leading whitespace removed;
10324 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10325 *
10326 * whitespace = "\x00\t\n\v\f\r "
10327 * s = whitespace + 'abc' + whitespace
10328 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10329 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10330 *
10331 * Related: String#rstrip, String#strip.
10332 */
10333
10334static VALUE
10335rb_str_lstrip(VALUE str)
10336{
10337 char *start;
10338 long len, loffset;
10339 RSTRING_GETMEM(str, start, len);
10340 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10341 if (loffset <= 0) return str_duplicate(rb_cString, str);
10342 return rb_str_subseq(str, loffset, len - loffset);
10343}
10344
10345static long
10346rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10347{
10348 const char *t;
10349
10350 rb_str_check_dummy_enc(enc);
10352 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10353 }
10354 if (!s || s >= e) return 0;
10355 t = e;
10356
10357 /* remove trailing spaces or '\0's */
10358 if (single_byte_optimizable(str)) {
10359 unsigned char c;
10360 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10361 }
10362 else {
10363 char *tp;
10364
10365 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10366 unsigned int c = rb_enc_codepoint(tp, e, enc);
10367 if (c && !rb_isspace(c)) break;
10368 t = tp;
10369 }
10370 }
10371 return e - t;
10372}
10373
10374/*
10375 * call-seq:
10376 * rstrip! -> self or nil
10377 *
10378 * Like String#rstrip, except that any modifications are made in +self+;
10379 * returns +self+ if any modification are made, +nil+ otherwise.
10380 *
10381 * Related: String#lstrip!, String#strip!.
10382 */
10383
10384static VALUE
10385rb_str_rstrip_bang(VALUE str)
10386{
10387 rb_encoding *enc;
10388 char *start;
10389 long olen, roffset;
10390
10391 str_modify_keep_cr(str);
10392 enc = STR_ENC_GET(str);
10393 RSTRING_GETMEM(str, start, olen);
10394 roffset = rstrip_offset(str, start, start+olen, enc);
10395 if (roffset > 0) {
10396 long len = olen - roffset;
10397
10398 STR_SET_LEN(str, len);
10399 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10400 return str;
10401 }
10402 return Qnil;
10403}
10404
10405
10406/*
10407 * call-seq:
10408 * rstrip -> new_string
10409 *
10410 * Returns a copy of the receiver with trailing whitespace removed;
10411 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10412 *
10413 * whitespace = "\x00\t\n\v\f\r "
10414 * s = whitespace + 'abc' + whitespace
10415 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10416 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10417 *
10418 * Related: String#lstrip, String#strip.
10419 */
10420
10421static VALUE
10422rb_str_rstrip(VALUE str)
10423{
10424 rb_encoding *enc;
10425 char *start;
10426 long olen, roffset;
10427
10428 enc = STR_ENC_GET(str);
10429 RSTRING_GETMEM(str, start, olen);
10430 roffset = rstrip_offset(str, start, start+olen, enc);
10431
10432 if (roffset <= 0) return str_duplicate(rb_cString, str);
10433 return rb_str_subseq(str, 0, olen-roffset);
10434}
10435
10436
10437/*
10438 * call-seq:
10439 * strip! -> self or nil
10440 *
10441 * Like String#strip, except that any modifications are made in +self+;
10442 * returns +self+ if any modification are made, +nil+ otherwise.
10443 *
10444 * Related: String#lstrip!, String#strip!.
10445 */
10446
10447static VALUE
10448rb_str_strip_bang(VALUE str)
10449{
10450 char *start;
10451 long olen, loffset, roffset;
10452 rb_encoding *enc;
10453
10454 str_modify_keep_cr(str);
10455 enc = STR_ENC_GET(str);
10456 RSTRING_GETMEM(str, start, olen);
10457 loffset = lstrip_offset(str, start, start+olen, enc);
10458 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10459
10460 if (loffset > 0 || roffset > 0) {
10461 long len = olen-roffset;
10462 if (loffset > 0) {
10463 len -= loffset;
10464 memmove(start, start + loffset, len);
10465 }
10466 STR_SET_LEN(str, len);
10467 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10468 return str;
10469 }
10470 return Qnil;
10471}
10472
10473
10474/*
10475 * call-seq:
10476 * strip -> new_string
10477 *
10478 * Returns a copy of the receiver with leading and trailing whitespace removed;
10479 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10480 *
10481 * whitespace = "\x00\t\n\v\f\r "
10482 * s = whitespace + 'abc' + whitespace
10483 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10484 * s.strip # => "abc"
10485 *
10486 * Related: String#lstrip, String#rstrip.
10487 */
10488
10489static VALUE
10490rb_str_strip(VALUE str)
10491{
10492 char *start;
10493 long olen, loffset, roffset;
10494 rb_encoding *enc = STR_ENC_GET(str);
10495
10496 RSTRING_GETMEM(str, start, olen);
10497 loffset = lstrip_offset(str, start, start+olen, enc);
10498 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10499
10500 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10501 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10502}
10503
10504static VALUE
10505scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10506{
10507 VALUE result = Qnil;
10508 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10509 if (pos >= 0) {
10510 VALUE match;
10511 struct re_registers *regs;
10512 if (BUILTIN_TYPE(pat) == T_STRING) {
10513 regs = NULL;
10514 end = pos + RSTRING_LEN(pat);
10515 }
10516 else {
10517 match = rb_backref_get();
10518 regs = RMATCH_REGS(match);
10519 pos = BEG(0);
10520 end = END(0);
10521 }
10522
10523 if (pos == end) {
10524 rb_encoding *enc = STR_ENC_GET(str);
10525 /*
10526 * Always consume at least one character of the input string
10527 */
10528 if (RSTRING_LEN(str) > end)
10529 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10530 RSTRING_END(str), enc);
10531 else
10532 *start = end + 1;
10533 }
10534 else {
10535 *start = end;
10536 }
10537
10538 if (!regs || regs->num_regs == 1) {
10539 result = rb_str_subseq(str, pos, end - pos);
10540 return result;
10541 }
10542 else {
10543 result = rb_ary_new2(regs->num_regs);
10544 for (int i = 1; i < regs->num_regs; i++) {
10545 VALUE s = Qnil;
10546 if (BEG(i) >= 0) {
10547 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10548 }
10549
10550 rb_ary_push(result, s);
10551 }
10552 }
10553
10554 RB_GC_GUARD(match);
10555 }
10556
10557 return result;
10558}
10559
10560
10561/*
10562 * call-seq:
10563 * scan(string_or_regexp) -> array
10564 * scan(string_or_regexp) {|matches| ... } -> self
10565 *
10566 * Matches a pattern against +self+; the pattern is:
10567 *
10568 * - +string_or_regexp+ itself, if it is a Regexp.
10569 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10570 *
10571 * Iterates through +self+, generating a collection of matching results:
10572 *
10573 * - If the pattern contains no groups, each result is the
10574 * matched string, <code>$&</code>.
10575 * - If the pattern contains groups, each result is an array
10576 * containing one entry per group.
10577 *
10578 * With no block given, returns an array of the results:
10579 *
10580 * s = 'cruel world'
10581 * s.scan(/\w+/) # => ["cruel", "world"]
10582 * s.scan(/.../) # => ["cru", "el ", "wor"]
10583 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10584 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10585 *
10586 * With a block given, calls the block with each result; returns +self+:
10587 *
10588 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10589 * print "\n"
10590 * s.scan(/(.)(.)/) {|x,y| print y, x }
10591 * print "\n"
10592 *
10593 * Output:
10594 *
10595 * <<cruel>> <<world>>
10596 * rceu lowlr
10597 *
10598 */
10599
10600static VALUE
10601rb_str_scan(VALUE str, VALUE pat)
10602{
10603 VALUE result;
10604 long start = 0;
10605 long last = -1, prev = 0;
10606 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10607
10608 pat = get_pat_quoted(pat, 1);
10609 mustnot_broken(str);
10610 if (!rb_block_given_p()) {
10611 VALUE ary = rb_ary_new();
10612
10613 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10614 last = prev;
10615 prev = start;
10616 rb_ary_push(ary, result);
10617 }
10618 if (last >= 0) rb_pat_search(pat, str, last, 1);
10619 else rb_backref_set(Qnil);
10620 return ary;
10621 }
10622
10623 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10624 last = prev;
10625 prev = start;
10626 rb_yield(result);
10627 str_mod_check(str, p, len);
10628 }
10629 if (last >= 0) rb_pat_search(pat, str, last, 1);
10630 return str;
10631}
10632
10633
10634/*
10635 * call-seq:
10636 * hex -> integer
10637 *
10638 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10639 * (with an optional sign and an optional <code>0x</code>) and returns the
10640 * corresponding number;
10641 * returns zero if there is no such leading substring:
10642 *
10643 * '0x0a'.hex # => 10
10644 * '-1234'.hex # => -4660
10645 * '0'.hex # => 0
10646 * 'non-numeric'.hex # => 0
10647 *
10648 * Related: String#oct.
10649 *
10650 */
10651
10652static VALUE
10653rb_str_hex(VALUE str)
10654{
10655 return rb_str_to_inum(str, 16, FALSE);
10656}
10657
10658
10659/*
10660 * call-seq:
10661 * oct -> integer
10662 *
10663 * Interprets the leading substring of +self+ as a string of octal digits
10664 * (with an optional sign) and returns the corresponding number;
10665 * returns zero if there is no such leading substring:
10666 *
10667 * '123'.oct # => 83
10668 * '-377'.oct # => -255
10669 * '0377non-numeric'.oct # => 255
10670 * 'non-numeric'.oct # => 0
10671 *
10672 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10673 * see Kernel#Integer.
10674 *
10675 * Related: String#hex.
10676 *
10677 */
10678
10679static VALUE
10680rb_str_oct(VALUE str)
10681{
10682 return rb_str_to_inum(str, -8, FALSE);
10683}
10684
10685#ifndef HAVE_CRYPT_R
10686# include "ruby/thread_native.h"
10687# include "ruby/atomic.h"
10688
10689static struct {
10690 rb_nativethread_lock_t lock;
10691} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10692
10693static void
10694crypt_mutex_initialize(void)
10695{
10696}
10697#endif
10698
10699/*
10700 * call-seq:
10701 * crypt(salt_str) -> new_string
10702 *
10703 * Returns the string generated by calling <code>crypt(3)</code>
10704 * standard library function with <code>str</code> and
10705 * <code>salt_str</code>, in this order, as its arguments. Please do
10706 * not use this method any longer. It is legacy; provided only for
10707 * backward compatibility with ruby scripts in earlier days. It is
10708 * bad to use in contemporary programs for several reasons:
10709 *
10710 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10711 * run. The generated string lacks data portability.
10712 *
10713 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10714 * (i.e. silently ends up in unexpected results).
10715 *
10716 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10717 * thread safe.
10718 *
10719 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10720 * very very weak. According to its manpage, Linux's traditional
10721 * <code>crypt(3)</code> output has only 2**56 variations; too
10722 * easy to brute force today. And this is the default behaviour.
10723 *
10724 * * In order to make things robust some OSes implement so-called
10725 * "modular" usage. To go through, you have to do a complex
10726 * build-up of the <code>salt_str</code> parameter, by hand.
10727 * Failure in generation of a proper salt string tends not to
10728 * yield any errors; typos in parameters are normally not
10729 * detectable.
10730 *
10731 * * For instance, in the following example, the second invocation
10732 * of String#crypt is wrong; it has a typo in "round=" (lacks
10733 * "s"). However the call does not fail and something unexpected
10734 * is generated.
10735 *
10736 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10737 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10738 *
10739 * * Even in the "modular" mode, some hash functions are considered
10740 * archaic and no longer recommended at all; for instance module
10741 * <code>$1$</code> is officially abandoned by its author: see
10742 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10743 * instance module <code>$3$</code> is considered completely
10744 * broken: see the manpage of FreeBSD.
10745 *
10746 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10747 * written above, <code>crypt(3)</code> on Mac OS never fails.
10748 * This means even if you build up a proper salt string it
10749 * generates a traditional DES hash anyways, and there is no way
10750 * for you to be aware of.
10751 *
10752 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10753 *
10754 * If for some reason you cannot migrate to other secure contemporary
10755 * password hashing algorithms, install the string-crypt gem and
10756 * <code>require 'string/crypt'</code> to continue using it.
10757 */
10758
10759static VALUE
10760rb_str_crypt(VALUE str, VALUE salt)
10761{
10762#ifdef HAVE_CRYPT_R
10763 VALUE databuf;
10764 struct crypt_data *data;
10765# define CRYPT_END() ALLOCV_END(databuf)
10766#else
10767 extern char *crypt(const char *, const char *);
10768# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10769#endif
10770 VALUE result;
10771 const char *s, *saltp;
10772 char *res;
10773#ifdef BROKEN_CRYPT
10774 char salt_8bit_clean[3];
10775#endif
10776
10777 StringValue(salt);
10778 mustnot_wchar(str);
10779 mustnot_wchar(salt);
10780 s = StringValueCStr(str);
10781 saltp = RSTRING_PTR(salt);
10782 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10783 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10784 }
10785
10786#ifdef BROKEN_CRYPT
10787 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10788 salt_8bit_clean[0] = saltp[0] & 0x7f;
10789 salt_8bit_clean[1] = saltp[1] & 0x7f;
10790 salt_8bit_clean[2] = '\0';
10791 saltp = salt_8bit_clean;
10792 }
10793#endif
10794#ifdef HAVE_CRYPT_R
10795 data = ALLOCV(databuf, sizeof(struct crypt_data));
10796# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10797 data->initialized = 0;
10798# endif
10799 res = crypt_r(s, saltp, data);
10800#else
10801 crypt_mutex_initialize();
10802 rb_nativethread_lock_lock(&crypt_mutex.lock);
10803 res = crypt(s, saltp);
10804#endif
10805 if (!res) {
10806 int err = errno;
10807 CRYPT_END();
10808 rb_syserr_fail(err, "crypt");
10809 }
10810 result = rb_str_new_cstr(res);
10811 CRYPT_END();
10812 return result;
10813}
10814
10815
10816/*
10817 * call-seq:
10818 * ord -> integer
10819 *
10820 * :include: doc/string/ord.rdoc
10821 *
10822 */
10823
10824static VALUE
10825rb_str_ord(VALUE s)
10826{
10827 unsigned int c;
10828
10829 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10830 return UINT2NUM(c);
10831}
10832/*
10833 * call-seq:
10834 * sum(n = 16) -> integer
10835 *
10836 * :include: doc/string/sum.rdoc
10837 *
10838 */
10839
10840static VALUE
10841rb_str_sum(int argc, VALUE *argv, VALUE str)
10842{
10843 int bits = 16;
10844 char *ptr, *p, *pend;
10845 long len;
10846 VALUE sum = INT2FIX(0);
10847 unsigned long sum0 = 0;
10848
10849 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10850 bits = 0;
10851 }
10852 ptr = p = RSTRING_PTR(str);
10853 len = RSTRING_LEN(str);
10854 pend = p + len;
10855
10856 while (p < pend) {
10857 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10858 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10859 str_mod_check(str, ptr, len);
10860 sum0 = 0;
10861 }
10862 sum0 += (unsigned char)*p;
10863 p++;
10864 }
10865
10866 if (bits == 0) {
10867 if (sum0) {
10868 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10869 }
10870 }
10871 else {
10872 if (sum == INT2FIX(0)) {
10873 if (bits < (int)sizeof(long)*CHAR_BIT) {
10874 sum0 &= (((unsigned long)1)<<bits)-1;
10875 }
10876 sum = LONG2FIX(sum0);
10877 }
10878 else {
10879 VALUE mod;
10880
10881 if (sum0) {
10882 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10883 }
10884
10885 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10886 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10887 sum = rb_funcall(sum, '&', 1, mod);
10888 }
10889 }
10890 return sum;
10891}
10892
10893static VALUE
10894rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10895{
10896 rb_encoding *enc;
10897 VALUE w;
10898 long width, len, flen = 1, fclen = 1;
10899 VALUE res;
10900 char *p;
10901 const char *f = " ";
10902 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10903 VALUE pad;
10904 int singlebyte = 1, cr;
10905 int termlen;
10906
10907 rb_scan_args(argc, argv, "11", &w, &pad);
10908 enc = STR_ENC_GET(str);
10909 termlen = rb_enc_mbminlen(enc);
10910 width = NUM2LONG(w);
10911 if (argc == 2) {
10912 StringValue(pad);
10913 enc = rb_enc_check(str, pad);
10914 f = RSTRING_PTR(pad);
10915 flen = RSTRING_LEN(pad);
10916 fclen = str_strlen(pad, enc); /* rb_enc_check */
10917 singlebyte = single_byte_optimizable(pad);
10918 if (flen == 0 || fclen == 0) {
10919 rb_raise(rb_eArgError, "zero width padding");
10920 }
10921 }
10922 len = str_strlen(str, enc); /* rb_enc_check */
10923 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10924 n = width - len;
10925 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10926 rlen = n - llen;
10927 cr = ENC_CODERANGE(str);
10928 if (flen > 1) {
10929 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10930 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10931 }
10932 size = RSTRING_LEN(str);
10933 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10934 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10935 (len += llen2 + rlen2) >= LONG_MAX - size) {
10936 rb_raise(rb_eArgError, "argument too big");
10937 }
10938 len += size;
10939 res = str_enc_new(rb_cString, 0, len, enc);
10940 p = RSTRING_PTR(res);
10941 if (flen <= 1) {
10942 memset(p, *f, llen);
10943 p += llen;
10944 }
10945 else {
10946 while (llen >= fclen) {
10947 memcpy(p,f,flen);
10948 p += flen;
10949 llen -= fclen;
10950 }
10951 if (llen > 0) {
10952 memcpy(p, f, llen2);
10953 p += llen2;
10954 }
10955 }
10956 memcpy(p, RSTRING_PTR(str), size);
10957 p += size;
10958 if (flen <= 1) {
10959 memset(p, *f, rlen);
10960 p += rlen;
10961 }
10962 else {
10963 while (rlen >= fclen) {
10964 memcpy(p,f,flen);
10965 p += flen;
10966 rlen -= fclen;
10967 }
10968 if (rlen > 0) {
10969 memcpy(p, f, rlen2);
10970 p += rlen2;
10971 }
10972 }
10973 TERM_FILL(p, termlen);
10974 STR_SET_LEN(res, p-RSTRING_PTR(res));
10975
10976 if (argc == 2)
10977 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10978 if (cr != ENC_CODERANGE_BROKEN)
10979 ENC_CODERANGE_SET(res, cr);
10980
10981 RB_GC_GUARD(pad);
10982 return res;
10983}
10984
10985
10986/*
10987 * call-seq:
10988 * ljust(size, pad_string = ' ') -> new_string
10989 *
10990 * :include: doc/string/ljust.rdoc
10991 *
10992 * Related: String#rjust, String#center.
10993 *
10994 */
10995
10996static VALUE
10997rb_str_ljust(int argc, VALUE *argv, VALUE str)
10998{
10999 return rb_str_justify(argc, argv, str, 'l');
11000}
11001
11002/*
11003 * call-seq:
11004 * rjust(size, pad_string = ' ') -> new_string
11005 *
11006 * :include: doc/string/rjust.rdoc
11007 *
11008 * Related: String#ljust, String#center.
11009 *
11010 */
11011
11012static VALUE
11013rb_str_rjust(int argc, VALUE *argv, VALUE str)
11014{
11015 return rb_str_justify(argc, argv, str, 'r');
11016}
11017
11018
11019/*
11020 * call-seq:
11021 * center(size, pad_string = ' ') -> new_string
11022 *
11023 * :include: doc/string/center.rdoc
11024 *
11025 * Related: String#ljust, String#rjust.
11026 *
11027 */
11028
11029static VALUE
11030rb_str_center(int argc, VALUE *argv, VALUE str)
11031{
11032 return rb_str_justify(argc, argv, str, 'c');
11033}
11034
11035/*
11036 * call-seq:
11037 * partition(string_or_regexp) -> [head, match, tail]
11038 *
11039 * :include: doc/string/partition.rdoc
11040 *
11041 */
11042
11043static VALUE
11044rb_str_partition(VALUE str, VALUE sep)
11045{
11046 long pos;
11047
11048 sep = get_pat_quoted(sep, 0);
11049 if (RB_TYPE_P(sep, T_REGEXP)) {
11050 if (rb_reg_search(sep, str, 0, 0) < 0) {
11051 goto failed;
11052 }
11053 VALUE match = rb_backref_get();
11054 struct re_registers *regs = RMATCH_REGS(match);
11055
11056 pos = BEG(0);
11057 sep = rb_str_subseq(str, pos, END(0) - pos);
11058 }
11059 else {
11060 pos = rb_str_index(str, sep, 0);
11061 if (pos < 0) goto failed;
11062 }
11063 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11064 sep,
11065 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11066 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11067
11068 failed:
11069 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11070}
11071
11072/*
11073 * call-seq:
11074 * rpartition(sep) -> [head, match, tail]
11075 *
11076 * :include: doc/string/rpartition.rdoc
11077 *
11078 */
11079
11080static VALUE
11081rb_str_rpartition(VALUE str, VALUE sep)
11082{
11083 long pos = RSTRING_LEN(str);
11084
11085 sep = get_pat_quoted(sep, 0);
11086 if (RB_TYPE_P(sep, T_REGEXP)) {
11087 if (rb_reg_search(sep, str, pos, 1) < 0) {
11088 goto failed;
11089 }
11090 VALUE match = rb_backref_get();
11091 struct re_registers *regs = RMATCH_REGS(match);
11092
11093 pos = BEG(0);
11094 sep = rb_str_subseq(str, pos, END(0) - pos);
11095 }
11096 else {
11097 pos = rb_str_sublen(str, pos);
11098 pos = rb_str_rindex(str, sep, pos);
11099 if (pos < 0) {
11100 goto failed;
11101 }
11102 }
11103
11104 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11105 sep,
11106 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11107 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11108 failed:
11109 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11110}
11111
11112/*
11113 * call-seq:
11114 * start_with?(*string_or_regexp) -> true or false
11115 *
11116 * :include: doc/string/start_with_p.rdoc
11117 *
11118 */
11119
11120static VALUE
11121rb_str_start_with(int argc, VALUE *argv, VALUE str)
11122{
11123 int i;
11124
11125 for (i=0; i<argc; i++) {
11126 VALUE tmp = argv[i];
11127 if (RB_TYPE_P(tmp, T_REGEXP)) {
11128 if (rb_reg_start_with_p(tmp, str))
11129 return Qtrue;
11130 }
11131 else {
11132 const char *p, *s, *e;
11133 long slen, tlen;
11134 rb_encoding *enc;
11135
11136 StringValue(tmp);
11137 enc = rb_enc_check(str, tmp);
11138 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11139 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11140 p = RSTRING_PTR(str);
11141 e = p + slen;
11142 s = p + tlen;
11143 if (!at_char_right_boundary(p, s, e, enc))
11144 continue;
11145 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11146 return Qtrue;
11147 }
11148 }
11149 return Qfalse;
11150}
11151
11152/*
11153 * call-seq:
11154 * end_with?(*strings) -> true or false
11155 *
11156 * :include: doc/string/end_with_p.rdoc
11157 *
11158 */
11159
11160static VALUE
11161rb_str_end_with(int argc, VALUE *argv, VALUE str)
11162{
11163 int i;
11164
11165 for (i=0; i<argc; i++) {
11166 VALUE tmp = argv[i];
11167 const char *p, *s, *e;
11168 long slen, tlen;
11169 rb_encoding *enc;
11170
11171 StringValue(tmp);
11172 enc = rb_enc_check(str, tmp);
11173 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11174 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11175 p = RSTRING_PTR(str);
11176 e = p + slen;
11177 s = e - tlen;
11178 if (!at_char_boundary(p, s, e, enc))
11179 continue;
11180 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11181 return Qtrue;
11182 }
11183 return Qfalse;
11184}
11185
11195static long
11196deleted_prefix_length(VALUE str, VALUE prefix)
11197{
11198 const char *strptr, *prefixptr;
11199 long olen, prefixlen;
11200 rb_encoding *enc = rb_enc_get(str);
11201
11202 StringValue(prefix);
11203
11204 if (!is_broken_string(prefix) ||
11205 !rb_enc_asciicompat(enc) ||
11206 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11207 enc = rb_enc_check(str, prefix);
11208 }
11209
11210 /* return 0 if not start with prefix */
11211 prefixlen = RSTRING_LEN(prefix);
11212 if (prefixlen <= 0) return 0;
11213 olen = RSTRING_LEN(str);
11214 if (olen < prefixlen) return 0;
11215 strptr = RSTRING_PTR(str);
11216 prefixptr = RSTRING_PTR(prefix);
11217 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11218 if (is_broken_string(prefix)) {
11219 if (!is_broken_string(str)) {
11220 /* prefix in a valid string cannot be broken */
11221 return 0;
11222 }
11223 const char *strend = strptr + olen;
11224 const char *after_prefix = strptr + prefixlen;
11225 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11226 /* prefix does not end at char-boundary */
11227 return 0;
11228 }
11229 }
11230 /* prefix part in `str` also should be valid. */
11231
11232 return prefixlen;
11233}
11234
11235/*
11236 * call-seq:
11237 * delete_prefix!(prefix) -> self or nil
11238 *
11239 * Like String#delete_prefix, except that +self+ is modified in place.
11240 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11241 *
11242 */
11243
11244static VALUE
11245rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11246{
11247 long prefixlen;
11248 str_modify_keep_cr(str);
11249
11250 prefixlen = deleted_prefix_length(str, prefix);
11251 if (prefixlen <= 0) return Qnil;
11252
11253 return rb_str_drop_bytes(str, prefixlen);
11254}
11255
11256/*
11257 * call-seq:
11258 * delete_prefix(prefix) -> new_string
11259 *
11260 * :include: doc/string/delete_prefix.rdoc
11261 *
11262 */
11263
11264static VALUE
11265rb_str_delete_prefix(VALUE str, VALUE prefix)
11266{
11267 long prefixlen;
11268
11269 prefixlen = deleted_prefix_length(str, prefix);
11270 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11271
11272 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11273}
11274
11284static long
11285deleted_suffix_length(VALUE str, VALUE suffix)
11286{
11287 const char *strptr, *suffixptr;
11288 long olen, suffixlen;
11289 rb_encoding *enc;
11290
11291 StringValue(suffix);
11292 if (is_broken_string(suffix)) return 0;
11293 enc = rb_enc_check(str, suffix);
11294
11295 /* return 0 if not start with suffix */
11296 suffixlen = RSTRING_LEN(suffix);
11297 if (suffixlen <= 0) return 0;
11298 olen = RSTRING_LEN(str);
11299 if (olen < suffixlen) return 0;
11300 strptr = RSTRING_PTR(str);
11301 suffixptr = RSTRING_PTR(suffix);
11302 const char *strend = strptr + olen;
11303 const char *before_suffix = strend - suffixlen;
11304 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11305 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11306
11307 return suffixlen;
11308}
11309
11310/*
11311 * call-seq:
11312 * delete_suffix!(suffix) -> self or nil
11313 *
11314 * Like String#delete_suffix, except that +self+ is modified in place.
11315 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11316 *
11317 */
11318
11319static VALUE
11320rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11321{
11322 long olen, suffixlen, len;
11323 str_modifiable(str);
11324
11325 suffixlen = deleted_suffix_length(str, suffix);
11326 if (suffixlen <= 0) return Qnil;
11327
11328 olen = RSTRING_LEN(str);
11329 str_modify_keep_cr(str);
11330 len = olen - suffixlen;
11331 STR_SET_LEN(str, len);
11332 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11333 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11335 }
11336 return str;
11337}
11338
11339/*
11340 * call-seq:
11341 * delete_suffix(suffix) -> new_string
11342 *
11343 * :include: doc/string/delete_suffix.rdoc
11344 *
11345 */
11346
11347static VALUE
11348rb_str_delete_suffix(VALUE str, VALUE suffix)
11349{
11350 long suffixlen;
11351
11352 suffixlen = deleted_suffix_length(str, suffix);
11353 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11354
11355 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11356}
11357
11358void
11359rb_str_setter(VALUE val, ID id, VALUE *var)
11360{
11361 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11362 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11363 }
11364 *var = val;
11365}
11366
11367static void
11368rb_fs_setter(VALUE val, ID id, VALUE *var)
11369{
11370 val = rb_fs_check(val);
11371 if (!val) {
11372 rb_raise(rb_eTypeError,
11373 "value of %"PRIsVALUE" must be String or Regexp",
11374 rb_id2str(id));
11375 }
11376 if (!NIL_P(val)) {
11377 rb_warn_deprecated("'$;'", NULL);
11378 }
11379 *var = val;
11380}
11381
11382
11383/*
11384 * call-seq:
11385 * force_encoding(encoding) -> self
11386 *
11387 * :include: doc/string/force_encoding.rdoc
11388 *
11389 */
11390
11391static VALUE
11392rb_str_force_encoding(VALUE str, VALUE enc)
11393{
11394 str_modifiable(str);
11395
11396 rb_encoding *encoding = rb_to_encoding(enc);
11397 int idx = rb_enc_to_index(encoding);
11398
11399 // If the encoding is unchanged, we do nothing.
11400 if (ENCODING_GET(str) == idx) {
11401 return str;
11402 }
11403
11404 rb_enc_associate_index(str, idx);
11405
11406 // If the coderange was 7bit and the new encoding is ASCII-compatible
11407 // we can keep the coderange.
11408 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11409 return str;
11410 }
11411
11413 return str;
11414}
11415
11416/*
11417 * call-seq:
11418 * b -> string
11419 *
11420 * :include: doc/string/b.rdoc
11421 *
11422 */
11423
11424static VALUE
11425rb_str_b(VALUE str)
11426{
11427 VALUE str2;
11428 if (STR_EMBED_P(str)) {
11429 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11430 }
11431 else {
11432 str2 = str_alloc_heap(rb_cString);
11433 }
11434 str_replace_shared_without_enc(str2, str);
11435
11436 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11437 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11438 // If we know the receiver's code range then we know the result's code range.
11439 int cr = ENC_CODERANGE(str);
11440 switch (cr) {
11441 case ENC_CODERANGE_7BIT:
11443 break;
11447 break;
11448 default:
11449 ENC_CODERANGE_CLEAR(str2);
11450 break;
11451 }
11452 }
11453
11454 return str2;
11455}
11456
11457/*
11458 * call-seq:
11459 * valid_encoding? -> true or false
11460 *
11461 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11462 *
11463 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11464 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11465 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11466 */
11467
11468static VALUE
11469rb_str_valid_encoding_p(VALUE str)
11470{
11471 int cr = rb_enc_str_coderange(str);
11472
11473 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11474}
11475
11476/*
11477 * call-seq:
11478 * ascii_only? -> true or false
11479 *
11480 * Returns +true+ if +self+ contains only ASCII characters,
11481 * +false+ otherwise:
11482 *
11483 * 'abc'.ascii_only? # => true
11484 * "abc\u{6666}".ascii_only? # => false
11485 *
11486 */
11487
11488static VALUE
11489rb_str_is_ascii_only_p(VALUE str)
11490{
11491 int cr = rb_enc_str_coderange(str);
11492
11493 return RBOOL(cr == ENC_CODERANGE_7BIT);
11494}
11495
11496VALUE
11498{
11499 static const char ellipsis[] = "...";
11500 const long ellipsislen = sizeof(ellipsis) - 1;
11501 rb_encoding *const enc = rb_enc_get(str);
11502 const long blen = RSTRING_LEN(str);
11503 const char *const p = RSTRING_PTR(str), *e = p + blen;
11504 VALUE estr, ret = 0;
11505
11506 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11507 if (len * rb_enc_mbminlen(enc) >= blen ||
11508 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11509 ret = str;
11510 }
11511 else if (len <= ellipsislen ||
11512 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11513 if (rb_enc_asciicompat(enc)) {
11514 ret = rb_str_new(ellipsis, len);
11515 rb_enc_associate(ret, enc);
11516 }
11517 else {
11518 estr = rb_usascii_str_new(ellipsis, len);
11519 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11520 }
11521 }
11522 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11523 rb_str_cat(ret, ellipsis, ellipsislen);
11524 }
11525 else {
11526 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11527 rb_enc_from_encoding(enc), 0, Qnil);
11528 rb_str_append(ret, estr);
11529 }
11530 return ret;
11531}
11532
11533static VALUE
11534str_compat_and_valid(VALUE str, rb_encoding *enc)
11535{
11536 int cr;
11537 str = StringValue(str);
11538 cr = rb_enc_str_coderange(str);
11539 if (cr == ENC_CODERANGE_BROKEN) {
11540 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11541 }
11542 else {
11543 rb_encoding *e = STR_ENC_GET(str);
11544 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11545 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11546 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11547 }
11548 }
11549 return str;
11550}
11551
11552static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11553
11554VALUE
11556{
11557 rb_encoding *enc = STR_ENC_GET(str);
11558 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11559}
11560
11561VALUE
11562rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11563{
11564 int cr = ENC_CODERANGE_UNKNOWN;
11565 if (enc == STR_ENC_GET(str)) {
11566 /* cached coderange makes sense only when enc equals the
11567 * actual encoding of str */
11568 cr = ENC_CODERANGE(str);
11569 }
11570 return enc_str_scrub(enc, str, repl, cr);
11571}
11572
11573static VALUE
11574enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11575{
11576 int encidx;
11577 VALUE buf = Qnil;
11578 const char *rep, *p, *e, *p1, *sp;
11579 long replen = -1;
11580 long slen;
11581
11582 if (rb_block_given_p()) {
11583 if (!NIL_P(repl))
11584 rb_raise(rb_eArgError, "both of block and replacement given");
11585 replen = 0;
11586 }
11587
11588 if (ENC_CODERANGE_CLEAN_P(cr))
11589 return Qnil;
11590
11591 if (!NIL_P(repl)) {
11592 repl = str_compat_and_valid(repl, enc);
11593 }
11594
11595 if (rb_enc_dummy_p(enc)) {
11596 return Qnil;
11597 }
11598 encidx = rb_enc_to_index(enc);
11599
11600#define DEFAULT_REPLACE_CHAR(str) do { \
11601 static const char replace[sizeof(str)-1] = str; \
11602 rep = replace; replen = (int)sizeof(replace); \
11603 } while (0)
11604
11605 slen = RSTRING_LEN(str);
11606 p = RSTRING_PTR(str);
11607 e = RSTRING_END(str);
11608 p1 = p;
11609 sp = p;
11610
11611 if (rb_enc_asciicompat(enc)) {
11612 int rep7bit_p;
11613 if (!replen) {
11614 rep = NULL;
11615 rep7bit_p = FALSE;
11616 }
11617 else if (!NIL_P(repl)) {
11618 rep = RSTRING_PTR(repl);
11619 replen = RSTRING_LEN(repl);
11620 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11621 }
11622 else if (encidx == rb_utf8_encindex()) {
11623 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11624 rep7bit_p = FALSE;
11625 }
11626 else {
11627 DEFAULT_REPLACE_CHAR("?");
11628 rep7bit_p = TRUE;
11629 }
11630 cr = ENC_CODERANGE_7BIT;
11631
11632 p = search_nonascii(p, e);
11633 if (!p) {
11634 p = e;
11635 }
11636 while (p < e) {
11637 int ret = rb_enc_precise_mbclen(p, e, enc);
11638 if (MBCLEN_NEEDMORE_P(ret)) {
11639 break;
11640 }
11641 else if (MBCLEN_CHARFOUND_P(ret)) {
11643 p += MBCLEN_CHARFOUND_LEN(ret);
11644 }
11645 else if (MBCLEN_INVALID_P(ret)) {
11646 /*
11647 * p1~p: valid ascii/multibyte chars
11648 * p ~e: invalid bytes + unknown bytes
11649 */
11650 long clen = rb_enc_mbmaxlen(enc);
11651 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11652 if (p > p1) {
11653 rb_str_buf_cat(buf, p1, p - p1);
11654 }
11655
11656 if (e - p < clen) clen = e - p;
11657 if (clen <= 2) {
11658 clen = 1;
11659 }
11660 else {
11661 const char *q = p;
11662 clen--;
11663 for (; clen > 1; clen--) {
11664 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11665 if (MBCLEN_NEEDMORE_P(ret)) break;
11666 if (MBCLEN_INVALID_P(ret)) continue;
11668 }
11669 }
11670 if (rep) {
11671 rb_str_buf_cat(buf, rep, replen);
11672 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11673 }
11674 else {
11675 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11676 str_mod_check(str, sp, slen);
11677 repl = str_compat_and_valid(repl, enc);
11678 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11681 }
11682 p += clen;
11683 p1 = p;
11684 p = search_nonascii(p, e);
11685 if (!p) {
11686 p = e;
11687 break;
11688 }
11689 }
11690 else {
11692 }
11693 }
11694 if (NIL_P(buf)) {
11695 if (p == e) {
11696 ENC_CODERANGE_SET(str, cr);
11697 return Qnil;
11698 }
11699 buf = rb_str_buf_new(RSTRING_LEN(str));
11700 }
11701 if (p1 < p) {
11702 rb_str_buf_cat(buf, p1, p - p1);
11703 }
11704 if (p < e) {
11705 if (rep) {
11706 rb_str_buf_cat(buf, rep, replen);
11707 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11708 }
11709 else {
11710 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11711 str_mod_check(str, sp, slen);
11712 repl = str_compat_and_valid(repl, enc);
11713 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11716 }
11717 }
11718 }
11719 else {
11720 /* ASCII incompatible */
11721 long mbminlen = rb_enc_mbminlen(enc);
11722 if (!replen) {
11723 rep = NULL;
11724 }
11725 else if (!NIL_P(repl)) {
11726 rep = RSTRING_PTR(repl);
11727 replen = RSTRING_LEN(repl);
11728 }
11729 else if (encidx == ENCINDEX_UTF_16BE) {
11730 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11731 }
11732 else if (encidx == ENCINDEX_UTF_16LE) {
11733 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11734 }
11735 else if (encidx == ENCINDEX_UTF_32BE) {
11736 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11737 }
11738 else if (encidx == ENCINDEX_UTF_32LE) {
11739 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11740 }
11741 else {
11742 DEFAULT_REPLACE_CHAR("?");
11743 }
11744
11745 while (p < e) {
11746 int ret = rb_enc_precise_mbclen(p, e, enc);
11747 if (MBCLEN_NEEDMORE_P(ret)) {
11748 break;
11749 }
11750 else if (MBCLEN_CHARFOUND_P(ret)) {
11751 p += MBCLEN_CHARFOUND_LEN(ret);
11752 }
11753 else if (MBCLEN_INVALID_P(ret)) {
11754 const char *q = p;
11755 long clen = rb_enc_mbmaxlen(enc);
11756 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11757 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11758
11759 if (e - p < clen) clen = e - p;
11760 if (clen <= mbminlen * 2) {
11761 clen = mbminlen;
11762 }
11763 else {
11764 clen -= mbminlen;
11765 for (; clen > mbminlen; clen-=mbminlen) {
11766 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11767 if (MBCLEN_NEEDMORE_P(ret)) break;
11768 if (MBCLEN_INVALID_P(ret)) continue;
11770 }
11771 }
11772 if (rep) {
11773 rb_str_buf_cat(buf, rep, replen);
11774 }
11775 else {
11776 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11777 str_mod_check(str, sp, slen);
11778 repl = str_compat_and_valid(repl, enc);
11779 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11780 }
11781 p += clen;
11782 p1 = p;
11783 }
11784 else {
11786 }
11787 }
11788 if (NIL_P(buf)) {
11789 if (p == e) {
11791 return Qnil;
11792 }
11793 buf = rb_str_buf_new(RSTRING_LEN(str));
11794 }
11795 if (p1 < p) {
11796 rb_str_buf_cat(buf, p1, p - p1);
11797 }
11798 if (p < e) {
11799 if (rep) {
11800 rb_str_buf_cat(buf, rep, replen);
11801 }
11802 else {
11803 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11804 str_mod_check(str, sp, slen);
11805 repl = str_compat_and_valid(repl, enc);
11806 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11807 }
11808 }
11810 }
11811 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11812 return buf;
11813}
11814
11815/*
11816 * call-seq:
11817 * scrub(replacement_string = default_replacement) -> new_string
11818 * scrub{|bytes| ... } -> new_string
11819 *
11820 * :include: doc/string/scrub.rdoc
11821 *
11822 */
11823static VALUE
11824str_scrub(int argc, VALUE *argv, VALUE str)
11825{
11826 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11827 VALUE new = rb_str_scrub(str, repl);
11828 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11829}
11830
11831/*
11832 * call-seq:
11833 * scrub! -> self
11834 * scrub!(replacement_string = default_replacement) -> self
11835 * scrub!{|bytes| ... } -> self
11836 *
11837 * Like String#scrub, except that any replacements are made in +self+.
11838 *
11839 */
11840static VALUE
11841str_scrub_bang(int argc, VALUE *argv, VALUE str)
11842{
11843 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11844 VALUE new = rb_str_scrub(str, repl);
11845 if (!NIL_P(new)) rb_str_replace(str, new);
11846 return str;
11847}
11848
11849static ID id_normalize;
11850static ID id_normalized_p;
11851static VALUE mUnicodeNormalize;
11852
11853static VALUE
11854unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11855{
11856 static int UnicodeNormalizeRequired = 0;
11857 VALUE argv2[2];
11858
11859 if (!UnicodeNormalizeRequired) {
11860 rb_require("unicode_normalize/normalize.rb");
11861 UnicodeNormalizeRequired = 1;
11862 }
11863 argv2[0] = str;
11864 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11865 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11866}
11867
11868/*
11869 * call-seq:
11870 * unicode_normalize(form = :nfc) -> string
11871 *
11872 * Returns a copy of +self+ with
11873 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11874 *
11875 * Argument +form+ must be one of the following symbols
11876 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11877 *
11878 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11879 * - +:nfd+: Canonical decomposition.
11880 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11881 * - +:nfkd+: Compatibility decomposition.
11882 *
11883 * The encoding of +self+ must be one of:
11884 *
11885 * - Encoding::UTF_8
11886 * - Encoding::UTF_16BE
11887 * - Encoding::UTF_16LE
11888 * - Encoding::UTF_32BE
11889 * - Encoding::UTF_32LE
11890 * - Encoding::GB18030
11891 * - Encoding::UCS_2BE
11892 * - Encoding::UCS_4BE
11893 *
11894 * Examples:
11895 *
11896 * "a\u0300".unicode_normalize # => "a"
11897 * "\u00E0".unicode_normalize(:nfd) # => "a "
11898 *
11899 * Related: String#unicode_normalize!, String#unicode_normalized?.
11900 */
11901static VALUE
11902rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11903{
11904 return unicode_normalize_common(argc, argv, str, id_normalize);
11905}
11906
11907/*
11908 * call-seq:
11909 * unicode_normalize!(form = :nfc) -> self
11910 *
11911 * Like String#unicode_normalize, except that the normalization
11912 * is performed on +self+.
11913 *
11914 * Related String#unicode_normalized?.
11915 *
11916 */
11917static VALUE
11918rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11919{
11920 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11921}
11922
11923/* call-seq:
11924 * unicode_normalized?(form = :nfc) -> true or false
11925 *
11926 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11927 * +false+ otherwise.
11928 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11929 *
11930 * Examples:
11931 *
11932 * "a\u0300".unicode_normalized? # => false
11933 * "a\u0300".unicode_normalized?(:nfd) # => true
11934 * "\u00E0".unicode_normalized? # => true
11935 * "\u00E0".unicode_normalized?(:nfd) # => false
11936 *
11937 *
11938 * Raises an exception if +self+ is not in a Unicode encoding:
11939 *
11940 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
11941 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11942 *
11943 * Related: String#unicode_normalize, String#unicode_normalize!.
11944 *
11945 */
11946static VALUE
11947rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11948{
11949 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11950}
11951
11952/**********************************************************************
11953 * Document-class: Symbol
11954 *
11955 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11956 *
11957 * You can create a +Symbol+ object explicitly with:
11958 *
11959 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11960 *
11961 * The same +Symbol+ object will be
11962 * created for a given name or string for the duration of a program's
11963 * execution, regardless of the context or meaning of that name. Thus
11964 * if <code>Fred</code> is a constant in one context, a method in
11965 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11966 * will be the same object in all three contexts.
11967 *
11968 * module One
11969 * class Fred
11970 * end
11971 * $f1 = :Fred
11972 * end
11973 * module Two
11974 * Fred = 1
11975 * $f2 = :Fred
11976 * end
11977 * def Fred()
11978 * end
11979 * $f3 = :Fred
11980 * $f1.object_id #=> 2514190
11981 * $f2.object_id #=> 2514190
11982 * $f3.object_id #=> 2514190
11983 *
11984 * Constant, method, and variable names are returned as symbols:
11985 *
11986 * module One
11987 * Two = 2
11988 * def three; 3 end
11989 * @four = 4
11990 * @@five = 5
11991 * $six = 6
11992 * end
11993 * seven = 7
11994 *
11995 * One.constants
11996 * # => [:Two]
11997 * One.instance_methods(true)
11998 * # => [:three]
11999 * One.instance_variables
12000 * # => [:@four]
12001 * One.class_variables
12002 * # => [:@@five]
12003 * global_variables.grep(/six/)
12004 * # => [:$six]
12005 * local_variables
12006 * # => [:seven]
12007 *
12008 * A +Symbol+ object differs from a String object in that
12009 * a +Symbol+ object represents an identifier, while a String object
12010 * represents text or data.
12011 *
12012 * == What's Here
12013 *
12014 * First, what's elsewhere. Class +Symbol+:
12015 *
12016 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12017 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12018 *
12019 * Here, class +Symbol+ provides methods that are useful for:
12020 *
12021 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12022 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12023 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12024 *
12025 * === Methods for Querying
12026 *
12027 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12028 * - #=~: Returns the index of the first substring in symbol that matches a
12029 * given Regexp or other object; returns +nil+ if no match is found.
12030 * - #[], #slice : Returns a substring of symbol
12031 * determined by a given index, start/length, or range, or string.
12032 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12033 * - #encoding: Returns the Encoding object that represents the encoding
12034 * of symbol.
12035 * - #end_with?: Returns +true+ if symbol ends with
12036 * any of the given strings.
12037 * - #match: Returns a MatchData object if symbol
12038 * matches a given Regexp; +nil+ otherwise.
12039 * - #match?: Returns +true+ if symbol
12040 * matches a given Regexp; +false+ otherwise.
12041 * - #length, #size: Returns the number of characters in symbol.
12042 * - #start_with?: Returns +true+ if symbol starts with
12043 * any of the given strings.
12044 *
12045 * === Methods for Comparing
12046 *
12047 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12048 * or larger than symbol.
12049 * - #==, #===: Returns +true+ if a given symbol has the same content and
12050 * encoding.
12051 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12052 * symbol is smaller than, equal to, or larger than symbol.
12053 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12054 * after Unicode case folding; +false+ otherwise.
12055 *
12056 * === Methods for Converting
12057 *
12058 * - #capitalize: Returns symbol with the first character upcased
12059 * and all other characters downcased.
12060 * - #downcase: Returns symbol with all characters downcased.
12061 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12062 * - #name: Returns the frozen string corresponding to symbol.
12063 * - #succ, #next: Returns the symbol that is the successor to symbol.
12064 * - #swapcase: Returns symbol with all upcase characters downcased
12065 * and all downcase characters upcased.
12066 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12067 * - #to_s, #id2name: Returns the string corresponding to +self+.
12068 * - #to_sym, #intern: Returns +self+.
12069 * - #upcase: Returns symbol with all characters upcased.
12070 *
12071 */
12072
12073
12074/*
12075 * call-seq:
12076 * symbol == object -> true or false
12077 *
12078 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12079 */
12080
12081#define sym_equal rb_obj_equal
12082
12083static int
12084sym_printable(const char *s, const char *send, rb_encoding *enc)
12085{
12086 while (s < send) {
12087 int n;
12088 int c = rb_enc_precise_mbclen(s, send, enc);
12089
12090 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12091 n = MBCLEN_CHARFOUND_LEN(c);
12092 c = rb_enc_mbc_to_codepoint(s, send, enc);
12093 if (!rb_enc_isprint(c, enc)) return FALSE;
12094 s += n;
12095 }
12096 return TRUE;
12097}
12098
12099int
12100rb_str_symname_p(VALUE sym)
12101{
12102 rb_encoding *enc;
12103 const char *ptr;
12104 long len;
12105 rb_encoding *resenc = rb_default_internal_encoding();
12106
12107 if (resenc == NULL) resenc = rb_default_external_encoding();
12108 enc = STR_ENC_GET(sym);
12109 ptr = RSTRING_PTR(sym);
12110 len = RSTRING_LEN(sym);
12111 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12112 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12113 return FALSE;
12114 }
12115 return TRUE;
12116}
12117
12118VALUE
12119rb_str_quote_unprintable(VALUE str)
12120{
12121 rb_encoding *enc;
12122 const char *ptr;
12123 long len;
12124 rb_encoding *resenc;
12125
12126 Check_Type(str, T_STRING);
12127 resenc = rb_default_internal_encoding();
12128 if (resenc == NULL) resenc = rb_default_external_encoding();
12129 enc = STR_ENC_GET(str);
12130 ptr = RSTRING_PTR(str);
12131 len = RSTRING_LEN(str);
12132 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12133 !sym_printable(ptr, ptr + len, enc)) {
12134 return rb_str_escape(str);
12135 }
12136 return str;
12137}
12138
12139VALUE
12140rb_id_quote_unprintable(ID id)
12141{
12142 VALUE str = rb_id2str(id);
12143 if (!rb_str_symname_p(str)) {
12144 return rb_str_escape(str);
12145 }
12146 return str;
12147}
12148
12149/*
12150 * call-seq:
12151 * inspect -> string
12152 *
12153 * Returns a string representation of +self+ (including the leading colon):
12154 *
12155 * :foo.inspect # => ":foo"
12156 *
12157 * Related: Symbol#to_s, Symbol#name.
12158 *
12159 */
12160
12161static VALUE
12162sym_inspect(VALUE sym)
12163{
12164 VALUE str = rb_sym2str(sym);
12165 const char *ptr;
12166 long len;
12167 char *dest;
12168
12169 if (!rb_str_symname_p(str)) {
12170 str = rb_str_inspect(str);
12171 len = RSTRING_LEN(str);
12172 rb_str_resize(str, len + 1);
12173 dest = RSTRING_PTR(str);
12174 memmove(dest + 1, dest, len);
12175 }
12176 else {
12177 rb_encoding *enc = STR_ENC_GET(str);
12178 VALUE orig_str = str;
12179
12180 len = RSTRING_LEN(orig_str);
12181 str = rb_enc_str_new(0, len + 1, enc);
12182
12183 // Get data pointer after allocation
12184 ptr = RSTRING_PTR(orig_str);
12185 dest = RSTRING_PTR(str);
12186 memcpy(dest + 1, ptr, len);
12187
12188 RB_GC_GUARD(orig_str);
12189 }
12190 dest[0] = ':';
12191
12193
12194 return str;
12195}
12196
12197VALUE
12199{
12200 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12201 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12202 return str;
12203}
12204
12205VALUE
12206rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12207{
12208 VALUE obj;
12209
12210 if (argc < 1) {
12211 rb_raise(rb_eArgError, "no receiver given");
12212 }
12213 obj = argv[0];
12214 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12215}
12216
12217/*
12218 * call-seq:
12219 * succ
12220 *
12221 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12222 *
12223 * :foo.succ # => :fop
12224 *
12225 * Related: String#succ.
12226 */
12227
12228static VALUE
12229sym_succ(VALUE sym)
12230{
12231 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12232}
12233
12234/*
12235 * call-seq:
12236 * symbol <=> object -> -1, 0, +1, or nil
12237 *
12238 * If +object+ is a symbol,
12239 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12240 *
12241 * :bar <=> :foo # => -1
12242 * :foo <=> :foo # => 0
12243 * :foo <=> :bar # => 1
12244 *
12245 * Otherwise, returns +nil+:
12246 *
12247 * :foo <=> 'bar' # => nil
12248 *
12249 * Related: String#<=>.
12250 */
12251
12252static VALUE
12253sym_cmp(VALUE sym, VALUE other)
12254{
12255 if (!SYMBOL_P(other)) {
12256 return Qnil;
12257 }
12258 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12259}
12260
12261/*
12262 * call-seq:
12263 * casecmp(object) -> -1, 0, 1, or nil
12264 *
12265 * :include: doc/symbol/casecmp.rdoc
12266 *
12267 */
12268
12269static VALUE
12270sym_casecmp(VALUE sym, VALUE other)
12271{
12272 if (!SYMBOL_P(other)) {
12273 return Qnil;
12274 }
12275 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12276}
12277
12278/*
12279 * call-seq:
12280 * casecmp?(object) -> true, false, or nil
12281 *
12282 * :include: doc/symbol/casecmp_p.rdoc
12283 *
12284 */
12285
12286static VALUE
12287sym_casecmp_p(VALUE sym, VALUE other)
12288{
12289 if (!SYMBOL_P(other)) {
12290 return Qnil;
12291 }
12292 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12293}
12294
12295/*
12296 * call-seq:
12297 * symbol =~ object -> integer or nil
12298 *
12299 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12300 * including possible updates to global variables;
12301 * see String#=~.
12302 *
12303 */
12304
12305static VALUE
12306sym_match(VALUE sym, VALUE other)
12307{
12308 return rb_str_match(rb_sym2str(sym), other);
12309}
12310
12311/*
12312 * call-seq:
12313 * match(pattern, offset = 0) -> matchdata or nil
12314 * match(pattern, offset = 0) {|matchdata| } -> object
12315 *
12316 * Equivalent to <tt>self.to_s.match</tt>,
12317 * including possible updates to global variables;
12318 * see String#match.
12319 *
12320 */
12321
12322static VALUE
12323sym_match_m(int argc, VALUE *argv, VALUE sym)
12324{
12325 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12326}
12327
12328/*
12329 * call-seq:
12330 * match?(pattern, offset) -> true or false
12331 *
12332 * Equivalent to <tt>sym.to_s.match?</tt>;
12333 * see String#match.
12334 *
12335 */
12336
12337static VALUE
12338sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12339{
12340 return rb_str_match_m_p(argc, argv, sym);
12341}
12342
12343/*
12344 * call-seq:
12345 * symbol[index] -> string or nil
12346 * symbol[start, length] -> string or nil
12347 * symbol[range] -> string or nil
12348 * symbol[regexp, capture = 0] -> string or nil
12349 * symbol[substring] -> string or nil
12350 *
12351 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12352 *
12353 */
12354
12355static VALUE
12356sym_aref(int argc, VALUE *argv, VALUE sym)
12357{
12358 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12359}
12360
12361/*
12362 * call-seq:
12363 * length -> integer
12364 *
12365 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12366 */
12367
12368static VALUE
12369sym_length(VALUE sym)
12370{
12371 return rb_str_length(rb_sym2str(sym));
12372}
12373
12374/*
12375 * call-seq:
12376 * empty? -> true or false
12377 *
12378 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12379 *
12380 */
12381
12382static VALUE
12383sym_empty(VALUE sym)
12384{
12385 return rb_str_empty(rb_sym2str(sym));
12386}
12387
12388/*
12389 * call-seq:
12390 * upcase(*options) -> symbol
12391 *
12392 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12393 *
12394 * See String#upcase.
12395 *
12396 */
12397
12398static VALUE
12399sym_upcase(int argc, VALUE *argv, VALUE sym)
12400{
12401 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12402}
12403
12404/*
12405 * call-seq:
12406 * downcase(*options) -> symbol
12407 *
12408 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12409 *
12410 * See String#downcase.
12411 *
12412 * Related: Symbol#upcase.
12413 *
12414 */
12415
12416static VALUE
12417sym_downcase(int argc, VALUE *argv, VALUE sym)
12418{
12419 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12420}
12421
12422/*
12423 * call-seq:
12424 * capitalize(*options) -> symbol
12425 *
12426 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12427 *
12428 * See String#capitalize.
12429 *
12430 */
12431
12432static VALUE
12433sym_capitalize(int argc, VALUE *argv, VALUE sym)
12434{
12435 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12436}
12437
12438/*
12439 * call-seq:
12440 * swapcase(*options) -> symbol
12441 *
12442 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12443 *
12444 * See String#swapcase.
12445 *
12446 */
12447
12448static VALUE
12449sym_swapcase(int argc, VALUE *argv, VALUE sym)
12450{
12451 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12452}
12453
12454/*
12455 * call-seq:
12456 * start_with?(*string_or_regexp) -> true or false
12457 *
12458 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12459 *
12460 */
12461
12462static VALUE
12463sym_start_with(int argc, VALUE *argv, VALUE sym)
12464{
12465 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12466}
12467
12468/*
12469 * call-seq:
12470 * end_with?(*strings) -> true or false
12471 *
12472 *
12473 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12474 *
12475 */
12476
12477static VALUE
12478sym_end_with(int argc, VALUE *argv, VALUE sym)
12479{
12480 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12481}
12482
12483/*
12484 * call-seq:
12485 * encoding -> encoding
12486 *
12487 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12488 *
12489 */
12490
12491static VALUE
12492sym_encoding(VALUE sym)
12493{
12494 return rb_obj_encoding(rb_sym2str(sym));
12495}
12496
12497static VALUE
12498string_for_symbol(VALUE name)
12499{
12500 if (!RB_TYPE_P(name, T_STRING)) {
12501 VALUE tmp = rb_check_string_type(name);
12502 if (NIL_P(tmp)) {
12503 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12504 name);
12505 }
12506 name = tmp;
12507 }
12508 return name;
12509}
12510
12511ID
12513{
12514 if (SYMBOL_P(name)) {
12515 return SYM2ID(name);
12516 }
12517 name = string_for_symbol(name);
12518 return rb_intern_str(name);
12519}
12520
12521VALUE
12523{
12524 if (SYMBOL_P(name)) {
12525 return name;
12526 }
12527 name = string_for_symbol(name);
12528 return rb_str_intern(name);
12529}
12530
12531/*
12532 * call-seq:
12533 * Symbol.all_symbols -> array_of_symbols
12534 *
12535 * Returns an array of all symbols currently in Ruby's symbol table:
12536 *
12537 * Symbol.all_symbols.size # => 9334
12538 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12539 *
12540 */
12541
12542static VALUE
12543sym_all_symbols(VALUE _)
12544{
12545 return rb_sym_all_symbols();
12546}
12547
12548VALUE
12549rb_str_to_interned_str(VALUE str)
12550{
12551 return rb_fstring(str);
12552}
12553
12554VALUE
12555rb_interned_str(const char *ptr, long len)
12556{
12557 struct RString fake_str;
12558 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12559}
12560
12561VALUE
12563{
12564 return rb_interned_str(ptr, strlen(ptr));
12565}
12566
12567VALUE
12568rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12569{
12570 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12571 rb_enc_autoload(enc);
12572 }
12573
12574 struct RString fake_str;
12575 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12576}
12577
12578VALUE
12579rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12580{
12581 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12582 rb_enc_autoload(enc);
12583 }
12584
12585 struct RString fake_str;
12586 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12587}
12588
12589VALUE
12591{
12592 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12593}
12594
12595#if USE_YJIT
12596void
12597rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12598{
12599 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12600 ssize_t code = RB_NUM2SSIZE(codepoint);
12601
12602 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12603 rb_str_buf_cat_byte(str, (char) code);
12604 return;
12605 }
12606 }
12607
12608 rb_str_concat(str, codepoint);
12609}
12610#endif
12611
12612void
12613Init_String(void)
12614{
12615 rb_cString = rb_define_class("String", rb_cObject);
12616 RUBY_ASSERT(rb_vm_fstring_table());
12617 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12619 rb_define_alloc_func(rb_cString, empty_str_alloc);
12620 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12621 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12622 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12623 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12624 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12627 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12628 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12629 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12630 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12633 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12634 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12635 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12636 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12639 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12640 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12641 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12642 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12643 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12645 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12647 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12648 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12649 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12650 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12651 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12652 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12654 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12655 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12656 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12657 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12658 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12659 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12660 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12661 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12663 rb_define_method(rb_cString, "+@", str_uplus, 0);
12664 rb_define_method(rb_cString, "-@", str_uminus, 0);
12665 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12666 rb_define_alias(rb_cString, "dedup", "-@");
12667
12668 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12669 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12670 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12671 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12674 rb_define_method(rb_cString, "undump", str_undump, 0);
12675
12676 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12677 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12678 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12679 sym_fold = ID2SYM(rb_intern_const("fold"));
12680
12681 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12682 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12683 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12684 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12685
12686 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12687 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12688 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12689 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12690
12691 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12692 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12693 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12694 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12695 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12696 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12697 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12698 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12699 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12700 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12701 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12702 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12704 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12705 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12706 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12707 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12708 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12709
12710 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12711 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12712 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12713
12714 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12715
12716 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12717 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12718 rb_define_method(rb_cString, "center", rb_str_center, -1);
12719
12720 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12721 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12722 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12723 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12724 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12725 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12726 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12727 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12728 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12729
12730 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12731 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12732 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12733 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12734 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12735 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12736 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12737 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12738 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12739
12740 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12741 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12742 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12743 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12744 rb_define_method(rb_cString, "count", rb_str_count, -1);
12745
12746 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12747 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12748 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12749 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12750
12751 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12752 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12753 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12754 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12755 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12756
12757 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12758
12759 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12760 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12761
12762 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12763 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12764
12765 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12766 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12767 rb_define_method(rb_cString, "b", rb_str_b, 0);
12768 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12769 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12770
12771 /* define UnicodeNormalize module here so that we don't have to look it up */
12772 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12773 id_normalize = rb_intern_const("normalize");
12774 id_normalized_p = rb_intern_const("normalized?");
12775
12776 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12777 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12778 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12779
12780 rb_fs = Qnil;
12781 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12782 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12783 rb_gc_register_address(&rb_fs);
12784
12785 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12789 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12790
12791 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12792 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12793 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12794 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12795 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12796 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12797
12798 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12799 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12800 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12801 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12802
12803 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12804 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12805 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12806 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12807 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12808 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12809 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12810
12811 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12812 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12813 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12814 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12815
12816 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12817 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12818
12819 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12820}
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1190
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1098
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2348
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2169
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2638
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:936
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2427
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:126
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:675
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3905
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:669
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2097
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2115
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1272
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3508
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:576
VALUE rb_cSymbol
Symbol class.
Definition string.c:80
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1260
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3192
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1294
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:909
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1159
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2945
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1178
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12568
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2266
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3630
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1107
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1399
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1300
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:928
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12590
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:793
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:430
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1099
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:678
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1836
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1058
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1842
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1926
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4219
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3716
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1926
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1694
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1464
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2417
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3695
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1375
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12198
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2489
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1351
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1688
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2973
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5287
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4064
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3070
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11497
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1752
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1730
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1141
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:963
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1470
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1933
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4050
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3463
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2355
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1951
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6524
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3078
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12562
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1381
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3661
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3020
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4166
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3287
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7245
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2709
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12555
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4120
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3937
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4095
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3637
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3195
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5797
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11555
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1644
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2867
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3167
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3270
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1153
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2665
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7359
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1363
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1660
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2369
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5715
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9452
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1147
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:894
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1792
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1924
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:1941
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2956
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1287
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:986
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12522
ID rb_to_id(VALUE str)
Definition string.c:12512
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1865
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3500
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4463
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1393
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2844
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2728
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1387
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2739
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1721
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1417
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:75
Ruby's String.
Definition rstring.h:196
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
union RString::@52::@53::@55 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:200
Definition st.h:79
Definition string.c:8317
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113