Ruby 3.5.0dev (2025-02-20 revision 34098b669c0cbc024cd08e686891f1dfe0a10aaf)
string.c (34098b669c0cbc024cd08e686891f1dfe0a10aaf)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/util.h"
46#include "ruby_assert.h"
47#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
73#undef rb_str_buf_cat
74#undef rb_str_buf_cat2
75#undef rb_str_cat2
76#undef rb_str_cat_cstr
77#undef rb_fstring_cstr
78
81
82/* Flags of RString
83 *
84 * 0: STR_SHARED (equal to ELTS_SHARED)
85 * The string is shared. The buffer this string points to is owned by
86 * another string (the shared root).
87 * 1: RSTRING_NOEMBED
88 * The string is not embedded. When a string is embedded, the contents
89 * follow the header. When a string is not embedded, the contents is
90 * on a separately allocated buffer.
91 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
92 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
93 * It emits a deprecation warning when mutated for the first time.
94 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
95 * The string was allocated by the `Symbol#to_s` method.
96 * It emits a deprecation warning when mutated for the first time.
97 * 4: STR_PRECOMPUTED_HASH
98 * The string is embedded and has its precomputed hashcode stored
99 * after the terminator.
100 * 5: STR_SHARED_ROOT
101 * Other strings may point to the contents of this string. When this
102 * flag is set, STR_SHARED must not be set.
103 * 6: STR_BORROWED
104 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
105 * to be unshared by rb_str_tmp_frozen_release.
106 * 7: STR_TMPLOCK
107 * The pointer to the buffer is passed to a system call such as
108 * read(2). Any modification and realloc is prohibited.
109 * 8-9: ENC_CODERANGE
110 * Stores the coderange of the string.
111 * 10-16: ENCODING
112 * Stores the encoding of the string.
113 * 17: RSTRING_FSTR
114 * The string is a fstring. The string is deduplicated in the fstring
115 * table.
116 * 18: STR_NOFREE
117 * Do not free this string's buffer when the string is reclaimed
118 * by the garbage collector. Used for when the string buffer is a C
119 * string literal.
120 * 19: STR_FAKESTR
121 * The string is not allocated or managed by the garbage collector.
122 * Typically, the string object header (struct RString) is temporarily
123 * allocated on C stack.
124 */
125
126#define RUBY_MAX_CHAR_LEN 16
127#define STR_PRECOMPUTED_HASH FL_USER4
128#define STR_SHARED_ROOT FL_USER5
129#define STR_BORROWED FL_USER6
130#define STR_TMPLOCK FL_USER7
131#define STR_NOFREE FL_USER18
132#define STR_FAKESTR FL_USER19
133
134#define STR_SET_NOEMBED(str) do {\
135 FL_SET((str), STR_NOEMBED);\
136 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
137} while (0)
138#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
139
140#define STR_SET_LEN(str, n) do { \
141 RSTRING(str)->len = (n); \
142} while (0)
143
144static inline bool
145str_encindex_fastpath(int encindex)
146{
147 // The overwhelming majority of strings are in one of these 3 encodings.
148 switch (encindex) {
149 case ENCINDEX_ASCII_8BIT:
150 case ENCINDEX_UTF_8:
151 case ENCINDEX_US_ASCII:
152 return true;
153 default:
154 return false;
155 }
156}
157
158static inline bool
159str_enc_fastpath(VALUE str)
160{
161 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
162}
163
164#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
165#define TERM_FILL(ptr, termlen) do {\
166 char *const term_fill_ptr = (ptr);\
167 const int term_fill_len = (termlen);\
168 *term_fill_ptr = '\0';\
169 if (UNLIKELY(term_fill_len > 1))\
170 memset(term_fill_ptr, 0, term_fill_len);\
171} while (0)
172
173#define RESIZE_CAPA(str,capacity) do {\
174 const int termlen = TERM_LEN(str);\
175 RESIZE_CAPA_TERM(str,capacity,termlen);\
176} while (0)
177#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
178 if (STR_EMBED_P(str)) {\
179 if (str_embed_capa(str) < capacity + termlen) {\
180 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
181 const long tlen = RSTRING_LEN(str);\
182 memcpy(tmp, RSTRING_PTR(str), tlen);\
183 RSTRING(str)->as.heap.ptr = tmp;\
184 RSTRING(str)->len = tlen;\
185 STR_SET_NOEMBED(str);\
186 RSTRING(str)->as.heap.aux.capa = (capacity);\
187 }\
188 }\
189 else {\
190 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
191 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
192 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
193 RSTRING(str)->as.heap.aux.capa = (capacity);\
194 }\
195} while (0)
196
197#define STR_SET_SHARED(str, shared_str) do { \
198 if (!FL_TEST(str, STR_FAKESTR)) { \
199 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
200 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
201 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
202 FL_SET((str), STR_SHARED); \
203 FL_SET((shared_str), STR_SHARED_ROOT); \
204 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
205 FL_SET_RAW((shared_str), STR_BORROWED); \
206 } \
207} while (0)
208
209#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
210#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
211/* TODO: include the terminator size in capa. */
212
213#define STR_ENC_GET(str) get_encoding(str)
214
215#if !defined SHARABLE_MIDDLE_SUBSTRING
216# define SHARABLE_MIDDLE_SUBSTRING 0
217#endif
218#if !SHARABLE_MIDDLE_SUBSTRING
219#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
220#else
221#define SHARABLE_SUBSTRING_P(beg, len, end) 1
222#endif
223
224
225static inline long
226str_embed_capa(VALUE str)
227{
228 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
229}
230
231bool
232rb_str_reembeddable_p(VALUE str)
233{
234 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
235}
236
237static inline size_t
238rb_str_embed_size(long capa)
239{
240 return offsetof(struct RString, as.embed.ary) + capa;
241}
242
243size_t
244rb_str_size_as_embedded(VALUE str)
245{
246 size_t real_size;
247 if (STR_EMBED_P(str)) {
248 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
249 }
250 /* if the string is not currently embedded, but it can be embedded, how
251 * much space would it require */
252 else if (rb_str_reembeddable_p(str)) {
253 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
254 }
255 else {
256 real_size = sizeof(struct RString);
257 }
258
259 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
260 real_size += sizeof(st_index_t);
261 }
262
263 return real_size;
264}
265
266static inline bool
267STR_EMBEDDABLE_P(long len, long termlen)
268{
269 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
270}
271
272static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
273static VALUE str_new_frozen(VALUE klass, VALUE orig);
274static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
275static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
276static VALUE str_new(VALUE klass, const char *ptr, long len);
277static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
278static inline void str_modifiable(VALUE str);
279static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
280static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
281
282static inline void
283str_make_independent(VALUE str)
284{
285 long len = RSTRING_LEN(str);
286 int termlen = TERM_LEN(str);
287 str_make_independent_expand((str), len, 0L, termlen);
288}
289
290static inline int str_dependent_p(VALUE str);
291
292void
293rb_str_make_independent(VALUE str)
294{
295 if (str_dependent_p(str)) {
296 str_make_independent(str);
297 }
298}
299
300void
301rb_str_make_embedded(VALUE str)
302{
303 RUBY_ASSERT(rb_str_reembeddable_p(str));
304 RUBY_ASSERT(!STR_EMBED_P(str));
305
306 char *buf = RSTRING(str)->as.heap.ptr;
307 long len = RSTRING(str)->len;
308
309 STR_SET_EMBED(str);
310 STR_SET_LEN(str, len);
311
312 if (len > 0) {
313 memcpy(RSTRING_PTR(str), buf, len);
314 ruby_xfree(buf);
315 }
316
317 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
318}
319
320void
321rb_debug_rstring_null_ptr(const char *func)
322{
323 fprintf(stderr, "%s is returning NULL!! "
324 "SIGSEGV is highly expected to follow immediately.\n"
325 "If you could reproduce, attach your debugger here, "
326 "and look at the passed string.\n",
327 func);
328}
329
330/* symbols for [up|down|swap]case/capitalize options */
331static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
332
333static rb_encoding *
334get_encoding(VALUE str)
335{
336 return rb_enc_from_index(ENCODING_GET(str));
337}
338
339static void
340mustnot_broken(VALUE str)
341{
342 if (is_broken_string(str)) {
343 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
344 }
345}
346
347static void
348mustnot_wchar(VALUE str)
349{
350 rb_encoding *enc = STR_ENC_GET(str);
351 if (rb_enc_mbminlen(enc) > 1) {
352 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
353 }
354}
355
356static int fstring_cmp(VALUE a, VALUE b);
357
358static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
359
360#if SIZEOF_LONG == SIZEOF_VOIDP
361#define PRECOMPUTED_FAKESTR_HASH 1
362#else
363#endif
364
365#ifdef PRECOMPUTED_FAKESTR_HASH
366static st_index_t
367fstring_hash(VALUE str)
368{
369 if (FL_TEST_RAW(str, STR_FAKESTR)) {
370 // register_fstring precomputes the hash and stores it in capa for fake strings
371 return (st_index_t)RSTRING(str)->as.heap.aux.capa;
372 }
373 else {
374 return rb_str_hash(str);
375 }
376}
377#else
378#define fstring_hash rb_str_hash
379#endif
380
381const struct st_hash_type rb_fstring_hash_type = {
382 fstring_cmp,
383 fstring_hash,
384};
385
386#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
387
388static inline st_index_t
389str_do_hash(VALUE str)
390{
391 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
392 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
393 if (e && !is_ascii_string(str)) {
394 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
395 }
396 return h;
397}
398
399static VALUE
400str_store_precomputed_hash(VALUE str, st_index_t hash)
401{
402 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
403 RUBY_ASSERT(STR_EMBED_P(str));
404
405#if RUBY_DEBUG
406 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
407 size_t free_bytes = str_embed_capa(str) - used_bytes;
408 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
409#endif
410
411 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
412
413 FL_SET(str, STR_PRECOMPUTED_HASH);
414
415 return str;
416}
417
419 VALUE fstr;
420 bool copy;
421 bool force_precompute_hash;
422};
423
424static int
425fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
426{
427 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
428 VALUE str = (VALUE)*key;
429
430 if (existing) {
431 /* because of lazy sweep, str may be unmarked already and swept
432 * at next time */
433
434 if (rb_objspace_garbage_object_p(str)) {
435 arg->fstr = Qundef;
436 return ST_DELETE;
437 }
438
439 arg->fstr = str;
440 return ST_STOP;
441 }
442 else {
443 // Unless the string is empty or binary, its coderange has been precomputed.
444 int coderange = ENC_CODERANGE(str);
445
446 if (FL_TEST_RAW(str, STR_FAKESTR)) {
447 if (arg->copy) {
448 VALUE new_str;
449 long len = RSTRING_LEN(str);
450 long capa = len + sizeof(st_index_t);
451 int term_len = TERM_LEN(str);
452
453 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
454 new_str = str_alloc_embed(rb_cString, capa + term_len);
455 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
456 STR_SET_LEN(new_str, RSTRING_LEN(str));
457 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
458 rb_enc_copy(new_str, str);
459 str_store_precomputed_hash(new_str, fstring_hash(str));
460 }
461 else {
462 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
463 rb_enc_copy(new_str, str);
464#ifdef PRECOMPUTED_FAKESTR_HASH
465 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
466 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
467 }
468#endif
469 }
470 str = new_str;
471 }
472 else {
473 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
474 RSTRING(str)->len,
475 ENCODING_GET(str));
476 }
477 OBJ_FREEZE(str);
478 }
479 else {
480 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
481 str = str_new_frozen(rb_cString, str);
482 }
483 if (STR_SHARED_P(str)) { /* str should not be shared */
484 /* shared substring */
485 str_make_independent(str);
487 }
488 if (!BARE_STRING_P(str)) {
489 str = str_new_frozen(rb_cString, str);
490 }
491 }
492
493 ENC_CODERANGE_SET(str, coderange);
494 RBASIC(str)->flags |= RSTRING_FSTR;
495
496 *key = *value = arg->fstr = str;
497 return ST_CONTINUE;
498 }
499}
500
501VALUE
502rb_fstring(VALUE str)
503{
504 VALUE fstr;
505 int bare;
506
507 Check_Type(str, T_STRING);
508
509 if (FL_TEST(str, RSTRING_FSTR))
510 return str;
511
512 bare = BARE_STRING_P(str);
513 if (!bare) {
514 if (STR_EMBED_P(str)) {
515 OBJ_FREEZE(str);
516 return str;
517 }
518
519 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
521 return str;
522 }
523 }
524
525 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
526 rb_str_resize(str, RSTRING_LEN(str));
527
528 fstr = register_fstring(str, false, false);
529
530 if (!bare) {
531 str_replace_shared_without_enc(str, fstr);
532 OBJ_FREEZE(str);
533 return str;
534 }
535 return fstr;
536}
537
538static VALUE
539register_fstring(VALUE str, bool copy, bool force_precompute_hash)
540{
541 struct fstr_update_arg args = {
542 .copy = copy,
543 .force_precompute_hash = force_precompute_hash
544 };
545
546#if SIZEOF_VOIDP == SIZEOF_LONG
547 if (FL_TEST_RAW(str, STR_FAKESTR)) {
548 // if the string hasn't been interned, we'll need the hash twice, so we
549 // compute it once and store it in capa
550 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
551 }
552#endif
553
554 RB_VM_LOCK_ENTER();
555 {
556 st_table *frozen_strings = rb_vm_fstring_table();
557 do {
558 args.fstr = str;
559 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
560 } while (UNDEF_P(args.fstr));
561 }
562 RB_VM_LOCK_LEAVE();
563
564 RUBY_ASSERT(OBJ_FROZEN(args.fstr));
565 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
566 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
567 RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
568
569 return args.fstr;
570}
571
572static VALUE
573setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
574{
575 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
576
577 if (!name) {
579 name = "";
580 }
581
582 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
583
584 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
585 fake_str->len = len;
586 fake_str->as.heap.ptr = (char *)name;
587 fake_str->as.heap.aux.capa = len;
588 return (VALUE)fake_str;
589}
590
591/*
592 * set up a fake string which refers a static string literal.
593 */
594VALUE
595rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
596{
597 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
598}
599
600/*
601 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
602 * shared string which refers a static string literal. `ptr` must
603 * point a constant string.
604 */
605VALUE
606rb_fstring_new(const char *ptr, long len)
607{
608 struct RString fake_str;
609 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
610}
611
612VALUE
613rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
614{
615 struct RString fake_str;
616 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
617}
618
619VALUE
620rb_fstring_cstr(const char *ptr)
621{
622 return rb_fstring_new(ptr, strlen(ptr));
623}
624
625static int
626fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
627{
628 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
629 return ST_CONTINUE;
630}
631
632static int
633fstring_cmp(VALUE a, VALUE b)
634{
635 long alen, blen;
636 const char *aptr, *bptr;
637 RSTRING_GETMEM(a, aptr, alen);
638 RSTRING_GETMEM(b, bptr, blen);
639 return (alen != blen ||
640 ENCODING_GET(a) != ENCODING_GET(b) ||
641 memcmp(aptr, bptr, alen) != 0);
642}
643
644static inline bool
645single_byte_optimizable(VALUE str)
646{
647 int encindex = ENCODING_GET(str);
648 switch (encindex) {
649 case ENCINDEX_ASCII_8BIT:
650 case ENCINDEX_US_ASCII:
651 return true;
652 case ENCINDEX_UTF_8:
653 // For UTF-8 it's worth scanning the string coderange when unknown.
655 }
656 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
657 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
658 return true;
659 }
660
661 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
662 return true;
663 }
664
665 /* Conservative. Possibly single byte.
666 * "\xa1" in Shift_JIS for example. */
667 return false;
668}
669
671
672static inline const char *
673search_nonascii(const char *p, const char *e)
674{
675 const uintptr_t *s, *t;
676
677#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
678# if SIZEOF_UINTPTR_T == 8
679# define NONASCII_MASK UINT64_C(0x8080808080808080)
680# elif SIZEOF_UINTPTR_T == 4
681# define NONASCII_MASK UINT32_C(0x80808080)
682# else
683# error "don't know what to do."
684# endif
685#else
686# if SIZEOF_UINTPTR_T == 8
687# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
688# elif SIZEOF_UINTPTR_T == 4
689# define NONASCII_MASK 0x80808080UL /* or...? */
690# else
691# error "don't know what to do."
692# endif
693#endif
694
695 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
696#if !UNALIGNED_WORD_ACCESS
697 if ((uintptr_t)p % SIZEOF_VOIDP) {
698 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
699 p += l;
700 switch (l) {
701 default: UNREACHABLE;
702#if SIZEOF_VOIDP > 4
703 case 7: if (p[-7]&0x80) return p-7;
704 case 6: if (p[-6]&0x80) return p-6;
705 case 5: if (p[-5]&0x80) return p-5;
706 case 4: if (p[-4]&0x80) return p-4;
707#endif
708 case 3: if (p[-3]&0x80) return p-3;
709 case 2: if (p[-2]&0x80) return p-2;
710 case 1: if (p[-1]&0x80) return p-1;
711 case 0: break;
712 }
713 }
714#endif
715#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
716#define aligned_ptr(value) \
717 __builtin_assume_aligned((value), sizeof(uintptr_t))
718#else
719#define aligned_ptr(value) (uintptr_t *)(value)
720#endif
721 s = aligned_ptr(p);
722 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
723#undef aligned_ptr
724 for (;s < t; s++) {
725 if (*s & NONASCII_MASK) {
726#ifdef WORDS_BIGENDIAN
727 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
728#else
729 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
730#endif
731 }
732 }
733 p = (const char *)s;
734 }
735
736 switch (e - p) {
737 default: UNREACHABLE;
738#if SIZEOF_VOIDP > 4
739 case 7: if (e[-7]&0x80) return e-7;
740 case 6: if (e[-6]&0x80) return e-6;
741 case 5: if (e[-5]&0x80) return e-5;
742 case 4: if (e[-4]&0x80) return e-4;
743#endif
744 case 3: if (e[-3]&0x80) return e-3;
745 case 2: if (e[-2]&0x80) return e-2;
746 case 1: if (e[-1]&0x80) return e-1;
747 case 0: return NULL;
748 }
749}
750
751static int
752coderange_scan(const char *p, long len, rb_encoding *enc)
753{
754 const char *e = p + len;
755
756 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
757 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
758 p = search_nonascii(p, e);
760 }
761
762 if (rb_enc_asciicompat(enc)) {
763 p = search_nonascii(p, e);
764 if (!p) return ENC_CODERANGE_7BIT;
765 for (;;) {
766 int ret = rb_enc_precise_mbclen(p, e, enc);
768 p += MBCLEN_CHARFOUND_LEN(ret);
769 if (p == e) break;
770 p = search_nonascii(p, e);
771 if (!p) break;
772 }
773 }
774 else {
775 while (p < e) {
776 int ret = rb_enc_precise_mbclen(p, e, enc);
778 p += MBCLEN_CHARFOUND_LEN(ret);
779 }
780 }
781 return ENC_CODERANGE_VALID;
782}
783
784long
785rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
786{
787 const char *p = s;
788
789 if (*cr == ENC_CODERANGE_BROKEN)
790 return e - s;
791
792 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
793 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
794 if (*cr == ENC_CODERANGE_VALID) return e - s;
795 p = search_nonascii(p, e);
797 return e - s;
798 }
799 else if (rb_enc_asciicompat(enc)) {
800 p = search_nonascii(p, e);
801 if (!p) {
802 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
803 return e - s;
804 }
805 for (;;) {
806 int ret = rb_enc_precise_mbclen(p, e, enc);
807 if (!MBCLEN_CHARFOUND_P(ret)) {
809 return p - s;
810 }
811 p += MBCLEN_CHARFOUND_LEN(ret);
812 if (p == e) break;
813 p = search_nonascii(p, e);
814 if (!p) break;
815 }
816 }
817 else {
818 while (p < e) {
819 int ret = rb_enc_precise_mbclen(p, e, enc);
820 if (!MBCLEN_CHARFOUND_P(ret)) {
822 return p - s;
823 }
824 p += MBCLEN_CHARFOUND_LEN(ret);
825 }
826 }
828 return e - s;
829}
830
831static inline void
832str_enc_copy(VALUE str1, VALUE str2)
833{
834 rb_enc_set_index(str1, ENCODING_GET(str2));
835}
836
837/* Like str_enc_copy, but does not check frozen status of str1.
838 * You should use this only if you're certain that str1 is not frozen. */
839static inline void
840str_enc_copy_direct(VALUE str1, VALUE str2)
841{
842 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
843 if (inlined_encoding == ENCODING_INLINE_MAX) {
844 rb_enc_set_index(str1, rb_enc_get_index(str2));
845 }
846 else {
847 ENCODING_SET_INLINED(str1, inlined_encoding);
848 }
849}
850
851static void
852rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
853{
854 /* this function is designed for copying encoding and coderange
855 * from src to new string "dest" which is made from the part of src.
856 */
857 str_enc_copy(dest, src);
858 if (RSTRING_LEN(dest) == 0) {
859 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
861 else
863 return;
864 }
865 switch (ENC_CODERANGE(src)) {
868 break;
870 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
871 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
873 else
875 break;
876 default:
877 break;
878 }
879}
880
881static void
882rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
883{
884 str_enc_copy(dest, src);
886}
887
888static int
889enc_coderange_scan(VALUE str, rb_encoding *enc)
890{
891 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
892}
893
894int
895rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
896{
897 return enc_coderange_scan(str, enc);
898}
899
900int
902{
903 int cr = ENC_CODERANGE(str);
904
905 if (cr == ENC_CODERANGE_UNKNOWN) {
906 cr = enc_coderange_scan(str, get_encoding(str));
907 ENC_CODERANGE_SET(str, cr);
908 }
909 return cr;
910}
911
912static inline bool
913rb_enc_str_asciicompat(VALUE str)
914{
915 int encindex = ENCODING_GET_INLINED(str);
916 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
917}
918
919int
921{
922 switch(ENC_CODERANGE(str)) {
924 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
926 return true;
927 default:
928 return false;
929 }
930}
931
932static inline void
933str_mod_check(VALUE s, const char *p, long len)
934{
935 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
936 rb_raise(rb_eRuntimeError, "string modified");
937 }
938}
939
940static size_t
941str_capacity(VALUE str, const int termlen)
942{
943 if (STR_EMBED_P(str)) {
944 return str_embed_capa(str) - termlen;
945 }
946 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
947 return RSTRING(str)->len;
948 }
949 else {
950 return RSTRING(str)->as.heap.aux.capa;
951 }
952}
953
954size_t
956{
957 return str_capacity(str, TERM_LEN(str));
958}
959
960static inline void
961must_not_null(const char *ptr)
962{
963 if (!ptr) {
964 rb_raise(rb_eArgError, "NULL pointer given");
965 }
966}
967
968static inline VALUE
969str_alloc_embed(VALUE klass, size_t capa)
970{
971 size_t size = rb_str_embed_size(capa);
972 RUBY_ASSERT(size > 0);
973 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
974
975 NEWOBJ_OF(str, struct RString, klass,
977
978 return (VALUE)str;
979}
980
981static inline VALUE
982str_alloc_heap(VALUE klass)
983{
984 NEWOBJ_OF(str, struct RString, klass,
985 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
986
987 return (VALUE)str;
988}
989
990static inline VALUE
991empty_str_alloc(VALUE klass)
992{
993 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
994 VALUE str = str_alloc_embed(klass, 0);
995 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
997 return str;
998}
999
1000static VALUE
1001str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1002{
1003 VALUE str;
1004
1005 if (len < 0) {
1006 rb_raise(rb_eArgError, "negative string size (or size too big)");
1007 }
1008
1009 if (enc == NULL) {
1010 enc = rb_ascii8bit_encoding();
1011 }
1012
1013 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1014
1015 int termlen = rb_enc_mbminlen(enc);
1016
1017 if (STR_EMBEDDABLE_P(len, termlen)) {
1018 str = str_alloc_embed(klass, len + termlen);
1019 if (len == 0) {
1020 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1021 }
1022 }
1023 else {
1024 str = str_alloc_heap(klass);
1025 RSTRING(str)->as.heap.aux.capa = len;
1026 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1027 * integer overflow. If we can STATIC_ASSERT that, the following
1028 * mul_add_mul can be reverted to a simple ALLOC_N. */
1029 RSTRING(str)->as.heap.ptr =
1030 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1031 }
1032
1033 rb_enc_raw_set(str, enc);
1034
1035 if (ptr) {
1036 memcpy(RSTRING_PTR(str), ptr, len);
1037 }
1038
1039 STR_SET_LEN(str, len);
1040 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1041 return str;
1042}
1043
1044static VALUE
1045str_new(VALUE klass, const char *ptr, long len)
1046{
1047 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1048}
1049
1050VALUE
1051rb_str_new(const char *ptr, long len)
1052{
1053 return str_new(rb_cString, ptr, len);
1054}
1055
1056VALUE
1057rb_usascii_str_new(const char *ptr, long len)
1058{
1059 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1060}
1061
1062VALUE
1063rb_utf8_str_new(const char *ptr, long len)
1064{
1065 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1066}
1067
1068VALUE
1069rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1070{
1071 return str_enc_new(rb_cString, ptr, len, enc);
1072}
1073
1074VALUE
1076{
1077 must_not_null(ptr);
1078 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1079 * memory regions, and that cannot be detected by the MSAN. Just
1080 * trust the programmer that the argument passed here is a sane C
1081 * string. */
1082 __msan_unpoison_string(ptr);
1083 return rb_str_new(ptr, strlen(ptr));
1084}
1085
1086VALUE
1088{
1089 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1090}
1091
1092VALUE
1094{
1095 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1096}
1097
1098VALUE
1100{
1101 must_not_null(ptr);
1102 if (rb_enc_mbminlen(enc) != 1) {
1103 rb_raise(rb_eArgError, "wchar encoding given");
1104 }
1105 return rb_enc_str_new(ptr, strlen(ptr), enc);
1106}
1107
1108static VALUE
1109str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1110{
1111 VALUE str;
1112
1113 if (len < 0) {
1114 rb_raise(rb_eArgError, "negative string size (or size too big)");
1115 }
1116
1117 if (!ptr) {
1118 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1119 }
1120 else {
1121 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1122 str = str_alloc_heap(klass);
1123 RSTRING(str)->len = len;
1124 RSTRING(str)->as.heap.ptr = (char *)ptr;
1125 RSTRING(str)->as.heap.aux.capa = len;
1126 RBASIC(str)->flags |= STR_NOFREE;
1127 rb_enc_associate_index(str, encindex);
1128 }
1129 return str;
1130}
1131
1132VALUE
1133rb_str_new_static(const char *ptr, long len)
1134{
1135 return str_new_static(rb_cString, ptr, len, 0);
1136}
1137
1138VALUE
1140{
1141 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1142}
1143
1144VALUE
1146{
1147 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1148}
1149
1150VALUE
1152{
1153 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1154}
1155
1156static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1157 rb_encoding *from, rb_encoding *to,
1158 int ecflags, VALUE ecopts);
1159
1160static inline bool
1161is_enc_ascii_string(VALUE str, rb_encoding *enc)
1162{
1163 int encidx = rb_enc_to_index(enc);
1164 if (rb_enc_get_index(str) == encidx)
1165 return is_ascii_string(str);
1166 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1167}
1168
1169VALUE
1170rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1171{
1172 long len;
1173 const char *ptr;
1174 VALUE newstr;
1175
1176 if (!to) return str;
1177 if (!from) from = rb_enc_get(str);
1178 if (from == to) return str;
1179 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1180 rb_is_ascii8bit_enc(to)) {
1181 if (STR_ENC_GET(str) != to) {
1182 str = rb_str_dup(str);
1183 rb_enc_associate(str, to);
1184 }
1185 return str;
1186 }
1187
1188 RSTRING_GETMEM(str, ptr, len);
1189 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1190 from, to, ecflags, ecopts);
1191 if (NIL_P(newstr)) {
1192 /* some error, return original */
1193 return str;
1194 }
1195 return newstr;
1196}
1197
1198VALUE
1199rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1200 rb_encoding *from, int ecflags, VALUE ecopts)
1201{
1202 long olen;
1203
1204 olen = RSTRING_LEN(newstr);
1205 if (ofs < -olen || olen < ofs)
1206 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1207 if (ofs < 0) ofs += olen;
1208 if (!from) {
1209 STR_SET_LEN(newstr, ofs);
1210 return rb_str_cat(newstr, ptr, len);
1211 }
1212
1213 rb_str_modify(newstr);
1214 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1215 rb_enc_get(newstr),
1216 ecflags, ecopts);
1217}
1218
1219VALUE
1220rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1221{
1222 STR_SET_LEN(str, 0);
1223 rb_enc_associate(str, enc);
1224 rb_str_cat(str, ptr, len);
1225 return str;
1226}
1227
1228static VALUE
1229str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1230 rb_encoding *from, rb_encoding *to,
1231 int ecflags, VALUE ecopts)
1232{
1233 rb_econv_t *ec;
1235 long olen;
1236 VALUE econv_wrapper;
1237 const unsigned char *start, *sp;
1238 unsigned char *dest, *dp;
1239 size_t converted_output = (size_t)ofs;
1240
1241 olen = rb_str_capacity(newstr);
1242
1243 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1244 RBASIC_CLEAR_CLASS(econv_wrapper);
1245 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1246 if (!ec) return Qnil;
1247 DATA_PTR(econv_wrapper) = ec;
1248
1249 sp = (unsigned char*)ptr;
1250 start = sp;
1251 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1252 (dp = dest + converted_output),
1253 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1255 /* destination buffer short */
1256 size_t converted_input = sp - start;
1257 size_t rest = len - converted_input;
1258 converted_output = dp - dest;
1259 rb_str_set_len(newstr, converted_output);
1260 if (converted_input && converted_output &&
1261 rest < (LONG_MAX / converted_output)) {
1262 rest = (rest * converted_output) / converted_input;
1263 }
1264 else {
1265 rest = olen;
1266 }
1267 olen += rest < 2 ? 2 : rest;
1268 rb_str_resize(newstr, olen);
1269 }
1270 DATA_PTR(econv_wrapper) = 0;
1271 RB_GC_GUARD(econv_wrapper);
1272 rb_econv_close(ec);
1273 switch (ret) {
1274 case econv_finished:
1275 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1276 rb_str_set_len(newstr, len);
1277 rb_enc_associate(newstr, to);
1278 return newstr;
1279
1280 default:
1281 return Qnil;
1282 }
1283}
1284
1285VALUE
1287{
1288 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1289}
1290
1291VALUE
1293{
1294 rb_encoding *ienc;
1295 VALUE str;
1296 const int eidx = rb_enc_to_index(eenc);
1297
1298 if (!ptr) {
1299 return rb_enc_str_new(ptr, len, eenc);
1300 }
1301
1302 /* ASCII-8BIT case, no conversion */
1303 if ((eidx == rb_ascii8bit_encindex()) ||
1304 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1305 return rb_str_new(ptr, len);
1306 }
1307 /* no default_internal or same encoding, no conversion */
1308 ienc = rb_default_internal_encoding();
1309 if (!ienc || eenc == ienc) {
1310 return rb_enc_str_new(ptr, len, eenc);
1311 }
1312 /* ASCII compatible, and ASCII only string, no conversion in
1313 * default_internal */
1314 if ((eidx == rb_ascii8bit_encindex()) ||
1315 (eidx == rb_usascii_encindex()) ||
1316 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1317 return rb_enc_str_new(ptr, len, ienc);
1318 }
1319 /* convert from the given encoding to default_internal */
1320 str = rb_enc_str_new(NULL, 0, ienc);
1321 /* when the conversion failed for some reason, just ignore the
1322 * default_internal and result in the given encoding as-is. */
1323 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1324 rb_str_initialize(str, ptr, len, eenc);
1325 }
1326 return str;
1327}
1328
1329VALUE
1330rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1331{
1332 int eidx = rb_enc_to_index(eenc);
1333 if (eidx == rb_usascii_encindex() &&
1334 !is_ascii_string(str)) {
1335 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1336 return str;
1337 }
1338 rb_enc_associate_index(str, eidx);
1339 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1340}
1341
1342VALUE
1343rb_external_str_new(const char *ptr, long len)
1344{
1345 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1346}
1347
1348VALUE
1350{
1351 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1352}
1353
1354VALUE
1355rb_locale_str_new(const char *ptr, long len)
1356{
1357 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1358}
1359
1360VALUE
1362{
1363 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1364}
1365
1366VALUE
1368{
1369 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1370}
1371
1372VALUE
1374{
1375 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1376}
1377
1378VALUE
1380{
1381 return rb_str_export_to_enc(str, rb_default_external_encoding());
1382}
1383
1384VALUE
1386{
1387 return rb_str_export_to_enc(str, rb_locale_encoding());
1388}
1389
1390VALUE
1392{
1393 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1394}
1395
1396static VALUE
1397str_replace_shared_without_enc(VALUE str2, VALUE str)
1398{
1399 const int termlen = TERM_LEN(str);
1400 char *ptr;
1401 long len;
1402
1403 RSTRING_GETMEM(str, ptr, len);
1404 if (str_embed_capa(str2) >= len + termlen) {
1405 char *ptr2 = RSTRING(str2)->as.embed.ary;
1406 STR_SET_EMBED(str2);
1407 memcpy(ptr2, RSTRING_PTR(str), len);
1408 TERM_FILL(ptr2+len, termlen);
1409 }
1410 else {
1411 VALUE root;
1412 if (STR_SHARED_P(str)) {
1413 root = RSTRING(str)->as.heap.aux.shared;
1414 RSTRING_GETMEM(str, ptr, len);
1415 }
1416 else {
1417 root = rb_str_new_frozen(str);
1418 RSTRING_GETMEM(root, ptr, len);
1419 }
1420 RUBY_ASSERT(OBJ_FROZEN(root));
1421
1422 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1423 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1424 rb_fatal("about to free a possible shared root");
1425 }
1426 char *ptr2 = STR_HEAP_PTR(str2);
1427 if (ptr2 != ptr) {
1428 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1429 }
1430 }
1431 FL_SET(str2, STR_NOEMBED);
1432 RSTRING(str2)->as.heap.ptr = ptr;
1433 STR_SET_SHARED(str2, root);
1434 }
1435
1436 STR_SET_LEN(str2, len);
1437
1438 return str2;
1439}
1440
1441static VALUE
1442str_replace_shared(VALUE str2, VALUE str)
1443{
1444 str_replace_shared_without_enc(str2, str);
1445 rb_enc_cr_str_exact_copy(str2, str);
1446 return str2;
1447}
1448
1449static VALUE
1450str_new_shared(VALUE klass, VALUE str)
1451{
1452 return str_replace_shared(str_alloc_heap(klass), str);
1453}
1454
1455VALUE
1457{
1458 return str_new_shared(rb_obj_class(str), str);
1459}
1460
1461VALUE
1463{
1464 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1465 return str_new_frozen(rb_obj_class(orig), orig);
1466}
1467
1468static VALUE
1469rb_str_new_frozen_String(VALUE orig)
1470{
1471 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1472 return str_new_frozen(rb_cString, orig);
1473}
1474
1475VALUE
1476rb_str_tmp_frozen_acquire(VALUE orig)
1477{
1478 if (OBJ_FROZEN_RAW(orig)) return orig;
1479 return str_new_frozen_buffer(0, orig, FALSE);
1480}
1481
1482VALUE
1483rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1484{
1485 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1486 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1487
1488 VALUE str = str_alloc_heap(0);
1489 OBJ_FREEZE(str);
1490 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1491 FL_SET(str, STR_SHARED_ROOT);
1492
1493 size_t capa = str_capacity(orig, TERM_LEN(orig));
1494
1495 /* If the string is embedded then we want to create a copy that is heap
1496 * allocated. If the string is shared then the shared root must be
1497 * embedded, so we want to create a copy. If the string is a shared root
1498 * then it must be embedded, so we want to create a copy. */
1499 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1500 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1501 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1502 }
1503 else {
1504 /* orig must be heap allocated and not shared, so we can safely transfer
1505 * the pointer to str. */
1506 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1507 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1508 RBASIC(orig)->flags &= ~STR_NOFREE;
1509 STR_SET_SHARED(orig, str);
1510 }
1511
1512 RSTRING(str)->len = RSTRING(orig)->len;
1513 RSTRING(str)->as.heap.aux.capa = capa;
1514
1515 return str;
1516}
1517
1518void
1519rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1520{
1521 if (RBASIC_CLASS(tmp) != 0)
1522 return;
1523
1524 if (STR_EMBED_P(tmp)) {
1526 }
1527 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1528 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1529 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1530
1531 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1532 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1533 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1534
1535 /* Unshare orig since the root (tmp) only has this one child. */
1536 FL_UNSET_RAW(orig, STR_SHARED);
1537 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1538 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1540
1541 /* Make tmp embedded and empty so it is safe for sweeping. */
1542 STR_SET_EMBED(tmp);
1543 STR_SET_LEN(tmp, 0);
1544 }
1545 }
1546}
1547
1548static VALUE
1549str_new_frozen(VALUE klass, VALUE orig)
1550{
1551 return str_new_frozen_buffer(klass, orig, TRUE);
1552}
1553
1554static VALUE
1555heap_str_make_shared(VALUE klass, VALUE orig)
1556{
1557 RUBY_ASSERT(!STR_EMBED_P(orig));
1558 RUBY_ASSERT(!STR_SHARED_P(orig));
1559
1560 VALUE str = str_alloc_heap(klass);
1561 STR_SET_LEN(str, RSTRING_LEN(orig));
1562 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1563 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1564 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1565 RBASIC(orig)->flags &= ~STR_NOFREE;
1566 STR_SET_SHARED(orig, str);
1567 if (klass == 0)
1568 FL_UNSET_RAW(str, STR_BORROWED);
1569 return str;
1570}
1571
1572static VALUE
1573str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1574{
1575 VALUE str;
1576
1577 long len = RSTRING_LEN(orig);
1578 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1579 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1580
1581 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1582 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1583 RUBY_ASSERT(STR_EMBED_P(str));
1584 }
1585 else {
1586 if (FL_TEST_RAW(orig, STR_SHARED)) {
1587 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1588 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1589 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1590 RUBY_ASSERT(ofs >= 0);
1591 RUBY_ASSERT(rest >= 0);
1592 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1594
1595 if ((ofs > 0) || (rest > 0) ||
1596 (klass != RBASIC(shared)->klass) ||
1597 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1598 str = str_new_shared(klass, shared);
1599 RUBY_ASSERT(!STR_EMBED_P(str));
1600 RSTRING(str)->as.heap.ptr += ofs;
1601 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1602 }
1603 else {
1604 if (RBASIC_CLASS(shared) == 0)
1605 FL_SET_RAW(shared, STR_BORROWED);
1606 return shared;
1607 }
1608 }
1609 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1610 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1611 STR_SET_EMBED(str);
1612 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1613 STR_SET_LEN(str, RSTRING_LEN(orig));
1614 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1615 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1616 }
1617 else {
1618 str = heap_str_make_shared(klass, orig);
1619 }
1620 }
1621
1622 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1623 OBJ_FREEZE(str);
1624 return str;
1625}
1626
1627VALUE
1628rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1629{
1630 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1631}
1632
1633static VALUE
1634str_new_empty_String(VALUE str)
1635{
1636 VALUE v = rb_str_new(0, 0);
1637 rb_enc_copy(v, str);
1638 return v;
1639}
1640
1641#define STR_BUF_MIN_SIZE 63
1642
1643VALUE
1645{
1646 if (STR_EMBEDDABLE_P(capa, 1)) {
1647 return str_alloc_embed(rb_cString, capa + 1);
1648 }
1649
1650 VALUE str = str_alloc_heap(rb_cString);
1651
1652 RSTRING(str)->as.heap.aux.capa = capa;
1653 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1654 RSTRING(str)->as.heap.ptr[0] = '\0';
1655
1656 return str;
1657}
1658
1659VALUE
1661{
1662 VALUE str;
1663 long len = strlen(ptr);
1664
1665 str = rb_str_buf_new(len);
1666 rb_str_buf_cat(str, ptr, len);
1667
1668 return str;
1669}
1670
1671VALUE
1673{
1674 return str_new(0, 0, len);
1675}
1676
1677void
1679{
1680 if (STR_EMBED_P(str)) {
1681 RB_DEBUG_COUNTER_INC(obj_str_embed);
1682 }
1683 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1684 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1685 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1686 }
1687 else {
1688 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1689 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1690 }
1691}
1692
1693size_t
1694rb_str_memsize(VALUE str)
1695{
1696 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1697 return STR_HEAP_SIZE(str);
1698 }
1699 else {
1700 return 0;
1701 }
1702}
1703
1704VALUE
1706{
1707 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1708}
1709
1710static inline void str_discard(VALUE str);
1711static void str_shared_replace(VALUE str, VALUE str2);
1712
1713void
1715{
1716 if (str != str2) str_shared_replace(str, str2);
1717}
1718
1719static void
1720str_shared_replace(VALUE str, VALUE str2)
1721{
1722 rb_encoding *enc;
1723 int cr;
1724 int termlen;
1725
1726 RUBY_ASSERT(str2 != str);
1727 enc = STR_ENC_GET(str2);
1728 cr = ENC_CODERANGE(str2);
1729 str_discard(str);
1730 termlen = rb_enc_mbminlen(enc);
1731
1732 STR_SET_LEN(str, RSTRING_LEN(str2));
1733
1734 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1735 STR_SET_EMBED(str);
1736 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1737 rb_enc_associate(str, enc);
1738 ENC_CODERANGE_SET(str, cr);
1739 }
1740 else {
1741 if (STR_EMBED_P(str2)) {
1742 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1743 long len = RSTRING_LEN(str2);
1744 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1745
1746 char *new_ptr = ALLOC_N(char, len + termlen);
1747 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1748 RSTRING(str2)->as.heap.ptr = new_ptr;
1749 STR_SET_LEN(str2, len);
1750 RSTRING(str2)->as.heap.aux.capa = len;
1751 STR_SET_NOEMBED(str2);
1752 }
1753
1754 STR_SET_NOEMBED(str);
1755 FL_UNSET(str, STR_SHARED);
1756 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1757
1758 if (FL_TEST(str2, STR_SHARED)) {
1759 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1760 STR_SET_SHARED(str, shared);
1761 }
1762 else {
1763 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1764 }
1765
1766 /* abandon str2 */
1767 STR_SET_EMBED(str2);
1768 RSTRING_PTR(str2)[0] = 0;
1769 STR_SET_LEN(str2, 0);
1770 rb_enc_associate(str, enc);
1771 ENC_CODERANGE_SET(str, cr);
1772 }
1773}
1774
1775VALUE
1777{
1778 VALUE str;
1779
1780 if (RB_TYPE_P(obj, T_STRING)) {
1781 return obj;
1782 }
1783 str = rb_funcall(obj, idTo_s, 0);
1784 return rb_obj_as_string_result(str, obj);
1785}
1786
1787VALUE
1788rb_obj_as_string_result(VALUE str, VALUE obj)
1789{
1790 if (!RB_TYPE_P(str, T_STRING))
1791 return rb_any_to_s(obj);
1792 return str;
1793}
1794
1795static VALUE
1796str_replace(VALUE str, VALUE str2)
1797{
1798 long len;
1799
1800 len = RSTRING_LEN(str2);
1801 if (STR_SHARED_P(str2)) {
1802 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1804 STR_SET_NOEMBED(str);
1805 STR_SET_LEN(str, len);
1806 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1807 STR_SET_SHARED(str, shared);
1808 rb_enc_cr_str_exact_copy(str, str2);
1809 }
1810 else {
1811 str_replace_shared(str, str2);
1812 }
1813
1814 return str;
1815}
1816
1817static inline VALUE
1818ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1819{
1820 size_t size = rb_str_embed_size(capa);
1821 RUBY_ASSERT(size > 0);
1822 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1823
1824 NEWOBJ_OF(str, struct RString, klass,
1826
1827 return (VALUE)str;
1828}
1829
1830static inline VALUE
1831ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1832{
1833 NEWOBJ_OF(str, struct RString, klass,
1834 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1835
1836 return (VALUE)str;
1837}
1838
1839static inline VALUE
1840str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1841{
1842 int encidx = 0;
1843 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1844 encidx = rb_enc_get_index(str);
1845 flags &= ~ENCODING_MASK;
1846 }
1847 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1848 if (encidx) rb_enc_associate_index(dup, encidx);
1849 return dup;
1850}
1851
1852static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1853
1854static inline VALUE
1855str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1856{
1857 VALUE flags = FL_TEST_RAW(str, flag_mask);
1858 long len = RSTRING_LEN(str);
1859
1860 RUBY_ASSERT(STR_EMBED_P(dup));
1861 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1862 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1863 STR_SET_LEN(dup, RSTRING_LEN(str));
1864 return str_duplicate_setup_encoding(str, dup, flags);
1865}
1866
1867static inline VALUE
1868str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1869{
1870 VALUE flags = FL_TEST_RAW(str, flag_mask);
1871 VALUE root = str;
1872 if (FL_TEST_RAW(str, STR_SHARED)) {
1873 root = RSTRING(str)->as.heap.aux.shared;
1874 }
1875 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1876 root = str = str_new_frozen(klass, str);
1877 flags = FL_TEST_RAW(str, flag_mask);
1878 }
1879 RUBY_ASSERT(!STR_SHARED_P(root));
1881
1882 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1883 FL_SET(root, STR_SHARED_ROOT);
1884 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1885 flags |= RSTRING_NOEMBED | STR_SHARED;
1886
1887 STR_SET_LEN(dup, RSTRING_LEN(str));
1888 return str_duplicate_setup_encoding(str, dup, flags);
1889}
1890
1891static inline VALUE
1892str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1893{
1894 if (STR_EMBED_P(str)) {
1895 return str_duplicate_setup_embed(klass, str, dup);
1896 }
1897 else {
1898 return str_duplicate_setup_heap(klass, str, dup);
1899 }
1900}
1901
1902static inline VALUE
1903str_duplicate(VALUE klass, VALUE str)
1904{
1905 VALUE dup;
1906 if (STR_EMBED_P(str)) {
1907 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1908 }
1909 else {
1910 dup = str_alloc_heap(klass);
1911 }
1912
1913 return str_duplicate_setup(klass, str, dup);
1914}
1915
1916VALUE
1918{
1919 return str_duplicate(rb_obj_class(str), str);
1920}
1921
1922/* :nodoc: */
1923VALUE
1924rb_str_dup_m(VALUE str)
1925{
1926 if (LIKELY(BARE_STRING_P(str))) {
1927 return str_duplicate(rb_obj_class(str), str);
1928 }
1929 else {
1930 return rb_obj_dup(str);
1931 }
1932}
1933
1934VALUE
1936{
1937 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1938 return str_duplicate(rb_cString, str);
1939}
1940
1941VALUE
1942rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1943{
1944 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1945 VALUE new_str, klass = rb_cString;
1946
1947 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1948 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1949 str_duplicate_setup_embed(klass, str, new_str);
1950 }
1951 else {
1952 new_str = ec_str_alloc_heap(ec, klass);
1953 str_duplicate_setup_heap(klass, str, new_str);
1954 }
1955 if (chilled) {
1956 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1957 }
1958 return new_str;
1959}
1960
1961VALUE
1962rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
1963{
1964 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
1965 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
1966 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1967 FL_SET_RAW(str, STR_CHILLED_LITERAL);
1968 return rb_str_freeze(str);
1969}
1970
1971/*
1972 *
1973 * call-seq:
1974 * String.new(string = '', **opts) -> new_string
1975 *
1976 * :include: doc/string/new.rdoc
1977 *
1978 */
1979
1980static VALUE
1981rb_str_init(int argc, VALUE *argv, VALUE str)
1982{
1983 static ID keyword_ids[2];
1984 VALUE orig, opt, venc, vcapa;
1985 VALUE kwargs[2];
1986 rb_encoding *enc = 0;
1987 int n;
1988
1989 if (!keyword_ids[0]) {
1990 keyword_ids[0] = rb_id_encoding();
1991 CONST_ID(keyword_ids[1], "capacity");
1992 }
1993
1994 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1995 if (!NIL_P(opt)) {
1996 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1997 venc = kwargs[0];
1998 vcapa = kwargs[1];
1999 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2000 enc = rb_to_encoding(venc);
2001 }
2002 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2003 long capa = NUM2LONG(vcapa);
2004 long len = 0;
2005 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2006
2007 if (capa < STR_BUF_MIN_SIZE) {
2008 capa = STR_BUF_MIN_SIZE;
2009 }
2010 if (n == 1) {
2011 StringValue(orig);
2012 len = RSTRING_LEN(orig);
2013 if (capa < len) {
2014 capa = len;
2015 }
2016 if (orig == str) n = 0;
2017 }
2018 str_modifiable(str);
2019 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2020 /* make noembed always */
2021 const size_t size = (size_t)capa + termlen;
2022 const char *const old_ptr = RSTRING_PTR(str);
2023 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2024 char *new_ptr = ALLOC_N(char, size);
2025 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2026 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2027 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2028 RSTRING(str)->as.heap.ptr = new_ptr;
2029 }
2030 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2031 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2032 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2033 }
2034 STR_SET_LEN(str, len);
2035 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2036 if (n == 1) {
2037 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2038 rb_enc_cr_str_exact_copy(str, orig);
2039 }
2040 FL_SET(str, STR_NOEMBED);
2041 RSTRING(str)->as.heap.aux.capa = capa;
2042 }
2043 else if (n == 1) {
2044 rb_str_replace(str, orig);
2045 }
2046 if (enc) {
2047 rb_enc_associate(str, enc);
2049 }
2050 }
2051 else if (n == 1) {
2052 rb_str_replace(str, orig);
2053 }
2054 return str;
2055}
2056
2057/* :nodoc: */
2058static VALUE
2059rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2060{
2061 if (klass != rb_cString) {
2062 return rb_class_new_instance_pass_kw(argc, argv, klass);
2063 }
2064
2065 static ID keyword_ids[2];
2066 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2067 VALUE kwargs[2];
2068 rb_encoding *enc = NULL;
2069
2070 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2071 if (NIL_P(opt)) {
2072 return rb_class_new_instance_pass_kw(argc, argv, klass);
2073 }
2074
2075 keyword_ids[0] = rb_id_encoding();
2076 CONST_ID(keyword_ids[1], "capacity");
2077 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2078 encoding = kwargs[0];
2079 capacity = kwargs[1];
2080
2081 if (n == 1) {
2082 orig = StringValue(orig);
2083 }
2084 else {
2085 orig = Qnil;
2086 }
2087
2088 if (UNDEF_P(encoding)) {
2089 if (!NIL_P(orig)) {
2090 encoding = rb_obj_encoding(orig);
2091 }
2092 }
2093
2094 if (!UNDEF_P(encoding)) {
2095 enc = rb_to_encoding(encoding);
2096 }
2097
2098 // If capacity is nil, we're basically just duping `orig`.
2099 if (UNDEF_P(capacity)) {
2100 if (NIL_P(orig)) {
2101 VALUE empty_str = str_new(klass, "", 0);
2102 if (enc) {
2103 rb_enc_associate(empty_str, enc);
2104 }
2105 return empty_str;
2106 }
2107 VALUE copy = str_duplicate(klass, orig);
2108 rb_enc_associate(copy, enc);
2109 ENC_CODERANGE_CLEAR(copy);
2110 return copy;
2111 }
2112
2113 long capa = 0;
2114 capa = NUM2LONG(capacity);
2115 if (capa < 0) {
2116 capa = 0;
2117 }
2118
2119 if (!NIL_P(orig)) {
2120 long orig_capa = rb_str_capacity(orig);
2121 if (orig_capa > capa) {
2122 capa = orig_capa;
2123 }
2124 }
2125
2126 VALUE str = str_enc_new(klass, NULL, capa, enc);
2127 STR_SET_LEN(str, 0);
2128 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2129
2130 if (!NIL_P(orig)) {
2131 rb_str_buf_append(str, orig);
2132 }
2133
2134 return str;
2135}
2136
2137#ifdef NONASCII_MASK
2138#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2139
2140/*
2141 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2142 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2143 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2144 *
2145 * if (!(byte & 0x80))
2146 * byte |= 0x40; // turn on bit6
2147 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2148 *
2149 * This function calculates whether a byte is leading or not for all bytes
2150 * in the argument word by concurrently using the above logic, and then
2151 * adds up the number of leading bytes in the word.
2152 */
2153static inline uintptr_t
2154count_utf8_lead_bytes_with_word(const uintptr_t *s)
2155{
2156 uintptr_t d = *s;
2157
2158 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2159 d = (d>>6) | (~d>>7);
2160 d &= NONASCII_MASK >> 7;
2161
2162 /* Gather all bytes. */
2163#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2164 /* use only if it can use POPCNT */
2165 return rb_popcount_intptr(d);
2166#else
2167 d += (d>>8);
2168 d += (d>>16);
2169# if SIZEOF_VOIDP == 8
2170 d += (d>>32);
2171# endif
2172 return (d&0xF);
2173#endif
2174}
2175#endif
2176
2177static inline long
2178enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2179{
2180 long c;
2181 const char *q;
2182
2183 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2184 long diff = (long)(e - p);
2185 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2186 }
2187#ifdef NONASCII_MASK
2188 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2189 uintptr_t len = 0;
2190 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2191 const uintptr_t *s, *t;
2192 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2193 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2194 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2195 while (p < (const char *)s) {
2196 if (is_utf8_lead_byte(*p)) len++;
2197 p++;
2198 }
2199 while (s < t) {
2200 len += count_utf8_lead_bytes_with_word(s);
2201 s++;
2202 }
2203 p = (const char *)s;
2204 }
2205 while (p < e) {
2206 if (is_utf8_lead_byte(*p)) len++;
2207 p++;
2208 }
2209 return (long)len;
2210 }
2211#endif
2212 else if (rb_enc_asciicompat(enc)) {
2213 c = 0;
2214 if (ENC_CODERANGE_CLEAN_P(cr)) {
2215 while (p < e) {
2216 if (ISASCII(*p)) {
2217 q = search_nonascii(p, e);
2218 if (!q)
2219 return c + (e - p);
2220 c += q - p;
2221 p = q;
2222 }
2223 p += rb_enc_fast_mbclen(p, e, enc);
2224 c++;
2225 }
2226 }
2227 else {
2228 while (p < e) {
2229 if (ISASCII(*p)) {
2230 q = search_nonascii(p, e);
2231 if (!q)
2232 return c + (e - p);
2233 c += q - p;
2234 p = q;
2235 }
2236 p += rb_enc_mbclen(p, e, enc);
2237 c++;
2238 }
2239 }
2240 return c;
2241 }
2242
2243 for (c=0; p<e; c++) {
2244 p += rb_enc_mbclen(p, e, enc);
2245 }
2246 return c;
2247}
2248
2249long
2250rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2251{
2252 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2253}
2254
2255/* To get strlen with cr
2256 * Note that given cr is not used.
2257 */
2258long
2259rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2260{
2261 long c;
2262 const char *q;
2263 int ret;
2264
2265 *cr = 0;
2266 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2267 long diff = (long)(e - p);
2268 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2269 }
2270 else if (rb_enc_asciicompat(enc)) {
2271 c = 0;
2272 while (p < e) {
2273 if (ISASCII(*p)) {
2274 q = search_nonascii(p, e);
2275 if (!q) {
2276 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2277 return c + (e - p);
2278 }
2279 c += q - p;
2280 p = q;
2281 }
2282 ret = rb_enc_precise_mbclen(p, e, enc);
2283 if (MBCLEN_CHARFOUND_P(ret)) {
2284 *cr |= ENC_CODERANGE_VALID;
2285 p += MBCLEN_CHARFOUND_LEN(ret);
2286 }
2287 else {
2289 p++;
2290 }
2291 c++;
2292 }
2293 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2294 return c;
2295 }
2296
2297 for (c=0; p<e; c++) {
2298 ret = rb_enc_precise_mbclen(p, e, enc);
2299 if (MBCLEN_CHARFOUND_P(ret)) {
2300 *cr |= ENC_CODERANGE_VALID;
2301 p += MBCLEN_CHARFOUND_LEN(ret);
2302 }
2303 else {
2305 if (p + rb_enc_mbminlen(enc) <= e)
2306 p += rb_enc_mbminlen(enc);
2307 else
2308 p = e;
2309 }
2310 }
2311 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2312 return c;
2313}
2314
2315/* enc must be str's enc or rb_enc_check(str, str2) */
2316static long
2317str_strlen(VALUE str, rb_encoding *enc)
2318{
2319 const char *p, *e;
2320 int cr;
2321
2322 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2323 if (!enc) enc = STR_ENC_GET(str);
2324 p = RSTRING_PTR(str);
2325 e = RSTRING_END(str);
2326 cr = ENC_CODERANGE(str);
2327
2328 if (cr == ENC_CODERANGE_UNKNOWN) {
2329 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2330 if (cr) ENC_CODERANGE_SET(str, cr);
2331 return n;
2332 }
2333 else {
2334 return enc_strlen(p, e, enc, cr);
2335 }
2336}
2337
2338long
2340{
2341 return str_strlen(str, NULL);
2342}
2343
2344/*
2345 * call-seq:
2346 * length -> integer
2347 *
2348 * :include: doc/string/length.rdoc
2349 *
2350 */
2351
2352VALUE
2354{
2355 return LONG2NUM(str_strlen(str, NULL));
2356}
2357
2358/*
2359 * call-seq:
2360 * bytesize -> integer
2361 *
2362 * :include: doc/string/bytesize.rdoc
2363 *
2364 */
2365
2366VALUE
2367rb_str_bytesize(VALUE str)
2368{
2369 return LONG2NUM(RSTRING_LEN(str));
2370}
2371
2372/*
2373 * call-seq:
2374 * empty? -> true or false
2375 *
2376 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2377 *
2378 * "hello".empty? # => false
2379 * " ".empty? # => false
2380 * "".empty? # => true
2381 *
2382 */
2383
2384static VALUE
2385rb_str_empty(VALUE str)
2386{
2387 return RBOOL(RSTRING_LEN(str) == 0);
2388}
2389
2390/*
2391 * call-seq:
2392 * string + other_string -> new_string
2393 *
2394 * Returns a new +String+ containing +other_string+ concatenated to +self+:
2395 *
2396 * "Hello from " + self.to_s # => "Hello from main"
2397 *
2398 */
2399
2400VALUE
2402{
2403 VALUE str3;
2404 rb_encoding *enc;
2405 char *ptr1, *ptr2, *ptr3;
2406 long len1, len2;
2407 int termlen;
2408
2409 StringValue(str2);
2410 enc = rb_enc_check_str(str1, str2);
2411 RSTRING_GETMEM(str1, ptr1, len1);
2412 RSTRING_GETMEM(str2, ptr2, len2);
2413 termlen = rb_enc_mbminlen(enc);
2414 if (len1 > LONG_MAX - len2) {
2415 rb_raise(rb_eArgError, "string size too big");
2416 }
2417 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2418 ptr3 = RSTRING_PTR(str3);
2419 memcpy(ptr3, ptr1, len1);
2420 memcpy(ptr3+len1, ptr2, len2);
2421 TERM_FILL(&ptr3[len1+len2], termlen);
2422
2423 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2425 RB_GC_GUARD(str1);
2426 RB_GC_GUARD(str2);
2427 return str3;
2428}
2429
2430/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2431VALUE
2432rb_str_opt_plus(VALUE str1, VALUE str2)
2433{
2436 long len1, len2;
2437 MAYBE_UNUSED(char) *ptr1, *ptr2;
2438 RSTRING_GETMEM(str1, ptr1, len1);
2439 RSTRING_GETMEM(str2, ptr2, len2);
2440 int enc1 = rb_enc_get_index(str1);
2441 int enc2 = rb_enc_get_index(str2);
2442
2443 if (enc1 < 0) {
2444 return Qundef;
2445 }
2446 else if (enc2 < 0) {
2447 return Qundef;
2448 }
2449 else if (enc1 != enc2) {
2450 return Qundef;
2451 }
2452 else if (len1 > LONG_MAX - len2) {
2453 return Qundef;
2454 }
2455 else {
2456 return rb_str_plus(str1, str2);
2457 }
2458
2459}
2460
2461/*
2462 * call-seq:
2463 * string * integer -> new_string
2464 *
2465 * Returns a new +String+ containing +integer+ copies of +self+:
2466 *
2467 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2468 * "Ho! " * 0 # => ""
2469 *
2470 */
2471
2472VALUE
2474{
2475 VALUE str2;
2476 long n, len;
2477 char *ptr2;
2478 int termlen;
2479
2480 if (times == INT2FIX(1)) {
2481 return str_duplicate(rb_cString, str);
2482 }
2483 if (times == INT2FIX(0)) {
2484 str2 = str_alloc_embed(rb_cString, 0);
2485 rb_enc_copy(str2, str);
2486 return str2;
2487 }
2488 len = NUM2LONG(times);
2489 if (len < 0) {
2490 rb_raise(rb_eArgError, "negative argument");
2491 }
2492 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2493 if (STR_EMBEDDABLE_P(len, 1)) {
2494 str2 = str_alloc_embed(rb_cString, len + 1);
2495 memset(RSTRING_PTR(str2), 0, len + 1);
2496 }
2497 else {
2498 str2 = str_alloc_heap(rb_cString);
2499 RSTRING(str2)->as.heap.aux.capa = len;
2500 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2501 }
2502 STR_SET_LEN(str2, len);
2503 rb_enc_copy(str2, str);
2504 return str2;
2505 }
2506 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2507 rb_raise(rb_eArgError, "argument too big");
2508 }
2509
2510 len *= RSTRING_LEN(str);
2511 termlen = TERM_LEN(str);
2512 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2513 ptr2 = RSTRING_PTR(str2);
2514 if (len) {
2515 n = RSTRING_LEN(str);
2516 memcpy(ptr2, RSTRING_PTR(str), n);
2517 while (n <= len/2) {
2518 memcpy(ptr2 + n, ptr2, n);
2519 n *= 2;
2520 }
2521 memcpy(ptr2 + n, ptr2, len-n);
2522 }
2523 STR_SET_LEN(str2, len);
2524 TERM_FILL(&ptr2[len], termlen);
2525 rb_enc_cr_str_copy_for_substr(str2, str);
2526
2527 return str2;
2528}
2529
2530/*
2531 * call-seq:
2532 * string % object -> new_string
2533 *
2534 * Returns the result of formatting +object+ into the format specification +self+
2535 * (see Kernel#sprintf for formatting details):
2536 *
2537 * "%05d" % 123 # => "00123"
2538 *
2539 * If +self+ contains multiple substitutions, +object+ must be
2540 * an Array or Hash containing the values to be substituted:
2541 *
2542 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2543 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2544 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2545 *
2546 */
2547
2548static VALUE
2549rb_str_format_m(VALUE str, VALUE arg)
2550{
2551 VALUE tmp = rb_check_array_type(arg);
2552
2553 if (!NIL_P(tmp)) {
2554 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2555 }
2556 return rb_str_format(1, &arg, str);
2557}
2558
2559static inline void
2560rb_check_lockedtmp(VALUE str)
2561{
2562 if (FL_TEST(str, STR_TMPLOCK)) {
2563 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2564 }
2565}
2566
2567// If none of these flags are set, we know we have an modifiable string.
2568// If any is set, we need to do more detailed checks.
2569#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2570static inline void
2571str_modifiable(VALUE str)
2572{
2573 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2574 if (CHILLED_STRING_P(str)) {
2575 CHILLED_STRING_MUTATED(str);
2576 }
2577 rb_check_lockedtmp(str);
2578 rb_check_frozen(str);
2579 }
2580}
2581
2582static inline int
2583str_dependent_p(VALUE str)
2584{
2585 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2586 return FALSE;
2587 }
2588 else {
2589 return TRUE;
2590 }
2591}
2592
2593// If none of these flags are set, we know we have an independent string.
2594// If any is set, we need to do more detailed checks.
2595#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2596static inline int
2597str_independent(VALUE str)
2598{
2599 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2600 str_modifiable(str);
2601 return !str_dependent_p(str);
2602 }
2603 return TRUE;
2604}
2605
2606static void
2607str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2608{
2609 char *ptr;
2610 char *oldptr;
2611 long capa = len + expand;
2612
2613 if (len > capa) len = capa;
2614
2615 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2616 ptr = RSTRING(str)->as.heap.ptr;
2617 STR_SET_EMBED(str);
2618 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2619 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2620 STR_SET_LEN(str, len);
2621 return;
2622 }
2623
2624 ptr = ALLOC_N(char, (size_t)capa + termlen);
2625 oldptr = RSTRING_PTR(str);
2626 if (oldptr) {
2627 memcpy(ptr, oldptr, len);
2628 }
2629 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2630 xfree(oldptr);
2631 }
2632 STR_SET_NOEMBED(str);
2633 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2634 TERM_FILL(ptr + len, termlen);
2635 RSTRING(str)->as.heap.ptr = ptr;
2636 STR_SET_LEN(str, len);
2637 RSTRING(str)->as.heap.aux.capa = capa;
2638}
2639
2640void
2641rb_str_modify(VALUE str)
2642{
2643 if (!str_independent(str))
2644 str_make_independent(str);
2646}
2647
2648void
2650{
2651 int termlen = TERM_LEN(str);
2652 long len = RSTRING_LEN(str);
2653
2654 if (expand < 0) {
2655 rb_raise(rb_eArgError, "negative expanding string size");
2656 }
2657 if (expand >= LONG_MAX - len) {
2658 rb_raise(rb_eArgError, "string size too big");
2659 }
2660
2661 if (!str_independent(str)) {
2662 str_make_independent_expand(str, len, expand, termlen);
2663 }
2664 else if (expand > 0) {
2665 RESIZE_CAPA_TERM(str, len + expand, termlen);
2666 }
2668}
2669
2670/* As rb_str_modify(), but don't clear coderange */
2671static void
2672str_modify_keep_cr(VALUE str)
2673{
2674 if (!str_independent(str))
2675 str_make_independent(str);
2677 /* Force re-scan later */
2679}
2680
2681static inline void
2682str_discard(VALUE str)
2683{
2684 str_modifiable(str);
2685 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2686 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2687 RSTRING(str)->as.heap.ptr = 0;
2688 STR_SET_LEN(str, 0);
2689 }
2690}
2691
2692void
2694{
2695 int encindex = rb_enc_get_index(str);
2696
2697 if (RB_UNLIKELY(encindex == -1)) {
2698 rb_raise(rb_eTypeError, "not encoding capable object");
2699 }
2700
2701 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2702 return;
2703 }
2704
2705 rb_encoding *enc = rb_enc_from_index(encindex);
2706 if (!rb_enc_asciicompat(enc)) {
2707 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2708 }
2709}
2710
2711VALUE
2713{
2714 VALUE s = *ptr;
2715 if (!RB_TYPE_P(s, T_STRING)) {
2716 s = rb_str_to_str(s);
2717 *ptr = s;
2718 }
2719 return s;
2720}
2721
2722char *
2724{
2725 VALUE str = rb_string_value(ptr);
2726 return RSTRING_PTR(str);
2727}
2728
2729static int
2730zero_filled(const char *s, int n)
2731{
2732 for (; n > 0; --n) {
2733 if (*s++) return 0;
2734 }
2735 return 1;
2736}
2737
2738static const char *
2739str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2740{
2741 const char *e = s + len;
2742
2743 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2744 if (zero_filled(s, minlen)) return s;
2745 }
2746 return 0;
2747}
2748
2749static char *
2750str_fill_term(VALUE str, char *s, long len, int termlen)
2751{
2752 /* This function assumes that (capa + termlen) bytes of memory
2753 * is allocated, like many other functions in this file.
2754 */
2755 if (str_dependent_p(str)) {
2756 if (!zero_filled(s + len, termlen))
2757 str_make_independent_expand(str, len, 0L, termlen);
2758 }
2759 else {
2760 TERM_FILL(s + len, termlen);
2761 return s;
2762 }
2763 return RSTRING_PTR(str);
2764}
2765
2766void
2767rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2768{
2769 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2770 long len = RSTRING_LEN(str);
2771
2772 RUBY_ASSERT(capa >= len);
2773 if (capa - len < termlen) {
2774 rb_check_lockedtmp(str);
2775 str_make_independent_expand(str, len, 0L, termlen);
2776 }
2777 else if (str_dependent_p(str)) {
2778 if (termlen > oldtermlen)
2779 str_make_independent_expand(str, len, 0L, termlen);
2780 }
2781 else {
2782 if (!STR_EMBED_P(str)) {
2783 /* modify capa instead of realloc */
2784 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2785 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2786 }
2787 if (termlen > oldtermlen) {
2788 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2789 }
2790 }
2791
2792 return;
2793}
2794
2795static char *
2796str_null_check(VALUE str, int *w)
2797{
2798 char *s = RSTRING_PTR(str);
2799 long len = RSTRING_LEN(str);
2800 rb_encoding *enc = rb_enc_get(str);
2801 const int minlen = rb_enc_mbminlen(enc);
2802
2803 if (minlen > 1) {
2804 *w = 1;
2805 if (str_null_char(s, len, minlen, enc)) {
2806 return NULL;
2807 }
2808 return str_fill_term(str, s, len, minlen);
2809 }
2810 *w = 0;
2811 if (!s || memchr(s, 0, len)) {
2812 return NULL;
2813 }
2814 if (s[len]) {
2815 s = str_fill_term(str, s, len, minlen);
2816 }
2817 return s;
2818}
2819
2820char *
2821rb_str_to_cstr(VALUE str)
2822{
2823 int w;
2824 return str_null_check(str, &w);
2825}
2826
2827char *
2829{
2830 VALUE str = rb_string_value(ptr);
2831 int w;
2832 char *s = str_null_check(str, &w);
2833 if (!s) {
2834 if (w) {
2835 rb_raise(rb_eArgError, "string contains null char");
2836 }
2837 rb_raise(rb_eArgError, "string contains null byte");
2838 }
2839 return s;
2840}
2841
2842char *
2843rb_str_fill_terminator(VALUE str, const int newminlen)
2844{
2845 char *s = RSTRING_PTR(str);
2846 long len = RSTRING_LEN(str);
2847 return str_fill_term(str, s, len, newminlen);
2848}
2849
2850VALUE
2852{
2853 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2854 return str;
2855}
2856
2857/*
2858 * call-seq:
2859 * String.try_convert(object) -> object, new_string, or nil
2860 *
2861 * If +object+ is a +String+ object, returns +object+.
2862 *
2863 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2864 * calls <tt>object.to_str</tt> and returns the result.
2865 *
2866 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2867 *
2868 * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2869 */
2870static VALUE
2871rb_str_s_try_convert(VALUE dummy, VALUE str)
2872{
2873 return rb_check_string_type(str);
2874}
2875
2876static char*
2877str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2878{
2879 long nth = *nthp;
2880 if (rb_enc_mbmaxlen(enc) == 1) {
2881 p += nth;
2882 }
2883 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2884 p += nth * rb_enc_mbmaxlen(enc);
2885 }
2886 else if (rb_enc_asciicompat(enc)) {
2887 const char *p2, *e2;
2888 int n;
2889
2890 while (p < e && 0 < nth) {
2891 e2 = p + nth;
2892 if (e < e2) {
2893 *nthp = nth;
2894 return (char *)e;
2895 }
2896 if (ISASCII(*p)) {
2897 p2 = search_nonascii(p, e2);
2898 if (!p2) {
2899 nth -= e2 - p;
2900 *nthp = nth;
2901 return (char *)e2;
2902 }
2903 nth -= p2 - p;
2904 p = p2;
2905 }
2906 n = rb_enc_mbclen(p, e, enc);
2907 p += n;
2908 nth--;
2909 }
2910 *nthp = nth;
2911 if (nth != 0) {
2912 return (char *)e;
2913 }
2914 return (char *)p;
2915 }
2916 else {
2917 while (p < e && nth--) {
2918 p += rb_enc_mbclen(p, e, enc);
2919 }
2920 }
2921 if (p > e) p = e;
2922 *nthp = nth;
2923 return (char*)p;
2924}
2925
2926char*
2927rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2928{
2929 return str_nth_len(p, e, &nth, enc);
2930}
2931
2932static char*
2933str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2934{
2935 if (singlebyte)
2936 p += nth;
2937 else {
2938 p = str_nth_len(p, e, &nth, enc);
2939 }
2940 if (!p) return 0;
2941 if (p > e) p = e;
2942 return (char *)p;
2943}
2944
2945/* char offset to byte offset */
2946static long
2947str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2948{
2949 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2950 if (!pp) return e - p;
2951 return pp - p;
2952}
2953
2954long
2955rb_str_offset(VALUE str, long pos)
2956{
2957 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2958 STR_ENC_GET(str), single_byte_optimizable(str));
2959}
2960
2961#ifdef NONASCII_MASK
2962static char *
2963str_utf8_nth(const char *p, const char *e, long *nthp)
2964{
2965 long nth = *nthp;
2966 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2967 const uintptr_t *s, *t;
2968 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2969 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2970 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2971 while (p < (const char *)s) {
2972 if (is_utf8_lead_byte(*p)) nth--;
2973 p++;
2974 }
2975 do {
2976 nth -= count_utf8_lead_bytes_with_word(s);
2977 s++;
2978 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2979 p = (char *)s;
2980 }
2981 while (p < e) {
2982 if (is_utf8_lead_byte(*p)) {
2983 if (nth == 0) break;
2984 nth--;
2985 }
2986 p++;
2987 }
2988 *nthp = nth;
2989 return (char *)p;
2990}
2991
2992static long
2993str_utf8_offset(const char *p, const char *e, long nth)
2994{
2995 const char *pp = str_utf8_nth(p, e, &nth);
2996 return pp - p;
2997}
2998#endif
2999
3000/* byte offset to char offset */
3001long
3002rb_str_sublen(VALUE str, long pos)
3003{
3004 if (single_byte_optimizable(str) || pos < 0)
3005 return pos;
3006 else {
3007 char *p = RSTRING_PTR(str);
3008 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3009 }
3010}
3011
3012static VALUE
3013str_subseq(VALUE str, long beg, long len)
3014{
3015 VALUE str2;
3016
3017 RUBY_ASSERT(beg >= 0);
3018 RUBY_ASSERT(len >= 0);
3019 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3020
3021 const int termlen = TERM_LEN(str);
3022 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3023 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3024 RB_GC_GUARD(str);
3025 return str2;
3026 }
3027
3028 str2 = str_alloc_heap(rb_cString);
3029 if (str_embed_capa(str2) >= len + termlen) {
3030 char *ptr2 = RSTRING(str2)->as.embed.ary;
3031 STR_SET_EMBED(str2);
3032 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3033 TERM_FILL(ptr2+len, termlen);
3034
3035 STR_SET_LEN(str2, len);
3036 RB_GC_GUARD(str);
3037 }
3038 else {
3039 str_replace_shared(str2, str);
3040 RUBY_ASSERT(!STR_EMBED_P(str2));
3041 ENC_CODERANGE_CLEAR(str2);
3042 RSTRING(str2)->as.heap.ptr += beg;
3043 if (RSTRING_LEN(str2) > len) {
3044 STR_SET_LEN(str2, len);
3045 }
3046 }
3047
3048 return str2;
3049}
3050
3051VALUE
3052rb_str_subseq(VALUE str, long beg, long len)
3053{
3054 VALUE str2 = str_subseq(str, beg, len);
3055 rb_enc_cr_str_copy_for_substr(str2, str);
3056 return str2;
3057}
3058
3059char *
3060rb_str_subpos(VALUE str, long beg, long *lenp)
3061{
3062 long len = *lenp;
3063 long slen = -1L;
3064 const long blen = RSTRING_LEN(str);
3065 rb_encoding *enc = STR_ENC_GET(str);
3066 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3067
3068 if (len < 0) return 0;
3069 if (beg < 0 && -beg < 0) return 0;
3070 if (!blen) {
3071 len = 0;
3072 }
3073 if (single_byte_optimizable(str)) {
3074 if (beg > blen) return 0;
3075 if (beg < 0) {
3076 beg += blen;
3077 if (beg < 0) return 0;
3078 }
3079 if (len > blen - beg)
3080 len = blen - beg;
3081 if (len < 0) return 0;
3082 p = s + beg;
3083 goto end;
3084 }
3085 if (beg < 0) {
3086 if (len > -beg) len = -beg;
3087 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3088 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3089 beg = -beg;
3090 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3091 p = e;
3092 if (!p) return 0;
3093 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3094 if (!p) return 0;
3095 len = e - p;
3096 goto end;
3097 }
3098 else {
3099 slen = str_strlen(str, enc);
3100 beg += slen;
3101 if (beg < 0) return 0;
3102 p = s + beg;
3103 if (len == 0) goto end;
3104 }
3105 }
3106 else if (beg > 0 && beg > blen) {
3107 return 0;
3108 }
3109 if (len == 0) {
3110 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3111 p = s + beg;
3112 }
3113#ifdef NONASCII_MASK
3114 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3115 enc == rb_utf8_encoding()) {
3116 p = str_utf8_nth(s, e, &beg);
3117 if (beg > 0) return 0;
3118 len = str_utf8_offset(p, e, len);
3119 }
3120#endif
3121 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3122 int char_sz = rb_enc_mbmaxlen(enc);
3123
3124 p = s + beg * char_sz;
3125 if (p > e) {
3126 return 0;
3127 }
3128 else if (len * char_sz > e - p)
3129 len = e - p;
3130 else
3131 len *= char_sz;
3132 }
3133 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3134 if (beg > 0) return 0;
3135 len = 0;
3136 }
3137 else {
3138 len = str_offset(p, e, len, enc, 0);
3139 }
3140 end:
3141 *lenp = len;
3142 RB_GC_GUARD(str);
3143 return p;
3144}
3145
3146static VALUE str_substr(VALUE str, long beg, long len, int empty);
3147
3148VALUE
3149rb_str_substr(VALUE str, long beg, long len)
3150{
3151 return str_substr(str, beg, len, TRUE);
3152}
3153
3154VALUE
3155rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3156{
3157 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3158}
3159
3160static VALUE
3161str_substr(VALUE str, long beg, long len, int empty)
3162{
3163 char *p = rb_str_subpos(str, beg, &len);
3164
3165 if (!p) return Qnil;
3166 if (!len && !empty) return Qnil;
3167
3168 beg = p - RSTRING_PTR(str);
3169
3170 VALUE str2 = str_subseq(str, beg, len);
3171 rb_enc_cr_str_copy_for_substr(str2, str);
3172 return str2;
3173}
3174
3175/* :nodoc: */
3176VALUE
3178{
3179 if (CHILLED_STRING_P(str)) {
3180 FL_UNSET_RAW(str, STR_CHILLED);
3181 }
3182
3183 if (OBJ_FROZEN(str)) return str;
3184 rb_str_resize(str, RSTRING_LEN(str));
3185 return rb_obj_freeze(str);
3186}
3187
3188/*
3189 * call-seq:
3190 * +string -> new_string or self
3191 *
3192 * Returns +self+ if +self+ is not frozen and can be mutated
3193 * without warning issuance.
3194 *
3195 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3196 */
3197static VALUE
3198str_uplus(VALUE str)
3199{
3200 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3201 return rb_str_dup(str);
3202 }
3203 else {
3204 return str;
3205 }
3206}
3207
3208/*
3209 * call-seq:
3210 * -string -> frozen_string
3211 * dedup -> frozen_string
3212 *
3213 * Returns a frozen, possibly pre-existing copy of the string.
3214 *
3215 * The returned +String+ will be deduplicated as long as it does not have
3216 * any instance variables set on it and is not a String subclass.
3217 *
3218 * Note that <tt>-string</tt> variant is more convenient for defining
3219 * constants:
3220 *
3221 * FILENAME = -'config/database.yml'
3222 *
3223 * while +dedup+ is better suitable for using the method in chains
3224 * of calculations:
3225 *
3226 * @url_list.concat(urls.map(&:dedup))
3227 *
3228 */
3229static VALUE
3230str_uminus(VALUE str)
3231{
3232 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3233 str = rb_str_dup(str);
3234 }
3235 return rb_fstring(str);
3236}
3237
3238RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3239#define rb_str_dup_frozen rb_str_new_frozen
3240
3241VALUE
3243{
3244 if (FL_TEST(str, STR_TMPLOCK)) {
3245 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3246 }
3247 FL_SET(str, STR_TMPLOCK);
3248 return str;
3249}
3250
3251VALUE
3253{
3254 if (!FL_TEST(str, STR_TMPLOCK)) {
3255 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3256 }
3257 FL_UNSET(str, STR_TMPLOCK);
3258 return str;
3259}
3260
3261VALUE
3262rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3263{
3264 rb_str_locktmp(str);
3265 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3266}
3267
3268void
3270{
3271 long capa;
3272 const int termlen = TERM_LEN(str);
3273
3274 str_modifiable(str);
3275 if (STR_SHARED_P(str)) {
3276 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3277 }
3278 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3279 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3280 }
3281
3282 int cr = ENC_CODERANGE(str);
3283 if (len == 0) {
3284 /* Empty string does not contain non-ASCII */
3286 }
3287 else if (cr == ENC_CODERANGE_UNKNOWN) {
3288 /* Leave unknown. */
3289 }
3290 else if (len > RSTRING_LEN(str)) {
3291 if (ENC_CODERANGE_CLEAN_P(cr)) {
3292 /* Update the coderange regarding the extended part. */
3293 const char *const prev_end = RSTRING_END(str);
3294 const char *const new_end = RSTRING_PTR(str) + len;
3295 rb_encoding *enc = rb_enc_get(str);
3296 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3297 ENC_CODERANGE_SET(str, cr);
3298 }
3299 else if (cr == ENC_CODERANGE_BROKEN) {
3300 /* May be valid now, by appended part. */
3302 }
3303 }
3304 else if (len < RSTRING_LEN(str)) {
3305 if (cr != ENC_CODERANGE_7BIT) {
3306 /* ASCII-only string is keeping after truncated. Valid
3307 * and broken may be invalid or valid, leave unknown. */
3309 }
3310 }
3311
3312 STR_SET_LEN(str, len);
3313 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3314}
3315
3316VALUE
3317rb_str_resize(VALUE str, long len)
3318{
3319 if (len < 0) {
3320 rb_raise(rb_eArgError, "negative string size (or size too big)");
3321 }
3322
3323 int independent = str_independent(str);
3324 long slen = RSTRING_LEN(str);
3325 const int termlen = TERM_LEN(str);
3326
3327 if (slen > len || (termlen != 1 && slen < len)) {
3329 }
3330
3331 {
3332 long capa;
3333 if (STR_EMBED_P(str)) {
3334 if (len == slen) return str;
3335 if (str_embed_capa(str) >= len + termlen) {
3336 STR_SET_LEN(str, len);
3337 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3338 return str;
3339 }
3340 str_make_independent_expand(str, slen, len - slen, termlen);
3341 }
3342 else if (str_embed_capa(str) >= len + termlen) {
3343 char *ptr = STR_HEAP_PTR(str);
3344 STR_SET_EMBED(str);
3345 if (slen > len) slen = len;
3346 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3347 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3348 STR_SET_LEN(str, len);
3349 if (independent) ruby_xfree(ptr);
3350 return str;
3351 }
3352 else if (!independent) {
3353 if (len == slen) return str;
3354 str_make_independent_expand(str, slen, len - slen, termlen);
3355 }
3356 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3357 (capa - len) > (len < 1024 ? len : 1024)) {
3358 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3359 (size_t)len + termlen, STR_HEAP_SIZE(str));
3360 RSTRING(str)->as.heap.aux.capa = len;
3361 }
3362 else if (len == slen) return str;
3363 STR_SET_LEN(str, len);
3364 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3365 }
3366 return str;
3367}
3368
3369static void
3370str_ensure_available_capa(VALUE str, long len)
3371{
3372 str_modify_keep_cr(str);
3373
3374 const int termlen = TERM_LEN(str);
3375 long olen = RSTRING_LEN(str);
3376
3377 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3378 rb_raise(rb_eArgError, "string sizes too big");
3379 }
3380
3381 long total = olen + len;
3382 long capa = str_capacity(str, termlen);
3383
3384 if (capa < total) {
3385 if (total >= LONG_MAX / 2) {
3386 capa = total;
3387 }
3388 while (total > capa) {
3389 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3390 }
3391 RESIZE_CAPA_TERM(str, capa, termlen);
3392 }
3393}
3394
3395static VALUE
3396str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3397{
3398 if (keep_cr) {
3399 str_modify_keep_cr(str);
3400 }
3401 else {
3402 rb_str_modify(str);
3403 }
3404 if (len == 0) return 0;
3405
3406 long total, olen, off = -1;
3407 char *sptr;
3408 const int termlen = TERM_LEN(str);
3409
3410 RSTRING_GETMEM(str, sptr, olen);
3411 if (ptr >= sptr && ptr <= sptr + olen) {
3412 off = ptr - sptr;
3413 }
3414
3415 long capa = str_capacity(str, termlen);
3416
3417 if (olen > LONG_MAX - len) {
3418 rb_raise(rb_eArgError, "string sizes too big");
3419 }
3420 total = olen + len;
3421 if (capa < total) {
3422 if (total >= LONG_MAX / 2) {
3423 capa = total;
3424 }
3425 while (total > capa) {
3426 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3427 }
3428 RESIZE_CAPA_TERM(str, capa, termlen);
3429 sptr = RSTRING_PTR(str);
3430 }
3431 if (off != -1) {
3432 ptr = sptr + off;
3433 }
3434 memcpy(sptr + olen, ptr, len);
3435 STR_SET_LEN(str, total);
3436 TERM_FILL(sptr + total, termlen); /* sentinel */
3437
3438 return str;
3439}
3440
3441#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3442#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3443
3444VALUE
3445rb_str_cat(VALUE str, const char *ptr, long len)
3446{
3447 if (len == 0) return str;
3448 if (len < 0) {
3449 rb_raise(rb_eArgError, "negative string size (or size too big)");
3450 }
3451 return str_buf_cat(str, ptr, len);
3452}
3453
3454VALUE
3455rb_str_cat_cstr(VALUE str, const char *ptr)
3456{
3457 must_not_null(ptr);
3458 return rb_str_buf_cat(str, ptr, strlen(ptr));
3459}
3460
3461static void
3462rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3463{
3464 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3465
3466 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3467 if (UNLIKELY(!str_independent(str))) {
3468 str_make_independent(str);
3469 }
3470
3471 long string_length = -1;
3472 const int null_terminator_length = 1;
3473 char *sptr;
3474 RSTRING_GETMEM(str, sptr, string_length);
3475
3476 // Ensure the resulting string wouldn't be too long.
3477 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3478 rb_raise(rb_eArgError, "string sizes too big");
3479 }
3480
3481 long string_capacity = str_capacity(str, null_terminator_length);
3482
3483 // Get the code range before any modifications since those might clear the code range.
3484 int cr = ENC_CODERANGE(str);
3485
3486 // Check if the string has spare string_capacity to write the new byte.
3487 if (LIKELY(string_capacity >= string_length + 1)) {
3488 // In fast path we can write the new byte and note the string's new length.
3489 sptr[string_length] = byte;
3490 STR_SET_LEN(str, string_length + 1);
3491 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3492 }
3493 else {
3494 // If there's not enough string_capacity, make a call into the general string concatenation function.
3495 str_buf_cat(str, (char *)&byte, 1);
3496 }
3497
3498 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3499 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3500 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3501 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3502 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3503 if (ISASCII(byte)) {
3505 }
3506 else {
3508
3509 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3510 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3511 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3512 }
3513 }
3514 }
3515}
3516
3517RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3518RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3519RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3520
3521static VALUE
3522rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3523 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3524{
3525 int str_encindex = ENCODING_GET(str);
3526 int res_encindex;
3527 int str_cr, res_cr;
3528 rb_encoding *str_enc, *ptr_enc;
3529
3530 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3531
3532 if (str_encindex == ptr_encindex) {
3533 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3534 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3535 }
3536 }
3537 else {
3538 str_enc = rb_enc_from_index(str_encindex);
3539 ptr_enc = rb_enc_from_index(ptr_encindex);
3540 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3541 if (len == 0)
3542 return str;
3543 if (RSTRING_LEN(str) == 0) {
3544 rb_str_buf_cat(str, ptr, len);
3545 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3546 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3547 return str;
3548 }
3549 goto incompatible;
3550 }
3551 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3552 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3553 }
3554 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3555 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3556 str_cr = rb_enc_str_coderange(str);
3557 }
3558 }
3559 }
3560 if (ptr_cr_ret)
3561 *ptr_cr_ret = ptr_cr;
3562
3563 if (str_encindex != ptr_encindex &&
3564 str_cr != ENC_CODERANGE_7BIT &&
3565 ptr_cr != ENC_CODERANGE_7BIT) {
3566 str_enc = rb_enc_from_index(str_encindex);
3567 ptr_enc = rb_enc_from_index(ptr_encindex);
3568 goto incompatible;
3569 }
3570
3571 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3572 res_encindex = str_encindex;
3573 res_cr = ENC_CODERANGE_UNKNOWN;
3574 }
3575 else if (str_cr == ENC_CODERANGE_7BIT) {
3576 if (ptr_cr == ENC_CODERANGE_7BIT) {
3577 res_encindex = str_encindex;
3578 res_cr = ENC_CODERANGE_7BIT;
3579 }
3580 else {
3581 res_encindex = ptr_encindex;
3582 res_cr = ptr_cr;
3583 }
3584 }
3585 else if (str_cr == ENC_CODERANGE_VALID) {
3586 res_encindex = str_encindex;
3587 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3588 res_cr = str_cr;
3589 else
3590 res_cr = ptr_cr;
3591 }
3592 else { /* str_cr == ENC_CODERANGE_BROKEN */
3593 res_encindex = str_encindex;
3594 res_cr = str_cr;
3595 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3596 }
3597
3598 if (len < 0) {
3599 rb_raise(rb_eArgError, "negative string size (or size too big)");
3600 }
3601 str_buf_cat(str, ptr, len);
3602 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3603 return str;
3604
3605 incompatible:
3606 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3607 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3609}
3610
3611VALUE
3612rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3613{
3614 return rb_enc_cr_str_buf_cat(str, ptr, len,
3615 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3616}
3617
3618VALUE
3620{
3621 /* ptr must reference NUL terminated ASCII string. */
3622 int encindex = ENCODING_GET(str);
3623 rb_encoding *enc = rb_enc_from_index(encindex);
3624 if (rb_enc_asciicompat(enc)) {
3625 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3626 encindex, ENC_CODERANGE_7BIT, 0);
3627 }
3628 else {
3629 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3630 while (*ptr) {
3631 unsigned int c = (unsigned char)*ptr;
3632 int len = rb_enc_codelen(c, enc);
3633 rb_enc_mbcput(c, buf, enc);
3634 rb_enc_cr_str_buf_cat(str, buf, len,
3635 encindex, ENC_CODERANGE_VALID, 0);
3636 ptr++;
3637 }
3638 return str;
3639 }
3640}
3641
3642VALUE
3644{
3645 int str2_cr = rb_enc_str_coderange(str2);
3646
3647 if (str_enc_fastpath(str)) {
3648 switch (str2_cr) {
3649 case ENC_CODERANGE_7BIT:
3650 // If RHS is 7bit we can do simple concatenation
3651 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3652 RB_GC_GUARD(str2);
3653 return str;
3655 // If RHS is valid, we can do simple concatenation if encodings are the same
3656 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3657 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3658 int str_cr = ENC_CODERANGE(str);
3659 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3660 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3661 }
3662 RB_GC_GUARD(str2);
3663 return str;
3664 }
3665 }
3666 }
3667
3668 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3669 ENCODING_GET(str2), str2_cr, &str2_cr);
3670
3671 ENC_CODERANGE_SET(str2, str2_cr);
3672
3673 return str;
3674}
3675
3676VALUE
3678{
3679 StringValue(str2);
3680 return rb_str_buf_append(str, str2);
3681}
3682
3683VALUE
3684rb_str_concat_literals(size_t num, const VALUE *strary)
3685{
3686 VALUE str;
3687 size_t i, s = 0;
3688 unsigned long len = 1;
3689
3690 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3691 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3692
3693 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3694 str = rb_str_buf_new(len);
3695 str_enc_copy_direct(str, strary[0]);
3696
3697 for (i = s; i < num; ++i) {
3698 const VALUE v = strary[i];
3699 int encidx = ENCODING_GET(v);
3700
3701 rb_str_buf_append(str, v);
3702 if (encidx != ENCINDEX_US_ASCII) {
3703 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3704 rb_enc_set_index(str, encidx);
3705 }
3706 }
3707 return str;
3708}
3709
3710/*
3711 * call-seq:
3712 * concat(*objects) -> string
3713 *
3714 * Concatenates each object in +objects+ to +self+ and returns +self+:
3715 *
3716 * s = 'foo'
3717 * s.concat('bar', 'baz') # => "foobarbaz"
3718 * s # => "foobarbaz"
3719 *
3720 * For each given object +object+ that is an Integer,
3721 * the value is considered a codepoint and converted to a character before concatenation:
3722 *
3723 * s = 'foo'
3724 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3725 *
3726 * Related: String#<<, which takes a single argument.
3727 */
3728static VALUE
3729rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3730{
3731 str_modifiable(str);
3732
3733 if (argc == 1) {
3734 return rb_str_concat(str, argv[0]);
3735 }
3736 else if (argc > 1) {
3737 int i;
3738 VALUE arg_str = rb_str_tmp_new(0);
3739 rb_enc_copy(arg_str, str);
3740 for (i = 0; i < argc; i++) {
3741 rb_str_concat(arg_str, argv[i]);
3742 }
3743 rb_str_buf_append(str, arg_str);
3744 }
3745
3746 return str;
3747}
3748
3749/*
3750 * call-seq:
3751 * append_as_bytes(*objects) -> string
3752 *
3753 * Concatenates each object in +objects+ into +self+ without any encoding
3754 * validation or conversion and returns +self+:
3755 *
3756 * s = 'foo'
3757 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3758 * s.valid_encoding? # => false
3759 * s.append_as_bytes("\xAC 12")
3760 * s.valid_encoding? # => true
3761 *
3762 * For each given object +object+ that is an Integer,
3763 * the value is considered a Byte. If the Integer is bigger
3764 * than one byte, only the lower byte is considered, similar to String#setbyte:
3765 *
3766 * s = ""
3767 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3768 *
3769 * Related: String#<<, String#concat, which do an encoding aware concatenation.
3770 */
3771
3772VALUE
3773rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3774{
3775 long needed_capacity = 0;
3776 volatile VALUE t0;
3777 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3778
3779 for (int index = 0; index < argc; index++) {
3780 VALUE obj = argv[index];
3781 enum ruby_value_type type = types[index] = rb_type(obj);
3782 switch (type) {
3783 case T_FIXNUM:
3784 case T_BIGNUM:
3785 needed_capacity++;
3786 break;
3787 case T_STRING:
3788 needed_capacity += RSTRING_LEN(obj);
3789 break;
3790 default:
3791 rb_raise(
3793 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3794 rb_obj_class(obj)
3795 );
3796 break;
3797 }
3798 }
3799
3800 str_ensure_available_capa(str, needed_capacity);
3801 char *sptr = RSTRING_END(str);
3802
3803 for (int index = 0; index < argc; index++) {
3804 VALUE obj = argv[index];
3805 enum ruby_value_type type = types[index];
3806 switch (type) {
3807 case T_FIXNUM:
3808 case T_BIGNUM: {
3809 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3810 char byte = (char)(NUM2INT(obj) & 0xFF);
3811 *sptr = byte;
3812 sptr++;
3813 break;
3814 }
3815 case T_STRING: {
3816 const char *ptr;
3817 long len;
3818 RSTRING_GETMEM(obj, ptr, len);
3819 memcpy(sptr, ptr, len);
3820 sptr += len;
3821 break;
3822 }
3823 default:
3824 rb_bug("append_as_bytes arguments should have been validated");
3825 }
3826 }
3827
3828 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3829 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3830
3831 int cr = ENC_CODERANGE(str);
3832 switch (cr) {
3833 case ENC_CODERANGE_7BIT: {
3834 for (int index = 0; index < argc; index++) {
3835 VALUE obj = argv[index];
3836 enum ruby_value_type type = types[index];
3837 switch (type) {
3838 case T_FIXNUM:
3839 case T_BIGNUM: {
3840 if (!ISASCII(NUM2INT(obj))) {
3841 goto clear_cr;
3842 }
3843 break;
3844 }
3845 case T_STRING: {
3846 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3847 goto clear_cr;
3848 }
3849 break;
3850 }
3851 default:
3852 rb_bug("append_as_bytes arguments should have been validated");
3853 }
3854 }
3855 break;
3856 }
3858 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3859 goto keep_cr;
3860 }
3861 else {
3862 goto clear_cr;
3863 }
3864 break;
3865 default:
3866 goto clear_cr;
3867 break;
3868 }
3869
3870 RB_GC_GUARD(t0);
3871
3872 clear_cr:
3873 // If no fast path was hit, we clear the coderange.
3874 // append_as_bytes is predominently meant to be used in
3875 // buffering situation, hence it's likely the coderange
3876 // will never be scanned, so it's not worth spending time
3877 // precomputing the coderange except for simple and common
3878 // situations.
3880 keep_cr:
3881 return str;
3882}
3883
3884/*
3885 * call-seq:
3886 * string << object -> string
3887 *
3888 * Concatenates +object+ to +self+ and returns +self+:
3889 *
3890 * s = 'foo'
3891 * s << 'bar' # => "foobar"
3892 * s # => "foobar"
3893 *
3894 * If +object+ is an Integer,
3895 * the value is considered a codepoint and converted to a character before concatenation:
3896 *
3897 * s = 'foo'
3898 * s << 33 # => "foo!"
3899 *
3900 * If that codepoint is not representable in the encoding of
3901 * _string_, RangeError is raised.
3902 *
3903 * s = 'foo'
3904 * s.encoding # => <Encoding:UTF-8>
3905 * s << 0x00110000 # 1114112 out of char range (RangeError)
3906 * s = 'foo'.encode(Encoding::EUC_JP)
3907 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3908 *
3909 * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
3910 * is automatically promoted to ASCII-8BIT.
3911 *
3912 * s = 'foo'.encode(Encoding::US_ASCII)
3913 * s << 0xff
3914 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3915 *
3916 * Related: String#concat, which takes multiple arguments.
3917 */
3918VALUE
3920{
3921 unsigned int code;
3922 rb_encoding *enc = STR_ENC_GET(str1);
3923 int encidx;
3924
3925 if (RB_INTEGER_TYPE_P(str2)) {
3926 if (rb_num_to_uint(str2, &code) == 0) {
3927 }
3928 else if (FIXNUM_P(str2)) {
3929 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3930 }
3931 else {
3932 rb_raise(rb_eRangeError, "bignum out of char range");
3933 }
3934 }
3935 else {
3936 return rb_str_append(str1, str2);
3937 }
3938
3939 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3940
3941 if (encidx >= 0) {
3942 rb_str_buf_cat_byte(str1, (unsigned char)code);
3943 }
3944 else {
3945 long pos = RSTRING_LEN(str1);
3946 int cr = ENC_CODERANGE(str1);
3947 int len;
3948 char *buf;
3949
3950 switch (len = rb_enc_codelen(code, enc)) {
3951 case ONIGERR_INVALID_CODE_POINT_VALUE:
3952 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3953 break;
3954 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3955 case 0:
3956 rb_raise(rb_eRangeError, "%u out of char range", code);
3957 break;
3958 }
3959 buf = ALLOCA_N(char, len + 1);
3960 rb_enc_mbcput(code, buf, enc);
3961 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3962 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3963 }
3964 rb_str_resize(str1, pos+len);
3965 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3966 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3968 }
3969 else if (cr == ENC_CODERANGE_BROKEN) {
3971 }
3972 ENC_CODERANGE_SET(str1, cr);
3973 }
3974 return str1;
3975}
3976
3977int
3978rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3979{
3980 int encidx = rb_enc_to_index(enc);
3981
3982 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3983 /* US-ASCII automatically extended to ASCII-8BIT */
3984 if (code > 0xFF) {
3985 rb_raise(rb_eRangeError, "%u out of char range", code);
3986 }
3987 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3988 return ENCINDEX_ASCII_8BIT;
3989 }
3990 return encidx;
3991 }
3992 else {
3993 return -1;
3994 }
3995}
3996
3997/*
3998 * call-seq:
3999 * prepend(*other_strings) -> string
4000 *
4001 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4002 *
4003 * s = 'foo'
4004 * s.prepend('bar', 'baz') # => "barbazfoo"
4005 * s # => "barbazfoo"
4006 *
4007 * Related: String#concat.
4008 */
4009
4010static VALUE
4011rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4012{
4013 str_modifiable(str);
4014
4015 if (argc == 1) {
4016 rb_str_update(str, 0L, 0L, argv[0]);
4017 }
4018 else if (argc > 1) {
4019 int i;
4020 VALUE arg_str = rb_str_tmp_new(0);
4021 rb_enc_copy(arg_str, str);
4022 for (i = 0; i < argc; i++) {
4023 rb_str_append(arg_str, argv[i]);
4024 }
4025 rb_str_update(str, 0L, 0L, arg_str);
4026 }
4027
4028 return str;
4029}
4030
4031st_index_t
4033{
4034 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4035 st_index_t precomputed_hash;
4036 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4037
4038 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4039 return precomputed_hash;
4040 }
4041
4042 return str_do_hash(str);
4043}
4044
4045int
4047{
4048 long len1, len2;
4049 const char *ptr1, *ptr2;
4050 RSTRING_GETMEM(str1, ptr1, len1);
4051 RSTRING_GETMEM(str2, ptr2, len2);
4052 return (len1 != len2 ||
4053 !rb_str_comparable(str1, str2) ||
4054 memcmp(ptr1, ptr2, len1) != 0);
4055}
4056
4057/*
4058 * call-seq:
4059 * hash -> integer
4060 *
4061 * Returns the integer hash value for +self+.
4062 * The value is based on the length, content and encoding of +self+.
4063 *
4064 * Related: Object#hash.
4065 */
4066
4067static VALUE
4068rb_str_hash_m(VALUE str)
4069{
4070 st_index_t hval = rb_str_hash(str);
4071 return ST2FIX(hval);
4072}
4073
4074#define lesser(a,b) (((a)>(b))?(b):(a))
4075
4076int
4078{
4079 int idx1, idx2;
4080 int rc1, rc2;
4081
4082 if (RSTRING_LEN(str1) == 0) return TRUE;
4083 if (RSTRING_LEN(str2) == 0) return TRUE;
4084 idx1 = ENCODING_GET(str1);
4085 idx2 = ENCODING_GET(str2);
4086 if (idx1 == idx2) return TRUE;
4087 rc1 = rb_enc_str_coderange(str1);
4088 rc2 = rb_enc_str_coderange(str2);
4089 if (rc1 == ENC_CODERANGE_7BIT) {
4090 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4091 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4092 return TRUE;
4093 }
4094 if (rc2 == ENC_CODERANGE_7BIT) {
4095 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4096 return TRUE;
4097 }
4098 return FALSE;
4099}
4100
4101int
4103{
4104 long len1, len2;
4105 const char *ptr1, *ptr2;
4106 int retval;
4107
4108 if (str1 == str2) return 0;
4109 RSTRING_GETMEM(str1, ptr1, len1);
4110 RSTRING_GETMEM(str2, ptr2, len2);
4111 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4112 if (len1 == len2) {
4113 if (!rb_str_comparable(str1, str2)) {
4114 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4115 return 1;
4116 return -1;
4117 }
4118 return 0;
4119 }
4120 if (len1 > len2) return 1;
4121 return -1;
4122 }
4123 if (retval > 0) return 1;
4124 return -1;
4125}
4126
4127/*
4128 * call-seq:
4129 * string == object -> true or false
4130 * string === object -> true or false
4131 *
4132 * Returns +true+ if +object+ has the same length and content;
4133 * as +self+; +false+ otherwise:
4134 *
4135 * s = 'foo'
4136 * s == 'foo' # => true
4137 * s == 'food' # => false
4138 * s == 'FOO' # => false
4139 *
4140 * Returns +false+ if the two strings' encodings are not compatible:
4141 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4142 *
4143 * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4144 * two strings are compared using <code>object.==</code>.
4145 */
4146
4147VALUE
4149{
4150 if (str1 == str2) return Qtrue;
4151 if (!RB_TYPE_P(str2, T_STRING)) {
4152 if (!rb_respond_to(str2, idTo_str)) {
4153 return Qfalse;
4154 }
4155 return rb_equal(str2, str1);
4156 }
4157 return rb_str_eql_internal(str1, str2);
4158}
4159
4160/*
4161 * call-seq:
4162 * eql?(object) -> true or false
4163 *
4164 * Returns +true+ if +object+ has the same length and content;
4165 * as +self+; +false+ otherwise:
4166 *
4167 * s = 'foo'
4168 * s.eql?('foo') # => true
4169 * s.eql?('food') # => false
4170 * s.eql?('FOO') # => false
4171 *
4172 * Returns +false+ if the two strings' encodings are not compatible:
4173 *
4174 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1).eql?("\u{c4 d6 dc}") # => false
4175 *
4176 */
4177
4178VALUE
4179rb_str_eql(VALUE str1, VALUE str2)
4180{
4181 if (str1 == str2) return Qtrue;
4182 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4183 return rb_str_eql_internal(str1, str2);
4184}
4185
4186/*
4187 * call-seq:
4188 * string <=> other_string -> -1, 0, 1, or nil
4189 *
4190 * Compares +self+ and +other_string+, returning:
4191 *
4192 * - -1 if +other_string+ is larger.
4193 * - 0 if the two are equal.
4194 * - 1 if +other_string+ is smaller.
4195 * - +nil+ if the two are incomparable.
4196 *
4197 * Examples:
4198 *
4199 * 'foo' <=> 'foo' # => 0
4200 * 'foo' <=> 'food' # => -1
4201 * 'food' <=> 'foo' # => 1
4202 * 'FOO' <=> 'foo' # => -1
4203 * 'foo' <=> 'FOO' # => 1
4204 * 'foo' <=> 1 # => nil
4205 *
4206 */
4207
4208static VALUE
4209rb_str_cmp_m(VALUE str1, VALUE str2)
4210{
4211 int result;
4212 VALUE s = rb_check_string_type(str2);
4213 if (NIL_P(s)) {
4214 return rb_invcmp(str1, str2);
4215 }
4216 result = rb_str_cmp(str1, s);
4217 return INT2FIX(result);
4218}
4219
4220static VALUE str_casecmp(VALUE str1, VALUE str2);
4221static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4222
4223/*
4224 * call-seq:
4225 * casecmp(other_string) -> -1, 0, 1, or nil
4226 *
4227 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4228 *
4229 * - -1 if <tt>other_string.downcase</tt> is larger.
4230 * - 0 if the two are equal.
4231 * - 1 if <tt>other_string.downcase</tt> is smaller.
4232 * - +nil+ if the two are incomparable.
4233 *
4234 * Examples:
4235 *
4236 * 'foo'.casecmp('foo') # => 0
4237 * 'foo'.casecmp('food') # => -1
4238 * 'food'.casecmp('foo') # => 1
4239 * 'FOO'.casecmp('foo') # => 0
4240 * 'foo'.casecmp('FOO') # => 0
4241 * 'foo'.casecmp(1) # => nil
4242 *
4243 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4244 *
4245 * Related: String#casecmp?.
4246 *
4247 */
4248
4249static VALUE
4250rb_str_casecmp(VALUE str1, VALUE str2)
4251{
4252 VALUE s = rb_check_string_type(str2);
4253 if (NIL_P(s)) {
4254 return Qnil;
4255 }
4256 return str_casecmp(str1, s);
4257}
4258
4259static VALUE
4260str_casecmp(VALUE str1, VALUE str2)
4261{
4262 long len;
4263 rb_encoding *enc;
4264 const char *p1, *p1end, *p2, *p2end;
4265
4266 enc = rb_enc_compatible(str1, str2);
4267 if (!enc) {
4268 return Qnil;
4269 }
4270
4271 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4272 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4273 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4274 while (p1 < p1end && p2 < p2end) {
4275 if (*p1 != *p2) {
4276 unsigned int c1 = TOLOWER(*p1 & 0xff);
4277 unsigned int c2 = TOLOWER(*p2 & 0xff);
4278 if (c1 != c2)
4279 return INT2FIX(c1 < c2 ? -1 : 1);
4280 }
4281 p1++;
4282 p2++;
4283 }
4284 }
4285 else {
4286 while (p1 < p1end && p2 < p2end) {
4287 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4288 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4289
4290 if (0 <= c1 && 0 <= c2) {
4291 c1 = TOLOWER(c1);
4292 c2 = TOLOWER(c2);
4293 if (c1 != c2)
4294 return INT2FIX(c1 < c2 ? -1 : 1);
4295 }
4296 else {
4297 int r;
4298 l1 = rb_enc_mbclen(p1, p1end, enc);
4299 l2 = rb_enc_mbclen(p2, p2end, enc);
4300 len = l1 < l2 ? l1 : l2;
4301 r = memcmp(p1, p2, len);
4302 if (r != 0)
4303 return INT2FIX(r < 0 ? -1 : 1);
4304 if (l1 != l2)
4305 return INT2FIX(l1 < l2 ? -1 : 1);
4306 }
4307 p1 += l1;
4308 p2 += l2;
4309 }
4310 }
4311 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4312 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4313 return INT2FIX(-1);
4314}
4315
4316/*
4317 * call-seq:
4318 * casecmp?(other_string) -> true, false, or nil
4319 *
4320 * Returns +true+ if +self+ and +other_string+ are equal after
4321 * Unicode case folding, otherwise +false+:
4322 *
4323 * 'foo'.casecmp?('foo') # => true
4324 * 'foo'.casecmp?('food') # => false
4325 * 'food'.casecmp?('foo') # => false
4326 * 'FOO'.casecmp?('foo') # => true
4327 * 'foo'.casecmp?('FOO') # => true
4328 *
4329 * Returns +nil+ if the two values are incomparable:
4330 *
4331 * 'foo'.casecmp?(1) # => nil
4332 *
4333 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4334 *
4335 * Related: String#casecmp.
4336 *
4337 */
4338
4339static VALUE
4340rb_str_casecmp_p(VALUE str1, VALUE str2)
4341{
4342 VALUE s = rb_check_string_type(str2);
4343 if (NIL_P(s)) {
4344 return Qnil;
4345 }
4346 return str_casecmp_p(str1, s);
4347}
4348
4349static VALUE
4350str_casecmp_p(VALUE str1, VALUE str2)
4351{
4352 rb_encoding *enc;
4353 VALUE folded_str1, folded_str2;
4354 VALUE fold_opt = sym_fold;
4355
4356 enc = rb_enc_compatible(str1, str2);
4357 if (!enc) {
4358 return Qnil;
4359 }
4360
4361 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4362 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4363
4364 return rb_str_eql(folded_str1, folded_str2);
4365}
4366
4367static long
4368strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4369 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4370{
4371 const char *search_start = str_ptr;
4372 long pos, search_len = str_len - offset;
4373
4374 for (;;) {
4375 const char *t;
4376 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4377 if (pos < 0) return pos;
4378 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4379 if (t == search_start + pos) break;
4380 search_len -= t - search_start;
4381 if (search_len <= 0) return -1;
4382 offset += t - search_start;
4383 search_start = t;
4384 }
4385 return pos + offset;
4386}
4387
4388/* found index in byte */
4389#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4390#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4391
4392static long
4393rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4394{
4395 const char *str_ptr, *str_ptr_end, *sub_ptr;
4396 long str_len, sub_len;
4397 rb_encoding *enc;
4398
4399 enc = rb_enc_check(str, sub);
4400 if (is_broken_string(sub)) return -1;
4401
4402 str_ptr = RSTRING_PTR(str);
4403 str_ptr_end = RSTRING_END(str);
4404 str_len = RSTRING_LEN(str);
4405 sub_ptr = RSTRING_PTR(sub);
4406 sub_len = RSTRING_LEN(sub);
4407
4408 if (str_len < sub_len) return -1;
4409
4410 if (offset != 0) {
4411 long str_len_char, sub_len_char;
4412 int single_byte = single_byte_optimizable(str);
4413 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4414 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4415 if (offset < 0) {
4416 offset += str_len_char;
4417 if (offset < 0) return -1;
4418 }
4419 if (str_len_char - offset < sub_len_char) return -1;
4420 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4421 str_ptr += offset;
4422 }
4423 if (sub_len == 0) return offset;
4424
4425 /* need proceed one character at a time */
4426 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4427}
4428
4429
4430/*
4431 * call-seq:
4432 * index(substring, offset = 0) -> integer or nil
4433 * index(regexp, offset = 0) -> integer or nil
4434 *
4435 * :include: doc/string/index.rdoc
4436 *
4437 */
4438
4439static VALUE
4440rb_str_index_m(int argc, VALUE *argv, VALUE str)
4441{
4442 VALUE sub;
4443 VALUE initpos;
4444 rb_encoding *enc = STR_ENC_GET(str);
4445 long pos;
4446
4447 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4448 long slen = str_strlen(str, enc); /* str's enc */
4449 pos = NUM2LONG(initpos);
4450 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4451 if (RB_TYPE_P(sub, T_REGEXP)) {
4453 }
4454 return Qnil;
4455 }
4456 }
4457 else {
4458 pos = 0;
4459 }
4460
4461 if (RB_TYPE_P(sub, T_REGEXP)) {
4462 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4463 enc, single_byte_optimizable(str));
4464
4465 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4466 VALUE match = rb_backref_get();
4467 struct re_registers *regs = RMATCH_REGS(match);
4468 pos = rb_str_sublen(str, BEG(0));
4469 return LONG2NUM(pos);
4470 }
4471 }
4472 else {
4473 StringValue(sub);
4474 pos = rb_str_index(str, sub, pos);
4475 if (pos >= 0) {
4476 pos = rb_str_sublen(str, pos);
4477 return LONG2NUM(pos);
4478 }
4479 }
4480 return Qnil;
4481}
4482
4483/* Ensure that the given pos is a valid character boundary.
4484 * Note that in this function, "character" means a code point
4485 * (Unicode scalar value), not a grapheme cluster.
4486 */
4487static void
4488str_ensure_byte_pos(VALUE str, long pos)
4489{
4490 if (!single_byte_optimizable(str)) {
4491 const char *s = RSTRING_PTR(str);
4492 const char *e = RSTRING_END(str);
4493 const char *p = s + pos;
4494 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4495 rb_raise(rb_eIndexError,
4496 "offset %ld does not land on character boundary", pos);
4497 }
4498 }
4499}
4500
4501/*
4502 * call-seq:
4503 * byteindex(substring, offset = 0) -> integer or nil
4504 * byteindex(regexp, offset = 0) -> integer or nil
4505 *
4506 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4507 * or +nil+ if none found:
4508 *
4509 * 'foo'.byteindex('f') # => 0
4510 * 'foo'.byteindex('o') # => 1
4511 * 'foo'.byteindex('oo') # => 1
4512 * 'foo'.byteindex('ooo') # => nil
4513 *
4514 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4515 * or +nil+ if none found:
4516 *
4517 * 'foo'.byteindex(/f/) # => 0
4518 * 'foo'.byteindex(/o/) # => 1
4519 * 'foo'.byteindex(/oo/) # => 1
4520 * 'foo'.byteindex(/ooo/) # => nil
4521 *
4522 * Integer argument +offset+, if given, specifies the byte-based position in the
4523 * string to begin the search:
4524 *
4525 * 'foo'.byteindex('o', 1) # => 1
4526 * 'foo'.byteindex('o', 2) # => 2
4527 * 'foo'.byteindex('o', 3) # => nil
4528 *
4529 * If +offset+ is negative, counts backward from the end of +self+:
4530 *
4531 * 'foo'.byteindex('o', -1) # => 2
4532 * 'foo'.byteindex('o', -2) # => 1
4533 * 'foo'.byteindex('o', -3) # => 1
4534 * 'foo'.byteindex('o', -4) # => nil
4535 *
4536 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4537 * raised.
4538 *
4539 * Related: String#index, String#byterindex.
4540 */
4541
4542static VALUE
4543rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4544{
4545 VALUE sub;
4546 VALUE initpos;
4547 long pos;
4548
4549 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4550 long slen = RSTRING_LEN(str);
4551 pos = NUM2LONG(initpos);
4552 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4553 if (RB_TYPE_P(sub, T_REGEXP)) {
4555 }
4556 return Qnil;
4557 }
4558 }
4559 else {
4560 pos = 0;
4561 }
4562
4563 str_ensure_byte_pos(str, pos);
4564
4565 if (RB_TYPE_P(sub, T_REGEXP)) {
4566 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4567 VALUE match = rb_backref_get();
4568 struct re_registers *regs = RMATCH_REGS(match);
4569 pos = BEG(0);
4570 return LONG2NUM(pos);
4571 }
4572 }
4573 else {
4574 StringValue(sub);
4575 pos = rb_str_byteindex(str, sub, pos);
4576 if (pos >= 0) return LONG2NUM(pos);
4577 }
4578 return Qnil;
4579}
4580
4581#ifndef HAVE_MEMRCHR
4582static void*
4583memrchr(const char *search_str, int chr, long search_len)
4584{
4585 const char *ptr = search_str + search_len;
4586 while (ptr > search_str) {
4587 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4588 }
4589
4590 return ((void *)0);
4591}
4592#endif
4593
4594static long
4595str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4596{
4597 char *hit, *adjusted;
4598 int c;
4599 long slen, searchlen;
4600 char *sbeg, *e, *t;
4601
4602 sbeg = RSTRING_PTR(str);
4603 slen = RSTRING_LEN(sub);
4604 if (slen == 0) return s - sbeg;
4605 e = RSTRING_END(str);
4606 t = RSTRING_PTR(sub);
4607 c = *t & 0xff;
4608 searchlen = s - sbeg + 1;
4609
4610 if (memcmp(s, t, slen) == 0) {
4611 return s - sbeg;
4612 }
4613
4614 do {
4615 hit = memrchr(sbeg, c, searchlen);
4616 if (!hit) break;
4617 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4618 if (hit != adjusted) {
4619 searchlen = adjusted - sbeg;
4620 continue;
4621 }
4622 if (memcmp(hit, t, slen) == 0)
4623 return hit - sbeg;
4624 searchlen = adjusted - sbeg;
4625 } while (searchlen > 0);
4626
4627 return -1;
4628}
4629
4630/* found index in byte */
4631static long
4632rb_str_rindex(VALUE str, VALUE sub, long pos)
4633{
4634 long len, slen;
4635 char *sbeg, *s;
4636 rb_encoding *enc;
4637 int singlebyte;
4638
4639 enc = rb_enc_check(str, sub);
4640 if (is_broken_string(sub)) return -1;
4641 singlebyte = single_byte_optimizable(str);
4642 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4643 slen = str_strlen(sub, enc); /* rb_enc_check */
4644
4645 /* substring longer than string */
4646 if (len < slen) return -1;
4647 if (len - pos < slen) pos = len - slen;
4648 if (len == 0) return pos;
4649
4650 sbeg = RSTRING_PTR(str);
4651
4652 if (pos == 0) {
4653 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4654 return 0;
4655 else
4656 return -1;
4657 }
4658
4659 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4660 return str_rindex(str, sub, s, enc);
4661}
4662
4663/*
4664 * call-seq:
4665 * rindex(substring, offset = self.length) -> integer or nil
4666 * rindex(regexp, offset = self.length) -> integer or nil
4667 *
4668 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4669 * or +nil+ if none found:
4670 *
4671 * 'foo'.rindex('f') # => 0
4672 * 'foo'.rindex('o') # => 2
4673 * 'foo'.rindex('oo') # => 1
4674 * 'foo'.rindex('ooo') # => nil
4675 *
4676 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4677 * or +nil+ if none found:
4678 *
4679 * 'foo'.rindex(/f/) # => 0
4680 * 'foo'.rindex(/o/) # => 2
4681 * 'foo'.rindex(/oo/) # => 1
4682 * 'foo'.rindex(/ooo/) # => nil
4683 *
4684 * The _last_ match means starting at the possible last position, not
4685 * the last of longest matches.
4686 *
4687 * 'foo'.rindex(/o+/) # => 2
4688 * $~ #=> #<MatchData "o">
4689 *
4690 * To get the last longest match, needs to combine with negative
4691 * lookbehind.
4692 *
4693 * 'foo'.rindex(/(?<!o)o+/) # => 1
4694 * $~ #=> #<MatchData "oo">
4695 *
4696 * Or String#index with negative lookforward.
4697 *
4698 * 'foo'.index(/o+(?!.*o)/) # => 1
4699 * $~ #=> #<MatchData "oo">
4700 *
4701 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4702 * string to _end_ the search:
4703 *
4704 * 'foo'.rindex('o', 0) # => nil
4705 * 'foo'.rindex('o', 1) # => 1
4706 * 'foo'.rindex('o', 2) # => 2
4707 * 'foo'.rindex('o', 3) # => 2
4708 *
4709 * If +offset+ is a negative Integer, the maximum starting position in the
4710 * string to _end_ the search is the sum of the string's length and +offset+:
4711 *
4712 * 'foo'.rindex('o', -1) # => 2
4713 * 'foo'.rindex('o', -2) # => 1
4714 * 'foo'.rindex('o', -3) # => nil
4715 * 'foo'.rindex('o', -4) # => nil
4716 *
4717 * Related: String#index.
4718 */
4719
4720static VALUE
4721rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4722{
4723 VALUE sub;
4724 VALUE initpos;
4725 rb_encoding *enc = STR_ENC_GET(str);
4726 long pos, len = str_strlen(str, enc); /* str's enc */
4727
4728 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4729 pos = NUM2LONG(initpos);
4730 if (pos < 0 && (pos += len) < 0) {
4731 if (RB_TYPE_P(sub, T_REGEXP)) {
4733 }
4734 return Qnil;
4735 }
4736 if (pos > len) pos = len;
4737 }
4738 else {
4739 pos = len;
4740 }
4741
4742 if (RB_TYPE_P(sub, T_REGEXP)) {
4743 /* enc = rb_enc_check(str, sub); */
4744 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4745 enc, single_byte_optimizable(str));
4746
4747 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4748 VALUE match = rb_backref_get();
4749 struct re_registers *regs = RMATCH_REGS(match);
4750 pos = rb_str_sublen(str, BEG(0));
4751 return LONG2NUM(pos);
4752 }
4753 }
4754 else {
4755 StringValue(sub);
4756 pos = rb_str_rindex(str, sub, pos);
4757 if (pos >= 0) {
4758 pos = rb_str_sublen(str, pos);
4759 return LONG2NUM(pos);
4760 }
4761 }
4762 return Qnil;
4763}
4764
4765static long
4766rb_str_byterindex(VALUE str, VALUE sub, long pos)
4767{
4768 long len, slen;
4769 char *sbeg, *s;
4770 rb_encoding *enc;
4771
4772 enc = rb_enc_check(str, sub);
4773 if (is_broken_string(sub)) return -1;
4774 len = RSTRING_LEN(str);
4775 slen = RSTRING_LEN(sub);
4776
4777 /* substring longer than string */
4778 if (len < slen) return -1;
4779 if (len - pos < slen) pos = len - slen;
4780 if (len == 0) return pos;
4781
4782 sbeg = RSTRING_PTR(str);
4783
4784 if (pos == 0) {
4785 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4786 return 0;
4787 else
4788 return -1;
4789 }
4790
4791 s = sbeg + pos;
4792 return str_rindex(str, sub, s, enc);
4793}
4794
4795
4796/*
4797 * call-seq:
4798 * byterindex(substring, offset = self.bytesize) -> integer or nil
4799 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4800 *
4801 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4802 * or +nil+ if none found:
4803 *
4804 * 'foo'.byterindex('f') # => 0
4805 * 'foo'.byterindex('o') # => 2
4806 * 'foo'.byterindex('oo') # => 1
4807 * 'foo'.byterindex('ooo') # => nil
4808 *
4809 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4810 * or +nil+ if none found:
4811 *
4812 * 'foo'.byterindex(/f/) # => 0
4813 * 'foo'.byterindex(/o/) # => 2
4814 * 'foo'.byterindex(/oo/) # => 1
4815 * 'foo'.byterindex(/ooo/) # => nil
4816 *
4817 * The _last_ match means starting at the possible last position, not
4818 * the last of longest matches.
4819 *
4820 * 'foo'.byterindex(/o+/) # => 2
4821 * $~ #=> #<MatchData "o">
4822 *
4823 * To get the last longest match, needs to combine with negative
4824 * lookbehind.
4825 *
4826 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4827 * $~ #=> #<MatchData "oo">
4828 *
4829 * Or String#byteindex with negative lookforward.
4830 *
4831 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4832 * $~ #=> #<MatchData "oo">
4833 *
4834 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4835 * string to _end_ the search:
4836 *
4837 * 'foo'.byterindex('o', 0) # => nil
4838 * 'foo'.byterindex('o', 1) # => 1
4839 * 'foo'.byterindex('o', 2) # => 2
4840 * 'foo'.byterindex('o', 3) # => 2
4841 *
4842 * If +offset+ is a negative Integer, the maximum starting position in the
4843 * string to _end_ the search is the sum of the string's length and +offset+:
4844 *
4845 * 'foo'.byterindex('o', -1) # => 2
4846 * 'foo'.byterindex('o', -2) # => 1
4847 * 'foo'.byterindex('o', -3) # => nil
4848 * 'foo'.byterindex('o', -4) # => nil
4849 *
4850 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4851 * raised.
4852 *
4853 * Related: String#byteindex.
4854 */
4855
4856static VALUE
4857rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4858{
4859 VALUE sub;
4860 VALUE initpos;
4861 long pos, len = RSTRING_LEN(str);
4862
4863 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4864 pos = NUM2LONG(initpos);
4865 if (pos < 0 && (pos += len) < 0) {
4866 if (RB_TYPE_P(sub, T_REGEXP)) {
4868 }
4869 return Qnil;
4870 }
4871 if (pos > len) pos = len;
4872 }
4873 else {
4874 pos = len;
4875 }
4876
4877 str_ensure_byte_pos(str, pos);
4878
4879 if (RB_TYPE_P(sub, T_REGEXP)) {
4880 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4881 VALUE match = rb_backref_get();
4882 struct re_registers *regs = RMATCH_REGS(match);
4883 pos = BEG(0);
4884 return LONG2NUM(pos);
4885 }
4886 }
4887 else {
4888 StringValue(sub);
4889 pos = rb_str_byterindex(str, sub, pos);
4890 if (pos >= 0) return LONG2NUM(pos);
4891 }
4892 return Qnil;
4893}
4894
4895/*
4896 * call-seq:
4897 * string =~ regexp -> integer or nil
4898 * string =~ object -> integer or nil
4899 *
4900 * Returns the Integer index of the first substring that matches
4901 * the given +regexp+, or +nil+ if no match found:
4902 *
4903 * 'foo' =~ /f/ # => 0
4904 * 'foo' =~ /o/ # => 1
4905 * 'foo' =~ /x/ # => nil
4906 *
4907 * Note: also updates Regexp@Global+Variables.
4908 *
4909 * If the given +object+ is not a Regexp, returns the value
4910 * returned by <tt>object =~ self</tt>.
4911 *
4912 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4913 * (see Regexp#=~):
4914 *
4915 * number= nil
4916 * "no. 9" =~ /(?<number>\d+)/
4917 * number # => nil (not assigned)
4918 * /(?<number>\d+)/ =~ "no. 9"
4919 * number #=> "9"
4920 *
4921 */
4922
4923static VALUE
4924rb_str_match(VALUE x, VALUE y)
4925{
4926 switch (OBJ_BUILTIN_TYPE(y)) {
4927 case T_STRING:
4928 rb_raise(rb_eTypeError, "type mismatch: String given");
4929
4930 case T_REGEXP:
4931 return rb_reg_match(y, x);
4932
4933 default:
4934 return rb_funcall(y, idEqTilde, 1, x);
4935 }
4936}
4937
4938
4939static VALUE get_pat(VALUE);
4940
4941
4942/*
4943 * call-seq:
4944 * match(pattern, offset = 0) -> matchdata or nil
4945 * match(pattern, offset = 0) {|matchdata| ... } -> object
4946 *
4947 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4948 *
4949 * Note: also updates Regexp@Global+Variables.
4950 *
4951 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4952 * regexp = Regexp.new(pattern)
4953 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4954 * (see Regexp#match):
4955 * matchdata = regexp.match(self)
4956 *
4957 * With no block given, returns the computed +matchdata+:
4958 *
4959 * 'foo'.match('f') # => #<MatchData "f">
4960 * 'foo'.match('o') # => #<MatchData "o">
4961 * 'foo'.match('x') # => nil
4962 *
4963 * If Integer argument +offset+ is given, the search begins at index +offset+:
4964 *
4965 * 'foo'.match('f', 1) # => nil
4966 * 'foo'.match('o', 1) # => #<MatchData "o">
4967 *
4968 * With a block given, calls the block with the computed +matchdata+
4969 * and returns the block's return value:
4970 *
4971 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4972 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4973 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4974 *
4975 */
4976
4977static VALUE
4978rb_str_match_m(int argc, VALUE *argv, VALUE str)
4979{
4980 VALUE re, result;
4981 if (argc < 1)
4982 rb_check_arity(argc, 1, 2);
4983 re = argv[0];
4984 argv[0] = str;
4985 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4986 if (!NIL_P(result) && rb_block_given_p()) {
4987 return rb_yield(result);
4988 }
4989 return result;
4990}
4991
4992/*
4993 * call-seq:
4994 * match?(pattern, offset = 0) -> true or false
4995 *
4996 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4997 *
4998 * Note: does not update Regexp@Global+Variables.
4999 *
5000 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5001 * regexp = Regexp.new(pattern)
5002 *
5003 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5004 * +false+ otherwise:
5005 *
5006 * 'foo'.match?(/o/) # => true
5007 * 'foo'.match?('o') # => true
5008 * 'foo'.match?(/x/) # => false
5009 *
5010 * If Integer argument +offset+ is given, the search begins at index +offset+:
5011 * 'foo'.match?('f', 1) # => false
5012 * 'foo'.match?('o', 1) # => true
5013 *
5014 */
5015
5016static VALUE
5017rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5018{
5019 VALUE re;
5020 rb_check_arity(argc, 1, 2);
5021 re = get_pat(argv[0]);
5022 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5023}
5024
5025enum neighbor_char {
5026 NEIGHBOR_NOT_CHAR,
5027 NEIGHBOR_FOUND,
5028 NEIGHBOR_WRAPPED
5029};
5030
5031static enum neighbor_char
5032enc_succ_char(char *p, long len, rb_encoding *enc)
5033{
5034 long i;
5035 int l;
5036
5037 if (rb_enc_mbminlen(enc) > 1) {
5038 /* wchar, trivial case */
5039 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5040 if (!MBCLEN_CHARFOUND_P(r)) {
5041 return NEIGHBOR_NOT_CHAR;
5042 }
5043 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5044 l = rb_enc_code_to_mbclen(c, enc);
5045 if (!l) return NEIGHBOR_NOT_CHAR;
5046 if (l != len) return NEIGHBOR_WRAPPED;
5047 rb_enc_mbcput(c, p, enc);
5048 r = rb_enc_precise_mbclen(p, p + len, enc);
5049 if (!MBCLEN_CHARFOUND_P(r)) {
5050 return NEIGHBOR_NOT_CHAR;
5051 }
5052 return NEIGHBOR_FOUND;
5053 }
5054 while (1) {
5055 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5056 p[i] = '\0';
5057 if (i < 0)
5058 return NEIGHBOR_WRAPPED;
5059 ++((unsigned char*)p)[i];
5060 l = rb_enc_precise_mbclen(p, p+len, enc);
5061 if (MBCLEN_CHARFOUND_P(l)) {
5062 l = MBCLEN_CHARFOUND_LEN(l);
5063 if (l == len) {
5064 return NEIGHBOR_FOUND;
5065 }
5066 else {
5067 memset(p+l, 0xff, len-l);
5068 }
5069 }
5070 if (MBCLEN_INVALID_P(l) && i < len-1) {
5071 long len2;
5072 int l2;
5073 for (len2 = len-1; 0 < len2; len2--) {
5074 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5075 if (!MBCLEN_INVALID_P(l2))
5076 break;
5077 }
5078 memset(p+len2+1, 0xff, len-(len2+1));
5079 }
5080 }
5081}
5082
5083static enum neighbor_char
5084enc_pred_char(char *p, long len, rb_encoding *enc)
5085{
5086 long i;
5087 int l;
5088 if (rb_enc_mbminlen(enc) > 1) {
5089 /* wchar, trivial case */
5090 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5091 if (!MBCLEN_CHARFOUND_P(r)) {
5092 return NEIGHBOR_NOT_CHAR;
5093 }
5094 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5095 if (!c) return NEIGHBOR_NOT_CHAR;
5096 --c;
5097 l = rb_enc_code_to_mbclen(c, enc);
5098 if (!l) return NEIGHBOR_NOT_CHAR;
5099 if (l != len) return NEIGHBOR_WRAPPED;
5100 rb_enc_mbcput(c, p, enc);
5101 r = rb_enc_precise_mbclen(p, p + len, enc);
5102 if (!MBCLEN_CHARFOUND_P(r)) {
5103 return NEIGHBOR_NOT_CHAR;
5104 }
5105 return NEIGHBOR_FOUND;
5106 }
5107 while (1) {
5108 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5109 p[i] = '\xff';
5110 if (i < 0)
5111 return NEIGHBOR_WRAPPED;
5112 --((unsigned char*)p)[i];
5113 l = rb_enc_precise_mbclen(p, p+len, enc);
5114 if (MBCLEN_CHARFOUND_P(l)) {
5115 l = MBCLEN_CHARFOUND_LEN(l);
5116 if (l == len) {
5117 return NEIGHBOR_FOUND;
5118 }
5119 else {
5120 memset(p+l, 0, len-l);
5121 }
5122 }
5123 if (MBCLEN_INVALID_P(l) && i < len-1) {
5124 long len2;
5125 int l2;
5126 for (len2 = len-1; 0 < len2; len2--) {
5127 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5128 if (!MBCLEN_INVALID_P(l2))
5129 break;
5130 }
5131 memset(p+len2+1, 0, len-(len2+1));
5132 }
5133 }
5134}
5135
5136/*
5137 overwrite +p+ by succeeding letter in +enc+ and returns
5138 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5139 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5140 assuming each ranges are successive, and mbclen
5141 never change in each ranges.
5142 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5143 character.
5144 */
5145static enum neighbor_char
5146enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5147{
5148 enum neighbor_char ret;
5149 unsigned int c;
5150 int ctype;
5151 int range;
5152 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5153
5154 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5155 int try;
5156 const int max_gaps = 1;
5157
5158 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5159 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5160 ctype = ONIGENC_CTYPE_DIGIT;
5161 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5162 ctype = ONIGENC_CTYPE_ALPHA;
5163 else
5164 return NEIGHBOR_NOT_CHAR;
5165
5166 MEMCPY(save, p, char, len);
5167 for (try = 0; try <= max_gaps; ++try) {
5168 ret = enc_succ_char(p, len, enc);
5169 if (ret == NEIGHBOR_FOUND) {
5170 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5171 if (rb_enc_isctype(c, ctype, enc))
5172 return NEIGHBOR_FOUND;
5173 }
5174 }
5175 MEMCPY(p, save, char, len);
5176 range = 1;
5177 while (1) {
5178 MEMCPY(save, p, char, len);
5179 ret = enc_pred_char(p, len, enc);
5180 if (ret == NEIGHBOR_FOUND) {
5181 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5182 if (!rb_enc_isctype(c, ctype, enc)) {
5183 MEMCPY(p, save, char, len);
5184 break;
5185 }
5186 }
5187 else {
5188 MEMCPY(p, save, char, len);
5189 break;
5190 }
5191 range++;
5192 }
5193 if (range == 1) {
5194 return NEIGHBOR_NOT_CHAR;
5195 }
5196
5197 if (ctype != ONIGENC_CTYPE_DIGIT) {
5198 MEMCPY(carry, p, char, len);
5199 return NEIGHBOR_WRAPPED;
5200 }
5201
5202 MEMCPY(carry, p, char, len);
5203 enc_succ_char(carry, len, enc);
5204 return NEIGHBOR_WRAPPED;
5205}
5206
5207
5208static VALUE str_succ(VALUE str);
5209
5210/*
5211 * call-seq:
5212 * succ -> new_str
5213 *
5214 * Returns the successor to +self+. The successor is calculated by
5215 * incrementing characters.
5216 *
5217 * The first character to be incremented is the rightmost alphanumeric:
5218 * or, if no alphanumerics, the rightmost character:
5219 *
5220 * 'THX1138'.succ # => "THX1139"
5221 * '<<koala>>'.succ # => "<<koalb>>"
5222 * '***'.succ # => '**+'
5223 *
5224 * The successor to a digit is another digit, "carrying" to the next-left
5225 * character for a "rollover" from 9 to 0, and prepending another digit
5226 * if necessary:
5227 *
5228 * '00'.succ # => "01"
5229 * '09'.succ # => "10"
5230 * '99'.succ # => "100"
5231 *
5232 * The successor to a letter is another letter of the same case,
5233 * carrying to the next-left character for a rollover,
5234 * and prepending another same-case letter if necessary:
5235 *
5236 * 'aa'.succ # => "ab"
5237 * 'az'.succ # => "ba"
5238 * 'zz'.succ # => "aaa"
5239 * 'AA'.succ # => "AB"
5240 * 'AZ'.succ # => "BA"
5241 * 'ZZ'.succ # => "AAA"
5242 *
5243 * The successor to a non-alphanumeric character is the next character
5244 * in the underlying character set's collating sequence,
5245 * carrying to the next-left character for a rollover,
5246 * and prepending another character if necessary:
5247 *
5248 * s = 0.chr * 3
5249 * s # => "\x00\x00\x00"
5250 * s.succ # => "\x00\x00\x01"
5251 * s = 255.chr * 3
5252 * s # => "\xFF\xFF\xFF"
5253 * s.succ # => "\x01\x00\x00\x00"
5254 *
5255 * Carrying can occur between and among mixtures of alphanumeric characters:
5256 *
5257 * s = 'zz99zz99'
5258 * s.succ # => "aaa00aa00"
5259 * s = '99zz99zz'
5260 * s.succ # => "100aa00aa"
5261 *
5262 * The successor to an empty +String+ is a new empty +String+:
5263 *
5264 * ''.succ # => ""
5265 *
5266 */
5267
5268VALUE
5270{
5271 VALUE str;
5272 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5273 rb_enc_cr_str_copy_for_substr(str, orig);
5274 return str_succ(str);
5275}
5276
5277static VALUE
5278str_succ(VALUE str)
5279{
5280 rb_encoding *enc;
5281 char *sbeg, *s, *e, *last_alnum = 0;
5282 int found_alnum = 0;
5283 long l, slen;
5284 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5285 long carry_pos = 0, carry_len = 1;
5286 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5287
5288 slen = RSTRING_LEN(str);
5289 if (slen == 0) return str;
5290
5291 enc = STR_ENC_GET(str);
5292 sbeg = RSTRING_PTR(str);
5293 s = e = sbeg + slen;
5294
5295 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5296 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5297 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5298 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5299 break;
5300 }
5301 }
5302 l = rb_enc_precise_mbclen(s, e, enc);
5303 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5304 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5305 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5306 switch (neighbor) {
5307 case NEIGHBOR_NOT_CHAR:
5308 continue;
5309 case NEIGHBOR_FOUND:
5310 return str;
5311 case NEIGHBOR_WRAPPED:
5312 last_alnum = s;
5313 break;
5314 }
5315 found_alnum = 1;
5316 carry_pos = s - sbeg;
5317 carry_len = l;
5318 }
5319 if (!found_alnum) { /* str contains no alnum */
5320 s = e;
5321 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5322 enum neighbor_char neighbor;
5323 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5324 l = rb_enc_precise_mbclen(s, e, enc);
5325 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5326 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5327 MEMCPY(tmp, s, char, l);
5328 neighbor = enc_succ_char(tmp, l, enc);
5329 switch (neighbor) {
5330 case NEIGHBOR_FOUND:
5331 MEMCPY(s, tmp, char, l);
5332 return str;
5333 break;
5334 case NEIGHBOR_WRAPPED:
5335 MEMCPY(s, tmp, char, l);
5336 break;
5337 case NEIGHBOR_NOT_CHAR:
5338 break;
5339 }
5340 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5341 /* wrapped to \0...\0. search next valid char. */
5342 enc_succ_char(s, l, enc);
5343 }
5344 if (!rb_enc_asciicompat(enc)) {
5345 MEMCPY(carry, s, char, l);
5346 carry_len = l;
5347 }
5348 carry_pos = s - sbeg;
5349 }
5351 }
5352 RESIZE_CAPA(str, slen + carry_len);
5353 sbeg = RSTRING_PTR(str);
5354 s = sbeg + carry_pos;
5355 memmove(s + carry_len, s, slen - carry_pos);
5356 memmove(s, carry, carry_len);
5357 slen += carry_len;
5358 STR_SET_LEN(str, slen);
5359 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5361 return str;
5362}
5363
5364
5365/*
5366 * call-seq:
5367 * succ! -> self
5368 *
5369 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5370 */
5371
5372static VALUE
5373rb_str_succ_bang(VALUE str)
5374{
5375 rb_str_modify(str);
5376 str_succ(str);
5377 return str;
5378}
5379
5380static int
5381all_digits_p(const char *s, long len)
5382{
5383 while (len-- > 0) {
5384 if (!ISDIGIT(*s)) return 0;
5385 s++;
5386 }
5387 return 1;
5388}
5389
5390static int
5391str_upto_i(VALUE str, VALUE arg)
5392{
5393 rb_yield(str);
5394 return 0;
5395}
5396
5397/*
5398 * call-seq:
5399 * upto(other_string, exclusive = false) {|string| ... } -> self
5400 * upto(other_string, exclusive = false) -> new_enumerator
5401 *
5402 * With a block given, calls the block with each +String+ value
5403 * returned by successive calls to String#succ;
5404 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5405 * the sequence terminates when value +other_string+ is reached;
5406 * returns +self+:
5407 *
5408 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5409 * Output:
5410 *
5411 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5412 *
5413 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5414 *
5415 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5416 *
5417 * Output:
5418 *
5419 * a8 a9 b0 b1 b2 b3 b4 b5
5420 *
5421 * If +other_string+ would not be reached, does not call the block:
5422 *
5423 * '25'.upto('5') {|s| fail s }
5424 * 'aa'.upto('a') {|s| fail s }
5425 *
5426 * With no block given, returns a new Enumerator:
5427 *
5428 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5429 *
5430 */
5431
5432static VALUE
5433rb_str_upto(int argc, VALUE *argv, VALUE beg)
5434{
5435 VALUE end, exclusive;
5436
5437 rb_scan_args(argc, argv, "11", &end, &exclusive);
5438 RETURN_ENUMERATOR(beg, argc, argv);
5439 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5440}
5441
5442VALUE
5443rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5444{
5445 VALUE current, after_end;
5446 ID succ;
5447 int n, ascii;
5448 rb_encoding *enc;
5449
5450 CONST_ID(succ, "succ");
5451 StringValue(end);
5452 enc = rb_enc_check(beg, end);
5453 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5454 /* single character */
5455 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5456 char c = RSTRING_PTR(beg)[0];
5457 char e = RSTRING_PTR(end)[0];
5458
5459 if (c > e || (excl && c == e)) return beg;
5460 for (;;) {
5461 VALUE str = rb_enc_str_new(&c, 1, enc);
5463 if ((*each)(str, arg)) break;
5464 if (!excl && c == e) break;
5465 c++;
5466 if (excl && c == e) break;
5467 }
5468 return beg;
5469 }
5470 /* both edges are all digits */
5471 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5472 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5473 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5474 VALUE b, e;
5475 int width;
5476
5477 width = RSTRING_LENINT(beg);
5478 b = rb_str_to_inum(beg, 10, FALSE);
5479 e = rb_str_to_inum(end, 10, FALSE);
5480 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5481 long bi = FIX2LONG(b);
5482 long ei = FIX2LONG(e);
5483 rb_encoding *usascii = rb_usascii_encoding();
5484
5485 while (bi <= ei) {
5486 if (excl && bi == ei) break;
5487 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5488 bi++;
5489 }
5490 }
5491 else {
5492 ID op = excl ? '<' : idLE;
5493 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5494
5495 args[0] = INT2FIX(width);
5496 while (rb_funcall(b, op, 1, e)) {
5497 args[1] = b;
5498 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5499 b = rb_funcallv(b, succ, 0, 0);
5500 }
5501 }
5502 return beg;
5503 }
5504 /* normal case */
5505 n = rb_str_cmp(beg, end);
5506 if (n > 0 || (excl && n == 0)) return beg;
5507
5508 after_end = rb_funcallv(end, succ, 0, 0);
5509 current = str_duplicate(rb_cString, beg);
5510 while (!rb_str_equal(current, after_end)) {
5511 VALUE next = Qnil;
5512 if (excl || !rb_str_equal(current, end))
5513 next = rb_funcallv(current, succ, 0, 0);
5514 if ((*each)(current, arg)) break;
5515 if (NIL_P(next)) break;
5516 current = next;
5517 StringValue(current);
5518 if (excl && rb_str_equal(current, end)) break;
5519 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5520 break;
5521 }
5522
5523 return beg;
5524}
5525
5526VALUE
5527rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5528{
5529 VALUE current;
5530 ID succ;
5531
5532 CONST_ID(succ, "succ");
5533 /* both edges are all digits */
5534 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5535 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5536 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5537 int width = RSTRING_LENINT(beg);
5538 b = rb_str_to_inum(beg, 10, FALSE);
5539 if (FIXNUM_P(b)) {
5540 long bi = FIX2LONG(b);
5541 rb_encoding *usascii = rb_usascii_encoding();
5542
5543 while (FIXABLE(bi)) {
5544 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5545 bi++;
5546 }
5547 b = LONG2NUM(bi);
5548 }
5549 args[0] = INT2FIX(width);
5550 while (1) {
5551 args[1] = b;
5552 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5553 b = rb_funcallv(b, succ, 0, 0);
5554 }
5555 }
5556 /* normal case */
5557 current = str_duplicate(rb_cString, beg);
5558 while (1) {
5559 VALUE next = rb_funcallv(current, succ, 0, 0);
5560 if ((*each)(current, arg)) break;
5561 current = next;
5562 StringValue(current);
5563 if (RSTRING_LEN(current) == 0)
5564 break;
5565 }
5566
5567 return beg;
5568}
5569
5570static int
5571include_range_i(VALUE str, VALUE arg)
5572{
5573 VALUE *argp = (VALUE *)arg;
5574 if (!rb_equal(str, *argp)) return 0;
5575 *argp = Qnil;
5576 return 1;
5577}
5578
5579VALUE
5580rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5581{
5582 beg = rb_str_new_frozen(beg);
5583 StringValue(end);
5584 end = rb_str_new_frozen(end);
5585 if (NIL_P(val)) return Qfalse;
5586 val = rb_check_string_type(val);
5587 if (NIL_P(val)) return Qfalse;
5588 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5589 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5590 rb_enc_asciicompat(STR_ENC_GET(val))) {
5591 const char *bp = RSTRING_PTR(beg);
5592 const char *ep = RSTRING_PTR(end);
5593 const char *vp = RSTRING_PTR(val);
5594 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5595 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5596 return Qfalse;
5597 else {
5598 char b = *bp;
5599 char e = *ep;
5600 char v = *vp;
5601
5602 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5603 if (b <= v && v < e) return Qtrue;
5604 return RBOOL(!RTEST(exclusive) && v == e);
5605 }
5606 }
5607 }
5608#if 0
5609 /* both edges are all digits */
5610 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5611 all_digits_p(bp, RSTRING_LEN(beg)) &&
5612 all_digits_p(ep, RSTRING_LEN(end))) {
5613 /* TODO */
5614 }
5615#endif
5616 }
5617 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5618
5619 return RBOOL(NIL_P(val));
5620}
5621
5622static VALUE
5623rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5624{
5625 if (rb_reg_search(re, str, 0, 0) >= 0) {
5626 VALUE match = rb_backref_get();
5627 int nth = rb_reg_backref_number(match, backref);
5628 return rb_reg_nth_match(nth, match);
5629 }
5630 return Qnil;
5631}
5632
5633static VALUE
5634rb_str_aref(VALUE str, VALUE indx)
5635{
5636 long idx;
5637
5638 if (FIXNUM_P(indx)) {
5639 idx = FIX2LONG(indx);
5640 }
5641 else if (RB_TYPE_P(indx, T_REGEXP)) {
5642 return rb_str_subpat(str, indx, INT2FIX(0));
5643 }
5644 else if (RB_TYPE_P(indx, T_STRING)) {
5645 if (rb_str_index(str, indx, 0) != -1)
5646 return str_duplicate(rb_cString, indx);
5647 return Qnil;
5648 }
5649 else {
5650 /* check if indx is Range */
5651 long beg, len = str_strlen(str, NULL);
5652 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5653 case Qfalse:
5654 break;
5655 case Qnil:
5656 return Qnil;
5657 default:
5658 return rb_str_substr(str, beg, len);
5659 }
5660 idx = NUM2LONG(indx);
5661 }
5662
5663 return str_substr(str, idx, 1, FALSE);
5664}
5665
5666
5667/*
5668 * call-seq:
5669 * string[index] -> new_string or nil
5670 * string[start, length] -> new_string or nil
5671 * string[range] -> new_string or nil
5672 * string[regexp, capture = 0] -> new_string or nil
5673 * string[substring] -> new_string or nil
5674 *
5675 * Returns the substring of +self+ specified by the arguments.
5676 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5677 *
5678 *
5679 */
5680
5681static VALUE
5682rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5683{
5684 if (argc == 2) {
5685 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5686 return rb_str_subpat(str, argv[0], argv[1]);
5687 }
5688 else {
5689 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5690 }
5691 }
5692 rb_check_arity(argc, 1, 2);
5693 return rb_str_aref(str, argv[0]);
5694}
5695
5696VALUE
5698{
5699 char *ptr = RSTRING_PTR(str);
5700 long olen = RSTRING_LEN(str), nlen;
5701
5702 str_modifiable(str);
5703 if (len > olen) len = olen;
5704 nlen = olen - len;
5705 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5706 char *oldptr = ptr;
5707 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5708 STR_SET_EMBED(str);
5709 ptr = RSTRING(str)->as.embed.ary;
5710 memmove(ptr, oldptr + len, nlen);
5711 if (fl == STR_NOEMBED) xfree(oldptr);
5712 }
5713 else {
5714 if (!STR_SHARED_P(str)) {
5715 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5716 rb_enc_cr_str_exact_copy(shared, str);
5717 OBJ_FREEZE(shared);
5718 }
5719 ptr = RSTRING(str)->as.heap.ptr += len;
5720 }
5721 STR_SET_LEN(str, nlen);
5722
5723 if (!SHARABLE_MIDDLE_SUBSTRING) {
5724 TERM_FILL(ptr + nlen, TERM_LEN(str));
5725 }
5727 return str;
5728}
5729
5730static void
5731rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5732{
5733 char *sptr;
5734 long slen;
5735 int cr;
5736
5737 if (beg == 0 && vlen == 0) {
5738 rb_str_drop_bytes(str, len);
5739 return;
5740 }
5741
5742 str_modify_keep_cr(str);
5743 RSTRING_GETMEM(str, sptr, slen);
5744 if (len < vlen) {
5745 /* expand string */
5746 RESIZE_CAPA(str, slen + vlen - len);
5747 sptr = RSTRING_PTR(str);
5748 }
5749
5751 cr = rb_enc_str_coderange(val);
5752 else
5754
5755 if (vlen != len) {
5756 memmove(sptr + beg + vlen,
5757 sptr + beg + len,
5758 slen - (beg + len));
5759 }
5760 if (vlen < beg && len < 0) {
5761 MEMZERO(sptr + slen, char, -len);
5762 }
5763 if (vlen > 0) {
5764 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5765 }
5766 slen += vlen - len;
5767 STR_SET_LEN(str, slen);
5768 TERM_FILL(&sptr[slen], TERM_LEN(str));
5769 ENC_CODERANGE_SET(str, cr);
5770}
5771
5772static inline void
5773rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5774{
5775 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5776}
5777
5778void
5779rb_str_update(VALUE str, long beg, long len, VALUE val)
5780{
5781 long slen;
5782 char *p, *e;
5783 rb_encoding *enc;
5784 int singlebyte = single_byte_optimizable(str);
5785 int cr;
5786
5787 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5788
5789 StringValue(val);
5790 enc = rb_enc_check(str, val);
5791 slen = str_strlen(str, enc); /* rb_enc_check */
5792
5793 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5794 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5795 }
5796 if (beg < 0) {
5797 beg += slen;
5798 }
5799 RUBY_ASSERT(beg >= 0);
5800 RUBY_ASSERT(beg <= slen);
5801
5802 if (len > slen - beg) {
5803 len = slen - beg;
5804 }
5805 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5806 if (!p) p = RSTRING_END(str);
5807 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5808 if (!e) e = RSTRING_END(str);
5809 /* error check */
5810 beg = p - RSTRING_PTR(str); /* physical position */
5811 len = e - p; /* physical length */
5812 rb_str_update_0(str, beg, len, val);
5813 rb_enc_associate(str, enc);
5815 if (cr != ENC_CODERANGE_BROKEN)
5816 ENC_CODERANGE_SET(str, cr);
5817}
5818
5819static void
5820rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5821{
5822 int nth;
5823 VALUE match;
5824 long start, end, len;
5825 rb_encoding *enc;
5826 struct re_registers *regs;
5827
5828 if (rb_reg_search(re, str, 0, 0) < 0) {
5829 rb_raise(rb_eIndexError, "regexp not matched");
5830 }
5831 match = rb_backref_get();
5832 nth = rb_reg_backref_number(match, backref);
5833 regs = RMATCH_REGS(match);
5834 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5835 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5836 }
5837 if (nth < 0) {
5838 nth += regs->num_regs;
5839 }
5840
5841 start = BEG(nth);
5842 if (start == -1) {
5843 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5844 }
5845 end = END(nth);
5846 len = end - start;
5847 StringValue(val);
5848 enc = rb_enc_check_str(str, val);
5849 rb_str_update_0(str, start, len, val);
5850 rb_enc_associate(str, enc);
5851}
5852
5853static VALUE
5854rb_str_aset(VALUE str, VALUE indx, VALUE val)
5855{
5856 long idx, beg;
5857
5858 switch (TYPE(indx)) {
5859 case T_REGEXP:
5860 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5861 return val;
5862
5863 case T_STRING:
5864 beg = rb_str_index(str, indx, 0);
5865 if (beg < 0) {
5866 rb_raise(rb_eIndexError, "string not matched");
5867 }
5868 beg = rb_str_sublen(str, beg);
5869 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5870 return val;
5871
5872 default:
5873 /* check if indx is Range */
5874 {
5875 long beg, len;
5876 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5877 rb_str_update(str, beg, len, val);
5878 return val;
5879 }
5880 }
5881 /* FALLTHROUGH */
5882
5883 case T_FIXNUM:
5884 idx = NUM2LONG(indx);
5885 rb_str_update(str, idx, 1, val);
5886 return val;
5887 }
5888}
5889
5890/*
5891 * call-seq:
5892 * string[index] = new_string
5893 * string[start, length] = new_string
5894 * string[range] = new_string
5895 * string[regexp, capture = 0] = new_string
5896 * string[substring] = new_string
5897 *
5898 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5899 * See {String Slices}[rdoc-ref:String@String+Slices].
5900 *
5901 * A few examples:
5902 *
5903 * s = 'foo'
5904 * s[2] = 'rtune' # => "rtune"
5905 * s # => "fortune"
5906 * s[1, 5] = 'init' # => "init"
5907 * s # => "finite"
5908 * s[3..4] = 'al' # => "al"
5909 * s # => "finale"
5910 * s[/e$/] = 'ly' # => "ly"
5911 * s # => "finally"
5912 * s['lly'] = 'ncial' # => "ncial"
5913 * s # => "financial"
5914 *
5915 */
5916
5917static VALUE
5918rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5919{
5920 if (argc == 3) {
5921 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5922 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5923 }
5924 else {
5925 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5926 }
5927 return argv[2];
5928 }
5929 rb_check_arity(argc, 2, 3);
5930 return rb_str_aset(str, argv[0], argv[1]);
5931}
5932
5933/*
5934 * call-seq:
5935 * insert(index, other_string) -> self
5936 *
5937 * Inserts the given +other_string+ into +self+; returns +self+.
5938 *
5939 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5940 *
5941 * 'foo'.insert(1, 'bar') # => "fbaroo"
5942 *
5943 * If the Integer +index+ is negative, counts backward from the end of +self+
5944 * and inserts +other_string+ at offset <tt>index+1</tt>
5945 * (that is, _after_ <tt>self[index]</tt>):
5946 *
5947 * 'foo'.insert(-2, 'bar') # => "fobaro"
5948 *
5949 */
5950
5951static VALUE
5952rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5953{
5954 long pos = NUM2LONG(idx);
5955
5956 if (pos == -1) {
5957 return rb_str_append(str, str2);
5958 }
5959 else if (pos < 0) {
5960 pos++;
5961 }
5962 rb_str_update(str, pos, 0, str2);
5963 return str;
5964}
5965
5966
5967/*
5968 * call-seq:
5969 * slice!(index) -> new_string or nil
5970 * slice!(start, length) -> new_string or nil
5971 * slice!(range) -> new_string or nil
5972 * slice!(regexp, capture = 0) -> new_string or nil
5973 * slice!(substring) -> new_string or nil
5974 *
5975 * Removes and returns the substring of +self+ specified by the arguments.
5976 * See {String Slices}[rdoc-ref:String@String+Slices].
5977 *
5978 * A few examples:
5979 *
5980 * string = "This is a string"
5981 * string.slice!(2) #=> "i"
5982 * string.slice!(3..6) #=> " is "
5983 * string.slice!(/s.*t/) #=> "sa st"
5984 * string.slice!("r") #=> "r"
5985 * string #=> "Thing"
5986 *
5987 */
5988
5989static VALUE
5990rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5991{
5992 VALUE result = Qnil;
5993 VALUE indx;
5994 long beg, len = 1;
5995 char *p;
5996
5997 rb_check_arity(argc, 1, 2);
5998 str_modify_keep_cr(str);
5999 indx = argv[0];
6000 if (RB_TYPE_P(indx, T_REGEXP)) {
6001 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6002 VALUE match = rb_backref_get();
6003 struct re_registers *regs = RMATCH_REGS(match);
6004 int nth = 0;
6005 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6006 if ((nth += regs->num_regs) <= 0) return Qnil;
6007 }
6008 else if (nth >= regs->num_regs) return Qnil;
6009 beg = BEG(nth);
6010 len = END(nth) - beg;
6011 goto subseq;
6012 }
6013 else if (argc == 2) {
6014 beg = NUM2LONG(indx);
6015 len = NUM2LONG(argv[1]);
6016 goto num_index;
6017 }
6018 else if (FIXNUM_P(indx)) {
6019 beg = FIX2LONG(indx);
6020 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6021 if (!len) return Qnil;
6022 beg = p - RSTRING_PTR(str);
6023 goto subseq;
6024 }
6025 else if (RB_TYPE_P(indx, T_STRING)) {
6026 beg = rb_str_index(str, indx, 0);
6027 if (beg == -1) return Qnil;
6028 len = RSTRING_LEN(indx);
6029 result = str_duplicate(rb_cString, indx);
6030 goto squash;
6031 }
6032 else {
6033 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6034 case Qnil:
6035 return Qnil;
6036 case Qfalse:
6037 beg = NUM2LONG(indx);
6038 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6039 if (!len) return Qnil;
6040 beg = p - RSTRING_PTR(str);
6041 goto subseq;
6042 default:
6043 goto num_index;
6044 }
6045 }
6046
6047 num_index:
6048 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6049 beg = p - RSTRING_PTR(str);
6050
6051 subseq:
6052 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6053 rb_enc_cr_str_copy_for_substr(result, str);
6054
6055 squash:
6056 if (len > 0) {
6057 if (beg == 0) {
6058 rb_str_drop_bytes(str, len);
6059 }
6060 else {
6061 char *sptr = RSTRING_PTR(str);
6062 long slen = RSTRING_LEN(str);
6063 if (beg + len > slen) /* pathological check */
6064 len = slen - beg;
6065 memmove(sptr + beg,
6066 sptr + beg + len,
6067 slen - (beg + len));
6068 slen -= len;
6069 STR_SET_LEN(str, slen);
6070 TERM_FILL(&sptr[slen], TERM_LEN(str));
6071 }
6072 }
6073 return result;
6074}
6075
6076static VALUE
6077get_pat(VALUE pat)
6078{
6079 VALUE val;
6080
6081 switch (OBJ_BUILTIN_TYPE(pat)) {
6082 case T_REGEXP:
6083 return pat;
6084
6085 case T_STRING:
6086 break;
6087
6088 default:
6089 val = rb_check_string_type(pat);
6090 if (NIL_P(val)) {
6091 Check_Type(pat, T_REGEXP);
6092 }
6093 pat = val;
6094 }
6095
6096 return rb_reg_regcomp(pat);
6097}
6098
6099static VALUE
6100get_pat_quoted(VALUE pat, int check)
6101{
6102 VALUE val;
6103
6104 switch (OBJ_BUILTIN_TYPE(pat)) {
6105 case T_REGEXP:
6106 return pat;
6107
6108 case T_STRING:
6109 break;
6110
6111 default:
6112 val = rb_check_string_type(pat);
6113 if (NIL_P(val)) {
6114 Check_Type(pat, T_REGEXP);
6115 }
6116 pat = val;
6117 }
6118 if (check && is_broken_string(pat)) {
6119 rb_exc_raise(rb_reg_check_preprocess(pat));
6120 }
6121 return pat;
6122}
6123
6124static long
6125rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6126{
6127 if (BUILTIN_TYPE(pat) == T_STRING) {
6128 pos = rb_str_byteindex(str, pat, pos);
6129 if (set_backref_str) {
6130 if (pos >= 0) {
6131 str = rb_str_new_frozen_String(str);
6132 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6133 }
6134 else {
6136 }
6137 }
6138 return pos;
6139 }
6140 else {
6141 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6142 }
6143}
6144
6145
6146/*
6147 * call-seq:
6148 * sub!(pattern, replacement) -> self or nil
6149 * sub!(pattern) {|match| ... } -> self or nil
6150 *
6151 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6152 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6153 *
6154 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6155 *
6156 * Related: String#sub, String#gsub, String#gsub!.
6157 *
6158 */
6159
6160static VALUE
6161rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6162{
6163 VALUE pat, repl, hash = Qnil;
6164 int iter = 0;
6165 long plen;
6166 int min_arity = rb_block_given_p() ? 1 : 2;
6167 long beg;
6168
6169 rb_check_arity(argc, min_arity, 2);
6170 if (argc == 1) {
6171 iter = 1;
6172 }
6173 else {
6174 repl = argv[1];
6175 hash = rb_check_hash_type(argv[1]);
6176 if (NIL_P(hash)) {
6177 StringValue(repl);
6178 }
6179 }
6180
6181 pat = get_pat_quoted(argv[0], 1);
6182
6183 str_modifiable(str);
6184 beg = rb_pat_search(pat, str, 0, 1);
6185 if (beg >= 0) {
6186 rb_encoding *enc;
6187 int cr = ENC_CODERANGE(str);
6188 long beg0, end0;
6189 VALUE match, match0 = Qnil;
6190 struct re_registers *regs;
6191 char *p, *rp;
6192 long len, rlen;
6193
6194 match = rb_backref_get();
6195 regs = RMATCH_REGS(match);
6196 if (RB_TYPE_P(pat, T_STRING)) {
6197 beg0 = beg;
6198 end0 = beg0 + RSTRING_LEN(pat);
6199 match0 = pat;
6200 }
6201 else {
6202 beg0 = BEG(0);
6203 end0 = END(0);
6204 if (iter) match0 = rb_reg_nth_match(0, match);
6205 }
6206
6207 if (iter || !NIL_P(hash)) {
6208 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6209
6210 if (iter) {
6211 repl = rb_obj_as_string(rb_yield(match0));
6212 }
6213 else {
6214 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6215 repl = rb_obj_as_string(repl);
6216 }
6217 str_mod_check(str, p, len);
6218 rb_check_frozen(str);
6219 }
6220 else {
6221 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6222 }
6223
6224 enc = rb_enc_compatible(str, repl);
6225 if (!enc) {
6226 rb_encoding *str_enc = STR_ENC_GET(str);
6227 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6228 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6229 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6230 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6231 rb_enc_inspect_name(str_enc),
6232 rb_enc_inspect_name(STR_ENC_GET(repl)));
6233 }
6234 enc = STR_ENC_GET(repl);
6235 }
6236 rb_str_modify(str);
6237 rb_enc_associate(str, enc);
6239 int cr2 = ENC_CODERANGE(repl);
6240 if (cr2 == ENC_CODERANGE_BROKEN ||
6241 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6243 else
6244 cr = cr2;
6245 }
6246 plen = end0 - beg0;
6247 rlen = RSTRING_LEN(repl);
6248 len = RSTRING_LEN(str);
6249 if (rlen > plen) {
6250 RESIZE_CAPA(str, len + rlen - plen);
6251 }
6252 p = RSTRING_PTR(str);
6253 if (rlen != plen) {
6254 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6255 }
6256 rp = RSTRING_PTR(repl);
6257 memmove(p + beg0, rp, rlen);
6258 len += rlen - plen;
6259 STR_SET_LEN(str, len);
6260 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6261 ENC_CODERANGE_SET(str, cr);
6262
6263 RB_GC_GUARD(match);
6264
6265 return str;
6266 }
6267 return Qnil;
6268}
6269
6270
6271/*
6272 * call-seq:
6273 * sub(pattern, replacement) -> new_string
6274 * sub(pattern) {|match| ... } -> new_string
6275 *
6276 * Returns a copy of +self+ with only the first occurrence
6277 * (not all occurrences) of the given +pattern+ replaced.
6278 *
6279 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6280 *
6281 * Related: String#sub!, String#gsub, String#gsub!.
6282 *
6283 */
6284
6285static VALUE
6286rb_str_sub(int argc, VALUE *argv, VALUE str)
6287{
6288 str = str_duplicate(rb_cString, str);
6289 rb_str_sub_bang(argc, argv, str);
6290 return str;
6291}
6292
6293static VALUE
6294str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6295{
6296 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
6297 long beg, beg0, end0;
6298 long offset, blen, slen, len, last;
6299 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6300 char *sp, *cp;
6301 int need_backref = -1;
6302 rb_encoding *str_enc;
6303
6304 switch (argc) {
6305 case 1:
6306 RETURN_ENUMERATOR(str, argc, argv);
6307 mode = ITER;
6308 break;
6309 case 2:
6310 repl = argv[1];
6311 hash = rb_check_hash_type(argv[1]);
6312 if (NIL_P(hash)) {
6313 StringValue(repl);
6314 }
6315 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6316 mode = FAST_MAP;
6317 }
6318 else {
6319 mode = MAP;
6320 }
6321 break;
6322 default:
6323 rb_error_arity(argc, 1, 2);
6324 }
6325
6326 pat = get_pat_quoted(argv[0], 1);
6327 beg = rb_pat_search(pat, str, 0, need_backref);
6328 if (beg < 0) {
6329 if (bang) return Qnil; /* no match, no substitution */
6330 return str_duplicate(rb_cString, str);
6331 }
6332
6333 offset = 0;
6334 blen = RSTRING_LEN(str) + 30; /* len + margin */
6335 dest = rb_str_buf_new(blen);
6336 sp = RSTRING_PTR(str);
6337 slen = RSTRING_LEN(str);
6338 cp = sp;
6339 str_enc = STR_ENC_GET(str);
6340 rb_enc_associate(dest, str_enc);
6341 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6342
6343 do {
6344 VALUE match = rb_backref_get();
6345 struct re_registers *regs = RMATCH_REGS(match);
6346 if (RB_TYPE_P(pat, T_STRING)) {
6347 beg0 = beg;
6348 end0 = beg0 + RSTRING_LEN(pat);
6349 match0 = pat;
6350 }
6351 else {
6352 beg0 = BEG(0);
6353 end0 = END(0);
6354 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6355 }
6356
6357 if (mode) {
6358 if (mode == ITER) {
6359 val = rb_obj_as_string(rb_yield(match0));
6360 }
6361 else {
6362 struct RString fake_str;
6363 VALUE key;
6364 if (mode == FAST_MAP) {
6365 // It is safe to use a fake_str here because we established that it won't escape,
6366 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6367 // default proc.
6368 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6369 }
6370 else {
6371 key = rb_str_subseq(str, beg0, end0 - beg0);
6372 }
6373 val = rb_hash_aref(hash, key);
6374 val = rb_obj_as_string(val);
6375 }
6376 str_mod_check(str, sp, slen);
6377 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6378 rb_raise(rb_eRuntimeError, "block should not cheat");
6379 }
6380 }
6381 else if (need_backref) {
6382 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6383 if (need_backref < 0) {
6384 need_backref = val != repl;
6385 }
6386 }
6387 else {
6388 val = repl;
6389 }
6390
6391 len = beg0 - offset; /* copy pre-match substr */
6392 if (len) {
6393 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6394 }
6395
6396 rb_str_buf_append(dest, val);
6397
6398 last = offset;
6399 offset = end0;
6400 if (beg0 == end0) {
6401 /*
6402 * Always consume at least one character of the input string
6403 * in order to prevent infinite loops.
6404 */
6405 if (RSTRING_LEN(str) <= end0) break;
6406 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6407 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6408 offset = end0 + len;
6409 }
6410 cp = RSTRING_PTR(str) + offset;
6411 if (offset > RSTRING_LEN(str)) break;
6412 beg = rb_pat_search(pat, str, offset, need_backref);
6413
6414 RB_GC_GUARD(match);
6415 } while (beg >= 0);
6416 if (RSTRING_LEN(str) > offset) {
6417 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6418 }
6419 rb_pat_search(pat, str, last, 1);
6420 if (bang) {
6421 str_shared_replace(str, dest);
6422 }
6423 else {
6424 str = dest;
6425 }
6426
6427 return str;
6428}
6429
6430
6431/*
6432 * call-seq:
6433 * gsub!(pattern, replacement) -> self or nil
6434 * gsub!(pattern) {|match| ... } -> self or nil
6435 * gsub!(pattern) -> an_enumerator
6436 *
6437 * Performs the specified substring replacement(s) on +self+;
6438 * returns +self+ if any replacement occurred, +nil+ otherwise.
6439 *
6440 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6441 *
6442 * Returns an Enumerator if no +replacement+ and no block given.
6443 *
6444 * Related: String#sub, String#gsub, String#sub!.
6445 *
6446 */
6447
6448static VALUE
6449rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6450{
6451 str_modify_keep_cr(str);
6452 return str_gsub(argc, argv, str, 1);
6453}
6454
6455
6456/*
6457 * call-seq:
6458 * gsub(pattern, replacement) -> new_string
6459 * gsub(pattern) {|match| ... } -> new_string
6460 * gsub(pattern) -> enumerator
6461 *
6462 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6463 *
6464 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6465 *
6466 * Returns an Enumerator if no +replacement+ and no block given.
6467 *
6468 * Related: String#sub, String#sub!, String#gsub!.
6469 *
6470 */
6471
6472static VALUE
6473rb_str_gsub(int argc, VALUE *argv, VALUE str)
6474{
6475 return str_gsub(argc, argv, str, 0);
6476}
6477
6478
6479/*
6480 * call-seq:
6481 * replace(other_string) -> self
6482 *
6483 * Replaces the contents of +self+ with the contents of +other_string+:
6484 *
6485 * s = 'foo' # => "foo"
6486 * s.replace('bar') # => "bar"
6487 *
6488 */
6489
6490VALUE
6492{
6493 str_modifiable(str);
6494 if (str == str2) return str;
6495
6496 StringValue(str2);
6497 str_discard(str);
6498 return str_replace(str, str2);
6499}
6500
6501/*
6502 * call-seq:
6503 * clear -> self
6504 *
6505 * Removes the contents of +self+:
6506 *
6507 * s = 'foo' # => "foo"
6508 * s.clear # => ""
6509 *
6510 */
6511
6512static VALUE
6513rb_str_clear(VALUE str)
6514{
6515 str_discard(str);
6516 STR_SET_EMBED(str);
6517 STR_SET_LEN(str, 0);
6518 RSTRING_PTR(str)[0] = 0;
6519 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6521 else
6523 return str;
6524}
6525
6526/*
6527 * call-seq:
6528 * chr -> string
6529 *
6530 * Returns a string containing the first character of +self+:
6531 *
6532 * s = 'foo' # => "foo"
6533 * s.chr # => "f"
6534 *
6535 */
6536
6537static VALUE
6538rb_str_chr(VALUE str)
6539{
6540 return rb_str_substr(str, 0, 1);
6541}
6542
6543/*
6544 * call-seq:
6545 * getbyte(index) -> integer or nil
6546 *
6547 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6548 *
6549 * s = 'abcde' # => "abcde"
6550 * s.getbyte(0) # => 97
6551 * s.getbyte(-1) # => 101
6552 * s.getbyte(5) # => nil
6553 *
6554 * Related: String#setbyte.
6555 */
6556VALUE
6557rb_str_getbyte(VALUE str, VALUE index)
6558{
6559 long pos = NUM2LONG(index);
6560
6561 if (pos < 0)
6562 pos += RSTRING_LEN(str);
6563 if (pos < 0 || RSTRING_LEN(str) <= pos)
6564 return Qnil;
6565
6566 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6567}
6568
6569/*
6570 * call-seq:
6571 * setbyte(index, integer) -> integer
6572 *
6573 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6574 *
6575 * s = 'abcde' # => "abcde"
6576 * s.setbyte(0, 98) # => 98
6577 * s # => "bbcde"
6578 *
6579 * Related: String#getbyte.
6580 */
6581VALUE
6582rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6583{
6584 long pos = NUM2LONG(index);
6585 long len = RSTRING_LEN(str);
6586 char *ptr, *head, *left = 0;
6587 rb_encoding *enc;
6588 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6589
6590 if (pos < -len || len <= pos)
6591 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6592 if (pos < 0)
6593 pos += len;
6594
6595 VALUE v = rb_to_int(value);
6596 VALUE w = rb_int_and(v, INT2FIX(0xff));
6597 char byte = (char)(NUM2INT(w) & 0xFF);
6598
6599 if (!str_independent(str))
6600 str_make_independent(str);
6601 enc = STR_ENC_GET(str);
6602 head = RSTRING_PTR(str);
6603 ptr = &head[pos];
6604 if (!STR_EMBED_P(str)) {
6605 cr = ENC_CODERANGE(str);
6606 switch (cr) {
6607 case ENC_CODERANGE_7BIT:
6608 left = ptr;
6609 *ptr = byte;
6610 if (ISASCII(byte)) goto end;
6611 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6612 if (!MBCLEN_CHARFOUND_P(nlen))
6614 else
6616 goto end;
6618 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6619 width = rb_enc_precise_mbclen(left, head+len, enc);
6620 *ptr = byte;
6621 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6622 if (!MBCLEN_CHARFOUND_P(nlen))
6624 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6626 goto end;
6627 }
6628 }
6630 *ptr = byte;
6631
6632 end:
6633 return value;
6634}
6635
6636static VALUE
6637str_byte_substr(VALUE str, long beg, long len, int empty)
6638{
6639 long n = RSTRING_LEN(str);
6640
6641 if (beg > n || len < 0) return Qnil;
6642 if (beg < 0) {
6643 beg += n;
6644 if (beg < 0) return Qnil;
6645 }
6646 if (len > n - beg)
6647 len = n - beg;
6648 if (len <= 0) {
6649 if (!empty) return Qnil;
6650 len = 0;
6651 }
6652
6653 VALUE str2 = str_subseq(str, beg, len);
6654
6655 str_enc_copy_direct(str2, str);
6656
6657 if (RSTRING_LEN(str2) == 0) {
6658 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6660 else
6662 }
6663 else {
6664 switch (ENC_CODERANGE(str)) {
6665 case ENC_CODERANGE_7BIT:
6667 break;
6668 default:
6670 break;
6671 }
6672 }
6673
6674 return str2;
6675}
6676
6677VALUE
6678rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6679{
6680 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6681}
6682
6683static VALUE
6684str_byte_aref(VALUE str, VALUE indx)
6685{
6686 long idx;
6687 if (FIXNUM_P(indx)) {
6688 idx = FIX2LONG(indx);
6689 }
6690 else {
6691 /* check if indx is Range */
6692 long beg, len = RSTRING_LEN(str);
6693
6694 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6695 case Qfalse:
6696 break;
6697 case Qnil:
6698 return Qnil;
6699 default:
6700 return str_byte_substr(str, beg, len, TRUE);
6701 }
6702
6703 idx = NUM2LONG(indx);
6704 }
6705 return str_byte_substr(str, idx, 1, FALSE);
6706}
6707
6708/*
6709 * call-seq:
6710 * byteslice(index, length = 1) -> string or nil
6711 * byteslice(range) -> string or nil
6712 *
6713 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6714 *
6715 * With integer arguments +index+ and +length+ given,
6716 * returns the substring beginning at the given +index+
6717 * of the given +length+ (if possible),
6718 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6719 *
6720 * s = '0123456789' # => "0123456789"
6721 * s.byteslice(2) # => "2"
6722 * s.byteslice(200) # => nil
6723 * s.byteslice(4, 3) # => "456"
6724 * s.byteslice(4, 30) # => "456789"
6725 * s.byteslice(4, -1) # => nil
6726 * s.byteslice(40, 2) # => nil
6727 *
6728 * In either case above, counts backwards from the end of +self+
6729 * if +index+ is negative:
6730 *
6731 * s = '0123456789' # => "0123456789"
6732 * s.byteslice(-4) # => "6"
6733 * s.byteslice(-4, 3) # => "678"
6734 *
6735 * With Range argument +range+ given, returns
6736 * <tt>byteslice(range.begin, range.size)</tt>:
6737 *
6738 * s = '0123456789' # => "0123456789"
6739 * s.byteslice(4..6) # => "456"
6740 * s.byteslice(-6..-4) # => "456"
6741 * s.byteslice(5..2) # => "" # range.size is zero.
6742 * s.byteslice(40..42) # => nil
6743 *
6744 * In all cases, a returned string has the same encoding as +self+:
6745 *
6746 * s.encoding # => #<Encoding:UTF-8>
6747 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6748 *
6749 */
6750
6751static VALUE
6752rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6753{
6754 if (argc == 2) {
6755 long beg = NUM2LONG(argv[0]);
6756 long len = NUM2LONG(argv[1]);
6757 return str_byte_substr(str, beg, len, TRUE);
6758 }
6759 rb_check_arity(argc, 1, 2);
6760 return str_byte_aref(str, argv[0]);
6761}
6762
6763static void
6764str_check_beg_len(VALUE str, long *beg, long *len)
6765{
6766 long end, slen = RSTRING_LEN(str);
6767
6768 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6769 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6770 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6771 }
6772 if (*beg < 0) {
6773 *beg += slen;
6774 }
6775 RUBY_ASSERT(*beg >= 0);
6776 RUBY_ASSERT(*beg <= slen);
6777
6778 if (*len > slen - *beg) {
6779 *len = slen - *beg;
6780 }
6781 end = *beg + *len;
6782 str_ensure_byte_pos(str, *beg);
6783 str_ensure_byte_pos(str, end);
6784}
6785
6786/*
6787 * call-seq:
6788 * bytesplice(index, length, str) -> string
6789 * bytesplice(index, length, str, str_index, str_length) -> string
6790 * bytesplice(range, str) -> string
6791 * bytesplice(range, str, str_range) -> string
6792 *
6793 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6794 * The portion of the string affected is determined using
6795 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6796 * If the replacement string is not the same length as the text it is replacing,
6797 * the string will be adjusted accordingly.
6798 *
6799 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6800 *
6801 * The form that take an Integer will raise an IndexError if the value is out
6802 * of range; the Range form will raise a RangeError.
6803 * If the beginning or ending offset does not land on character (codepoint)
6804 * boundary, an IndexError will be raised.
6805 */
6806
6807static VALUE
6808rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6809{
6810 long beg, len, vbeg, vlen;
6811 VALUE val;
6812 int cr;
6813
6814 rb_check_arity(argc, 2, 5);
6815 if (!(argc == 2 || argc == 3 || argc == 5)) {
6816 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6817 }
6818 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6819 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6820 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6821 rb_builtin_class_name(argv[0]));
6822 }
6823 val = argv[1];
6824 StringValue(val);
6825 if (argc == 2) {
6826 /* bytesplice(range, str) */
6827 vbeg = 0;
6828 vlen = RSTRING_LEN(val);
6829 }
6830 else {
6831 /* bytesplice(range, str, str_range) */
6832 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6833 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6834 rb_builtin_class_name(argv[2]));
6835 }
6836 }
6837 }
6838 else {
6839 beg = NUM2LONG(argv[0]);
6840 len = NUM2LONG(argv[1]);
6841 val = argv[2];
6842 StringValue(val);
6843 if (argc == 3) {
6844 /* bytesplice(index, length, str) */
6845 vbeg = 0;
6846 vlen = RSTRING_LEN(val);
6847 }
6848 else {
6849 /* bytesplice(index, length, str, str_index, str_length) */
6850 vbeg = NUM2LONG(argv[3]);
6851 vlen = NUM2LONG(argv[4]);
6852 }
6853 }
6854 str_check_beg_len(str, &beg, &len);
6855 str_check_beg_len(val, &vbeg, &vlen);
6856 str_modify_keep_cr(str);
6857
6858 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6859 rb_enc_associate(str, rb_enc_check(str, val));
6860 }
6861
6862 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6864 if (cr != ENC_CODERANGE_BROKEN)
6865 ENC_CODERANGE_SET(str, cr);
6866 return str;
6867}
6868
6869/*
6870 * call-seq:
6871 * reverse -> string
6872 *
6873 * Returns a new string with the characters from +self+ in reverse order.
6874 *
6875 * 'stressed'.reverse # => "desserts"
6876 *
6877 */
6878
6879static VALUE
6880rb_str_reverse(VALUE str)
6881{
6882 rb_encoding *enc;
6883 VALUE rev;
6884 char *s, *e, *p;
6885 int cr;
6886
6887 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6888 enc = STR_ENC_GET(str);
6889 rev = rb_str_new(0, RSTRING_LEN(str));
6890 s = RSTRING_PTR(str); e = RSTRING_END(str);
6891 p = RSTRING_END(rev);
6892 cr = ENC_CODERANGE(str);
6893
6894 if (RSTRING_LEN(str) > 1) {
6895 if (single_byte_optimizable(str)) {
6896 while (s < e) {
6897 *--p = *s++;
6898 }
6899 }
6900 else if (cr == ENC_CODERANGE_VALID) {
6901 while (s < e) {
6902 int clen = rb_enc_fast_mbclen(s, e, enc);
6903
6904 p -= clen;
6905 memcpy(p, s, clen);
6906 s += clen;
6907 }
6908 }
6909 else {
6910 cr = rb_enc_asciicompat(enc) ?
6912 while (s < e) {
6913 int clen = rb_enc_mbclen(s, e, enc);
6914
6915 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6916 p -= clen;
6917 memcpy(p, s, clen);
6918 s += clen;
6919 }
6920 }
6921 }
6922 STR_SET_LEN(rev, RSTRING_LEN(str));
6923 str_enc_copy_direct(rev, str);
6924 ENC_CODERANGE_SET(rev, cr);
6925
6926 return rev;
6927}
6928
6929
6930/*
6931 * call-seq:
6932 * reverse! -> self
6933 *
6934 * Returns +self+ with its characters reversed:
6935 *
6936 * s = 'stressed'
6937 * s.reverse! # => "desserts"
6938 * s # => "desserts"
6939 *
6940 */
6941
6942static VALUE
6943rb_str_reverse_bang(VALUE str)
6944{
6945 if (RSTRING_LEN(str) > 1) {
6946 if (single_byte_optimizable(str)) {
6947 char *s, *e, c;
6948
6949 str_modify_keep_cr(str);
6950 s = RSTRING_PTR(str);
6951 e = RSTRING_END(str) - 1;
6952 while (s < e) {
6953 c = *s;
6954 *s++ = *e;
6955 *e-- = c;
6956 }
6957 }
6958 else {
6959 str_shared_replace(str, rb_str_reverse(str));
6960 }
6961 }
6962 else {
6963 str_modify_keep_cr(str);
6964 }
6965 return str;
6966}
6967
6968
6969/*
6970 * call-seq:
6971 * include?(other_string) -> true or false
6972 *
6973 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6974 *
6975 * s = 'foo'
6976 * s.include?('f') # => true
6977 * s.include?('fo') # => true
6978 * s.include?('food') # => false
6979 *
6980 */
6981
6982VALUE
6983rb_str_include(VALUE str, VALUE arg)
6984{
6985 long i;
6986
6987 StringValue(arg);
6988 i = rb_str_index(str, arg, 0);
6989
6990 return RBOOL(i != -1);
6991}
6992
6993
6994/*
6995 * call-seq:
6996 * to_i(base = 10) -> integer
6997 *
6998 * Returns the result of interpreting leading characters in +self+
6999 * as an integer in the given +base+ (which must be in (0, 2..36)):
7000 *
7001 * '123456'.to_i # => 123456
7002 * '123def'.to_i(16) # => 1195503
7003 *
7004 * With +base+ zero, string +object+ may contain leading characters
7005 * to specify the actual base:
7006 *
7007 * '123def'.to_i(0) # => 123
7008 * '0123def'.to_i(0) # => 83
7009 * '0b123def'.to_i(0) # => 1
7010 * '0o123def'.to_i(0) # => 83
7011 * '0d123def'.to_i(0) # => 123
7012 * '0x123def'.to_i(0) # => 1195503
7013 *
7014 * Characters past a leading valid number (in the given +base+) are ignored:
7015 *
7016 * '12.345'.to_i # => 12
7017 * '12345'.to_i(2) # => 1
7018 *
7019 * Returns zero if there is no leading valid number:
7020 *
7021 * 'abcdef'.to_i # => 0
7022 * '2'.to_i(2) # => 0
7023 *
7024 */
7025
7026static VALUE
7027rb_str_to_i(int argc, VALUE *argv, VALUE str)
7028{
7029 int base = 10;
7030
7031 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7032 rb_raise(rb_eArgError, "invalid radix %d", base);
7033 }
7034 return rb_str_to_inum(str, base, FALSE);
7035}
7036
7037
7038/*
7039 * call-seq:
7040 * to_f -> float
7041 *
7042 * Returns the result of interpreting leading characters in +self+ as a Float:
7043 *
7044 * '3.14159'.to_f # => 3.14159
7045 * '1.234e-2'.to_f # => 0.01234
7046 *
7047 * Characters past a leading valid number (in the given +base+) are ignored:
7048 *
7049 * '3.14 (pi to two places)'.to_f # => 3.14
7050 *
7051 * Returns zero if there is no leading valid number:
7052 *
7053 * 'abcdef'.to_f # => 0.0
7054 *
7055 */
7056
7057static VALUE
7058rb_str_to_f(VALUE str)
7059{
7060 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7061}
7062
7063
7064/*
7065 * call-seq:
7066 * to_s -> self or string
7067 *
7068 * Returns +self+ if +self+ is a +String+,
7069 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7070 */
7071
7072static VALUE
7073rb_str_to_s(VALUE str)
7074{
7075 if (rb_obj_class(str) != rb_cString) {
7076 return str_duplicate(rb_cString, str);
7077 }
7078 return str;
7079}
7080
7081#if 0
7082static void
7083str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7084{
7085 char s[RUBY_MAX_CHAR_LEN];
7086 int n = rb_enc_codelen(c, enc);
7087
7088 rb_enc_mbcput(c, s, enc);
7089 rb_enc_str_buf_cat(str, s, n, enc);
7090}
7091#endif
7092
7093#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7094
7095int
7096rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7097{
7098 char buf[CHAR_ESC_LEN + 1];
7099 int l;
7100
7101#if SIZEOF_INT > 4
7102 c &= 0xffffffff;
7103#endif
7104 if (unicode_p) {
7105 if (c < 0x7F && ISPRINT(c)) {
7106 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7107 }
7108 else if (c < 0x10000) {
7109 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7110 }
7111 else {
7112 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7113 }
7114 }
7115 else {
7116 if (c < 0x100) {
7117 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7118 }
7119 else {
7120 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7121 }
7122 }
7123 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7124 rb_str_buf_cat(result, buf, l);
7125 return l;
7126}
7127
7128const char *
7129ruby_escaped_char(int c)
7130{
7131 switch (c) {
7132 case '\0': return "\\0";
7133 case '\n': return "\\n";
7134 case '\r': return "\\r";
7135 case '\t': return "\\t";
7136 case '\f': return "\\f";
7137 case '\013': return "\\v";
7138 case '\010': return "\\b";
7139 case '\007': return "\\a";
7140 case '\033': return "\\e";
7141 case '\x7f': return "\\c?";
7142 }
7143 return NULL;
7144}
7145
7146VALUE
7147rb_str_escape(VALUE str)
7148{
7149 int encidx = ENCODING_GET(str);
7150 rb_encoding *enc = rb_enc_from_index(encidx);
7151 const char *p = RSTRING_PTR(str);
7152 const char *pend = RSTRING_END(str);
7153 const char *prev = p;
7154 char buf[CHAR_ESC_LEN + 1];
7155 VALUE result = rb_str_buf_new(0);
7156 int unicode_p = rb_enc_unicode_p(enc);
7157 int asciicompat = rb_enc_asciicompat(enc);
7158
7159 while (p < pend) {
7160 unsigned int c;
7161 const char *cc;
7162 int n = rb_enc_precise_mbclen(p, pend, enc);
7163 if (!MBCLEN_CHARFOUND_P(n)) {
7164 if (p > prev) str_buf_cat(result, prev, p - prev);
7165 n = rb_enc_mbminlen(enc);
7166 if (pend < p + n)
7167 n = (int)(pend - p);
7168 while (n--) {
7169 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7170 str_buf_cat(result, buf, strlen(buf));
7171 prev = ++p;
7172 }
7173 continue;
7174 }
7175 n = MBCLEN_CHARFOUND_LEN(n);
7176 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7177 p += n;
7178 cc = ruby_escaped_char(c);
7179 if (cc) {
7180 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7181 str_buf_cat(result, cc, strlen(cc));
7182 prev = p;
7183 }
7184 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7185 }
7186 else {
7187 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7188 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7189 prev = p;
7190 }
7191 }
7192 if (p > prev) str_buf_cat(result, prev, p - prev);
7193 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7194
7195 return result;
7196}
7197
7198/*
7199 * call-seq:
7200 * inspect -> string
7201 *
7202 * Returns a printable version of +self+, enclosed in double-quotes,
7203 * and with special characters escaped:
7204 *
7205 * s = "foo\tbar\tbaz\n"
7206 * s.inspect
7207 * # => "\"foo\\tbar\\tbaz\\n\""
7208 *
7209 */
7210
7211VALUE
7213{
7214 int encidx = ENCODING_GET(str);
7215 rb_encoding *enc = rb_enc_from_index(encidx);
7216 const char *p, *pend, *prev;
7217 char buf[CHAR_ESC_LEN + 1];
7218 VALUE result = rb_str_buf_new(0);
7219 rb_encoding *resenc = rb_default_internal_encoding();
7220 int unicode_p = rb_enc_unicode_p(enc);
7221 int asciicompat = rb_enc_asciicompat(enc);
7222
7223 if (resenc == NULL) resenc = rb_default_external_encoding();
7224 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7225 rb_enc_associate(result, resenc);
7226 str_buf_cat2(result, "\"");
7227
7228 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7229 prev = p;
7230 while (p < pend) {
7231 unsigned int c, cc;
7232 int n;
7233
7234 n = rb_enc_precise_mbclen(p, pend, enc);
7235 if (!MBCLEN_CHARFOUND_P(n)) {
7236 if (p > prev) str_buf_cat(result, prev, p - prev);
7237 n = rb_enc_mbminlen(enc);
7238 if (pend < p + n)
7239 n = (int)(pend - p);
7240 while (n--) {
7241 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7242 str_buf_cat(result, buf, strlen(buf));
7243 prev = ++p;
7244 }
7245 continue;
7246 }
7247 n = MBCLEN_CHARFOUND_LEN(n);
7248 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7249 p += n;
7250 if ((asciicompat || unicode_p) &&
7251 (c == '"'|| c == '\\' ||
7252 (c == '#' &&
7253 p < pend &&
7254 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7255 (cc = rb_enc_codepoint(p,pend,enc),
7256 (cc == '$' || cc == '@' || cc == '{'))))) {
7257 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7258 str_buf_cat2(result, "\\");
7259 if (asciicompat || enc == resenc) {
7260 prev = p - n;
7261 continue;
7262 }
7263 }
7264 switch (c) {
7265 case '\n': cc = 'n'; break;
7266 case '\r': cc = 'r'; break;
7267 case '\t': cc = 't'; break;
7268 case '\f': cc = 'f'; break;
7269 case '\013': cc = 'v'; break;
7270 case '\010': cc = 'b'; break;
7271 case '\007': cc = 'a'; break;
7272 case 033: cc = 'e'; break;
7273 default: cc = 0; break;
7274 }
7275 if (cc) {
7276 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7277 buf[0] = '\\';
7278 buf[1] = (char)cc;
7279 str_buf_cat(result, buf, 2);
7280 prev = p;
7281 continue;
7282 }
7283 /* The special casing of 0x85 (NEXT_LINE) here is because
7284 * Oniguruma historically treats it as printable, but it
7285 * doesn't match the print POSIX bracket class or character
7286 * property in regexps.
7287 *
7288 * See Ruby Bug #16842 for details:
7289 * https://bugs.ruby-lang.org/issues/16842
7290 */
7291 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7292 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7293 continue;
7294 }
7295 else {
7296 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7297 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7298 prev = p;
7299 continue;
7300 }
7301 }
7302 if (p > prev) str_buf_cat(result, prev, p - prev);
7303 str_buf_cat2(result, "\"");
7304
7305 return result;
7306}
7307
7308#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7309
7310/*
7311 * call-seq:
7312 * dump -> string
7313 *
7314 * Returns a printable version of +self+, enclosed in double-quotes,
7315 * with special characters escaped, and with non-printing characters
7316 * replaced by hexadecimal notation:
7317 *
7318 * "hello \n ''".dump # => "\"hello \\n ''\""
7319 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7320 *
7321 * Related: String#undump (inverse of String#dump).
7322 *
7323 */
7324
7325VALUE
7327{
7328 int encidx = rb_enc_get_index(str);
7329 rb_encoding *enc = rb_enc_from_index(encidx);
7330 long len;
7331 const char *p, *pend;
7332 char *q, *qend;
7333 VALUE result;
7334 int u8 = (encidx == rb_utf8_encindex());
7335 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7336
7337 len = 2; /* "" */
7338 if (!rb_enc_asciicompat(enc)) {
7339 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7340 len += strlen(enc->name);
7341 }
7342
7343 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7344 while (p < pend) {
7345 int clen;
7346 unsigned char c = *p++;
7347
7348 switch (c) {
7349 case '"': case '\\':
7350 case '\n': case '\r':
7351 case '\t': case '\f':
7352 case '\013': case '\010': case '\007': case '\033':
7353 clen = 2;
7354 break;
7355
7356 case '#':
7357 clen = IS_EVSTR(p, pend) ? 2 : 1;
7358 break;
7359
7360 default:
7361 if (ISPRINT(c)) {
7362 clen = 1;
7363 }
7364 else {
7365 if (u8 && c > 0x7F) { /* \u notation */
7366 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7367 if (MBCLEN_CHARFOUND_P(n)) {
7368 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7369 if (cc <= 0xFFFF)
7370 clen = 6; /* \uXXXX */
7371 else if (cc <= 0xFFFFF)
7372 clen = 9; /* \u{XXXXX} */
7373 else
7374 clen = 10; /* \u{XXXXXX} */
7375 p += MBCLEN_CHARFOUND_LEN(n)-1;
7376 break;
7377 }
7378 }
7379 clen = 4; /* \xNN */
7380 }
7381 break;
7382 }
7383
7384 if (clen > LONG_MAX - len) {
7385 rb_raise(rb_eRuntimeError, "string size too big");
7386 }
7387 len += clen;
7388 }
7389
7390 result = rb_str_new(0, len);
7391 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7392 q = RSTRING_PTR(result); qend = q + len + 1;
7393
7394 *q++ = '"';
7395 while (p < pend) {
7396 unsigned char c = *p++;
7397
7398 if (c == '"' || c == '\\') {
7399 *q++ = '\\';
7400 *q++ = c;
7401 }
7402 else if (c == '#') {
7403 if (IS_EVSTR(p, pend)) *q++ = '\\';
7404 *q++ = '#';
7405 }
7406 else if (c == '\n') {
7407 *q++ = '\\';
7408 *q++ = 'n';
7409 }
7410 else if (c == '\r') {
7411 *q++ = '\\';
7412 *q++ = 'r';
7413 }
7414 else if (c == '\t') {
7415 *q++ = '\\';
7416 *q++ = 't';
7417 }
7418 else if (c == '\f') {
7419 *q++ = '\\';
7420 *q++ = 'f';
7421 }
7422 else if (c == '\013') {
7423 *q++ = '\\';
7424 *q++ = 'v';
7425 }
7426 else if (c == '\010') {
7427 *q++ = '\\';
7428 *q++ = 'b';
7429 }
7430 else if (c == '\007') {
7431 *q++ = '\\';
7432 *q++ = 'a';
7433 }
7434 else if (c == '\033') {
7435 *q++ = '\\';
7436 *q++ = 'e';
7437 }
7438 else if (ISPRINT(c)) {
7439 *q++ = c;
7440 }
7441 else {
7442 *q++ = '\\';
7443 if (u8) {
7444 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7445 if (MBCLEN_CHARFOUND_P(n)) {
7446 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7447 p += n;
7448 if (cc <= 0xFFFF)
7449 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7450 else
7451 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7452 q += strlen(q);
7453 continue;
7454 }
7455 }
7456 snprintf(q, qend-q, "x%02X", c);
7457 q += 3;
7458 }
7459 }
7460 *q++ = '"';
7461 *q = '\0';
7462 if (!rb_enc_asciicompat(enc)) {
7463 snprintf(q, qend-q, nonascii_suffix, enc->name);
7464 encidx = rb_ascii8bit_encindex();
7465 }
7466 /* result from dump is ASCII */
7467 rb_enc_associate_index(result, encidx);
7469 return result;
7470}
7471
7472static int
7473unescape_ascii(unsigned int c)
7474{
7475 switch (c) {
7476 case 'n':
7477 return '\n';
7478 case 'r':
7479 return '\r';
7480 case 't':
7481 return '\t';
7482 case 'f':
7483 return '\f';
7484 case 'v':
7485 return '\13';
7486 case 'b':
7487 return '\010';
7488 case 'a':
7489 return '\007';
7490 case 'e':
7491 return 033;
7492 }
7494}
7495
7496static void
7497undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7498{
7499 const char *s = *ss;
7500 unsigned int c;
7501 int codelen;
7502 size_t hexlen;
7503 unsigned char buf[6];
7504 static rb_encoding *enc_utf8 = NULL;
7505
7506 switch (*s) {
7507 case '\\':
7508 case '"':
7509 case '#':
7510 rb_str_cat(undumped, s, 1); /* cat itself */
7511 s++;
7512 break;
7513 case 'n':
7514 case 'r':
7515 case 't':
7516 case 'f':
7517 case 'v':
7518 case 'b':
7519 case 'a':
7520 case 'e':
7521 *buf = unescape_ascii(*s);
7522 rb_str_cat(undumped, (char *)buf, 1);
7523 s++;
7524 break;
7525 case 'u':
7526 if (*binary) {
7527 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7528 }
7529 *utf8 = true;
7530 if (++s >= s_end) {
7531 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7532 }
7533 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7534 if (*penc != enc_utf8) {
7535 *penc = enc_utf8;
7536 rb_enc_associate(undumped, enc_utf8);
7537 }
7538 if (*s == '{') { /* handle \u{...} form */
7539 s++;
7540 for (;;) {
7541 if (s >= s_end) {
7542 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7543 }
7544 if (*s == '}') {
7545 s++;
7546 break;
7547 }
7548 if (ISSPACE(*s)) {
7549 s++;
7550 continue;
7551 }
7552 c = scan_hex(s, s_end-s, &hexlen);
7553 if (hexlen == 0 || hexlen > 6) {
7554 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7555 }
7556 if (c > 0x10ffff) {
7557 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7558 }
7559 if (0xd800 <= c && c <= 0xdfff) {
7560 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7561 }
7562 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7563 rb_str_cat(undumped, (char *)buf, codelen);
7564 s += hexlen;
7565 }
7566 }
7567 else { /* handle \uXXXX form */
7568 c = scan_hex(s, 4, &hexlen);
7569 if (hexlen != 4) {
7570 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7571 }
7572 if (0xd800 <= c && c <= 0xdfff) {
7573 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7574 }
7575 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7576 rb_str_cat(undumped, (char *)buf, codelen);
7577 s += hexlen;
7578 }
7579 break;
7580 case 'x':
7581 if (*utf8) {
7582 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7583 }
7584 *binary = true;
7585 if (++s >= s_end) {
7586 rb_raise(rb_eRuntimeError, "invalid hex escape");
7587 }
7588 *buf = scan_hex(s, 2, &hexlen);
7589 if (hexlen != 2) {
7590 rb_raise(rb_eRuntimeError, "invalid hex escape");
7591 }
7592 rb_str_cat(undumped, (char *)buf, 1);
7593 s += hexlen;
7594 break;
7595 default:
7596 rb_str_cat(undumped, s-1, 2);
7597 s++;
7598 }
7599
7600 *ss = s;
7601}
7602
7603static VALUE rb_str_is_ascii_only_p(VALUE str);
7604
7605/*
7606 * call-seq:
7607 * undump -> string
7608 *
7609 * Returns an unescaped version of +self+:
7610 *
7611 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7612 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7613 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7614 * s_undumped == s_orig # => true
7615 *
7616 * Related: String#dump (inverse of String#undump).
7617 *
7618 */
7619
7620static VALUE
7621str_undump(VALUE str)
7622{
7623 const char *s = RSTRING_PTR(str);
7624 const char *s_end = RSTRING_END(str);
7625 rb_encoding *enc = rb_enc_get(str);
7626 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7627 bool utf8 = false;
7628 bool binary = false;
7629 int w;
7630
7632 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7633 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7634 }
7635 if (!str_null_check(str, &w)) {
7636 rb_raise(rb_eRuntimeError, "string contains null byte");
7637 }
7638 if (RSTRING_LEN(str) < 2) goto invalid_format;
7639 if (*s != '"') goto invalid_format;
7640
7641 /* strip '"' at the start */
7642 s++;
7643
7644 for (;;) {
7645 if (s >= s_end) {
7646 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7647 }
7648
7649 if (*s == '"') {
7650 /* epilogue */
7651 s++;
7652 if (s == s_end) {
7653 /* ascii compatible dumped string */
7654 break;
7655 }
7656 else {
7657 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7658 static const char dup_suffix[] = ".dup";
7659 const char *encname;
7660 int encidx;
7661 ptrdiff_t size;
7662
7663 /* check separately for strings dumped by older versions */
7664 size = sizeof(dup_suffix) - 1;
7665 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7666
7667 size = sizeof(force_encoding_suffix) - 1;
7668 if (s_end - s <= size) goto invalid_format;
7669 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7670 s += size;
7671
7672 if (utf8) {
7673 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7674 }
7675
7676 encname = s;
7677 s = memchr(s, '"', s_end-s);
7678 size = s - encname;
7679 if (!s) goto invalid_format;
7680 if (s_end - s != 2) goto invalid_format;
7681 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7682
7683 encidx = rb_enc_find_index2(encname, (long)size);
7684 if (encidx < 0) {
7685 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7686 }
7687 rb_enc_associate_index(undumped, encidx);
7688 }
7689 break;
7690 }
7691
7692 if (*s == '\\') {
7693 s++;
7694 if (s >= s_end) {
7695 rb_raise(rb_eRuntimeError, "invalid escape");
7696 }
7697 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7698 }
7699 else {
7700 rb_str_cat(undumped, s++, 1);
7701 }
7702 }
7703
7704 RB_GC_GUARD(str);
7705
7706 return undumped;
7707invalid_format:
7708 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7709}
7710
7711static void
7712rb_str_check_dummy_enc(rb_encoding *enc)
7713{
7714 if (rb_enc_dummy_p(enc)) {
7715 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7716 rb_enc_name(enc));
7717 }
7718}
7719
7720static rb_encoding *
7721str_true_enc(VALUE str)
7722{
7723 rb_encoding *enc = STR_ENC_GET(str);
7724 rb_str_check_dummy_enc(enc);
7725 return enc;
7726}
7727
7728static OnigCaseFoldType
7729check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7730{
7731 if (argc==0)
7732 return flags;
7733 if (argc>2)
7734 rb_raise(rb_eArgError, "too many options");
7735 if (argv[0]==sym_turkic) {
7736 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7737 if (argc==2) {
7738 if (argv[1]==sym_lithuanian)
7739 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7740 else
7741 rb_raise(rb_eArgError, "invalid second option");
7742 }
7743 }
7744 else if (argv[0]==sym_lithuanian) {
7745 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7746 if (argc==2) {
7747 if (argv[1]==sym_turkic)
7748 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7749 else
7750 rb_raise(rb_eArgError, "invalid second option");
7751 }
7752 }
7753 else if (argc>1)
7754 rb_raise(rb_eArgError, "too many options");
7755 else if (argv[0]==sym_ascii)
7756 flags |= ONIGENC_CASE_ASCII_ONLY;
7757 else if (argv[0]==sym_fold) {
7758 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7759 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7760 else
7761 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7762 }
7763 else
7764 rb_raise(rb_eArgError, "invalid option");
7765 return flags;
7766}
7767
7768static inline bool
7769case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7770{
7771 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7772 return true;
7773 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7774}
7775
7776/* 16 should be long enough to absorb any kind of single character length increase */
7777#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7778#ifndef CASEMAP_DEBUG
7779# define CASEMAP_DEBUG 0
7780#endif
7781
7782struct mapping_buffer;
7783typedef struct mapping_buffer {
7784 size_t capa;
7785 size_t used;
7786 struct mapping_buffer *next;
7787 OnigUChar space[FLEX_ARY_LEN];
7789
7790static void
7791mapping_buffer_free(void *p)
7792{
7793 mapping_buffer *previous_buffer;
7794 mapping_buffer *current_buffer = p;
7795 while (current_buffer) {
7796 previous_buffer = current_buffer;
7797 current_buffer = current_buffer->next;
7798 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7799 }
7800}
7801
7802static const rb_data_type_t mapping_buffer_type = {
7803 "mapping_buffer",
7804 {0, mapping_buffer_free,},
7805 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7806};
7807
7808static VALUE
7809rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7810{
7811 VALUE target;
7812
7813 const OnigUChar *source_current, *source_end;
7814 int target_length = 0;
7815 VALUE buffer_anchor;
7816 mapping_buffer *current_buffer = 0;
7817 mapping_buffer **pre_buffer;
7818 size_t buffer_count = 0;
7819 int buffer_length_or_invalid;
7820
7821 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7822
7823 source_current = (OnigUChar*)RSTRING_PTR(source);
7824 source_end = (OnigUChar*)RSTRING_END(source);
7825
7826 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7827 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7828 while (source_current < source_end) {
7829 /* increase multiplier using buffer count to converge quickly */
7830 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7831 if (CASEMAP_DEBUG) {
7832 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7833 }
7834 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7835 *pre_buffer = current_buffer;
7836 pre_buffer = &current_buffer->next;
7837 current_buffer->next = NULL;
7838 current_buffer->capa = capa;
7839 buffer_length_or_invalid = enc->case_map(flags,
7840 &source_current, source_end,
7841 current_buffer->space,
7842 current_buffer->space+current_buffer->capa,
7843 enc);
7844 if (buffer_length_or_invalid < 0) {
7845 current_buffer = DATA_PTR(buffer_anchor);
7846 DATA_PTR(buffer_anchor) = 0;
7847 mapping_buffer_free(current_buffer);
7848 rb_raise(rb_eArgError, "input string invalid");
7849 }
7850 target_length += current_buffer->used = buffer_length_or_invalid;
7851 }
7852 if (CASEMAP_DEBUG) {
7853 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7854 }
7855
7856 if (buffer_count==1) {
7857 target = rb_str_new((const char*)current_buffer->space, target_length);
7858 }
7859 else {
7860 char *target_current;
7861
7862 target = rb_str_new(0, target_length);
7863 target_current = RSTRING_PTR(target);
7864 current_buffer = DATA_PTR(buffer_anchor);
7865 while (current_buffer) {
7866 memcpy(target_current, current_buffer->space, current_buffer->used);
7867 target_current += current_buffer->used;
7868 current_buffer = current_buffer->next;
7869 }
7870 }
7871 current_buffer = DATA_PTR(buffer_anchor);
7872 DATA_PTR(buffer_anchor) = 0;
7873 mapping_buffer_free(current_buffer);
7874
7875 RB_GC_GUARD(buffer_anchor);
7876
7877 /* TODO: check about string terminator character */
7878 str_enc_copy_direct(target, source);
7879 /*ENC_CODERANGE_SET(mapped, cr);*/
7880
7881 return target;
7882}
7883
7884static VALUE
7885rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7886{
7887 const OnigUChar *source_current, *source_end;
7888 OnigUChar *target_current, *target_end;
7889 long old_length = RSTRING_LEN(source);
7890 int length_or_invalid;
7891
7892 if (old_length == 0) return Qnil;
7893
7894 source_current = (OnigUChar*)RSTRING_PTR(source);
7895 source_end = (OnigUChar*)RSTRING_END(source);
7896 if (source == target) {
7897 target_current = (OnigUChar*)source_current;
7898 target_end = (OnigUChar*)source_end;
7899 }
7900 else {
7901 target_current = (OnigUChar*)RSTRING_PTR(target);
7902 target_end = (OnigUChar*)RSTRING_END(target);
7903 }
7904
7905 length_or_invalid = onigenc_ascii_only_case_map(flags,
7906 &source_current, source_end,
7907 target_current, target_end, enc);
7908 if (length_or_invalid < 0)
7909 rb_raise(rb_eArgError, "input string invalid");
7910 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7911 fprintf(stderr, "problem with rb_str_ascii_casemap"
7912 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7913 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7914 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7915 }
7916
7917 str_enc_copy(target, source);
7918
7919 return target;
7920}
7921
7922static bool
7923upcase_single(VALUE str)
7924{
7925 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7926 bool modified = false;
7927
7928 while (s < send) {
7929 unsigned int c = *(unsigned char*)s;
7930
7931 if ('a' <= c && c <= 'z') {
7932 *s = 'A' + (c - 'a');
7933 modified = true;
7934 }
7935 s++;
7936 }
7937 return modified;
7938}
7939
7940/*
7941 * call-seq:
7942 * upcase!(*options) -> self or nil
7943 *
7944 * Upcases the characters in +self+;
7945 * returns +self+ if any changes were made, +nil+ otherwise:
7946 *
7947 * s = 'Hello World!' # => "Hello World!"
7948 * s.upcase! # => "HELLO WORLD!"
7949 * s # => "HELLO WORLD!"
7950 * s.upcase! # => nil
7951 *
7952 * The casing may be affected by the given +options+;
7953 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7954 *
7955 * Related: String#upcase, String#downcase, String#downcase!.
7956 *
7957 */
7958
7959static VALUE
7960rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7961{
7962 rb_encoding *enc;
7963 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7964
7965 flags = check_case_options(argc, argv, flags);
7966 str_modify_keep_cr(str);
7967 enc = str_true_enc(str);
7968 if (case_option_single_p(flags, enc, str)) {
7969 if (upcase_single(str))
7970 flags |= ONIGENC_CASE_MODIFIED;
7971 }
7972 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7973 rb_str_ascii_casemap(str, str, &flags, enc);
7974 else
7975 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7976
7977 if (ONIGENC_CASE_MODIFIED&flags) return str;
7978 return Qnil;
7979}
7980
7981
7982/*
7983 * call-seq:
7984 * upcase(*options) -> string
7985 *
7986 * Returns a string containing the upcased characters in +self+:
7987 *
7988 * s = 'Hello World!' # => "Hello World!"
7989 * s.upcase # => "HELLO WORLD!"
7990 *
7991 * The casing may be affected by the given +options+;
7992 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7993 *
7994 * Related: String#upcase!, String#downcase, String#downcase!.
7995 *
7996 */
7997
7998static VALUE
7999rb_str_upcase(int argc, VALUE *argv, VALUE str)
8000{
8001 rb_encoding *enc;
8002 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8003 VALUE ret;
8004
8005 flags = check_case_options(argc, argv, flags);
8006 enc = str_true_enc(str);
8007 if (case_option_single_p(flags, enc, str)) {
8008 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8009 str_enc_copy_direct(ret, str);
8010 upcase_single(ret);
8011 }
8012 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8013 ret = rb_str_new(0, RSTRING_LEN(str));
8014 rb_str_ascii_casemap(str, ret, &flags, enc);
8015 }
8016 else {
8017 ret = rb_str_casemap(str, &flags, enc);
8018 }
8019
8020 return ret;
8021}
8022
8023static bool
8024downcase_single(VALUE str)
8025{
8026 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8027 bool modified = false;
8028
8029 while (s < send) {
8030 unsigned int c = *(unsigned char*)s;
8031
8032 if ('A' <= c && c <= 'Z') {
8033 *s = 'a' + (c - 'A');
8034 modified = true;
8035 }
8036 s++;
8037 }
8038
8039 return modified;
8040}
8041
8042/*
8043 * call-seq:
8044 * downcase!(*options) -> self or nil
8045 *
8046 * Downcases the characters in +self+;
8047 * returns +self+ if any changes were made, +nil+ otherwise:
8048 *
8049 * s = 'Hello World!' # => "Hello World!"
8050 * s.downcase! # => "hello world!"
8051 * s # => "hello world!"
8052 * s.downcase! # => nil
8053 *
8054 * The casing may be affected by the given +options+;
8055 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8056 *
8057 * Related: String#downcase, String#upcase, String#upcase!.
8058 *
8059 */
8060
8061static VALUE
8062rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8063{
8064 rb_encoding *enc;
8065 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8066
8067 flags = check_case_options(argc, argv, flags);
8068 str_modify_keep_cr(str);
8069 enc = str_true_enc(str);
8070 if (case_option_single_p(flags, enc, str)) {
8071 if (downcase_single(str))
8072 flags |= ONIGENC_CASE_MODIFIED;
8073 }
8074 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8075 rb_str_ascii_casemap(str, str, &flags, enc);
8076 else
8077 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8078
8079 if (ONIGENC_CASE_MODIFIED&flags) return str;
8080 return Qnil;
8081}
8082
8083
8084/*
8085 * call-seq:
8086 * downcase(*options) -> string
8087 *
8088 * Returns a string containing the downcased characters in +self+:
8089 *
8090 * s = 'Hello World!' # => "Hello World!"
8091 * s.downcase # => "hello world!"
8092 *
8093 * The casing may be affected by the given +options+;
8094 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8095 *
8096 * Related: String#downcase!, String#upcase, String#upcase!.
8097 *
8098 */
8099
8100static VALUE
8101rb_str_downcase(int argc, VALUE *argv, VALUE str)
8102{
8103 rb_encoding *enc;
8104 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8105 VALUE ret;
8106
8107 flags = check_case_options(argc, argv, flags);
8108 enc = str_true_enc(str);
8109 if (case_option_single_p(flags, enc, str)) {
8110 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8111 str_enc_copy_direct(ret, str);
8112 downcase_single(ret);
8113 }
8114 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8115 ret = rb_str_new(0, RSTRING_LEN(str));
8116 rb_str_ascii_casemap(str, ret, &flags, enc);
8117 }
8118 else {
8119 ret = rb_str_casemap(str, &flags, enc);
8120 }
8121
8122 return ret;
8123}
8124
8125
8126/*
8127 * call-seq:
8128 * capitalize!(*options) -> self or nil
8129 *
8130 * Upcases the first character in +self+;
8131 * downcases the remaining characters;
8132 * returns +self+ if any changes were made, +nil+ otherwise:
8133 *
8134 * s = 'hello World!' # => "hello World!"
8135 * s.capitalize! # => "Hello world!"
8136 * s # => "Hello world!"
8137 * s.capitalize! # => nil
8138 *
8139 * The casing may be affected by the given +options+;
8140 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8141 *
8142 * Related: String#capitalize.
8143 *
8144 */
8145
8146static VALUE
8147rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8148{
8149 rb_encoding *enc;
8150 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8151
8152 flags = check_case_options(argc, argv, flags);
8153 str_modify_keep_cr(str);
8154 enc = str_true_enc(str);
8155 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8156 if (flags&ONIGENC_CASE_ASCII_ONLY)
8157 rb_str_ascii_casemap(str, str, &flags, enc);
8158 else
8159 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8160
8161 if (ONIGENC_CASE_MODIFIED&flags) return str;
8162 return Qnil;
8163}
8164
8165
8166/*
8167 * call-seq:
8168 * capitalize(*options) -> string
8169 *
8170 * Returns a string containing the characters in +self+;
8171 * the first character is upcased;
8172 * the remaining characters are downcased:
8173 *
8174 * s = 'hello World!' # => "hello World!"
8175 * s.capitalize # => "Hello world!"
8176 *
8177 * The casing may be affected by the given +options+;
8178 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8179 *
8180 * Related: String#capitalize!.
8181 *
8182 */
8183
8184static VALUE
8185rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8186{
8187 rb_encoding *enc;
8188 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8189 VALUE ret;
8190
8191 flags = check_case_options(argc, argv, flags);
8192 enc = str_true_enc(str);
8193 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8194 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8195 ret = rb_str_new(0, RSTRING_LEN(str));
8196 rb_str_ascii_casemap(str, ret, &flags, enc);
8197 }
8198 else {
8199 ret = rb_str_casemap(str, &flags, enc);
8200 }
8201 return ret;
8202}
8203
8204
8205/*
8206 * call-seq:
8207 * swapcase!(*options) -> self or nil
8208 *
8209 * Upcases each lowercase character in +self+;
8210 * downcases uppercase character;
8211 * returns +self+ if any changes were made, +nil+ otherwise:
8212 *
8213 * s = 'Hello World!' # => "Hello World!"
8214 * s.swapcase! # => "hELLO wORLD!"
8215 * s # => "hELLO wORLD!"
8216 * ''.swapcase! # => nil
8217 *
8218 * The casing may be affected by the given +options+;
8219 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8220 *
8221 * Related: String#swapcase.
8222 *
8223 */
8224
8225static VALUE
8226rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8227{
8228 rb_encoding *enc;
8229 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8230
8231 flags = check_case_options(argc, argv, flags);
8232 str_modify_keep_cr(str);
8233 enc = str_true_enc(str);
8234 if (flags&ONIGENC_CASE_ASCII_ONLY)
8235 rb_str_ascii_casemap(str, str, &flags, enc);
8236 else
8237 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8238
8239 if (ONIGENC_CASE_MODIFIED&flags) return str;
8240 return Qnil;
8241}
8242
8243
8244/*
8245 * call-seq:
8246 * swapcase(*options) -> string
8247 *
8248 * Returns a string containing the characters in +self+, with cases reversed;
8249 * each uppercase character is downcased;
8250 * each lowercase character is upcased:
8251 *
8252 * s = 'Hello World!' # => "Hello World!"
8253 * s.swapcase # => "hELLO wORLD!"
8254 *
8255 * The casing may be affected by the given +options+;
8256 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8257 *
8258 * Related: String#swapcase!.
8259 *
8260 */
8261
8262static VALUE
8263rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8264{
8265 rb_encoding *enc;
8266 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8267 VALUE ret;
8268
8269 flags = check_case_options(argc, argv, flags);
8270 enc = str_true_enc(str);
8271 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8272 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8273 ret = rb_str_new(0, RSTRING_LEN(str));
8274 rb_str_ascii_casemap(str, ret, &flags, enc);
8275 }
8276 else {
8277 ret = rb_str_casemap(str, &flags, enc);
8278 }
8279 return ret;
8280}
8281
8282typedef unsigned char *USTR;
8283
8284struct tr {
8285 int gen;
8286 unsigned int now, max;
8287 char *p, *pend;
8288};
8289
8290static unsigned int
8291trnext(struct tr *t, rb_encoding *enc)
8292{
8293 int n;
8294
8295 for (;;) {
8296 nextpart:
8297 if (!t->gen) {
8298 if (t->p == t->pend) return -1;
8299 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8300 t->p += n;
8301 }
8302 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8303 t->p += n;
8304 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8305 t->p += n;
8306 if (t->p < t->pend) {
8307 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8308 t->p += n;
8309 if (t->now > c) {
8310 if (t->now < 0x80 && c < 0x80) {
8311 rb_raise(rb_eArgError,
8312 "invalid range \"%c-%c\" in string transliteration",
8313 t->now, c);
8314 }
8315 else {
8316 rb_raise(rb_eArgError, "invalid range in string transliteration");
8317 }
8318 continue; /* not reached */
8319 }
8320 else if (t->now < c) {
8321 t->gen = 1;
8322 t->max = c;
8323 }
8324 }
8325 }
8326 return t->now;
8327 }
8328 else {
8329 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8330 if (t->now == t->max) {
8331 t->gen = 0;
8332 goto nextpart;
8333 }
8334 }
8335 if (t->now < t->max) {
8336 return t->now;
8337 }
8338 else {
8339 t->gen = 0;
8340 return t->max;
8341 }
8342 }
8343 }
8344}
8345
8346static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8347
8348static VALUE
8349tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8350{
8351 const unsigned int errc = -1;
8352 unsigned int trans[256];
8353 rb_encoding *enc, *e1, *e2;
8354 struct tr trsrc, trrepl;
8355 int cflag = 0;
8356 unsigned int c, c0, last = 0;
8357 int modify = 0, i, l;
8358 unsigned char *s, *send;
8359 VALUE hash = 0;
8360 int singlebyte = single_byte_optimizable(str);
8361 int termlen;
8362 int cr;
8363
8364#define CHECK_IF_ASCII(c) \
8365 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8366 (cr = ENC_CODERANGE_VALID) : 0)
8367
8368 StringValue(src);
8369 StringValue(repl);
8370 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8371 if (RSTRING_LEN(repl) == 0) {
8372 return rb_str_delete_bang(1, &src, str);
8373 }
8374
8375 cr = ENC_CODERANGE(str);
8376 e1 = rb_enc_check(str, src);
8377 e2 = rb_enc_check(str, repl);
8378 if (e1 == e2) {
8379 enc = e1;
8380 }
8381 else {
8382 enc = rb_enc_check(src, repl);
8383 }
8384 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8385 if (RSTRING_LEN(src) > 1 &&
8386 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8387 trsrc.p + l < trsrc.pend) {
8388 cflag = 1;
8389 trsrc.p += l;
8390 }
8391 trrepl.p = RSTRING_PTR(repl);
8392 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8393 trsrc.gen = trrepl.gen = 0;
8394 trsrc.now = trrepl.now = 0;
8395 trsrc.max = trrepl.max = 0;
8396
8397 if (cflag) {
8398 for (i=0; i<256; i++) {
8399 trans[i] = 1;
8400 }
8401 while ((c = trnext(&trsrc, enc)) != errc) {
8402 if (c < 256) {
8403 trans[c] = errc;
8404 }
8405 else {
8406 if (!hash) hash = rb_hash_new();
8407 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8408 }
8409 }
8410 while ((c = trnext(&trrepl, enc)) != errc)
8411 /* retrieve last replacer */;
8412 last = trrepl.now;
8413 for (i=0; i<256; i++) {
8414 if (trans[i] != errc) {
8415 trans[i] = last;
8416 }
8417 }
8418 }
8419 else {
8420 unsigned int r;
8421
8422 for (i=0; i<256; i++) {
8423 trans[i] = errc;
8424 }
8425 while ((c = trnext(&trsrc, enc)) != errc) {
8426 r = trnext(&trrepl, enc);
8427 if (r == errc) r = trrepl.now;
8428 if (c < 256) {
8429 trans[c] = r;
8430 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8431 }
8432 else {
8433 if (!hash) hash = rb_hash_new();
8434 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8435 }
8436 }
8437 }
8438
8439 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8440 cr = ENC_CODERANGE_7BIT;
8441 str_modify_keep_cr(str);
8442 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8443 termlen = rb_enc_mbminlen(enc);
8444 if (sflag) {
8445 int clen, tlen;
8446 long offset, max = RSTRING_LEN(str);
8447 unsigned int save = -1;
8448 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8449
8450 while (s < send) {
8451 int may_modify = 0;
8452
8453 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8454 if (!MBCLEN_CHARFOUND_P(r)) {
8455 xfree(buf);
8456 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8457 }
8458 clen = MBCLEN_CHARFOUND_LEN(r);
8459 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8460
8461 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8462
8463 s += clen;
8464 if (c < 256) {
8465 c = trans[c];
8466 }
8467 else if (hash) {
8468 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8469 if (NIL_P(tmp)) {
8470 if (cflag) c = last;
8471 else c = errc;
8472 }
8473 else if (cflag) c = errc;
8474 else c = NUM2INT(tmp);
8475 }
8476 else {
8477 c = errc;
8478 }
8479 if (c != (unsigned int)-1) {
8480 if (save == c) {
8481 CHECK_IF_ASCII(c);
8482 continue;
8483 }
8484 save = c;
8485 tlen = rb_enc_codelen(c, enc);
8486 modify = 1;
8487 }
8488 else {
8489 save = -1;
8490 c = c0;
8491 if (enc != e1) may_modify = 1;
8492 }
8493 if ((offset = t - buf) + tlen > max) {
8494 size_t MAYBE_UNUSED(old) = max + termlen;
8495 max = offset + tlen + (send - s);
8496 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8497 t = buf + offset;
8498 }
8499 rb_enc_mbcput(c, t, enc);
8500 if (may_modify && memcmp(s, t, tlen) != 0) {
8501 modify = 1;
8502 }
8503 CHECK_IF_ASCII(c);
8504 t += tlen;
8505 }
8506 if (!STR_EMBED_P(str)) {
8507 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8508 }
8509 TERM_FILL((char *)t, termlen);
8510 RSTRING(str)->as.heap.ptr = (char *)buf;
8511 STR_SET_LEN(str, t - buf);
8512 STR_SET_NOEMBED(str);
8513 RSTRING(str)->as.heap.aux.capa = max;
8514 }
8515 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8516 while (s < send) {
8517 c = (unsigned char)*s;
8518 if (trans[c] != errc) {
8519 if (!cflag) {
8520 c = trans[c];
8521 *s = c;
8522 modify = 1;
8523 }
8524 else {
8525 *s = last;
8526 modify = 1;
8527 }
8528 }
8529 CHECK_IF_ASCII(c);
8530 s++;
8531 }
8532 }
8533 else {
8534 int clen, tlen;
8535 long offset, max = (long)((send - s) * 1.2);
8536 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8537
8538 while (s < send) {
8539 int may_modify = 0;
8540
8541 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8542 if (!MBCLEN_CHARFOUND_P(r)) {
8543 xfree(buf);
8544 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8545 }
8546 clen = MBCLEN_CHARFOUND_LEN(r);
8547 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8548
8549 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8550
8551 if (c < 256) {
8552 c = trans[c];
8553 }
8554 else if (hash) {
8555 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8556 if (NIL_P(tmp)) {
8557 if (cflag) c = last;
8558 else c = errc;
8559 }
8560 else if (cflag) c = errc;
8561 else c = NUM2INT(tmp);
8562 }
8563 else {
8564 c = cflag ? last : errc;
8565 }
8566 if (c != errc) {
8567 tlen = rb_enc_codelen(c, enc);
8568 modify = 1;
8569 }
8570 else {
8571 c = c0;
8572 if (enc != e1) may_modify = 1;
8573 }
8574 if ((offset = t - buf) + tlen > max) {
8575 size_t MAYBE_UNUSED(old) = max + termlen;
8576 max = offset + tlen + (long)((send - s) * 1.2);
8577 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8578 t = buf + offset;
8579 }
8580 if (s != t) {
8581 rb_enc_mbcput(c, t, enc);
8582 if (may_modify && memcmp(s, t, tlen) != 0) {
8583 modify = 1;
8584 }
8585 }
8586 CHECK_IF_ASCII(c);
8587 s += clen;
8588 t += tlen;
8589 }
8590 if (!STR_EMBED_P(str)) {
8591 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8592 }
8593 TERM_FILL((char *)t, termlen);
8594 RSTRING(str)->as.heap.ptr = (char *)buf;
8595 STR_SET_LEN(str, t - buf);
8596 STR_SET_NOEMBED(str);
8597 RSTRING(str)->as.heap.aux.capa = max;
8598 }
8599
8600 if (modify) {
8601 if (cr != ENC_CODERANGE_BROKEN)
8602 ENC_CODERANGE_SET(str, cr);
8603 rb_enc_associate(str, enc);
8604 return str;
8605 }
8606 return Qnil;
8607}
8608
8609
8610/*
8611 * call-seq:
8612 * tr!(selector, replacements) -> self or nil
8613 *
8614 * Like String#tr, but modifies +self+ in place.
8615 * Returns +self+ if any changes were made, +nil+ otherwise.
8616 *
8617 */
8618
8619static VALUE
8620rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8621{
8622 return tr_trans(str, src, repl, 0);
8623}
8624
8625
8626/*
8627 * call-seq:
8628 * tr(selector, replacements) -> new_string
8629 *
8630 * Returns a copy of +self+ with each character specified by string +selector+
8631 * translated to the corresponding character in string +replacements+.
8632 * The correspondence is _positional_:
8633 *
8634 * - Each occurrence of the first character specified by +selector+
8635 * is translated to the first character in +replacements+.
8636 * - Each occurrence of the second character specified by +selector+
8637 * is translated to the second character in +replacements+.
8638 * - And so on.
8639 *
8640 * Example:
8641 *
8642 * 'hello'.tr('el', 'ip') #=> "hippo"
8643 *
8644 * If +replacements+ is shorter than +selector+,
8645 * it is implicitly padded with its own last character:
8646 *
8647 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8648 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8649 *
8650 * Arguments +selector+ and +replacements+ must be valid character selectors
8651 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8652 * and may use any of its valid forms, including negation, ranges, and escaping:
8653 *
8654 * # Negation.
8655 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8656 * # Ranges.
8657 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8658 * # Escapes.
8659 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8660 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8661 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8662 *
8663 */
8664
8665static VALUE
8666rb_str_tr(VALUE str, VALUE src, VALUE repl)
8667{
8668 str = str_duplicate(rb_cString, str);
8669 tr_trans(str, src, repl, 0);
8670 return str;
8671}
8672
8673#define TR_TABLE_MAX (UCHAR_MAX+1)
8674#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8675static void
8676tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8677 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8678{
8679 const unsigned int errc = -1;
8680 char buf[TR_TABLE_MAX];
8681 struct tr tr;
8682 unsigned int c;
8683 VALUE table = 0, ptable = 0;
8684 int i, l, cflag = 0;
8685
8686 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8687 tr.gen = tr.now = tr.max = 0;
8688
8689 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8690 cflag = 1;
8691 tr.p += l;
8692 }
8693 if (first) {
8694 for (i=0; i<TR_TABLE_MAX; i++) {
8695 stable[i] = 1;
8696 }
8697 stable[TR_TABLE_MAX] = cflag;
8698 }
8699 else if (stable[TR_TABLE_MAX] && !cflag) {
8700 stable[TR_TABLE_MAX] = 0;
8701 }
8702 for (i=0; i<TR_TABLE_MAX; i++) {
8703 buf[i] = cflag;
8704 }
8705
8706 while ((c = trnext(&tr, enc)) != errc) {
8707 if (c < TR_TABLE_MAX) {
8708 buf[(unsigned char)c] = !cflag;
8709 }
8710 else {
8711 VALUE key = UINT2NUM(c);
8712
8713 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8714 if (cflag) {
8715 ptable = *ctablep;
8716 table = ptable ? ptable : rb_hash_new();
8717 *ctablep = table;
8718 }
8719 else {
8720 table = rb_hash_new();
8721 ptable = *tablep;
8722 *tablep = table;
8723 }
8724 }
8725 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8726 rb_hash_aset(table, key, Qtrue);
8727 }
8728 }
8729 }
8730 for (i=0; i<TR_TABLE_MAX; i++) {
8731 stable[i] = stable[i] && buf[i];
8732 }
8733 if (!table && !cflag) {
8734 *tablep = 0;
8735 }
8736}
8737
8738
8739static int
8740tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8741{
8742 if (c < TR_TABLE_MAX) {
8743 return table[c] != 0;
8744 }
8745 else {
8746 VALUE v = UINT2NUM(c);
8747
8748 if (del) {
8749 if (!NIL_P(rb_hash_lookup(del, v)) &&
8750 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8751 return TRUE;
8752 }
8753 }
8754 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8755 return FALSE;
8756 }
8757 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8758 }
8759}
8760
8761/*
8762 * call-seq:
8763 * delete!(*selectors) -> self or nil
8764 *
8765 * Like String#delete, but modifies +self+ in place.
8766 * Returns +self+ if any changes were made, +nil+ otherwise.
8767 *
8768 */
8769
8770static VALUE
8771rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8772{
8773 char squeez[TR_TABLE_SIZE];
8774 rb_encoding *enc = 0;
8775 char *s, *send, *t;
8776 VALUE del = 0, nodel = 0;
8777 int modify = 0;
8778 int i, ascompat, cr;
8779
8780 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8782 for (i=0; i<argc; i++) {
8783 VALUE s = argv[i];
8784
8785 StringValue(s);
8786 enc = rb_enc_check(str, s);
8787 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8788 }
8789
8790 str_modify_keep_cr(str);
8791 ascompat = rb_enc_asciicompat(enc);
8792 s = t = RSTRING_PTR(str);
8793 send = RSTRING_END(str);
8794 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8795 while (s < send) {
8796 unsigned int c;
8797 int clen;
8798
8799 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8800 if (squeez[c]) {
8801 modify = 1;
8802 }
8803 else {
8804 if (t != s) *t = c;
8805 t++;
8806 }
8807 s++;
8808 }
8809 else {
8810 c = rb_enc_codepoint_len(s, send, &clen, enc);
8811
8812 if (tr_find(c, squeez, del, nodel)) {
8813 modify = 1;
8814 }
8815 else {
8816 if (t != s) rb_enc_mbcput(c, t, enc);
8817 t += clen;
8819 }
8820 s += clen;
8821 }
8822 }
8823 TERM_FILL(t, TERM_LEN(str));
8824 STR_SET_LEN(str, t - RSTRING_PTR(str));
8825 ENC_CODERANGE_SET(str, cr);
8826
8827 if (modify) return str;
8828 return Qnil;
8829}
8830
8831
8832/*
8833 * call-seq:
8834 * delete(*selectors) -> new_string
8835 *
8836 * Returns a copy of +self+ with characters specified by +selectors+ removed
8837 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8838 *
8839 * "hello".delete "l","lo" #=> "heo"
8840 * "hello".delete "lo" #=> "he"
8841 * "hello".delete "aeiou", "^e" #=> "hell"
8842 * "hello".delete "ej-m" #=> "ho"
8843 *
8844 */
8845
8846static VALUE
8847rb_str_delete(int argc, VALUE *argv, VALUE str)
8848{
8849 str = str_duplicate(rb_cString, str);
8850 rb_str_delete_bang(argc, argv, str);
8851 return str;
8852}
8853
8854
8855/*
8856 * call-seq:
8857 * squeeze!(*selectors) -> self or nil
8858 *
8859 * Like String#squeeze, but modifies +self+ in place.
8860 * Returns +self+ if any changes were made, +nil+ otherwise.
8861 */
8862
8863static VALUE
8864rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8865{
8866 char squeez[TR_TABLE_SIZE];
8867 rb_encoding *enc = 0;
8868 VALUE del = 0, nodel = 0;
8869 unsigned char *s, *send, *t;
8870 int i, modify = 0;
8871 int ascompat, singlebyte = single_byte_optimizable(str);
8872 unsigned int save;
8873
8874 if (argc == 0) {
8875 enc = STR_ENC_GET(str);
8876 }
8877 else {
8878 for (i=0; i<argc; i++) {
8879 VALUE s = argv[i];
8880
8881 StringValue(s);
8882 enc = rb_enc_check(str, s);
8883 if (singlebyte && !single_byte_optimizable(s))
8884 singlebyte = 0;
8885 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8886 }
8887 }
8888
8889 str_modify_keep_cr(str);
8890 s = t = (unsigned char *)RSTRING_PTR(str);
8891 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8892 send = (unsigned char *)RSTRING_END(str);
8893 save = -1;
8894 ascompat = rb_enc_asciicompat(enc);
8895
8896 if (singlebyte) {
8897 while (s < send) {
8898 unsigned int c = *s++;
8899 if (c != save || (argc > 0 && !squeez[c])) {
8900 *t++ = save = c;
8901 }
8902 }
8903 }
8904 else {
8905 while (s < send) {
8906 unsigned int c;
8907 int clen;
8908
8909 if (ascompat && (c = *s) < 0x80) {
8910 if (c != save || (argc > 0 && !squeez[c])) {
8911 *t++ = save = c;
8912 }
8913 s++;
8914 }
8915 else {
8916 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8917
8918 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8919 if (t != s) rb_enc_mbcput(c, t, enc);
8920 save = c;
8921 t += clen;
8922 }
8923 s += clen;
8924 }
8925 }
8926 }
8927
8928 TERM_FILL((char *)t, TERM_LEN(str));
8929 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8930 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8931 modify = 1;
8932 }
8933
8934 if (modify) return str;
8935 return Qnil;
8936}
8937
8938
8939/*
8940 * call-seq:
8941 * squeeze(*selectors) -> new_string
8942 *
8943 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8944 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8945 *
8946 * "Squeezed" means that each multiple-character run of a selected character
8947 * is squeezed down to a single character;
8948 * with no arguments given, squeezes all characters:
8949 *
8950 * "yellow moon".squeeze #=> "yelow mon"
8951 * " now is the".squeeze(" ") #=> " now is the"
8952 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8953 *
8954 */
8955
8956static VALUE
8957rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8958{
8959 str = str_duplicate(rb_cString, str);
8960 rb_str_squeeze_bang(argc, argv, str);
8961 return str;
8962}
8963
8964
8965/*
8966 * call-seq:
8967 * tr_s!(selector, replacements) -> self or nil
8968 *
8969 * Like String#tr_s, but modifies +self+ in place.
8970 * Returns +self+ if any changes were made, +nil+ otherwise.
8971 *
8972 * Related: String#squeeze!.
8973 */
8974
8975static VALUE
8976rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8977{
8978 return tr_trans(str, src, repl, 1);
8979}
8980
8981
8982/*
8983 * call-seq:
8984 * tr_s(selector, replacements) -> string
8985 *
8986 * Like String#tr, but also squeezes the modified portions of the translated string;
8987 * returns a new string (translated and squeezed).
8988 *
8989 * 'hello'.tr_s('l', 'r') #=> "hero"
8990 * 'hello'.tr_s('el', '-') #=> "h-o"
8991 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8992 *
8993 * Related: String#squeeze.
8994 *
8995 */
8996
8997static VALUE
8998rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8999{
9000 str = str_duplicate(rb_cString, str);
9001 tr_trans(str, src, repl, 1);
9002 return str;
9003}
9004
9005
9006/*
9007 * call-seq:
9008 * count(*selectors) -> integer
9009 *
9010 * Returns the total number of characters in +self+
9011 * that are specified by the given +selectors+
9012 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9013 *
9014 * a = "hello world"
9015 * a.count "lo" #=> 5
9016 * a.count "lo", "o" #=> 2
9017 * a.count "hello", "^l" #=> 4
9018 * a.count "ej-m" #=> 4
9019 *
9020 * "hello^world".count "\\^aeiou" #=> 4
9021 * "hello-world".count "a\\-eo" #=> 4
9022 *
9023 * c = "hello world\\r\\n"
9024 * c.count "\\" #=> 2
9025 * c.count "\\A" #=> 0
9026 * c.count "X-\\w" #=> 3
9027 */
9028
9029static VALUE
9030rb_str_count(int argc, VALUE *argv, VALUE str)
9031{
9032 char table[TR_TABLE_SIZE];
9033 rb_encoding *enc = 0;
9034 VALUE del = 0, nodel = 0, tstr;
9035 char *s, *send;
9036 int i;
9037 int ascompat;
9038 size_t n = 0;
9039
9041
9042 tstr = argv[0];
9043 StringValue(tstr);
9044 enc = rb_enc_check(str, tstr);
9045 if (argc == 1) {
9046 const char *ptstr;
9047 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9048 (ptstr = RSTRING_PTR(tstr),
9049 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9050 !is_broken_string(str)) {
9051 int clen;
9052 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9053
9054 s = RSTRING_PTR(str);
9055 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9056 send = RSTRING_END(str);
9057 while (s < send) {
9058 if (*(unsigned char*)s++ == c) n++;
9059 }
9060 return SIZET2NUM(n);
9061 }
9062 }
9063
9064 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9065 for (i=1; i<argc; i++) {
9066 tstr = argv[i];
9067 StringValue(tstr);
9068 enc = rb_enc_check(str, tstr);
9069 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9070 }
9071
9072 s = RSTRING_PTR(str);
9073 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9074 send = RSTRING_END(str);
9075 ascompat = rb_enc_asciicompat(enc);
9076 while (s < send) {
9077 unsigned int c;
9078
9079 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9080 if (table[c]) {
9081 n++;
9082 }
9083 s++;
9084 }
9085 else {
9086 int clen;
9087 c = rb_enc_codepoint_len(s, send, &clen, enc);
9088 if (tr_find(c, table, del, nodel)) {
9089 n++;
9090 }
9091 s += clen;
9092 }
9093 }
9094
9095 return SIZET2NUM(n);
9096}
9097
9098static VALUE
9099rb_fs_check(VALUE val)
9100{
9101 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9102 val = rb_check_string_type(val);
9103 if (NIL_P(val)) return 0;
9104 }
9105 return val;
9106}
9107
9108static const char isspacetable[256] = {
9109 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9111 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9125};
9126
9127#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9128
9129static long
9130split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9131{
9132 if (empty_count >= 0 && len == 0) {
9133 return empty_count + 1;
9134 }
9135 if (empty_count > 0) {
9136 /* make different substrings */
9137 if (result) {
9138 do {
9139 rb_ary_push(result, str_new_empty_String(str));
9140 } while (--empty_count > 0);
9141 }
9142 else {
9143 do {
9144 rb_yield(str_new_empty_String(str));
9145 } while (--empty_count > 0);
9146 }
9147 }
9148 str = rb_str_subseq(str, beg, len);
9149 if (result) {
9150 rb_ary_push(result, str);
9151 }
9152 else {
9153 rb_yield(str);
9154 }
9155 return empty_count;
9156}
9157
9158typedef enum {
9159 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9160} split_type_t;
9161
9162static split_type_t
9163literal_split_pattern(VALUE spat, split_type_t default_type)
9164{
9165 rb_encoding *enc = STR_ENC_GET(spat);
9166 const char *ptr;
9167 long len;
9168 RSTRING_GETMEM(spat, ptr, len);
9169 if (len == 0) {
9170 /* Special case - split into chars */
9171 return SPLIT_TYPE_CHARS;
9172 }
9173 else if (rb_enc_asciicompat(enc)) {
9174 if (len == 1 && ptr[0] == ' ') {
9175 return SPLIT_TYPE_AWK;
9176 }
9177 }
9178 else {
9179 int l;
9180 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9181 return SPLIT_TYPE_AWK;
9182 }
9183 }
9184 return default_type;
9185}
9186
9187/*
9188 * call-seq:
9189 * split(field_sep = $;, limit = 0) -> array
9190 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9191 *
9192 * :include: doc/string/split.rdoc
9193 *
9194 */
9195
9196static VALUE
9197rb_str_split_m(int argc, VALUE *argv, VALUE str)
9198{
9199 rb_encoding *enc;
9200 VALUE spat;
9201 VALUE limit;
9202 split_type_t split_type;
9203 long beg, end, i = 0, empty_count = -1;
9204 int lim = 0;
9205 VALUE result, tmp;
9206
9207 result = rb_block_given_p() ? Qfalse : Qnil;
9208 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9209 lim = NUM2INT(limit);
9210 if (lim <= 0) limit = Qnil;
9211 else if (lim == 1) {
9212 if (RSTRING_LEN(str) == 0)
9213 return result ? rb_ary_new2(0) : str;
9214 tmp = str_duplicate(rb_cString, str);
9215 if (!result) {
9216 rb_yield(tmp);
9217 return str;
9218 }
9219 return rb_ary_new3(1, tmp);
9220 }
9221 i = 1;
9222 }
9223 if (NIL_P(limit) && !lim) empty_count = 0;
9224
9225 enc = STR_ENC_GET(str);
9226 split_type = SPLIT_TYPE_REGEXP;
9227 if (!NIL_P(spat)) {
9228 spat = get_pat_quoted(spat, 0);
9229 }
9230 else if (NIL_P(spat = rb_fs)) {
9231 split_type = SPLIT_TYPE_AWK;
9232 }
9233 else if (!(spat = rb_fs_check(spat))) {
9234 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9235 }
9236 else {
9237 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9238 }
9239 if (split_type != SPLIT_TYPE_AWK) {
9240 switch (BUILTIN_TYPE(spat)) {
9241 case T_REGEXP:
9242 rb_reg_options(spat); /* check if uninitialized */
9243 tmp = RREGEXP_SRC(spat);
9244 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9245 if (split_type == SPLIT_TYPE_AWK) {
9246 spat = tmp;
9247 split_type = SPLIT_TYPE_STRING;
9248 }
9249 break;
9250
9251 case T_STRING:
9252 mustnot_broken(spat);
9253 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9254 break;
9255
9256 default:
9258 }
9259 }
9260
9261#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9262
9263 beg = 0;
9264 char *ptr = RSTRING_PTR(str);
9265 char *eptr = RSTRING_END(str);
9266 if (split_type == SPLIT_TYPE_AWK) {
9267 char *bptr = ptr;
9268 int skip = 1;
9269 unsigned int c;
9270
9271 if (result) result = rb_ary_new();
9272 end = beg;
9273 if (is_ascii_string(str)) {
9274 while (ptr < eptr) {
9275 c = (unsigned char)*ptr++;
9276 if (skip) {
9277 if (ascii_isspace(c)) {
9278 beg = ptr - bptr;
9279 }
9280 else {
9281 end = ptr - bptr;
9282 skip = 0;
9283 if (!NIL_P(limit) && lim <= i) break;
9284 }
9285 }
9286 else if (ascii_isspace(c)) {
9287 SPLIT_STR(beg, end-beg);
9288 skip = 1;
9289 beg = ptr - bptr;
9290 if (!NIL_P(limit)) ++i;
9291 }
9292 else {
9293 end = ptr - bptr;
9294 }
9295 }
9296 }
9297 else {
9298 while (ptr < eptr) {
9299 int n;
9300
9301 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9302 ptr += n;
9303 if (skip) {
9304 if (rb_isspace(c)) {
9305 beg = ptr - bptr;
9306 }
9307 else {
9308 end = ptr - bptr;
9309 skip = 0;
9310 if (!NIL_P(limit) && lim <= i) break;
9311 }
9312 }
9313 else if (rb_isspace(c)) {
9314 SPLIT_STR(beg, end-beg);
9315 skip = 1;
9316 beg = ptr - bptr;
9317 if (!NIL_P(limit)) ++i;
9318 }
9319 else {
9320 end = ptr - bptr;
9321 }
9322 }
9323 }
9324 }
9325 else if (split_type == SPLIT_TYPE_STRING) {
9326 char *str_start = ptr;
9327 char *substr_start = ptr;
9328 char *sptr = RSTRING_PTR(spat);
9329 long slen = RSTRING_LEN(spat);
9330
9331 if (result) result = rb_ary_new();
9332 mustnot_broken(str);
9333 enc = rb_enc_check(str, spat);
9334 while (ptr < eptr &&
9335 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9336 /* Check we are at the start of a char */
9337 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9338 if (t != ptr + end) {
9339 ptr = t;
9340 continue;
9341 }
9342 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9343 ptr += end + slen;
9344 substr_start = ptr;
9345 if (!NIL_P(limit) && lim <= ++i) break;
9346 }
9347 beg = ptr - str_start;
9348 }
9349 else if (split_type == SPLIT_TYPE_CHARS) {
9350 char *str_start = ptr;
9351 int n;
9352
9353 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9354 mustnot_broken(str);
9355 enc = rb_enc_get(str);
9356 while (ptr < eptr &&
9357 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9358 SPLIT_STR(ptr - str_start, n);
9359 ptr += n;
9360 if (!NIL_P(limit) && lim <= ++i) break;
9361 }
9362 beg = ptr - str_start;
9363 }
9364 else {
9365 if (result) result = rb_ary_new();
9366 long len = RSTRING_LEN(str);
9367 long start = beg;
9368 long idx;
9369 int last_null = 0;
9370 struct re_registers *regs;
9371 VALUE match = 0;
9372
9373 for (; rb_reg_search(spat, str, start, 0) >= 0;
9374 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9375 match = rb_backref_get();
9376 if (!result) rb_match_busy(match);
9377 regs = RMATCH_REGS(match);
9378 end = BEG(0);
9379 if (start == end && BEG(0) == END(0)) {
9380 if (!ptr) {
9381 SPLIT_STR(0, 0);
9382 break;
9383 }
9384 else if (last_null == 1) {
9385 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9386 beg = start;
9387 }
9388 else {
9389 if (start == len)
9390 start++;
9391 else
9392 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9393 last_null = 1;
9394 continue;
9395 }
9396 }
9397 else {
9398 SPLIT_STR(beg, end-beg);
9399 beg = start = END(0);
9400 }
9401 last_null = 0;
9402
9403 for (idx=1; idx < regs->num_regs; idx++) {
9404 if (BEG(idx) == -1) continue;
9405 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9406 }
9407 if (!NIL_P(limit) && lim <= ++i) break;
9408 }
9409 if (match) rb_match_unbusy(match);
9410 }
9411 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9412 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9413 }
9414
9415 return result ? result : str;
9416}
9417
9418VALUE
9419rb_str_split(VALUE str, const char *sep0)
9420{
9421 VALUE sep;
9422
9423 StringValue(str);
9424 sep = rb_str_new_cstr(sep0);
9425 return rb_str_split_m(1, &sep, str);
9426}
9427
9428#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9429
9430static inline int
9431enumerator_element(VALUE ary, VALUE e)
9432{
9433 if (ary) {
9434 rb_ary_push(ary, e);
9435 return 0;
9436 }
9437 else {
9438 rb_yield(e);
9439 return 1;
9440 }
9441}
9442
9443#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9444
9445static const char *
9446chomp_newline(const char *p, const char *e, rb_encoding *enc)
9447{
9448 const char *prev = rb_enc_prev_char(p, e, e, enc);
9449 if (rb_enc_is_newline(prev, e, enc)) {
9450 e = prev;
9451 prev = rb_enc_prev_char(p, e, e, enc);
9452 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9453 e = prev;
9454 }
9455 return e;
9456}
9457
9458static VALUE
9459get_rs(void)
9460{
9461 VALUE rs = rb_rs;
9462 if (!NIL_P(rs) &&
9463 (!RB_TYPE_P(rs, T_STRING) ||
9464 RSTRING_LEN(rs) != 1 ||
9465 RSTRING_PTR(rs)[0] != '\n')) {
9466 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9467 }
9468 return rs;
9469}
9470
9471#define rb_rs get_rs()
9472
9473static VALUE
9474rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9475{
9476 rb_encoding *enc;
9477 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9478 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9479 long pos, len, rslen;
9480 int rsnewline = 0;
9481
9482 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9483 rs = rb_rs;
9484 if (!NIL_P(opts)) {
9485 static ID keywords[1];
9486 if (!keywords[0]) {
9487 keywords[0] = rb_intern_const("chomp");
9488 }
9489 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9490 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9491 }
9492
9493 if (NIL_P(rs)) {
9494 if (!ENUM_ELEM(ary, str)) {
9495 return ary;
9496 }
9497 else {
9498 return orig;
9499 }
9500 }
9501
9502 if (!RSTRING_LEN(str)) goto end;
9503 str = rb_str_new_frozen(str);
9504 ptr = subptr = RSTRING_PTR(str);
9505 pend = RSTRING_END(str);
9506 len = RSTRING_LEN(str);
9507 StringValue(rs);
9508 rslen = RSTRING_LEN(rs);
9509
9510 if (rs == rb_default_rs)
9511 enc = rb_enc_get(str);
9512 else
9513 enc = rb_enc_check(str, rs);
9514
9515 if (rslen == 0) {
9516 /* paragraph mode */
9517 int n;
9518 const char *eol = NULL;
9519 subend = subptr;
9520 while (subend < pend) {
9521 long chomp_rslen = 0;
9522 do {
9523 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9524 n = 0;
9525 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9526 if (rb_enc_is_newline(subend + n, pend, enc)) {
9527 if (eol == subend) break;
9528 subend += rslen;
9529 if (subptr) {
9530 eol = subend;
9531 chomp_rslen = -rslen;
9532 }
9533 }
9534 else {
9535 if (!subptr) subptr = subend;
9536 subend += rslen;
9537 }
9538 rslen = 0;
9539 } while (subend < pend);
9540 if (!subptr) break;
9541 if (rslen == 0) chomp_rslen = 0;
9542 line = rb_str_subseq(str, subptr - ptr,
9543 subend - subptr + (chomp ? chomp_rslen : rslen));
9544 if (ENUM_ELEM(ary, line)) {
9545 str_mod_check(str, ptr, len);
9546 }
9547 subptr = eol = NULL;
9548 }
9549 goto end;
9550 }
9551 else {
9552 rsptr = RSTRING_PTR(rs);
9553 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9554 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9555 rsnewline = 1;
9556 }
9557 }
9558
9559 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9560 rs = rb_str_new(rsptr, rslen);
9561 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9562 rsptr = RSTRING_PTR(rs);
9563 rslen = RSTRING_LEN(rs);
9564 }
9565
9566 while (subptr < pend) {
9567 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9568 if (pos < 0) break;
9569 hit = subptr + pos;
9570 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9571 if (hit != adjusted) {
9572 subptr = adjusted;
9573 continue;
9574 }
9575 subend = hit += rslen;
9576 if (chomp) {
9577 if (rsnewline) {
9578 subend = chomp_newline(subptr, subend, enc);
9579 }
9580 else {
9581 subend -= rslen;
9582 }
9583 }
9584 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9585 if (ENUM_ELEM(ary, line)) {
9586 str_mod_check(str, ptr, len);
9587 }
9588 subptr = hit;
9589 }
9590
9591 if (subptr != pend) {
9592 if (chomp) {
9593 if (rsnewline) {
9594 pend = chomp_newline(subptr, pend, enc);
9595 }
9596 else if (pend - subptr >= rslen &&
9597 memcmp(pend - rslen, rsptr, rslen) == 0) {
9598 pend -= rslen;
9599 }
9600 }
9601 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9602 ENUM_ELEM(ary, line);
9603 RB_GC_GUARD(str);
9604 }
9605
9606 end:
9607 if (ary)
9608 return ary;
9609 else
9610 return orig;
9611}
9612
9613/*
9614 * call-seq:
9615 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9616 * each_line(line_sep = $/, chomp: false) -> enumerator
9617 *
9618 * :include: doc/string/each_line.rdoc
9619 *
9620 */
9621
9622static VALUE
9623rb_str_each_line(int argc, VALUE *argv, VALUE str)
9624{
9625 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9626 return rb_str_enumerate_lines(argc, argv, str, 0);
9627}
9628
9629/*
9630 * call-seq:
9631 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9632 *
9633 * Forms substrings ("lines") of +self+ according to the given arguments
9634 * (see String#each_line for details); returns the lines in an array.
9635 *
9636 */
9637
9638static VALUE
9639rb_str_lines(int argc, VALUE *argv, VALUE str)
9640{
9641 VALUE ary = WANTARRAY("lines", 0);
9642 return rb_str_enumerate_lines(argc, argv, str, ary);
9643}
9644
9645static VALUE
9646rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9647{
9648 return LONG2FIX(RSTRING_LEN(str));
9649}
9650
9651static VALUE
9652rb_str_enumerate_bytes(VALUE str, VALUE ary)
9653{
9654 long i;
9655
9656 for (i=0; i<RSTRING_LEN(str); i++) {
9657 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9658 }
9659 if (ary)
9660 return ary;
9661 else
9662 return str;
9663}
9664
9665/*
9666 * call-seq:
9667 * each_byte {|byte| ... } -> self
9668 * each_byte -> enumerator
9669 *
9670 * :include: doc/string/each_byte.rdoc
9671 *
9672 */
9673
9674static VALUE
9675rb_str_each_byte(VALUE str)
9676{
9677 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9678 return rb_str_enumerate_bytes(str, 0);
9679}
9680
9681/*
9682 * call-seq:
9683 * bytes -> array_of_bytes
9684 *
9685 * :include: doc/string/bytes.rdoc
9686 *
9687 */
9688
9689static VALUE
9690rb_str_bytes(VALUE str)
9691{
9692 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9693 return rb_str_enumerate_bytes(str, ary);
9694}
9695
9696static VALUE
9697rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9698{
9699 return rb_str_length(str);
9700}
9701
9702static VALUE
9703rb_str_enumerate_chars(VALUE str, VALUE ary)
9704{
9705 VALUE orig = str;
9706 long i, len, n;
9707 const char *ptr;
9708 rb_encoding *enc;
9709
9710 str = rb_str_new_frozen(str);
9711 ptr = RSTRING_PTR(str);
9712 len = RSTRING_LEN(str);
9713 enc = rb_enc_get(str);
9714
9716 for (i = 0; i < len; i += n) {
9717 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9718 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9719 }
9720 }
9721 else {
9722 for (i = 0; i < len; i += n) {
9723 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9724 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9725 }
9726 }
9727 RB_GC_GUARD(str);
9728 if (ary)
9729 return ary;
9730 else
9731 return orig;
9732}
9733
9734/*
9735 * call-seq:
9736 * each_char {|c| ... } -> self
9737 * each_char -> enumerator
9738 *
9739 * :include: doc/string/each_char.rdoc
9740 *
9741 */
9742
9743static VALUE
9744rb_str_each_char(VALUE str)
9745{
9746 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9747 return rb_str_enumerate_chars(str, 0);
9748}
9749
9750/*
9751 * call-seq:
9752 * chars -> array_of_characters
9753 *
9754 * :include: doc/string/chars.rdoc
9755 *
9756 */
9757
9758static VALUE
9759rb_str_chars(VALUE str)
9760{
9761 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9762 return rb_str_enumerate_chars(str, ary);
9763}
9764
9765static VALUE
9766rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9767{
9768 VALUE orig = str;
9769 int n;
9770 unsigned int c;
9771 const char *ptr, *end;
9772 rb_encoding *enc;
9773
9774 if (single_byte_optimizable(str))
9775 return rb_str_enumerate_bytes(str, ary);
9776
9777 str = rb_str_new_frozen(str);
9778 ptr = RSTRING_PTR(str);
9779 end = RSTRING_END(str);
9780 enc = STR_ENC_GET(str);
9781
9782 while (ptr < end) {
9783 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9784 ENUM_ELEM(ary, UINT2NUM(c));
9785 ptr += n;
9786 }
9787 RB_GC_GUARD(str);
9788 if (ary)
9789 return ary;
9790 else
9791 return orig;
9792}
9793
9794/*
9795 * call-seq:
9796 * each_codepoint {|integer| ... } -> self
9797 * each_codepoint -> enumerator
9798 *
9799 * :include: doc/string/each_codepoint.rdoc
9800 *
9801 */
9802
9803static VALUE
9804rb_str_each_codepoint(VALUE str)
9805{
9806 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9807 return rb_str_enumerate_codepoints(str, 0);
9808}
9809
9810/*
9811 * call-seq:
9812 * codepoints -> array_of_integers
9813 *
9814 * :include: doc/string/codepoints.rdoc
9815 *
9816 */
9817
9818static VALUE
9819rb_str_codepoints(VALUE str)
9820{
9821 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9822 return rb_str_enumerate_codepoints(str, ary);
9823}
9824
9825static regex_t *
9826get_reg_grapheme_cluster(rb_encoding *enc)
9827{
9828 int encidx = rb_enc_to_index(enc);
9829
9830 const OnigUChar source_ascii[] = "\\X";
9831 const OnigUChar *source = source_ascii;
9832 size_t source_len = sizeof(source_ascii) - 1;
9833
9834 switch (encidx) {
9835#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9836#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9837#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9838#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9839#define CASE_UTF(e) \
9840 case ENCINDEX_UTF_##e: { \
9841 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9842 source = source_UTF_##e; \
9843 source_len = sizeof(source_UTF_##e); \
9844 break; \
9845 }
9846 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9847#undef CASE_UTF
9848#undef CHARS_16BE
9849#undef CHARS_16LE
9850#undef CHARS_32BE
9851#undef CHARS_32LE
9852 }
9853
9854 regex_t *reg_grapheme_cluster;
9855 OnigErrorInfo einfo;
9856 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9857 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9858 if (r) {
9859 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9860 onig_error_code_to_str(message, r, &einfo);
9861 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9862 }
9863
9864 return reg_grapheme_cluster;
9865}
9866
9867static regex_t *
9868get_cached_reg_grapheme_cluster(rb_encoding *enc)
9869{
9870 int encidx = rb_enc_to_index(enc);
9871 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9872
9873 if (encidx == rb_utf8_encindex()) {
9874 if (!reg_grapheme_cluster_utf8) {
9875 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9876 }
9877
9878 return reg_grapheme_cluster_utf8;
9879 }
9880
9881 return NULL;
9882}
9883
9884static VALUE
9885rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9886{
9887 size_t grapheme_cluster_count = 0;
9888 rb_encoding *enc = get_encoding(str);
9889 const char *ptr, *end;
9890
9891 if (!rb_enc_unicode_p(enc)) {
9892 return rb_str_length(str);
9893 }
9894
9895 bool cached_reg_grapheme_cluster = true;
9896 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9897 if (!reg_grapheme_cluster) {
9898 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9899 cached_reg_grapheme_cluster = false;
9900 }
9901
9902 ptr = RSTRING_PTR(str);
9903 end = RSTRING_END(str);
9904
9905 while (ptr < end) {
9906 OnigPosition len = onig_match(reg_grapheme_cluster,
9907 (const OnigUChar *)ptr, (const OnigUChar *)end,
9908 (const OnigUChar *)ptr, NULL, 0);
9909 if (len <= 0) break;
9910 grapheme_cluster_count++;
9911 ptr += len;
9912 }
9913
9914 if (!cached_reg_grapheme_cluster) {
9915 onig_free(reg_grapheme_cluster);
9916 }
9917
9918 return SIZET2NUM(grapheme_cluster_count);
9919}
9920
9921static VALUE
9922rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9923{
9924 VALUE orig = str;
9925 rb_encoding *enc = get_encoding(str);
9926 const char *ptr0, *ptr, *end;
9927
9928 if (!rb_enc_unicode_p(enc)) {
9929 return rb_str_enumerate_chars(str, ary);
9930 }
9931
9932 if (!ary) str = rb_str_new_frozen(str);
9933
9934 bool cached_reg_grapheme_cluster = true;
9935 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9936 if (!reg_grapheme_cluster) {
9937 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9938 cached_reg_grapheme_cluster = false;
9939 }
9940
9941 ptr0 = ptr = RSTRING_PTR(str);
9942 end = RSTRING_END(str);
9943
9944 while (ptr < end) {
9945 OnigPosition len = onig_match(reg_grapheme_cluster,
9946 (const OnigUChar *)ptr, (const OnigUChar *)end,
9947 (const OnigUChar *)ptr, NULL, 0);
9948 if (len <= 0) break;
9949 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9950 ptr += len;
9951 }
9952
9953 if (!cached_reg_grapheme_cluster) {
9954 onig_free(reg_grapheme_cluster);
9955 }
9956
9957 RB_GC_GUARD(str);
9958 if (ary)
9959 return ary;
9960 else
9961 return orig;
9962}
9963
9964/*
9965 * call-seq:
9966 * each_grapheme_cluster {|gc| ... } -> self
9967 * each_grapheme_cluster -> enumerator
9968 *
9969 * :include: doc/string/each_grapheme_cluster.rdoc
9970 *
9971 */
9972
9973static VALUE
9974rb_str_each_grapheme_cluster(VALUE str)
9975{
9976 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9977 return rb_str_enumerate_grapheme_clusters(str, 0);
9978}
9979
9980/*
9981 * call-seq:
9982 * grapheme_clusters -> array_of_grapheme_clusters
9983 *
9984 * :include: doc/string/grapheme_clusters.rdoc
9985 *
9986 */
9987
9988static VALUE
9989rb_str_grapheme_clusters(VALUE str)
9990{
9991 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9992 return rb_str_enumerate_grapheme_clusters(str, ary);
9993}
9994
9995static long
9996chopped_length(VALUE str)
9997{
9998 rb_encoding *enc = STR_ENC_GET(str);
9999 const char *p, *p2, *beg, *end;
10000
10001 beg = RSTRING_PTR(str);
10002 end = beg + RSTRING_LEN(str);
10003 if (beg >= end) return 0;
10004 p = rb_enc_prev_char(beg, end, end, enc);
10005 if (!p) return 0;
10006 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10007 p2 = rb_enc_prev_char(beg, p, end, enc);
10008 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10009 }
10010 return p - beg;
10011}
10012
10013/*
10014 * call-seq:
10015 * chop! -> self or nil
10016 *
10017 * Like String#chop, but modifies +self+ in place;
10018 * returns +nil+ if +self+ is empty, +self+ otherwise.
10019 *
10020 * Related: String#chomp!.
10021 */
10022
10023static VALUE
10024rb_str_chop_bang(VALUE str)
10025{
10026 str_modify_keep_cr(str);
10027 if (RSTRING_LEN(str) > 0) {
10028 long len;
10029 len = chopped_length(str);
10030 STR_SET_LEN(str, len);
10031 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10032 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10034 }
10035 return str;
10036 }
10037 return Qnil;
10038}
10039
10040
10041/*
10042 * call-seq:
10043 * chop -> new_string
10044 *
10045 * :include: doc/string/chop.rdoc
10046 *
10047 */
10048
10049static VALUE
10050rb_str_chop(VALUE str)
10051{
10052 return rb_str_subseq(str, 0, chopped_length(str));
10053}
10054
10055static long
10056smart_chomp(VALUE str, const char *e, const char *p)
10057{
10058 rb_encoding *enc = rb_enc_get(str);
10059 if (rb_enc_mbminlen(enc) > 1) {
10060 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10061 if (rb_enc_is_newline(pp, e, enc)) {
10062 e = pp;
10063 }
10064 pp = e - rb_enc_mbminlen(enc);
10065 if (pp >= p) {
10066 pp = rb_enc_left_char_head(p, pp, e, enc);
10067 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10068 e = pp;
10069 }
10070 }
10071 }
10072 else {
10073 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10074 case '\n':
10075 if (--e > p && *(e-1) == '\r') {
10076 --e;
10077 }
10078 break;
10079 case '\r':
10080 --e;
10081 break;
10082 }
10083 }
10084 return e - p;
10085}
10086
10087static long
10088chompped_length(VALUE str, VALUE rs)
10089{
10090 rb_encoding *enc;
10091 int newline;
10092 char *pp, *e, *rsptr;
10093 long rslen;
10094 char *const p = RSTRING_PTR(str);
10095 long len = RSTRING_LEN(str);
10096
10097 if (len == 0) return 0;
10098 e = p + len;
10099 if (rs == rb_default_rs) {
10100 return smart_chomp(str, e, p);
10101 }
10102
10103 enc = rb_enc_get(str);
10104 RSTRING_GETMEM(rs, rsptr, rslen);
10105 if (rslen == 0) {
10106 if (rb_enc_mbminlen(enc) > 1) {
10107 while (e > p) {
10108 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10109 if (!rb_enc_is_newline(pp, e, enc)) break;
10110 e = pp;
10111 pp -= rb_enc_mbminlen(enc);
10112 if (pp >= p) {
10113 pp = rb_enc_left_char_head(p, pp, e, enc);
10114 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10115 e = pp;
10116 }
10117 }
10118 }
10119 }
10120 else {
10121 while (e > p && *(e-1) == '\n') {
10122 --e;
10123 if (e > p && *(e-1) == '\r')
10124 --e;
10125 }
10126 }
10127 return e - p;
10128 }
10129 if (rslen > len) return len;
10130
10131 enc = rb_enc_get(rs);
10132 newline = rsptr[rslen-1];
10133 if (rslen == rb_enc_mbminlen(enc)) {
10134 if (rslen == 1) {
10135 if (newline == '\n')
10136 return smart_chomp(str, e, p);
10137 }
10138 else {
10139 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10140 return smart_chomp(str, e, p);
10141 }
10142 }
10143
10144 enc = rb_enc_check(str, rs);
10145 if (is_broken_string(rs)) {
10146 return len;
10147 }
10148 pp = e - rslen;
10149 if (p[len-1] == newline &&
10150 (rslen <= 1 ||
10151 memcmp(rsptr, pp, rslen) == 0)) {
10152 if (at_char_boundary(p, pp, e, enc))
10153 return len - rslen;
10154 RB_GC_GUARD(rs);
10155 }
10156 return len;
10157}
10158
10164static VALUE
10165chomp_rs(int argc, const VALUE *argv)
10166{
10167 rb_check_arity(argc, 0, 1);
10168 if (argc > 0) {
10169 VALUE rs = argv[0];
10170 if (!NIL_P(rs)) StringValue(rs);
10171 return rs;
10172 }
10173 else {
10174 return rb_rs;
10175 }
10176}
10177
10178VALUE
10179rb_str_chomp_string(VALUE str, VALUE rs)
10180{
10181 long olen = RSTRING_LEN(str);
10182 long len = chompped_length(str, rs);
10183 if (len >= olen) return Qnil;
10184 str_modify_keep_cr(str);
10185 STR_SET_LEN(str, len);
10186 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10187 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10189 }
10190 return str;
10191}
10192
10193/*
10194 * call-seq:
10195 * chomp!(line_sep = $/) -> self or nil
10196 *
10197 * Like String#chomp, but modifies +self+ in place;
10198 * returns +nil+ if no modification made, +self+ otherwise.
10199 *
10200 */
10201
10202static VALUE
10203rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10204{
10205 VALUE rs;
10206 str_modifiable(str);
10207 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10208 rs = chomp_rs(argc, argv);
10209 if (NIL_P(rs)) return Qnil;
10210 return rb_str_chomp_string(str, rs);
10211}
10212
10213
10214/*
10215 * call-seq:
10216 * chomp(line_sep = $/) -> new_string
10217 *
10218 * :include: doc/string/chomp.rdoc
10219 *
10220 */
10221
10222static VALUE
10223rb_str_chomp(int argc, VALUE *argv, VALUE str)
10224{
10225 VALUE rs = chomp_rs(argc, argv);
10226 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10227 return rb_str_subseq(str, 0, chompped_length(str, rs));
10228}
10229
10230static long
10231lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10232{
10233 const char *const start = s;
10234
10235 if (!s || s >= e) return 0;
10236
10237 /* remove spaces at head */
10238 if (single_byte_optimizable(str)) {
10239 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10240 }
10241 else {
10242 while (s < e) {
10243 int n;
10244 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10245
10246 if (cc && !rb_isspace(cc)) break;
10247 s += n;
10248 }
10249 }
10250 return s - start;
10251}
10252
10253/*
10254 * call-seq:
10255 * lstrip! -> self or nil
10256 *
10257 * Like String#lstrip, except that any modifications are made in +self+;
10258 * returns +self+ if any modification are made, +nil+ otherwise.
10259 *
10260 * Related: String#rstrip!, String#strip!.
10261 */
10262
10263static VALUE
10264rb_str_lstrip_bang(VALUE str)
10265{
10266 rb_encoding *enc;
10267 char *start, *s;
10268 long olen, loffset;
10269
10270 str_modify_keep_cr(str);
10271 enc = STR_ENC_GET(str);
10272 RSTRING_GETMEM(str, start, olen);
10273 loffset = lstrip_offset(str, start, start+olen, enc);
10274 if (loffset > 0) {
10275 long len = olen-loffset;
10276 s = start + loffset;
10277 memmove(start, s, len);
10278 STR_SET_LEN(str, len);
10279 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10280 return str;
10281 }
10282 return Qnil;
10283}
10284
10285
10286/*
10287 * call-seq:
10288 * lstrip -> new_string
10289 *
10290 * Returns a copy of +self+ with leading whitespace removed;
10291 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10292 *
10293 * whitespace = "\x00\t\n\v\f\r "
10294 * s = whitespace + 'abc' + whitespace
10295 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10296 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10297 *
10298 * Related: String#rstrip, String#strip.
10299 */
10300
10301static VALUE
10302rb_str_lstrip(VALUE str)
10303{
10304 char *start;
10305 long len, loffset;
10306 RSTRING_GETMEM(str, start, len);
10307 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10308 if (loffset <= 0) return str_duplicate(rb_cString, str);
10309 return rb_str_subseq(str, loffset, len - loffset);
10310}
10311
10312static long
10313rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10314{
10315 const char *t;
10316
10317 rb_str_check_dummy_enc(enc);
10319 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10320 }
10321 if (!s || s >= e) return 0;
10322 t = e;
10323
10324 /* remove trailing spaces or '\0's */
10325 if (single_byte_optimizable(str)) {
10326 unsigned char c;
10327 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10328 }
10329 else {
10330 char *tp;
10331
10332 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10333 unsigned int c = rb_enc_codepoint(tp, e, enc);
10334 if (c && !rb_isspace(c)) break;
10335 t = tp;
10336 }
10337 }
10338 return e - t;
10339}
10340
10341/*
10342 * call-seq:
10343 * rstrip! -> self or nil
10344 *
10345 * Like String#rstrip, except that any modifications are made in +self+;
10346 * returns +self+ if any modification are made, +nil+ otherwise.
10347 *
10348 * Related: String#lstrip!, String#strip!.
10349 */
10350
10351static VALUE
10352rb_str_rstrip_bang(VALUE str)
10353{
10354 rb_encoding *enc;
10355 char *start;
10356 long olen, roffset;
10357
10358 str_modify_keep_cr(str);
10359 enc = STR_ENC_GET(str);
10360 RSTRING_GETMEM(str, start, olen);
10361 roffset = rstrip_offset(str, start, start+olen, enc);
10362 if (roffset > 0) {
10363 long len = olen - roffset;
10364
10365 STR_SET_LEN(str, len);
10366 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10367 return str;
10368 }
10369 return Qnil;
10370}
10371
10372
10373/*
10374 * call-seq:
10375 * rstrip -> new_string
10376 *
10377 * Returns a copy of the receiver with trailing whitespace removed;
10378 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10379 *
10380 * whitespace = "\x00\t\n\v\f\r "
10381 * s = whitespace + 'abc' + whitespace
10382 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10383 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10384 *
10385 * Related: String#lstrip, String#strip.
10386 */
10387
10388static VALUE
10389rb_str_rstrip(VALUE str)
10390{
10391 rb_encoding *enc;
10392 char *start;
10393 long olen, roffset;
10394
10395 enc = STR_ENC_GET(str);
10396 RSTRING_GETMEM(str, start, olen);
10397 roffset = rstrip_offset(str, start, start+olen, enc);
10398
10399 if (roffset <= 0) return str_duplicate(rb_cString, str);
10400 return rb_str_subseq(str, 0, olen-roffset);
10401}
10402
10403
10404/*
10405 * call-seq:
10406 * strip! -> self or nil
10407 *
10408 * Like String#strip, except that any modifications are made in +self+;
10409 * returns +self+ if any modification are made, +nil+ otherwise.
10410 *
10411 * Related: String#lstrip!, String#strip!.
10412 */
10413
10414static VALUE
10415rb_str_strip_bang(VALUE str)
10416{
10417 char *start;
10418 long olen, loffset, roffset;
10419 rb_encoding *enc;
10420
10421 str_modify_keep_cr(str);
10422 enc = STR_ENC_GET(str);
10423 RSTRING_GETMEM(str, start, olen);
10424 loffset = lstrip_offset(str, start, start+olen, enc);
10425 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10426
10427 if (loffset > 0 || roffset > 0) {
10428 long len = olen-roffset;
10429 if (loffset > 0) {
10430 len -= loffset;
10431 memmove(start, start + loffset, len);
10432 }
10433 STR_SET_LEN(str, len);
10434 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10435 return str;
10436 }
10437 return Qnil;
10438}
10439
10440
10441/*
10442 * call-seq:
10443 * strip -> new_string
10444 *
10445 * Returns a copy of the receiver with leading and trailing whitespace removed;
10446 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10447 *
10448 * whitespace = "\x00\t\n\v\f\r "
10449 * s = whitespace + 'abc' + whitespace
10450 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10451 * s.strip # => "abc"
10452 *
10453 * Related: String#lstrip, String#rstrip.
10454 */
10455
10456static VALUE
10457rb_str_strip(VALUE str)
10458{
10459 char *start;
10460 long olen, loffset, roffset;
10461 rb_encoding *enc = STR_ENC_GET(str);
10462
10463 RSTRING_GETMEM(str, start, olen);
10464 loffset = lstrip_offset(str, start, start+olen, enc);
10465 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10466
10467 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10468 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10469}
10470
10471static VALUE
10472scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10473{
10474 VALUE result = Qnil;
10475 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10476 if (pos >= 0) {
10477 VALUE match;
10478 struct re_registers *regs;
10479 if (BUILTIN_TYPE(pat) == T_STRING) {
10480 regs = NULL;
10481 end = pos + RSTRING_LEN(pat);
10482 }
10483 else {
10484 match = rb_backref_get();
10485 regs = RMATCH_REGS(match);
10486 pos = BEG(0);
10487 end = END(0);
10488 }
10489
10490 if (pos == end) {
10491 rb_encoding *enc = STR_ENC_GET(str);
10492 /*
10493 * Always consume at least one character of the input string
10494 */
10495 if (RSTRING_LEN(str) > end)
10496 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10497 RSTRING_END(str), enc);
10498 else
10499 *start = end + 1;
10500 }
10501 else {
10502 *start = end;
10503 }
10504
10505 if (!regs || regs->num_regs == 1) {
10506 result = rb_str_subseq(str, pos, end - pos);
10507 return result;
10508 }
10509 else {
10510 result = rb_ary_new2(regs->num_regs);
10511 for (int i = 1; i < regs->num_regs; i++) {
10512 VALUE s = Qnil;
10513 if (BEG(i) >= 0) {
10514 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10515 }
10516
10517 rb_ary_push(result, s);
10518 }
10519 }
10520
10521 RB_GC_GUARD(match);
10522 }
10523
10524 return result;
10525}
10526
10527
10528/*
10529 * call-seq:
10530 * scan(string_or_regexp) -> array
10531 * scan(string_or_regexp) {|matches| ... } -> self
10532 *
10533 * Matches a pattern against +self+; the pattern is:
10534 *
10535 * - +string_or_regexp+ itself, if it is a Regexp.
10536 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10537 *
10538 * Iterates through +self+, generating a collection of matching results:
10539 *
10540 * - If the pattern contains no groups, each result is the
10541 * matched string, <code>$&</code>.
10542 * - If the pattern contains groups, each result is an array
10543 * containing one entry per group.
10544 *
10545 * With no block given, returns an array of the results:
10546 *
10547 * s = 'cruel world'
10548 * s.scan(/\w+/) # => ["cruel", "world"]
10549 * s.scan(/.../) # => ["cru", "el ", "wor"]
10550 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10551 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10552 *
10553 * With a block given, calls the block with each result; returns +self+:
10554 *
10555 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10556 * print "\n"
10557 * s.scan(/(.)(.)/) {|x,y| print y, x }
10558 * print "\n"
10559 *
10560 * Output:
10561 *
10562 * <<cruel>> <<world>>
10563 * rceu lowlr
10564 *
10565 */
10566
10567static VALUE
10568rb_str_scan(VALUE str, VALUE pat)
10569{
10570 VALUE result;
10571 long start = 0;
10572 long last = -1, prev = 0;
10573 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10574
10575 pat = get_pat_quoted(pat, 1);
10576 mustnot_broken(str);
10577 if (!rb_block_given_p()) {
10578 VALUE ary = rb_ary_new();
10579
10580 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10581 last = prev;
10582 prev = start;
10583 rb_ary_push(ary, result);
10584 }
10585 if (last >= 0) rb_pat_search(pat, str, last, 1);
10586 else rb_backref_set(Qnil);
10587 return ary;
10588 }
10589
10590 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10591 last = prev;
10592 prev = start;
10593 rb_yield(result);
10594 str_mod_check(str, p, len);
10595 }
10596 if (last >= 0) rb_pat_search(pat, str, last, 1);
10597 return str;
10598}
10599
10600
10601/*
10602 * call-seq:
10603 * hex -> integer
10604 *
10605 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10606 * (with an optional sign and an optional <code>0x</code>) and returns the
10607 * corresponding number;
10608 * returns zero if there is no such leading substring:
10609 *
10610 * '0x0a'.hex # => 10
10611 * '-1234'.hex # => -4660
10612 * '0'.hex # => 0
10613 * 'non-numeric'.hex # => 0
10614 *
10615 * Related: String#oct.
10616 *
10617 */
10618
10619static VALUE
10620rb_str_hex(VALUE str)
10621{
10622 return rb_str_to_inum(str, 16, FALSE);
10623}
10624
10625
10626/*
10627 * call-seq:
10628 * oct -> integer
10629 *
10630 * Interprets the leading substring of +self+ as a string of octal digits
10631 * (with an optional sign) and returns the corresponding number;
10632 * returns zero if there is no such leading substring:
10633 *
10634 * '123'.oct # => 83
10635 * '-377'.oct # => -255
10636 * '0377non-numeric'.oct # => 255
10637 * 'non-numeric'.oct # => 0
10638 *
10639 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10640 * see Kernel#Integer.
10641 *
10642 * Related: String#hex.
10643 *
10644 */
10645
10646static VALUE
10647rb_str_oct(VALUE str)
10648{
10649 return rb_str_to_inum(str, -8, FALSE);
10650}
10651
10652#ifndef HAVE_CRYPT_R
10653# include "ruby/thread_native.h"
10654# include "ruby/atomic.h"
10655
10656static struct {
10657 rb_nativethread_lock_t lock;
10658} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10659
10660static void
10661crypt_mutex_initialize(void)
10662{
10663}
10664#endif
10665
10666/*
10667 * call-seq:
10668 * crypt(salt_str) -> new_string
10669 *
10670 * Returns the string generated by calling <code>crypt(3)</code>
10671 * standard library function with <code>str</code> and
10672 * <code>salt_str</code>, in this order, as its arguments. Please do
10673 * not use this method any longer. It is legacy; provided only for
10674 * backward compatibility with ruby scripts in earlier days. It is
10675 * bad to use in contemporary programs for several reasons:
10676 *
10677 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10678 * run. The generated string lacks data portability.
10679 *
10680 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10681 * (i.e. silently ends up in unexpected results).
10682 *
10683 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10684 * thread safe.
10685 *
10686 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10687 * very very weak. According to its manpage, Linux's traditional
10688 * <code>crypt(3)</code> output has only 2**56 variations; too
10689 * easy to brute force today. And this is the default behaviour.
10690 *
10691 * * In order to make things robust some OSes implement so-called
10692 * "modular" usage. To go through, you have to do a complex
10693 * build-up of the <code>salt_str</code> parameter, by hand.
10694 * Failure in generation of a proper salt string tends not to
10695 * yield any errors; typos in parameters are normally not
10696 * detectable.
10697 *
10698 * * For instance, in the following example, the second invocation
10699 * of String#crypt is wrong; it has a typo in "round=" (lacks
10700 * "s"). However the call does not fail and something unexpected
10701 * is generated.
10702 *
10703 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10704 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10705 *
10706 * * Even in the "modular" mode, some hash functions are considered
10707 * archaic and no longer recommended at all; for instance module
10708 * <code>$1$</code> is officially abandoned by its author: see
10709 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10710 * instance module <code>$3$</code> is considered completely
10711 * broken: see the manpage of FreeBSD.
10712 *
10713 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10714 * written above, <code>crypt(3)</code> on Mac OS never fails.
10715 * This means even if you build up a proper salt string it
10716 * generates a traditional DES hash anyways, and there is no way
10717 * for you to be aware of.
10718 *
10719 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10720 *
10721 * If for some reason you cannot migrate to other secure contemporary
10722 * password hashing algorithms, install the string-crypt gem and
10723 * <code>require 'string/crypt'</code> to continue using it.
10724 */
10725
10726static VALUE
10727rb_str_crypt(VALUE str, VALUE salt)
10728{
10729#ifdef HAVE_CRYPT_R
10730 VALUE databuf;
10731 struct crypt_data *data;
10732# define CRYPT_END() ALLOCV_END(databuf)
10733#else
10734 extern char *crypt(const char *, const char *);
10735# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10736#endif
10737 VALUE result;
10738 const char *s, *saltp;
10739 char *res;
10740#ifdef BROKEN_CRYPT
10741 char salt_8bit_clean[3];
10742#endif
10743
10744 StringValue(salt);
10745 mustnot_wchar(str);
10746 mustnot_wchar(salt);
10747 s = StringValueCStr(str);
10748 saltp = RSTRING_PTR(salt);
10749 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10750 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10751 }
10752
10753#ifdef BROKEN_CRYPT
10754 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10755 salt_8bit_clean[0] = saltp[0] & 0x7f;
10756 salt_8bit_clean[1] = saltp[1] & 0x7f;
10757 salt_8bit_clean[2] = '\0';
10758 saltp = salt_8bit_clean;
10759 }
10760#endif
10761#ifdef HAVE_CRYPT_R
10762 data = ALLOCV(databuf, sizeof(struct crypt_data));
10763# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10764 data->initialized = 0;
10765# endif
10766 res = crypt_r(s, saltp, data);
10767#else
10768 crypt_mutex_initialize();
10769 rb_nativethread_lock_lock(&crypt_mutex.lock);
10770 res = crypt(s, saltp);
10771#endif
10772 if (!res) {
10773 int err = errno;
10774 CRYPT_END();
10775 rb_syserr_fail(err, "crypt");
10776 }
10777 result = rb_str_new_cstr(res);
10778 CRYPT_END();
10779 return result;
10780}
10781
10782
10783/*
10784 * call-seq:
10785 * ord -> integer
10786 *
10787 * :include: doc/string/ord.rdoc
10788 *
10789 */
10790
10791static VALUE
10792rb_str_ord(VALUE s)
10793{
10794 unsigned int c;
10795
10796 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10797 return UINT2NUM(c);
10798}
10799/*
10800 * call-seq:
10801 * sum(n = 16) -> integer
10802 *
10803 * :include: doc/string/sum.rdoc
10804 *
10805 */
10806
10807static VALUE
10808rb_str_sum(int argc, VALUE *argv, VALUE str)
10809{
10810 int bits = 16;
10811 char *ptr, *p, *pend;
10812 long len;
10813 VALUE sum = INT2FIX(0);
10814 unsigned long sum0 = 0;
10815
10816 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10817 bits = 0;
10818 }
10819 ptr = p = RSTRING_PTR(str);
10820 len = RSTRING_LEN(str);
10821 pend = p + len;
10822
10823 while (p < pend) {
10824 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10825 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10826 str_mod_check(str, ptr, len);
10827 sum0 = 0;
10828 }
10829 sum0 += (unsigned char)*p;
10830 p++;
10831 }
10832
10833 if (bits == 0) {
10834 if (sum0) {
10835 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10836 }
10837 }
10838 else {
10839 if (sum == INT2FIX(0)) {
10840 if (bits < (int)sizeof(long)*CHAR_BIT) {
10841 sum0 &= (((unsigned long)1)<<bits)-1;
10842 }
10843 sum = LONG2FIX(sum0);
10844 }
10845 else {
10846 VALUE mod;
10847
10848 if (sum0) {
10849 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10850 }
10851
10852 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10853 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10854 sum = rb_funcall(sum, '&', 1, mod);
10855 }
10856 }
10857 return sum;
10858}
10859
10860static VALUE
10861rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10862{
10863 rb_encoding *enc;
10864 VALUE w;
10865 long width, len, flen = 1, fclen = 1;
10866 VALUE res;
10867 char *p;
10868 const char *f = " ";
10869 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10870 VALUE pad;
10871 int singlebyte = 1, cr;
10872 int termlen;
10873
10874 rb_scan_args(argc, argv, "11", &w, &pad);
10875 enc = STR_ENC_GET(str);
10876 termlen = rb_enc_mbminlen(enc);
10877 width = NUM2LONG(w);
10878 if (argc == 2) {
10879 StringValue(pad);
10880 enc = rb_enc_check(str, pad);
10881 f = RSTRING_PTR(pad);
10882 flen = RSTRING_LEN(pad);
10883 fclen = str_strlen(pad, enc); /* rb_enc_check */
10884 singlebyte = single_byte_optimizable(pad);
10885 if (flen == 0 || fclen == 0) {
10886 rb_raise(rb_eArgError, "zero width padding");
10887 }
10888 }
10889 len = str_strlen(str, enc); /* rb_enc_check */
10890 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10891 n = width - len;
10892 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10893 rlen = n - llen;
10894 cr = ENC_CODERANGE(str);
10895 if (flen > 1) {
10896 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10897 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10898 }
10899 size = RSTRING_LEN(str);
10900 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10901 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10902 (len += llen2 + rlen2) >= LONG_MAX - size) {
10903 rb_raise(rb_eArgError, "argument too big");
10904 }
10905 len += size;
10906 res = str_enc_new(rb_cString, 0, len, enc);
10907 p = RSTRING_PTR(res);
10908 if (flen <= 1) {
10909 memset(p, *f, llen);
10910 p += llen;
10911 }
10912 else {
10913 while (llen >= fclen) {
10914 memcpy(p,f,flen);
10915 p += flen;
10916 llen -= fclen;
10917 }
10918 if (llen > 0) {
10919 memcpy(p, f, llen2);
10920 p += llen2;
10921 }
10922 }
10923 memcpy(p, RSTRING_PTR(str), size);
10924 p += size;
10925 if (flen <= 1) {
10926 memset(p, *f, rlen);
10927 p += rlen;
10928 }
10929 else {
10930 while (rlen >= fclen) {
10931 memcpy(p,f,flen);
10932 p += flen;
10933 rlen -= fclen;
10934 }
10935 if (rlen > 0) {
10936 memcpy(p, f, rlen2);
10937 p += rlen2;
10938 }
10939 }
10940 TERM_FILL(p, termlen);
10941 STR_SET_LEN(res, p-RSTRING_PTR(res));
10942
10943 if (argc == 2)
10944 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10945 if (cr != ENC_CODERANGE_BROKEN)
10946 ENC_CODERANGE_SET(res, cr);
10947
10948 RB_GC_GUARD(pad);
10949 return res;
10950}
10951
10952
10953/*
10954 * call-seq:
10955 * ljust(size, pad_string = ' ') -> new_string
10956 *
10957 * :include: doc/string/ljust.rdoc
10958 *
10959 * Related: String#rjust, String#center.
10960 *
10961 */
10962
10963static VALUE
10964rb_str_ljust(int argc, VALUE *argv, VALUE str)
10965{
10966 return rb_str_justify(argc, argv, str, 'l');
10967}
10968
10969/*
10970 * call-seq:
10971 * rjust(size, pad_string = ' ') -> new_string
10972 *
10973 * :include: doc/string/rjust.rdoc
10974 *
10975 * Related: String#ljust, String#center.
10976 *
10977 */
10978
10979static VALUE
10980rb_str_rjust(int argc, VALUE *argv, VALUE str)
10981{
10982 return rb_str_justify(argc, argv, str, 'r');
10983}
10984
10985
10986/*
10987 * call-seq:
10988 * center(size, pad_string = ' ') -> new_string
10989 *
10990 * :include: doc/string/center.rdoc
10991 *
10992 * Related: String#ljust, String#rjust.
10993 *
10994 */
10995
10996static VALUE
10997rb_str_center(int argc, VALUE *argv, VALUE str)
10998{
10999 return rb_str_justify(argc, argv, str, 'c');
11000}
11001
11002/*
11003 * call-seq:
11004 * partition(string_or_regexp) -> [head, match, tail]
11005 *
11006 * :include: doc/string/partition.rdoc
11007 *
11008 */
11009
11010static VALUE
11011rb_str_partition(VALUE str, VALUE sep)
11012{
11013 long pos;
11014
11015 sep = get_pat_quoted(sep, 0);
11016 if (RB_TYPE_P(sep, T_REGEXP)) {
11017 if (rb_reg_search(sep, str, 0, 0) < 0) {
11018 goto failed;
11019 }
11020 VALUE match = rb_backref_get();
11021 struct re_registers *regs = RMATCH_REGS(match);
11022
11023 pos = BEG(0);
11024 sep = rb_str_subseq(str, pos, END(0) - pos);
11025 }
11026 else {
11027 pos = rb_str_index(str, sep, 0);
11028 if (pos < 0) goto failed;
11029 }
11030 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11031 sep,
11032 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11033 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11034
11035 failed:
11036 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11037}
11038
11039/*
11040 * call-seq:
11041 * rpartition(sep) -> [head, match, tail]
11042 *
11043 * :include: doc/string/rpartition.rdoc
11044 *
11045 */
11046
11047static VALUE
11048rb_str_rpartition(VALUE str, VALUE sep)
11049{
11050 long pos = RSTRING_LEN(str);
11051
11052 sep = get_pat_quoted(sep, 0);
11053 if (RB_TYPE_P(sep, T_REGEXP)) {
11054 if (rb_reg_search(sep, str, pos, 1) < 0) {
11055 goto failed;
11056 }
11057 VALUE match = rb_backref_get();
11058 struct re_registers *regs = RMATCH_REGS(match);
11059
11060 pos = BEG(0);
11061 sep = rb_str_subseq(str, pos, END(0) - pos);
11062 }
11063 else {
11064 pos = rb_str_sublen(str, pos);
11065 pos = rb_str_rindex(str, sep, pos);
11066 if (pos < 0) {
11067 goto failed;
11068 }
11069 }
11070
11071 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11072 sep,
11073 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11074 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11075 failed:
11076 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11077}
11078
11079/*
11080 * call-seq:
11081 * start_with?(*string_or_regexp) -> true or false
11082 *
11083 * :include: doc/string/start_with_p.rdoc
11084 *
11085 */
11086
11087static VALUE
11088rb_str_start_with(int argc, VALUE *argv, VALUE str)
11089{
11090 int i;
11091
11092 for (i=0; i<argc; i++) {
11093 VALUE tmp = argv[i];
11094 if (RB_TYPE_P(tmp, T_REGEXP)) {
11095 if (rb_reg_start_with_p(tmp, str))
11096 return Qtrue;
11097 }
11098 else {
11099 const char *p, *s, *e;
11100 long slen, tlen;
11101 rb_encoding *enc;
11102
11103 StringValue(tmp);
11104 enc = rb_enc_check(str, tmp);
11105 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11106 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11107 p = RSTRING_PTR(str);
11108 e = p + slen;
11109 s = p + tlen;
11110 if (!at_char_right_boundary(p, s, e, enc))
11111 continue;
11112 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11113 return Qtrue;
11114 }
11115 }
11116 return Qfalse;
11117}
11118
11119/*
11120 * call-seq:
11121 * end_with?(*strings) -> true or false
11122 *
11123 * :include: doc/string/end_with_p.rdoc
11124 *
11125 */
11126
11127static VALUE
11128rb_str_end_with(int argc, VALUE *argv, VALUE str)
11129{
11130 int i;
11131
11132 for (i=0; i<argc; i++) {
11133 VALUE tmp = argv[i];
11134 const char *p, *s, *e;
11135 long slen, tlen;
11136 rb_encoding *enc;
11137
11138 StringValue(tmp);
11139 enc = rb_enc_check(str, tmp);
11140 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11141 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11142 p = RSTRING_PTR(str);
11143 e = p + slen;
11144 s = e - tlen;
11145 if (!at_char_boundary(p, s, e, enc))
11146 continue;
11147 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11148 return Qtrue;
11149 }
11150 return Qfalse;
11151}
11152
11162static long
11163deleted_prefix_length(VALUE str, VALUE prefix)
11164{
11165 const char *strptr, *prefixptr;
11166 long olen, prefixlen;
11167 rb_encoding *enc = rb_enc_get(str);
11168
11169 StringValue(prefix);
11170
11171 if (!is_broken_string(prefix) ||
11172 !rb_enc_asciicompat(enc) ||
11173 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11174 enc = rb_enc_check(str, prefix);
11175 }
11176
11177 /* return 0 if not start with prefix */
11178 prefixlen = RSTRING_LEN(prefix);
11179 if (prefixlen <= 0) return 0;
11180 olen = RSTRING_LEN(str);
11181 if (olen < prefixlen) return 0;
11182 strptr = RSTRING_PTR(str);
11183 prefixptr = RSTRING_PTR(prefix);
11184 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11185 if (is_broken_string(prefix)) {
11186 if (!is_broken_string(str)) {
11187 /* prefix in a valid string cannot be broken */
11188 return 0;
11189 }
11190 const char *strend = strptr + olen;
11191 const char *after_prefix = strptr + prefixlen;
11192 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11193 /* prefix does not end at char-boundary */
11194 return 0;
11195 }
11196 }
11197 /* prefix part in `str` also should be valid. */
11198
11199 return prefixlen;
11200}
11201
11202/*
11203 * call-seq:
11204 * delete_prefix!(prefix) -> self or nil
11205 *
11206 * Like String#delete_prefix, except that +self+ is modified in place.
11207 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11208 *
11209 */
11210
11211static VALUE
11212rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11213{
11214 long prefixlen;
11215 str_modify_keep_cr(str);
11216
11217 prefixlen = deleted_prefix_length(str, prefix);
11218 if (prefixlen <= 0) return Qnil;
11219
11220 return rb_str_drop_bytes(str, prefixlen);
11221}
11222
11223/*
11224 * call-seq:
11225 * delete_prefix(prefix) -> new_string
11226 *
11227 * :include: doc/string/delete_prefix.rdoc
11228 *
11229 */
11230
11231static VALUE
11232rb_str_delete_prefix(VALUE str, VALUE prefix)
11233{
11234 long prefixlen;
11235
11236 prefixlen = deleted_prefix_length(str, prefix);
11237 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11238
11239 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11240}
11241
11251static long
11252deleted_suffix_length(VALUE str, VALUE suffix)
11253{
11254 const char *strptr, *suffixptr;
11255 long olen, suffixlen;
11256 rb_encoding *enc;
11257
11258 StringValue(suffix);
11259 if (is_broken_string(suffix)) return 0;
11260 enc = rb_enc_check(str, suffix);
11261
11262 /* return 0 if not start with suffix */
11263 suffixlen = RSTRING_LEN(suffix);
11264 if (suffixlen <= 0) return 0;
11265 olen = RSTRING_LEN(str);
11266 if (olen < suffixlen) return 0;
11267 strptr = RSTRING_PTR(str);
11268 suffixptr = RSTRING_PTR(suffix);
11269 const char *strend = strptr + olen;
11270 const char *before_suffix = strend - suffixlen;
11271 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11272 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11273
11274 return suffixlen;
11275}
11276
11277/*
11278 * call-seq:
11279 * delete_suffix!(suffix) -> self or nil
11280 *
11281 * Like String#delete_suffix, except that +self+ is modified in place.
11282 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11283 *
11284 */
11285
11286static VALUE
11287rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11288{
11289 long olen, suffixlen, len;
11290 str_modifiable(str);
11291
11292 suffixlen = deleted_suffix_length(str, suffix);
11293 if (suffixlen <= 0) return Qnil;
11294
11295 olen = RSTRING_LEN(str);
11296 str_modify_keep_cr(str);
11297 len = olen - suffixlen;
11298 STR_SET_LEN(str, len);
11299 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11300 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11302 }
11303 return str;
11304}
11305
11306/*
11307 * call-seq:
11308 * delete_suffix(suffix) -> new_string
11309 *
11310 * :include: doc/string/delete_suffix.rdoc
11311 *
11312 */
11313
11314static VALUE
11315rb_str_delete_suffix(VALUE str, VALUE suffix)
11316{
11317 long suffixlen;
11318
11319 suffixlen = deleted_suffix_length(str, suffix);
11320 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11321
11322 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11323}
11324
11325void
11326rb_str_setter(VALUE val, ID id, VALUE *var)
11327{
11328 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11329 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11330 }
11331 *var = val;
11332}
11333
11334static void
11335rb_fs_setter(VALUE val, ID id, VALUE *var)
11336{
11337 val = rb_fs_check(val);
11338 if (!val) {
11339 rb_raise(rb_eTypeError,
11340 "value of %"PRIsVALUE" must be String or Regexp",
11341 rb_id2str(id));
11342 }
11343 if (!NIL_P(val)) {
11344 rb_warn_deprecated("'$;'", NULL);
11345 }
11346 *var = val;
11347}
11348
11349
11350/*
11351 * call-seq:
11352 * force_encoding(encoding) -> self
11353 *
11354 * :include: doc/string/force_encoding.rdoc
11355 *
11356 */
11357
11358static VALUE
11359rb_str_force_encoding(VALUE str, VALUE enc)
11360{
11361 str_modifiable(str);
11362
11363 rb_encoding *encoding = rb_to_encoding(enc);
11364 int idx = rb_enc_to_index(encoding);
11365
11366 // If the encoding is unchanged, we do nothing.
11367 if (ENCODING_GET(str) == idx) {
11368 return str;
11369 }
11370
11371 rb_enc_associate_index(str, idx);
11372
11373 // If the coderange was 7bit and the new encoding is ASCII-compatible
11374 // we can keep the coderange.
11375 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11376 return str;
11377 }
11378
11380 return str;
11381}
11382
11383/*
11384 * call-seq:
11385 * b -> string
11386 *
11387 * :include: doc/string/b.rdoc
11388 *
11389 */
11390
11391static VALUE
11392rb_str_b(VALUE str)
11393{
11394 VALUE str2;
11395 if (STR_EMBED_P(str)) {
11396 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11397 }
11398 else {
11399 str2 = str_alloc_heap(rb_cString);
11400 }
11401 str_replace_shared_without_enc(str2, str);
11402
11403 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11404 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11405 // If we know the receiver's code range then we know the result's code range.
11406 int cr = ENC_CODERANGE(str);
11407 switch (cr) {
11408 case ENC_CODERANGE_7BIT:
11410 break;
11414 break;
11415 default:
11416 ENC_CODERANGE_CLEAR(str2);
11417 break;
11418 }
11419 }
11420
11421 return str2;
11422}
11423
11424/*
11425 * call-seq:
11426 * valid_encoding? -> true or false
11427 *
11428 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11429 *
11430 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11431 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11432 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11433 */
11434
11435static VALUE
11436rb_str_valid_encoding_p(VALUE str)
11437{
11438 int cr = rb_enc_str_coderange(str);
11439
11440 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11441}
11442
11443/*
11444 * call-seq:
11445 * ascii_only? -> true or false
11446 *
11447 * Returns +true+ if +self+ contains only ASCII characters,
11448 * +false+ otherwise:
11449 *
11450 * 'abc'.ascii_only? # => true
11451 * "abc\u{6666}".ascii_only? # => false
11452 *
11453 */
11454
11455static VALUE
11456rb_str_is_ascii_only_p(VALUE str)
11457{
11458 int cr = rb_enc_str_coderange(str);
11459
11460 return RBOOL(cr == ENC_CODERANGE_7BIT);
11461}
11462
11463VALUE
11465{
11466 static const char ellipsis[] = "...";
11467 const long ellipsislen = sizeof(ellipsis) - 1;
11468 rb_encoding *const enc = rb_enc_get(str);
11469 const long blen = RSTRING_LEN(str);
11470 const char *const p = RSTRING_PTR(str), *e = p + blen;
11471 VALUE estr, ret = 0;
11472
11473 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11474 if (len * rb_enc_mbminlen(enc) >= blen ||
11475 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11476 ret = str;
11477 }
11478 else if (len <= ellipsislen ||
11479 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11480 if (rb_enc_asciicompat(enc)) {
11481 ret = rb_str_new(ellipsis, len);
11482 rb_enc_associate(ret, enc);
11483 }
11484 else {
11485 estr = rb_usascii_str_new(ellipsis, len);
11486 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11487 }
11488 }
11489 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11490 rb_str_cat(ret, ellipsis, ellipsislen);
11491 }
11492 else {
11493 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11494 rb_enc_from_encoding(enc), 0, Qnil);
11495 rb_str_append(ret, estr);
11496 }
11497 return ret;
11498}
11499
11500static VALUE
11501str_compat_and_valid(VALUE str, rb_encoding *enc)
11502{
11503 int cr;
11504 str = StringValue(str);
11505 cr = rb_enc_str_coderange(str);
11506 if (cr == ENC_CODERANGE_BROKEN) {
11507 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11508 }
11509 else {
11510 rb_encoding *e = STR_ENC_GET(str);
11511 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11512 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11513 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11514 }
11515 }
11516 return str;
11517}
11518
11519static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11520
11521VALUE
11523{
11524 rb_encoding *enc = STR_ENC_GET(str);
11525 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11526}
11527
11528VALUE
11529rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11530{
11531 int cr = ENC_CODERANGE_UNKNOWN;
11532 if (enc == STR_ENC_GET(str)) {
11533 /* cached coderange makes sense only when enc equals the
11534 * actual encoding of str */
11535 cr = ENC_CODERANGE(str);
11536 }
11537 return enc_str_scrub(enc, str, repl, cr);
11538}
11539
11540static VALUE
11541enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11542{
11543 int encidx;
11544 VALUE buf = Qnil;
11545 const char *rep, *p, *e, *p1, *sp;
11546 long replen = -1;
11547 long slen;
11548
11549 if (rb_block_given_p()) {
11550 if (!NIL_P(repl))
11551 rb_raise(rb_eArgError, "both of block and replacement given");
11552 replen = 0;
11553 }
11554
11555 if (ENC_CODERANGE_CLEAN_P(cr))
11556 return Qnil;
11557
11558 if (!NIL_P(repl)) {
11559 repl = str_compat_and_valid(repl, enc);
11560 }
11561
11562 if (rb_enc_dummy_p(enc)) {
11563 return Qnil;
11564 }
11565 encidx = rb_enc_to_index(enc);
11566
11567#define DEFAULT_REPLACE_CHAR(str) do { \
11568 static const char replace[sizeof(str)-1] = str; \
11569 rep = replace; replen = (int)sizeof(replace); \
11570 } while (0)
11571
11572 slen = RSTRING_LEN(str);
11573 p = RSTRING_PTR(str);
11574 e = RSTRING_END(str);
11575 p1 = p;
11576 sp = p;
11577
11578 if (rb_enc_asciicompat(enc)) {
11579 int rep7bit_p;
11580 if (!replen) {
11581 rep = NULL;
11582 rep7bit_p = FALSE;
11583 }
11584 else if (!NIL_P(repl)) {
11585 rep = RSTRING_PTR(repl);
11586 replen = RSTRING_LEN(repl);
11587 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11588 }
11589 else if (encidx == rb_utf8_encindex()) {
11590 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11591 rep7bit_p = FALSE;
11592 }
11593 else {
11594 DEFAULT_REPLACE_CHAR("?");
11595 rep7bit_p = TRUE;
11596 }
11597 cr = ENC_CODERANGE_7BIT;
11598
11599 p = search_nonascii(p, e);
11600 if (!p) {
11601 p = e;
11602 }
11603 while (p < e) {
11604 int ret = rb_enc_precise_mbclen(p, e, enc);
11605 if (MBCLEN_NEEDMORE_P(ret)) {
11606 break;
11607 }
11608 else if (MBCLEN_CHARFOUND_P(ret)) {
11610 p += MBCLEN_CHARFOUND_LEN(ret);
11611 }
11612 else if (MBCLEN_INVALID_P(ret)) {
11613 /*
11614 * p1~p: valid ascii/multibyte chars
11615 * p ~e: invalid bytes + unknown bytes
11616 */
11617 long clen = rb_enc_mbmaxlen(enc);
11618 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11619 if (p > p1) {
11620 rb_str_buf_cat(buf, p1, p - p1);
11621 }
11622
11623 if (e - p < clen) clen = e - p;
11624 if (clen <= 2) {
11625 clen = 1;
11626 }
11627 else {
11628 const char *q = p;
11629 clen--;
11630 for (; clen > 1; clen--) {
11631 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11632 if (MBCLEN_NEEDMORE_P(ret)) break;
11633 if (MBCLEN_INVALID_P(ret)) continue;
11635 }
11636 }
11637 if (rep) {
11638 rb_str_buf_cat(buf, rep, replen);
11639 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11640 }
11641 else {
11642 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11643 str_mod_check(str, sp, slen);
11644 repl = str_compat_and_valid(repl, enc);
11645 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11648 }
11649 p += clen;
11650 p1 = p;
11651 p = search_nonascii(p, e);
11652 if (!p) {
11653 p = e;
11654 break;
11655 }
11656 }
11657 else {
11659 }
11660 }
11661 if (NIL_P(buf)) {
11662 if (p == e) {
11663 ENC_CODERANGE_SET(str, cr);
11664 return Qnil;
11665 }
11666 buf = rb_str_buf_new(RSTRING_LEN(str));
11667 }
11668 if (p1 < p) {
11669 rb_str_buf_cat(buf, p1, p - p1);
11670 }
11671 if (p < e) {
11672 if (rep) {
11673 rb_str_buf_cat(buf, rep, replen);
11674 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11675 }
11676 else {
11677 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11678 str_mod_check(str, sp, slen);
11679 repl = str_compat_and_valid(repl, enc);
11680 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11683 }
11684 }
11685 }
11686 else {
11687 /* ASCII incompatible */
11688 long mbminlen = rb_enc_mbminlen(enc);
11689 if (!replen) {
11690 rep = NULL;
11691 }
11692 else if (!NIL_P(repl)) {
11693 rep = RSTRING_PTR(repl);
11694 replen = RSTRING_LEN(repl);
11695 }
11696 else if (encidx == ENCINDEX_UTF_16BE) {
11697 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11698 }
11699 else if (encidx == ENCINDEX_UTF_16LE) {
11700 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11701 }
11702 else if (encidx == ENCINDEX_UTF_32BE) {
11703 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11704 }
11705 else if (encidx == ENCINDEX_UTF_32LE) {
11706 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11707 }
11708 else {
11709 DEFAULT_REPLACE_CHAR("?");
11710 }
11711
11712 while (p < e) {
11713 int ret = rb_enc_precise_mbclen(p, e, enc);
11714 if (MBCLEN_NEEDMORE_P(ret)) {
11715 break;
11716 }
11717 else if (MBCLEN_CHARFOUND_P(ret)) {
11718 p += MBCLEN_CHARFOUND_LEN(ret);
11719 }
11720 else if (MBCLEN_INVALID_P(ret)) {
11721 const char *q = p;
11722 long clen = rb_enc_mbmaxlen(enc);
11723 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11724 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11725
11726 if (e - p < clen) clen = e - p;
11727 if (clen <= mbminlen * 2) {
11728 clen = mbminlen;
11729 }
11730 else {
11731 clen -= mbminlen;
11732 for (; clen > mbminlen; clen-=mbminlen) {
11733 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11734 if (MBCLEN_NEEDMORE_P(ret)) break;
11735 if (MBCLEN_INVALID_P(ret)) continue;
11737 }
11738 }
11739 if (rep) {
11740 rb_str_buf_cat(buf, rep, replen);
11741 }
11742 else {
11743 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11744 str_mod_check(str, sp, slen);
11745 repl = str_compat_and_valid(repl, enc);
11746 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11747 }
11748 p += clen;
11749 p1 = p;
11750 }
11751 else {
11753 }
11754 }
11755 if (NIL_P(buf)) {
11756 if (p == e) {
11758 return Qnil;
11759 }
11760 buf = rb_str_buf_new(RSTRING_LEN(str));
11761 }
11762 if (p1 < p) {
11763 rb_str_buf_cat(buf, p1, p - p1);
11764 }
11765 if (p < e) {
11766 if (rep) {
11767 rb_str_buf_cat(buf, rep, replen);
11768 }
11769 else {
11770 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11771 str_mod_check(str, sp, slen);
11772 repl = str_compat_and_valid(repl, enc);
11773 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11774 }
11775 }
11777 }
11778 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11779 return buf;
11780}
11781
11782/*
11783 * call-seq:
11784 * scrub(replacement_string = default_replacement) -> new_string
11785 * scrub{|bytes| ... } -> new_string
11786 *
11787 * :include: doc/string/scrub.rdoc
11788 *
11789 */
11790static VALUE
11791str_scrub(int argc, VALUE *argv, VALUE str)
11792{
11793 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11794 VALUE new = rb_str_scrub(str, repl);
11795 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11796}
11797
11798/*
11799 * call-seq:
11800 * scrub! -> self
11801 * scrub!(replacement_string = default_replacement) -> self
11802 * scrub!{|bytes| ... } -> self
11803 *
11804 * Like String#scrub, except that any replacements are made in +self+.
11805 *
11806 */
11807static VALUE
11808str_scrub_bang(int argc, VALUE *argv, VALUE str)
11809{
11810 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11811 VALUE new = rb_str_scrub(str, repl);
11812 if (!NIL_P(new)) rb_str_replace(str, new);
11813 return str;
11814}
11815
11816static ID id_normalize;
11817static ID id_normalized_p;
11818static VALUE mUnicodeNormalize;
11819
11820static VALUE
11821unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11822{
11823 static int UnicodeNormalizeRequired = 0;
11824 VALUE argv2[2];
11825
11826 if (!UnicodeNormalizeRequired) {
11827 rb_require("unicode_normalize/normalize.rb");
11828 UnicodeNormalizeRequired = 1;
11829 }
11830 argv2[0] = str;
11831 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11832 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11833}
11834
11835/*
11836 * call-seq:
11837 * unicode_normalize(form = :nfc) -> string
11838 *
11839 * Returns a copy of +self+ with
11840 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11841 *
11842 * Argument +form+ must be one of the following symbols
11843 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11844 *
11845 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11846 * - +:nfd+: Canonical decomposition.
11847 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11848 * - +:nfkd+: Compatibility decomposition.
11849 *
11850 * The encoding of +self+ must be one of:
11851 *
11852 * - Encoding::UTF_8
11853 * - Encoding::UTF_16BE
11854 * - Encoding::UTF_16LE
11855 * - Encoding::UTF_32BE
11856 * - Encoding::UTF_32LE
11857 * - Encoding::GB18030
11858 * - Encoding::UCS_2BE
11859 * - Encoding::UCS_4BE
11860 *
11861 * Examples:
11862 *
11863 * "a\u0300".unicode_normalize # => "a"
11864 * "\u00E0".unicode_normalize(:nfd) # => "a "
11865 *
11866 * Related: String#unicode_normalize!, String#unicode_normalized?.
11867 */
11868static VALUE
11869rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11870{
11871 return unicode_normalize_common(argc, argv, str, id_normalize);
11872}
11873
11874/*
11875 * call-seq:
11876 * unicode_normalize!(form = :nfc) -> self
11877 *
11878 * Like String#unicode_normalize, except that the normalization
11879 * is performed on +self+.
11880 *
11881 * Related String#unicode_normalized?.
11882 *
11883 */
11884static VALUE
11885rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11886{
11887 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11888}
11889
11890/* call-seq:
11891 * unicode_normalized?(form = :nfc) -> true or false
11892 *
11893 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11894 * +false+ otherwise.
11895 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11896 *
11897 * Examples:
11898 *
11899 * "a\u0300".unicode_normalized? # => false
11900 * "a\u0300".unicode_normalized?(:nfd) # => true
11901 * "\u00E0".unicode_normalized? # => true
11902 * "\u00E0".unicode_normalized?(:nfd) # => false
11903 *
11904 *
11905 * Raises an exception if +self+ is not in a Unicode encoding:
11906 *
11907 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
11908 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11909 *
11910 * Related: String#unicode_normalize, String#unicode_normalize!.
11911 *
11912 */
11913static VALUE
11914rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11915{
11916 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11917}
11918
11919/**********************************************************************
11920 * Document-class: Symbol
11921 *
11922 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11923 *
11924 * You can create a +Symbol+ object explicitly with:
11925 *
11926 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11927 *
11928 * The same +Symbol+ object will be
11929 * created for a given name or string for the duration of a program's
11930 * execution, regardless of the context or meaning of that name. Thus
11931 * if <code>Fred</code> is a constant in one context, a method in
11932 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11933 * will be the same object in all three contexts.
11934 *
11935 * module One
11936 * class Fred
11937 * end
11938 * $f1 = :Fred
11939 * end
11940 * module Two
11941 * Fred = 1
11942 * $f2 = :Fred
11943 * end
11944 * def Fred()
11945 * end
11946 * $f3 = :Fred
11947 * $f1.object_id #=> 2514190
11948 * $f2.object_id #=> 2514190
11949 * $f3.object_id #=> 2514190
11950 *
11951 * Constant, method, and variable names are returned as symbols:
11952 *
11953 * module One
11954 * Two = 2
11955 * def three; 3 end
11956 * @four = 4
11957 * @@five = 5
11958 * $six = 6
11959 * end
11960 * seven = 7
11961 *
11962 * One.constants
11963 * # => [:Two]
11964 * One.instance_methods(true)
11965 * # => [:three]
11966 * One.instance_variables
11967 * # => [:@four]
11968 * One.class_variables
11969 * # => [:@@five]
11970 * global_variables.grep(/six/)
11971 * # => [:$six]
11972 * local_variables
11973 * # => [:seven]
11974 *
11975 * A +Symbol+ object differs from a String object in that
11976 * a +Symbol+ object represents an identifier, while a String object
11977 * represents text or data.
11978 *
11979 * == What's Here
11980 *
11981 * First, what's elsewhere. Class +Symbol+:
11982 *
11983 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11984 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11985 *
11986 * Here, class +Symbol+ provides methods that are useful for:
11987 *
11988 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11989 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11990 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11991 *
11992 * === Methods for Querying
11993 *
11994 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11995 * - #=~: Returns the index of the first substring in symbol that matches a
11996 * given Regexp or other object; returns +nil+ if no match is found.
11997 * - #[], #slice : Returns a substring of symbol
11998 * determined by a given index, start/length, or range, or string.
11999 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12000 * - #encoding: Returns the Encoding object that represents the encoding
12001 * of symbol.
12002 * - #end_with?: Returns +true+ if symbol ends with
12003 * any of the given strings.
12004 * - #match: Returns a MatchData object if symbol
12005 * matches a given Regexp; +nil+ otherwise.
12006 * - #match?: Returns +true+ if symbol
12007 * matches a given Regexp; +false+ otherwise.
12008 * - #length, #size: Returns the number of characters in symbol.
12009 * - #start_with?: Returns +true+ if symbol starts with
12010 * any of the given strings.
12011 *
12012 * === Methods for Comparing
12013 *
12014 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12015 * or larger than symbol.
12016 * - #==, #===: Returns +true+ if a given symbol has the same content and
12017 * encoding.
12018 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12019 * symbol is smaller than, equal to, or larger than symbol.
12020 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12021 * after Unicode case folding; +false+ otherwise.
12022 *
12023 * === Methods for Converting
12024 *
12025 * - #capitalize: Returns symbol with the first character upcased
12026 * and all other characters downcased.
12027 * - #downcase: Returns symbol with all characters downcased.
12028 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12029 * - #name: Returns the frozen string corresponding to symbol.
12030 * - #succ, #next: Returns the symbol that is the successor to symbol.
12031 * - #swapcase: Returns symbol with all upcase characters downcased
12032 * and all downcase characters upcased.
12033 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12034 * - #to_s, #id2name: Returns the string corresponding to +self+.
12035 * - #to_sym, #intern: Returns +self+.
12036 * - #upcase: Returns symbol with all characters upcased.
12037 *
12038 */
12039
12040
12041/*
12042 * call-seq:
12043 * symbol == object -> true or false
12044 *
12045 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12046 */
12047
12048#define sym_equal rb_obj_equal
12049
12050static int
12051sym_printable(const char *s, const char *send, rb_encoding *enc)
12052{
12053 while (s < send) {
12054 int n;
12055 int c = rb_enc_precise_mbclen(s, send, enc);
12056
12057 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12058 n = MBCLEN_CHARFOUND_LEN(c);
12059 c = rb_enc_mbc_to_codepoint(s, send, enc);
12060 if (!rb_enc_isprint(c, enc)) return FALSE;
12061 s += n;
12062 }
12063 return TRUE;
12064}
12065
12066int
12067rb_str_symname_p(VALUE sym)
12068{
12069 rb_encoding *enc;
12070 const char *ptr;
12071 long len;
12072 rb_encoding *resenc = rb_default_internal_encoding();
12073
12074 if (resenc == NULL) resenc = rb_default_external_encoding();
12075 enc = STR_ENC_GET(sym);
12076 ptr = RSTRING_PTR(sym);
12077 len = RSTRING_LEN(sym);
12078 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12079 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12080 return FALSE;
12081 }
12082 return TRUE;
12083}
12084
12085VALUE
12086rb_str_quote_unprintable(VALUE str)
12087{
12088 rb_encoding *enc;
12089 const char *ptr;
12090 long len;
12091 rb_encoding *resenc;
12092
12093 Check_Type(str, T_STRING);
12094 resenc = rb_default_internal_encoding();
12095 if (resenc == NULL) resenc = rb_default_external_encoding();
12096 enc = STR_ENC_GET(str);
12097 ptr = RSTRING_PTR(str);
12098 len = RSTRING_LEN(str);
12099 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12100 !sym_printable(ptr, ptr + len, enc)) {
12101 return rb_str_escape(str);
12102 }
12103 return str;
12104}
12105
12106VALUE
12107rb_id_quote_unprintable(ID id)
12108{
12109 VALUE str = rb_id2str(id);
12110 if (!rb_str_symname_p(str)) {
12111 return rb_str_escape(str);
12112 }
12113 return str;
12114}
12115
12116/*
12117 * call-seq:
12118 * inspect -> string
12119 *
12120 * Returns a string representation of +self+ (including the leading colon):
12121 *
12122 * :foo.inspect # => ":foo"
12123 *
12124 * Related: Symbol#to_s, Symbol#name.
12125 *
12126 */
12127
12128static VALUE
12129sym_inspect(VALUE sym)
12130{
12131 VALUE str = rb_sym2str(sym);
12132 const char *ptr;
12133 long len;
12134 char *dest;
12135
12136 if (!rb_str_symname_p(str)) {
12137 str = rb_str_inspect(str);
12138 len = RSTRING_LEN(str);
12139 rb_str_resize(str, len + 1);
12140 dest = RSTRING_PTR(str);
12141 memmove(dest + 1, dest, len);
12142 }
12143 else {
12144 rb_encoding *enc = STR_ENC_GET(str);
12145 VALUE orig_str = str;
12146
12147 len = RSTRING_LEN(orig_str);
12148 str = rb_enc_str_new(0, len + 1, enc);
12149
12150 // Get data pointer after allocation
12151 ptr = RSTRING_PTR(orig_str);
12152 dest = RSTRING_PTR(str);
12153 memcpy(dest + 1, ptr, len);
12154
12155 RB_GC_GUARD(orig_str);
12156 }
12157 dest[0] = ':';
12158
12160
12161 return str;
12162}
12163
12164VALUE
12166{
12167 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12168 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12169 return str;
12170}
12171
12172VALUE
12173rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12174{
12175 VALUE obj;
12176
12177 if (argc < 1) {
12178 rb_raise(rb_eArgError, "no receiver given");
12179 }
12180 obj = argv[0];
12181 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12182}
12183
12184/*
12185 * call-seq:
12186 * succ
12187 *
12188 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12189 *
12190 * :foo.succ # => :fop
12191 *
12192 * Related: String#succ.
12193 */
12194
12195static VALUE
12196sym_succ(VALUE sym)
12197{
12198 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12199}
12200
12201/*
12202 * call-seq:
12203 * symbol <=> object -> -1, 0, +1, or nil
12204 *
12205 * If +object+ is a symbol,
12206 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12207 *
12208 * :bar <=> :foo # => -1
12209 * :foo <=> :foo # => 0
12210 * :foo <=> :bar # => 1
12211 *
12212 * Otherwise, returns +nil+:
12213 *
12214 * :foo <=> 'bar' # => nil
12215 *
12216 * Related: String#<=>.
12217 */
12218
12219static VALUE
12220sym_cmp(VALUE sym, VALUE other)
12221{
12222 if (!SYMBOL_P(other)) {
12223 return Qnil;
12224 }
12225 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12226}
12227
12228/*
12229 * call-seq:
12230 * casecmp(object) -> -1, 0, 1, or nil
12231 *
12232 * :include: doc/symbol/casecmp.rdoc
12233 *
12234 */
12235
12236static VALUE
12237sym_casecmp(VALUE sym, VALUE other)
12238{
12239 if (!SYMBOL_P(other)) {
12240 return Qnil;
12241 }
12242 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12243}
12244
12245/*
12246 * call-seq:
12247 * casecmp?(object) -> true, false, or nil
12248 *
12249 * :include: doc/symbol/casecmp_p.rdoc
12250 *
12251 */
12252
12253static VALUE
12254sym_casecmp_p(VALUE sym, VALUE other)
12255{
12256 if (!SYMBOL_P(other)) {
12257 return Qnil;
12258 }
12259 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12260}
12261
12262/*
12263 * call-seq:
12264 * symbol =~ object -> integer or nil
12265 *
12266 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12267 * including possible updates to global variables;
12268 * see String#=~.
12269 *
12270 */
12271
12272static VALUE
12273sym_match(VALUE sym, VALUE other)
12274{
12275 return rb_str_match(rb_sym2str(sym), other);
12276}
12277
12278/*
12279 * call-seq:
12280 * match(pattern, offset = 0) -> matchdata or nil
12281 * match(pattern, offset = 0) {|matchdata| } -> object
12282 *
12283 * Equivalent to <tt>self.to_s.match</tt>,
12284 * including possible updates to global variables;
12285 * see String#match.
12286 *
12287 */
12288
12289static VALUE
12290sym_match_m(int argc, VALUE *argv, VALUE sym)
12291{
12292 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12293}
12294
12295/*
12296 * call-seq:
12297 * match?(pattern, offset) -> true or false
12298 *
12299 * Equivalent to <tt>sym.to_s.match?</tt>;
12300 * see String#match.
12301 *
12302 */
12303
12304static VALUE
12305sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12306{
12307 return rb_str_match_m_p(argc, argv, sym);
12308}
12309
12310/*
12311 * call-seq:
12312 * symbol[index] -> string or nil
12313 * symbol[start, length] -> string or nil
12314 * symbol[range] -> string or nil
12315 * symbol[regexp, capture = 0] -> string or nil
12316 * symbol[substring] -> string or nil
12317 *
12318 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12319 *
12320 */
12321
12322static VALUE
12323sym_aref(int argc, VALUE *argv, VALUE sym)
12324{
12325 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12326}
12327
12328/*
12329 * call-seq:
12330 * length -> integer
12331 *
12332 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12333 */
12334
12335static VALUE
12336sym_length(VALUE sym)
12337{
12338 return rb_str_length(rb_sym2str(sym));
12339}
12340
12341/*
12342 * call-seq:
12343 * empty? -> true or false
12344 *
12345 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12346 *
12347 */
12348
12349static VALUE
12350sym_empty(VALUE sym)
12351{
12352 return rb_str_empty(rb_sym2str(sym));
12353}
12354
12355/*
12356 * call-seq:
12357 * upcase(*options) -> symbol
12358 *
12359 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12360 *
12361 * See String#upcase.
12362 *
12363 */
12364
12365static VALUE
12366sym_upcase(int argc, VALUE *argv, VALUE sym)
12367{
12368 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12369}
12370
12371/*
12372 * call-seq:
12373 * downcase(*options) -> symbol
12374 *
12375 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12376 *
12377 * See String#downcase.
12378 *
12379 * Related: Symbol#upcase.
12380 *
12381 */
12382
12383static VALUE
12384sym_downcase(int argc, VALUE *argv, VALUE sym)
12385{
12386 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12387}
12388
12389/*
12390 * call-seq:
12391 * capitalize(*options) -> symbol
12392 *
12393 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12394 *
12395 * See String#capitalize.
12396 *
12397 */
12398
12399static VALUE
12400sym_capitalize(int argc, VALUE *argv, VALUE sym)
12401{
12402 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12403}
12404
12405/*
12406 * call-seq:
12407 * swapcase(*options) -> symbol
12408 *
12409 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12410 *
12411 * See String#swapcase.
12412 *
12413 */
12414
12415static VALUE
12416sym_swapcase(int argc, VALUE *argv, VALUE sym)
12417{
12418 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12419}
12420
12421/*
12422 * call-seq:
12423 * start_with?(*string_or_regexp) -> true or false
12424 *
12425 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12426 *
12427 */
12428
12429static VALUE
12430sym_start_with(int argc, VALUE *argv, VALUE sym)
12431{
12432 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12433}
12434
12435/*
12436 * call-seq:
12437 * end_with?(*strings) -> true or false
12438 *
12439 *
12440 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12441 *
12442 */
12443
12444static VALUE
12445sym_end_with(int argc, VALUE *argv, VALUE sym)
12446{
12447 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12448}
12449
12450/*
12451 * call-seq:
12452 * encoding -> encoding
12453 *
12454 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12455 *
12456 */
12457
12458static VALUE
12459sym_encoding(VALUE sym)
12460{
12461 return rb_obj_encoding(rb_sym2str(sym));
12462}
12463
12464static VALUE
12465string_for_symbol(VALUE name)
12466{
12467 if (!RB_TYPE_P(name, T_STRING)) {
12468 VALUE tmp = rb_check_string_type(name);
12469 if (NIL_P(tmp)) {
12470 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12471 name);
12472 }
12473 name = tmp;
12474 }
12475 return name;
12476}
12477
12478ID
12480{
12481 if (SYMBOL_P(name)) {
12482 return SYM2ID(name);
12483 }
12484 name = string_for_symbol(name);
12485 return rb_intern_str(name);
12486}
12487
12488VALUE
12490{
12491 if (SYMBOL_P(name)) {
12492 return name;
12493 }
12494 name = string_for_symbol(name);
12495 return rb_str_intern(name);
12496}
12497
12498/*
12499 * call-seq:
12500 * Symbol.all_symbols -> array_of_symbols
12501 *
12502 * Returns an array of all symbols currently in Ruby's symbol table:
12503 *
12504 * Symbol.all_symbols.size # => 9334
12505 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12506 *
12507 */
12508
12509static VALUE
12510sym_all_symbols(VALUE _)
12511{
12512 return rb_sym_all_symbols();
12513}
12514
12515VALUE
12516rb_str_to_interned_str(VALUE str)
12517{
12518 return rb_fstring(str);
12519}
12520
12521VALUE
12522rb_interned_str(const char *ptr, long len)
12523{
12524 struct RString fake_str;
12525 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12526}
12527
12528VALUE
12530{
12531 return rb_interned_str(ptr, strlen(ptr));
12532}
12533
12534VALUE
12535rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12536{
12537 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12538 rb_enc_autoload(enc);
12539 }
12540
12541 struct RString fake_str;
12542 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12543}
12544
12545VALUE
12546rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12547{
12548 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12549 rb_enc_autoload(enc);
12550 }
12551
12552 struct RString fake_str;
12553 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12554}
12555
12556VALUE
12558{
12559 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12560}
12561
12562#if USE_YJIT
12563void
12564rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12565{
12566 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12567 ssize_t code = RB_NUM2SSIZE(codepoint);
12568
12569 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12570 rb_str_buf_cat_byte(str, (char) code);
12571 return;
12572 }
12573 }
12574
12575 rb_str_concat(str, codepoint);
12576}
12577#endif
12578
12579void
12580Init_String(void)
12581{
12582 rb_cString = rb_define_class("String", rb_cObject);
12583 RUBY_ASSERT(rb_vm_fstring_table());
12584 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12586 rb_define_alloc_func(rb_cString, empty_str_alloc);
12587 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12588 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12589 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12590 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12591 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12594 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12595 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12596 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12597 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12600 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12601 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12602 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12603 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12606 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12607 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12608 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12609 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12610 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12612 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12614 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12615 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12616 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12617 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12618 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12619 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12621 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12622 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12623 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12624 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12625 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12626 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12627 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12628 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12630 rb_define_method(rb_cString, "+@", str_uplus, 0);
12631 rb_define_method(rb_cString, "-@", str_uminus, 0);
12632 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12633 rb_define_alias(rb_cString, "dedup", "-@");
12634
12635 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12636 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12637 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12638 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12641 rb_define_method(rb_cString, "undump", str_undump, 0);
12642
12643 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12644 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12645 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12646 sym_fold = ID2SYM(rb_intern_const("fold"));
12647
12648 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12649 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12650 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12651 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12652
12653 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12654 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12655 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12656 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12657
12658 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12659 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12660 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12661 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12662 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12663 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12664 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12665 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12666 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12667 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12668 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12669 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12671 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12672 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12673 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12674 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12675 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12676
12677 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12678 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12679 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12680
12681 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12682
12683 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12684 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12685 rb_define_method(rb_cString, "center", rb_str_center, -1);
12686
12687 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12688 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12689 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12690 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12691 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12692 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12693 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12694 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12695 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12696
12697 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12698 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12699 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12700 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12701 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12702 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12703 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12704 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12705 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12706
12707 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12708 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12709 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12710 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12711 rb_define_method(rb_cString, "count", rb_str_count, -1);
12712
12713 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12714 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12715 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12716 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12717
12718 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12719 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12720 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12721 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12722 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12723
12724 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12725
12726 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12727 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12728
12729 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12730 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12731
12732 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12733 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12734 rb_define_method(rb_cString, "b", rb_str_b, 0);
12735 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12736 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12737
12738 /* define UnicodeNormalize module here so that we don't have to look it up */
12739 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12740 id_normalize = rb_intern_const("normalize");
12741 id_normalized_p = rb_intern_const("normalized?");
12742
12743 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12744 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12745 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12746
12747 rb_fs = Qnil;
12748 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12749 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12750 rb_gc_register_address(&rb_fs);
12751
12752 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12756 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12757
12758 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12759 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12760 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12761 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12762 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12763 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12764
12765 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12766 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12767 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12768 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12769
12770 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12771 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12772 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12773 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12774 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12775 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12776 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12777
12778 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12779 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12780 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12781 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12782
12783 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12784 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12785
12786 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12787}
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1187
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1095
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2345
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2166
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2635
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:936
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2424
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:126
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:675
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3905
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:669
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2097
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2115
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1272
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3483
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:576
VALUE rb_cSymbol
Symbol class.
Definition string.c:80
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1260
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3192
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1286
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:901
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1151
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2927
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1170
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12535
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2250
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3612
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1099
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1391
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1292
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:920
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12557
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:785
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:430
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1099
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:670
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1836
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1058
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1842
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1892
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4198
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3695
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1905
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1678
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1456
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2401
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3677
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1367
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12165
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2473
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1343
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1672
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2955
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5269
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4046
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3052
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11464
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1752
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1714
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1133
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:955
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1462
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1917
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4032
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3445
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2339
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1935
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6491
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3060
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12529
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1373
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3643
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3002
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4148
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3269
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7212
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2693
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12522
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4102
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3919
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4077
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3619
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3177
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5779
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11522
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1628
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2851
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3149
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3252
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1145
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2649
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7326
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1355
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1644
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2353
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5697
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9419
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1139
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:894
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1776
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1844
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:1861
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2953
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1284
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:986
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12489
ID rb_to_id(VALUE str)
Definition string.c:12479
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1844
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3479
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4442
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1385
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2828
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2712
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1379
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2723
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1705
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1417
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:75
Ruby's String.
Definition rstring.h:196
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
union RString::@52::@53::@55 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:200
Definition st.h:79
Definition string.c:8284
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:295
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113