Ruby 3.5.0dev (2025-10-10 revision 83d0b064c88df718e13bb8d6b4182ec635f7b03b)
string.c (83d0b064c88df718e13bb8d6b4182ec635f7b03b)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby_assert.h"
49#include "shape.h"
50#include "vm_sync.h"
52
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
55# include <crypt.h>
56# endif
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
60#endif
61
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
64
65#undef rb_str_new
66#undef rb_usascii_str_new
67#undef rb_utf8_str_new
68#undef rb_enc_str_new
69#undef rb_str_new_cstr
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
77#undef rb_str_buf_cat
78#undef rb_str_buf_cat2
79#undef rb_str_cat2
80#undef rb_str_cat_cstr
81#undef rb_fstring_cstr
82
85
86/* Flags of RString
87 *
88 * 0: STR_SHARED (equal to ELTS_SHARED)
89 * The string is shared. The buffer this string points to is owned by
90 * another string (the shared root).
91 * 1: RSTRING_NOEMBED
92 * The string is not embedded. When a string is embedded, the contents
93 * follow the header. When a string is not embedded, the contents is
94 * on a separately allocated buffer.
95 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
96 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
97 * It emits a deprecation warning when mutated for the first time.
98 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
99 * The string was allocated by the `Symbol#to_s` method.
100 * It emits a deprecation warning when mutated for the first time.
101 * 4: STR_PRECOMPUTED_HASH
102 * The string is embedded and has its precomputed hashcode stored
103 * after the terminator.
104 * 5: STR_SHARED_ROOT
105 * Other strings may point to the contents of this string. When this
106 * flag is set, STR_SHARED must not be set.
107 * 6: STR_BORROWED
108 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
109 * to be unshared by rb_str_tmp_frozen_release.
110 * 7: STR_TMPLOCK
111 * The pointer to the buffer is passed to a system call such as
112 * read(2). Any modification and realloc is prohibited.
113 * 8-9: ENC_CODERANGE
114 * Stores the coderange of the string.
115 * 10-16: ENCODING
116 * Stores the encoding of the string.
117 * 17: RSTRING_FSTR
118 * The string is a fstring. The string is deduplicated in the fstring
119 * table.
120 * 18: STR_NOFREE
121 * Do not free this string's buffer when the string is reclaimed
122 * by the garbage collector. Used for when the string buffer is a C
123 * string literal.
124 * 19: STR_FAKESTR
125 * The string is not allocated or managed by the garbage collector.
126 * Typically, the string object header (struct RString) is temporarily
127 * allocated on C stack.
128 */
129
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
137
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
141} while (0)
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
143
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
146} while (0)
147
148static inline bool
149str_encindex_fastpath(int encindex)
150{
151 // The overwhelming majority of strings are in one of these 3 encodings.
152 switch (encindex) {
153 case ENCINDEX_ASCII_8BIT:
154 case ENCINDEX_UTF_8:
155 case ENCINDEX_US_ASCII:
156 return true;
157 default:
158 return false;
159 }
160}
161
162static inline bool
163str_enc_fastpath(VALUE str)
164{
165 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
166}
167
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
175} while (0)
176
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
180} while (0)
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
191 }\
192 }\
193 else {\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
198 }\
199} while (0)
200
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
210 } \
211} while (0)
212
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
215/* TODO: include the terminator size in capa. */
216
217#define STR_ENC_GET(str) get_encoding(str)
218
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
221#endif
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
224#else
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
226#endif
227
228
229static inline long
230str_embed_capa(VALUE str)
231{
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233}
234
235bool
236rb_str_reembeddable_p(VALUE str)
237{
238 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
239}
240
241static inline size_t
242rb_str_embed_size(long capa)
243{
244 return offsetof(struct RString, as.embed.ary) + capa;
245}
246
247size_t
248rb_str_size_as_embedded(VALUE str)
249{
250 size_t real_size;
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
253 }
254 /* if the string is not currently embedded, but it can be embedded, how
255 * much space would it require */
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
258 }
259 else {
260 real_size = sizeof(struct RString);
261 }
262
263 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
264 real_size += sizeof(st_index_t);
265 }
266
267 return real_size;
268}
269
270static inline bool
271STR_EMBEDDABLE_P(long len, long termlen)
272{
273 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
274}
275
276static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
277static VALUE str_new_frozen(VALUE klass, VALUE orig);
278static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
279static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
280static VALUE str_new(VALUE klass, const char *ptr, long len);
281static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
282static inline void str_modifiable(VALUE str);
283static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
284static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
285
286static inline void
287str_make_independent(VALUE str)
288{
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str), len, 0L, termlen);
292}
293
294static inline int str_dependent_p(VALUE str);
295
296void
297rb_str_make_independent(VALUE str)
298{
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
301 }
302}
303
304void
305rb_str_make_embedded(VALUE str)
306{
307 RUBY_ASSERT(rb_str_reembeddable_p(str));
308 RUBY_ASSERT(!STR_EMBED_P(str));
309
310 char *buf = RSTRING(str)->as.heap.ptr;
311 long len = RSTRING(str)->len;
312
313 STR_SET_EMBED(str);
314 STR_SET_LEN(str, len);
315
316 if (len > 0) {
317 memcpy(RSTRING_PTR(str), buf, len);
318 ruby_xfree(buf);
319 }
320
321 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
322}
323
324void
325rb_debug_rstring_null_ptr(const char *func)
326{
327 fprintf(stderr, "%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
331 func);
332}
333
334/* symbols for [up|down|swap]case/capitalize options */
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
336
337static rb_encoding *
338get_encoding(VALUE str)
339{
340 return rb_enc_from_index(ENCODING_GET(str));
341}
342
343static void
344mustnot_broken(VALUE str)
345{
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348 }
349}
350
351static void
352mustnot_wchar(VALUE str)
353{
354 rb_encoding *enc = STR_ENC_GET(str);
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
357 }
358}
359
360static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
361
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
364#else
365#endif
366
367static inline bool
368BARE_STRING_P(VALUE str)
369{
370 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
371}
372
373static inline st_index_t
374str_do_hash(VALUE str)
375{
376 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
377 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
378 if (e && !is_ascii_string(str)) {
379 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
380 }
381 return h;
382}
383
384static VALUE
385str_store_precomputed_hash(VALUE str, st_index_t hash)
386{
387 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
388 RUBY_ASSERT(STR_EMBED_P(str));
389
390#if RUBY_DEBUG
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
393 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
394#endif
395
396 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
397
398 FL_SET(str, STR_PRECOMPUTED_HASH);
399
400 return str;
401}
402
403VALUE
404rb_fstring(VALUE str)
405{
406 VALUE fstr;
407 int bare;
408
409 Check_Type(str, T_STRING);
410
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
413
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE(str);
418 return str;
419 }
420
421 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
423 return str;
424 }
425 }
426
427 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
428 rb_str_resize(str, RSTRING_LEN(str));
429
430 fstr = register_fstring(str, false, false);
431
432 if (!bare) {
433 str_replace_shared_without_enc(str, fstr);
434 OBJ_FREEZE(str);
435 return str;
436 }
437 return fstr;
438}
439
440static VALUE fstring_table_obj;
441
442static VALUE
443fstring_concurrent_set_hash(VALUE str)
444{
445#ifdef PRECOMPUTED_FAKESTR_HASH
446 st_index_t h;
447 if (FL_TEST_RAW(str, STR_FAKESTR)) {
448 // register_fstring precomputes the hash and stores it in capa for fake strings
449 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
450 }
451 else {
452 h = rb_str_hash(str);
453 }
454 // rb_str_hash doesn't include the encoding for ascii only strings, so
455 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
456 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
457#else
458 return (VALUE)rb_str_hash(str);
459#endif
460}
461
462static bool
463fstring_concurrent_set_cmp(VALUE a, VALUE b)
464{
465 long alen, blen;
466 const char *aptr, *bptr;
467
470
471 RSTRING_GETMEM(a, aptr, alen);
472 RSTRING_GETMEM(b, bptr, blen);
473 return (alen == blen &&
474 ENCODING_GET(a) == ENCODING_GET(b) &&
475 memcmp(aptr, bptr, alen) == 0);
476}
477
479 bool copy;
480 bool force_precompute_hash;
481};
482
483static VALUE
484fstring_concurrent_set_create(VALUE str, void *data)
485{
486 struct fstr_create_arg *arg = data;
487
488 // Unless the string is empty or binary, its coderange has been precomputed.
489 int coderange = ENC_CODERANGE(str);
490
491 if (FL_TEST_RAW(str, STR_FAKESTR)) {
492 if (arg->copy) {
493 VALUE new_str;
494 long len = RSTRING_LEN(str);
495 long capa = len + sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
497
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
499 new_str = str_alloc_embed(rb_cString, capa + term_len);
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
502 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
505 }
506 else {
507 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
512 }
513#endif
514 }
515 str = new_str;
516 }
517 else {
518 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
519 RSTRING(str)->len,
520 ENCODING_GET(str));
521 }
522 OBJ_FREEZE(str);
523 }
524 else {
525 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
526 str = str_new_frozen(rb_cString, str);
527 }
528 if (STR_SHARED_P(str)) { /* str should not be shared */
529 /* shared substring */
530 str_make_independent(str);
532 }
533 if (!BARE_STRING_P(str)) {
534 str = str_new_frozen(rb_cString, str);
535 }
536 }
537
538 ENC_CODERANGE_SET(str, coderange);
539 RBASIC(str)->flags |= RSTRING_FSTR;
540
543 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
544 RUBY_ASSERT(!rb_obj_exivar_p(str));
546 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
547
548 return str;
549}
550
551static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
555 .free = NULL,
556};
557
558void
559Init_fstring_table(void)
560{
561 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
562 rb_gc_register_address(&fstring_table_obj);
563}
564
565static VALUE
566register_fstring(VALUE str, bool copy, bool force_precompute_hash)
567{
568 struct fstr_create_arg args = {
569 .copy = copy,
570 .force_precompute_hash = force_precompute_hash
571 };
572
573#if SIZEOF_VOIDP == SIZEOF_LONG
574 if (FL_TEST_RAW(str, STR_FAKESTR)) {
575 // if the string hasn't been interned, we'll need the hash twice, so we
576 // compute it once and store it in capa
577 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
578 }
579#endif
580
581 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
582
583 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
585 RUBY_ASSERT(OBJ_FROZEN(result));
586 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
588
589 return result;
590}
591
592bool
593rb_obj_is_fstring_table(VALUE obj)
594{
595 ASSERT_vm_locking();
596
597 return obj == fstring_table_obj;
598}
599
600void
601rb_gc_free_fstring(VALUE obj)
602{
603 // Assume locking and barrier (which there is no assert for)
604 ASSERT_vm_locking();
605
606 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
607
608 RB_DEBUG_COUNTER_INC(obj_str_fstr);
609
610 FL_UNSET(obj, RSTRING_FSTR);
611}
612
613void
614rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
615{
616 if (fstring_table_obj) {
617 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
618 }
619}
620
621static VALUE
622setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
623{
624 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
625 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
626
627 if (!name) {
629 name = "";
630 }
631
632 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
633
634 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
635 fake_str->len = len;
636 fake_str->as.heap.ptr = (char *)name;
637 fake_str->as.heap.aux.capa = len;
638 return (VALUE)fake_str;
639}
640
641/*
642 * set up a fake string which refers a static string literal.
643 */
644VALUE
645rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
646{
647 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
648}
649
650/*
651 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
652 * shared string which refers a static string literal. `ptr` must
653 * point a constant string.
654 */
655VALUE
656rb_fstring_new(const char *ptr, long len)
657{
658 struct RString fake_str = {RBASIC_INIT};
659 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
660}
661
662VALUE
663rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
664{
665 struct RString fake_str = {RBASIC_INIT};
666 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
667}
668
669VALUE
670rb_fstring_cstr(const char *ptr)
671{
672 return rb_fstring_new(ptr, strlen(ptr));
673}
674
675static inline bool
676single_byte_optimizable(VALUE str)
677{
678 int encindex = ENCODING_GET(str);
679 switch (encindex) {
680 case ENCINDEX_ASCII_8BIT:
681 case ENCINDEX_US_ASCII:
682 return true;
683 case ENCINDEX_UTF_8:
684 // For UTF-8 it's worth scanning the string coderange when unknown.
686 }
687 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
688 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
689 return true;
690 }
691
692 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
693 return true;
694 }
695
696 /* Conservative. Possibly single byte.
697 * "\xa1" in Shift_JIS for example. */
698 return false;
699}
700
702
703static inline const char *
704search_nonascii(const char *p, const char *e)
705{
706 const uintptr_t *s, *t;
707
708#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
709# if SIZEOF_UINTPTR_T == 8
710# define NONASCII_MASK UINT64_C(0x8080808080808080)
711# elif SIZEOF_UINTPTR_T == 4
712# define NONASCII_MASK UINT32_C(0x80808080)
713# else
714# error "don't know what to do."
715# endif
716#else
717# if SIZEOF_UINTPTR_T == 8
718# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
719# elif SIZEOF_UINTPTR_T == 4
720# define NONASCII_MASK 0x80808080UL /* or...? */
721# else
722# error "don't know what to do."
723# endif
724#endif
725
726 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
727#if !UNALIGNED_WORD_ACCESS
728 if ((uintptr_t)p % SIZEOF_VOIDP) {
729 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
730 p += l;
731 switch (l) {
732 default: UNREACHABLE;
733#if SIZEOF_VOIDP > 4
734 case 7: if (p[-7]&0x80) return p-7;
735 case 6: if (p[-6]&0x80) return p-6;
736 case 5: if (p[-5]&0x80) return p-5;
737 case 4: if (p[-4]&0x80) return p-4;
738#endif
739 case 3: if (p[-3]&0x80) return p-3;
740 case 2: if (p[-2]&0x80) return p-2;
741 case 1: if (p[-1]&0x80) return p-1;
742 case 0: break;
743 }
744 }
745#endif
746#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
747#define aligned_ptr(value) \
748 __builtin_assume_aligned((value), sizeof(uintptr_t))
749#else
750#define aligned_ptr(value) (uintptr_t *)(value)
751#endif
752 s = aligned_ptr(p);
753 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
754#undef aligned_ptr
755 for (;s < t; s++) {
756 if (*s & NONASCII_MASK) {
757#ifdef WORDS_BIGENDIAN
758 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
759#else
760 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
761#endif
762 }
763 }
764 p = (const char *)s;
765 }
766
767 switch (e - p) {
768 default: UNREACHABLE;
769#if SIZEOF_VOIDP > 4
770 case 7: if (e[-7]&0x80) return e-7;
771 case 6: if (e[-6]&0x80) return e-6;
772 case 5: if (e[-5]&0x80) return e-5;
773 case 4: if (e[-4]&0x80) return e-4;
774#endif
775 case 3: if (e[-3]&0x80) return e-3;
776 case 2: if (e[-2]&0x80) return e-2;
777 case 1: if (e[-1]&0x80) return e-1;
778 case 0: return NULL;
779 }
780}
781
782static int
783coderange_scan(const char *p, long len, rb_encoding *enc)
784{
785 const char *e = p + len;
786
787 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
788 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
789 p = search_nonascii(p, e);
791 }
792
793 if (rb_enc_asciicompat(enc)) {
794 p = search_nonascii(p, e);
795 if (!p) return ENC_CODERANGE_7BIT;
796 for (;;) {
797 int ret = rb_enc_precise_mbclen(p, e, enc);
799 p += MBCLEN_CHARFOUND_LEN(ret);
800 if (p == e) break;
801 p = search_nonascii(p, e);
802 if (!p) break;
803 }
804 }
805 else {
806 while (p < e) {
807 int ret = rb_enc_precise_mbclen(p, e, enc);
809 p += MBCLEN_CHARFOUND_LEN(ret);
810 }
811 }
812 return ENC_CODERANGE_VALID;
813}
814
815long
816rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
817{
818 const char *p = s;
819
820 if (*cr == ENC_CODERANGE_BROKEN)
821 return e - s;
822
823 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
824 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
825 if (*cr == ENC_CODERANGE_VALID) return e - s;
826 p = search_nonascii(p, e);
828 return e - s;
829 }
830 else if (rb_enc_asciicompat(enc)) {
831 p = search_nonascii(p, e);
832 if (!p) {
833 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
834 return e - s;
835 }
836 for (;;) {
837 int ret = rb_enc_precise_mbclen(p, e, enc);
838 if (!MBCLEN_CHARFOUND_P(ret)) {
840 return p - s;
841 }
842 p += MBCLEN_CHARFOUND_LEN(ret);
843 if (p == e) break;
844 p = search_nonascii(p, e);
845 if (!p) break;
846 }
847 }
848 else {
849 while (p < e) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 }
857 }
859 return e - s;
860}
861
862static inline void
863str_enc_copy(VALUE str1, VALUE str2)
864{
865 rb_enc_set_index(str1, ENCODING_GET(str2));
866}
867
868/* Like str_enc_copy, but does not check frozen status of str1.
869 * You should use this only if you're certain that str1 is not frozen. */
870static inline void
871str_enc_copy_direct(VALUE str1, VALUE str2)
872{
873 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
874 if (inlined_encoding == ENCODING_INLINE_MAX) {
875 rb_enc_set_index(str1, rb_enc_get_index(str2));
876 }
877 else {
878 ENCODING_SET_INLINED(str1, inlined_encoding);
879 }
880}
881
882static void
883rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
884{
885 /* this function is designed for copying encoding and coderange
886 * from src to new string "dest" which is made from the part of src.
887 */
888 str_enc_copy(dest, src);
889 if (RSTRING_LEN(dest) == 0) {
890 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
892 else
894 return;
895 }
896 switch (ENC_CODERANGE(src)) {
899 break;
901 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
902 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
904 else
906 break;
907 default:
908 break;
909 }
910}
911
912static void
913rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
914{
915 str_enc_copy(dest, src);
917}
918
919static int
920enc_coderange_scan(VALUE str, rb_encoding *enc)
921{
922 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
923}
924
925int
926rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
927{
928 return enc_coderange_scan(str, enc);
929}
930
931int
933{
934 int cr = ENC_CODERANGE(str);
935
936 if (cr == ENC_CODERANGE_UNKNOWN) {
937 cr = enc_coderange_scan(str, get_encoding(str));
938 ENC_CODERANGE_SET(str, cr);
939 }
940 return cr;
941}
942
943static inline bool
944rb_enc_str_asciicompat(VALUE str)
945{
946 int encindex = ENCODING_GET_INLINED(str);
947 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
948}
949
950int
952{
953 switch(ENC_CODERANGE(str)) {
955 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
957 return true;
958 default:
959 return false;
960 }
961}
962
963static inline void
964str_mod_check(VALUE s, const char *p, long len)
965{
966 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
967 rb_raise(rb_eRuntimeError, "string modified");
968 }
969}
970
971static size_t
972str_capacity(VALUE str, const int termlen)
973{
974 if (STR_EMBED_P(str)) {
975 return str_embed_capa(str) - termlen;
976 }
977 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
978 return RSTRING(str)->len;
979 }
980 else {
981 return RSTRING(str)->as.heap.aux.capa;
982 }
983}
984
985size_t
987{
988 return str_capacity(str, TERM_LEN(str));
989}
990
991static inline void
992must_not_null(const char *ptr)
993{
994 if (!ptr) {
995 rb_raise(rb_eArgError, "NULL pointer given");
996 }
997}
998
999static inline VALUE
1000str_alloc_embed(VALUE klass, size_t capa)
1001{
1002 size_t size = rb_str_embed_size(capa);
1003 RUBY_ASSERT(size > 0);
1004 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1005
1006 NEWOBJ_OF(str, struct RString, klass,
1008
1009 str->len = 0;
1010 str->as.embed.ary[0] = 0;
1011
1012 return (VALUE)str;
1013}
1014
1015static inline VALUE
1016str_alloc_heap(VALUE klass)
1017{
1018 NEWOBJ_OF(str, struct RString, klass,
1019 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1020
1021 str->len = 0;
1022 str->as.heap.aux.capa = 0;
1023 str->as.heap.ptr = NULL;
1024
1025 return (VALUE)str;
1026}
1027
1028static inline VALUE
1029empty_str_alloc(VALUE klass)
1030{
1031 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1032 VALUE str = str_alloc_embed(klass, 0);
1033 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1035 return str;
1036}
1037
1038static VALUE
1039str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1040{
1041 VALUE str;
1042
1043 if (len < 0) {
1044 rb_raise(rb_eArgError, "negative string size (or size too big)");
1045 }
1046
1047 if (enc == NULL) {
1048 enc = rb_ascii8bit_encoding();
1049 }
1050
1051 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1052
1053 int termlen = rb_enc_mbminlen(enc);
1054
1055 if (STR_EMBEDDABLE_P(len, termlen)) {
1056 str = str_alloc_embed(klass, len + termlen);
1057 if (len == 0) {
1058 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1059 }
1060 }
1061 else {
1062 str = str_alloc_heap(klass);
1063 RSTRING(str)->as.heap.aux.capa = len;
1064 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1065 * integer overflow. If we can STATIC_ASSERT that, the following
1066 * mul_add_mul can be reverted to a simple ALLOC_N. */
1067 RSTRING(str)->as.heap.ptr =
1068 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1069 }
1070
1071 rb_enc_raw_set(str, enc);
1072
1073 if (ptr) {
1074 memcpy(RSTRING_PTR(str), ptr, len);
1075 }
1076 else {
1077 memset(RSTRING_PTR(str), 0, len);
1078 }
1079
1080 STR_SET_LEN(str, len);
1081 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1082 return str;
1083}
1084
1085static VALUE
1086str_new(VALUE klass, const char *ptr, long len)
1087{
1088 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1089}
1090
1091VALUE
1092rb_str_new(const char *ptr, long len)
1093{
1094 return str_new(rb_cString, ptr, len);
1095}
1096
1097VALUE
1098rb_usascii_str_new(const char *ptr, long len)
1099{
1100 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1101}
1102
1103VALUE
1104rb_utf8_str_new(const char *ptr, long len)
1105{
1106 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1107}
1108
1109VALUE
1110rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1111{
1112 return str_enc_new(rb_cString, ptr, len, enc);
1113}
1114
1115VALUE
1117{
1118 must_not_null(ptr);
1119 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1120 * memory regions, and that cannot be detected by the MSAN. Just
1121 * trust the programmer that the argument passed here is a sane C
1122 * string. */
1123 __msan_unpoison_string(ptr);
1124 return rb_str_new(ptr, strlen(ptr));
1125}
1126
1127VALUE
1129{
1130 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1131}
1132
1133VALUE
1135{
1136 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1137}
1138
1139VALUE
1141{
1142 must_not_null(ptr);
1143 if (rb_enc_mbminlen(enc) != 1) {
1144 rb_raise(rb_eArgError, "wchar encoding given");
1145 }
1146 return rb_enc_str_new(ptr, strlen(ptr), enc);
1147}
1148
1149static VALUE
1150str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1151{
1152 VALUE str;
1153
1154 if (len < 0) {
1155 rb_raise(rb_eArgError, "negative string size (or size too big)");
1156 }
1157
1158 if (!ptr) {
1159 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1160 }
1161 else {
1162 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1163 str = str_alloc_heap(klass);
1164 RSTRING(str)->len = len;
1165 RSTRING(str)->as.heap.ptr = (char *)ptr;
1166 RSTRING(str)->as.heap.aux.capa = len;
1167 RBASIC(str)->flags |= STR_NOFREE;
1168 rb_enc_associate_index(str, encindex);
1169 }
1170 return str;
1171}
1172
1173VALUE
1174rb_str_new_static(const char *ptr, long len)
1175{
1176 return str_new_static(rb_cString, ptr, len, 0);
1177}
1178
1179VALUE
1181{
1182 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1183}
1184
1185VALUE
1187{
1188 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1189}
1190
1191VALUE
1193{
1194 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1195}
1196
1197static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1198 rb_encoding *from, rb_encoding *to,
1199 int ecflags, VALUE ecopts);
1200
1201static inline bool
1202is_enc_ascii_string(VALUE str, rb_encoding *enc)
1203{
1204 int encidx = rb_enc_to_index(enc);
1205 if (rb_enc_get_index(str) == encidx)
1206 return is_ascii_string(str);
1207 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1208}
1209
1210VALUE
1211rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1212{
1213 long len;
1214 const char *ptr;
1215 VALUE newstr;
1216
1217 if (!to) return str;
1218 if (!from) from = rb_enc_get(str);
1219 if (from == to) return str;
1220 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1221 rb_is_ascii8bit_enc(to)) {
1222 if (STR_ENC_GET(str) != to) {
1223 str = rb_str_dup(str);
1224 rb_enc_associate(str, to);
1225 }
1226 return str;
1227 }
1228
1229 RSTRING_GETMEM(str, ptr, len);
1230 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1231 from, to, ecflags, ecopts);
1232 if (NIL_P(newstr)) {
1233 /* some error, return original */
1234 return str;
1235 }
1236 return newstr;
1237}
1238
1239VALUE
1240rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1241 rb_encoding *from, int ecflags, VALUE ecopts)
1242{
1243 long olen;
1244
1245 olen = RSTRING_LEN(newstr);
1246 if (ofs < -olen || olen < ofs)
1247 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1248 if (ofs < 0) ofs += olen;
1249 if (!from) {
1250 STR_SET_LEN(newstr, ofs);
1251 return rb_str_cat(newstr, ptr, len);
1252 }
1253
1254 rb_str_modify(newstr);
1255 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1256 rb_enc_get(newstr),
1257 ecflags, ecopts);
1258}
1259
1260VALUE
1261rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1262{
1263 STR_SET_LEN(str, 0);
1264 rb_enc_associate(str, enc);
1265 rb_str_cat(str, ptr, len);
1266 return str;
1267}
1268
1269static VALUE
1270str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1271 rb_encoding *from, rb_encoding *to,
1272 int ecflags, VALUE ecopts)
1273{
1274 rb_econv_t *ec;
1276 long olen;
1277 VALUE econv_wrapper;
1278 const unsigned char *start, *sp;
1279 unsigned char *dest, *dp;
1280 size_t converted_output = (size_t)ofs;
1281
1282 olen = rb_str_capacity(newstr);
1283
1284 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1285 RBASIC_CLEAR_CLASS(econv_wrapper);
1286 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1287 if (!ec) return Qnil;
1288 DATA_PTR(econv_wrapper) = ec;
1289
1290 sp = (unsigned char*)ptr;
1291 start = sp;
1292 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1293 (dp = dest + converted_output),
1294 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1296 /* destination buffer short */
1297 size_t converted_input = sp - start;
1298 size_t rest = len - converted_input;
1299 converted_output = dp - dest;
1300 rb_str_set_len(newstr, converted_output);
1301 if (converted_input && converted_output &&
1302 rest < (LONG_MAX / converted_output)) {
1303 rest = (rest * converted_output) / converted_input;
1304 }
1305 else {
1306 rest = olen;
1307 }
1308 olen += rest < 2 ? 2 : rest;
1309 rb_str_resize(newstr, olen);
1310 }
1311 DATA_PTR(econv_wrapper) = 0;
1312 RB_GC_GUARD(econv_wrapper);
1313 rb_econv_close(ec);
1314 switch (ret) {
1315 case econv_finished:
1316 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1317 rb_str_set_len(newstr, len);
1318 rb_enc_associate(newstr, to);
1319 return newstr;
1320
1321 default:
1322 return Qnil;
1323 }
1324}
1325
1326VALUE
1328{
1329 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1330}
1331
1332VALUE
1334{
1335 rb_encoding *ienc;
1336 VALUE str;
1337 const int eidx = rb_enc_to_index(eenc);
1338
1339 if (!ptr) {
1340 return rb_enc_str_new(ptr, len, eenc);
1341 }
1342
1343 /* ASCII-8BIT case, no conversion */
1344 if ((eidx == rb_ascii8bit_encindex()) ||
1345 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1346 return rb_str_new(ptr, len);
1347 }
1348 /* no default_internal or same encoding, no conversion */
1349 ienc = rb_default_internal_encoding();
1350 if (!ienc || eenc == ienc) {
1351 return rb_enc_str_new(ptr, len, eenc);
1352 }
1353 /* ASCII compatible, and ASCII only string, no conversion in
1354 * default_internal */
1355 if ((eidx == rb_ascii8bit_encindex()) ||
1356 (eidx == rb_usascii_encindex()) ||
1357 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1358 return rb_enc_str_new(ptr, len, ienc);
1359 }
1360 /* convert from the given encoding to default_internal */
1361 str = rb_enc_str_new(NULL, 0, ienc);
1362 /* when the conversion failed for some reason, just ignore the
1363 * default_internal and result in the given encoding as-is. */
1364 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1365 rb_str_initialize(str, ptr, len, eenc);
1366 }
1367 return str;
1368}
1369
1370VALUE
1371rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1372{
1373 int eidx = rb_enc_to_index(eenc);
1374 if (eidx == rb_usascii_encindex() &&
1375 !is_ascii_string(str)) {
1376 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1377 return str;
1378 }
1379 rb_enc_associate_index(str, eidx);
1380 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1381}
1382
1383VALUE
1384rb_external_str_new(const char *ptr, long len)
1385{
1386 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1387}
1388
1389VALUE
1391{
1392 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1393}
1394
1395VALUE
1396rb_locale_str_new(const char *ptr, long len)
1397{
1398 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1399}
1400
1401VALUE
1403{
1404 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1405}
1406
1407VALUE
1409{
1410 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1411}
1412
1413VALUE
1415{
1416 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1417}
1418
1419VALUE
1421{
1422 return rb_str_export_to_enc(str, rb_default_external_encoding());
1423}
1424
1425VALUE
1427{
1428 return rb_str_export_to_enc(str, rb_locale_encoding());
1429}
1430
1431VALUE
1433{
1434 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1435}
1436
1437static VALUE
1438str_replace_shared_without_enc(VALUE str2, VALUE str)
1439{
1440 const int termlen = TERM_LEN(str);
1441 char *ptr;
1442 long len;
1443
1444 RSTRING_GETMEM(str, ptr, len);
1445 if (str_embed_capa(str2) >= len + termlen) {
1446 char *ptr2 = RSTRING(str2)->as.embed.ary;
1447 STR_SET_EMBED(str2);
1448 memcpy(ptr2, RSTRING_PTR(str), len);
1449 TERM_FILL(ptr2+len, termlen);
1450 }
1451 else {
1452 VALUE root;
1453 if (STR_SHARED_P(str)) {
1454 root = RSTRING(str)->as.heap.aux.shared;
1455 RSTRING_GETMEM(str, ptr, len);
1456 }
1457 else {
1458 root = rb_str_new_frozen(str);
1459 RSTRING_GETMEM(root, ptr, len);
1460 }
1461 RUBY_ASSERT(OBJ_FROZEN(root));
1462
1463 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1464 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1465 rb_fatal("about to free a possible shared root");
1466 }
1467 char *ptr2 = STR_HEAP_PTR(str2);
1468 if (ptr2 != ptr) {
1469 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1470 }
1471 }
1472 FL_SET(str2, STR_NOEMBED);
1473 RSTRING(str2)->as.heap.ptr = ptr;
1474 STR_SET_SHARED(str2, root);
1475 }
1476
1477 STR_SET_LEN(str2, len);
1478
1479 return str2;
1480}
1481
1482static VALUE
1483str_replace_shared(VALUE str2, VALUE str)
1484{
1485 str_replace_shared_without_enc(str2, str);
1486 rb_enc_cr_str_exact_copy(str2, str);
1487 return str2;
1488}
1489
1490static VALUE
1491str_new_shared(VALUE klass, VALUE str)
1492{
1493 return str_replace_shared(str_alloc_heap(klass), str);
1494}
1495
1496VALUE
1498{
1499 return str_new_shared(rb_obj_class(str), str);
1500}
1501
1502VALUE
1504{
1505 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1506 return str_new_frozen(rb_obj_class(orig), orig);
1507}
1508
1509static VALUE
1510rb_str_new_frozen_String(VALUE orig)
1511{
1512 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1513 return str_new_frozen(rb_cString, orig);
1514}
1515
1516
1517VALUE
1518rb_str_frozen_bare_string(VALUE orig)
1519{
1520 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1521 return str_new_frozen(rb_cString, orig);
1522}
1523
1524VALUE
1525rb_str_tmp_frozen_acquire(VALUE orig)
1526{
1527 if (OBJ_FROZEN_RAW(orig)) return orig;
1528 return str_new_frozen_buffer(0, orig, FALSE);
1529}
1530
1531VALUE
1532rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1533{
1534 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1535 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1536
1537 VALUE str = str_alloc_heap(0);
1538 OBJ_FREEZE(str);
1539 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1540 FL_SET(str, STR_SHARED_ROOT);
1541
1542 size_t capa = str_capacity(orig, TERM_LEN(orig));
1543
1544 /* If the string is embedded then we want to create a copy that is heap
1545 * allocated. If the string is shared then the shared root must be
1546 * embedded, so we want to create a copy. If the string is a shared root
1547 * then it must be embedded, so we want to create a copy. */
1548 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1549 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1550 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1551 }
1552 else {
1553 /* orig must be heap allocated and not shared, so we can safely transfer
1554 * the pointer to str. */
1555 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1556 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1557 RBASIC(orig)->flags &= ~STR_NOFREE;
1558 STR_SET_SHARED(orig, str);
1559 }
1560
1561 RSTRING(str)->len = RSTRING(orig)->len;
1562 RSTRING(str)->as.heap.aux.capa = capa;
1563
1564 return str;
1565}
1566
1567void
1568rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1569{
1570 if (RBASIC_CLASS(tmp) != 0)
1571 return;
1572
1573 if (STR_EMBED_P(tmp)) {
1575 }
1576 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1577 !OBJ_FROZEN_RAW(orig)) {
1578 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1579
1580 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1581 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1582 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1583
1584 /* Unshare orig since the root (tmp) only has this one child. */
1585 FL_UNSET_RAW(orig, STR_SHARED);
1586 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1587 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1589
1590 /* Make tmp embedded and empty so it is safe for sweeping. */
1591 STR_SET_EMBED(tmp);
1592 STR_SET_LEN(tmp, 0);
1593 }
1594 }
1595}
1596
1597static VALUE
1598str_new_frozen(VALUE klass, VALUE orig)
1599{
1600 return str_new_frozen_buffer(klass, orig, TRUE);
1601}
1602
1603static VALUE
1604heap_str_make_shared(VALUE klass, VALUE orig)
1605{
1606 RUBY_ASSERT(!STR_EMBED_P(orig));
1607 RUBY_ASSERT(!STR_SHARED_P(orig));
1608
1609 VALUE str = str_alloc_heap(klass);
1610 STR_SET_LEN(str, RSTRING_LEN(orig));
1611 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1612 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1613 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1614 RBASIC(orig)->flags &= ~STR_NOFREE;
1615 STR_SET_SHARED(orig, str);
1616 if (klass == 0)
1617 FL_UNSET_RAW(str, STR_BORROWED);
1618 return str;
1619}
1620
1621static VALUE
1622str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1623{
1624 VALUE str;
1625
1626 long len = RSTRING_LEN(orig);
1627 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1628 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1629
1630 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1631 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1632 RUBY_ASSERT(STR_EMBED_P(str));
1633 }
1634 else {
1635 if (FL_TEST_RAW(orig, STR_SHARED)) {
1636 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1637 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1638 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1639 RUBY_ASSERT(ofs >= 0);
1640 RUBY_ASSERT(rest >= 0);
1641 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1643
1644 if ((ofs > 0) || (rest > 0) ||
1645 (klass != RBASIC(shared)->klass) ||
1646 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1647 str = str_new_shared(klass, shared);
1648 RUBY_ASSERT(!STR_EMBED_P(str));
1649 RSTRING(str)->as.heap.ptr += ofs;
1650 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1651 }
1652 else {
1653 if (RBASIC_CLASS(shared) == 0)
1654 FL_SET_RAW(shared, STR_BORROWED);
1655 return shared;
1656 }
1657 }
1658 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1659 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1660 STR_SET_EMBED(str);
1661 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1662 STR_SET_LEN(str, RSTRING_LEN(orig));
1663 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1664 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1665 }
1666 else {
1667 str = heap_str_make_shared(klass, orig);
1668 }
1669 }
1670
1671 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1672 OBJ_FREEZE(str);
1673 return str;
1674}
1675
1676VALUE
1677rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1678{
1679 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1680}
1681
1682static VALUE
1683str_new_empty_String(VALUE str)
1684{
1685 VALUE v = rb_str_new(0, 0);
1686 rb_enc_copy(v, str);
1687 return v;
1688}
1689
1690#define STR_BUF_MIN_SIZE 63
1691
1692VALUE
1694{
1695 if (STR_EMBEDDABLE_P(capa, 1)) {
1696 return str_alloc_embed(rb_cString, capa + 1);
1697 }
1698
1699 VALUE str = str_alloc_heap(rb_cString);
1700
1701 RSTRING(str)->as.heap.aux.capa = capa;
1702 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1703 RSTRING(str)->as.heap.ptr[0] = '\0';
1704
1705 return str;
1706}
1707
1708VALUE
1710{
1711 VALUE str;
1712 long len = strlen(ptr);
1713
1714 str = rb_str_buf_new(len);
1715 rb_str_buf_cat(str, ptr, len);
1716
1717 return str;
1718}
1719
1720VALUE
1722{
1723 return str_new(0, 0, len);
1724}
1725
1726void
1728{
1729 if (STR_EMBED_P(str)) {
1730 RB_DEBUG_COUNTER_INC(obj_str_embed);
1731 }
1732 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1733 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1734 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1735 }
1736 else {
1737 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1738 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1739 }
1740}
1741
1742size_t
1743rb_str_memsize(VALUE str)
1744{
1745 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1746 return STR_HEAP_SIZE(str);
1747 }
1748 else {
1749 return 0;
1750 }
1751}
1752
1753VALUE
1755{
1756 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1757}
1758
1759static inline void str_discard(VALUE str);
1760static void str_shared_replace(VALUE str, VALUE str2);
1761
1762void
1764{
1765 if (str != str2) str_shared_replace(str, str2);
1766}
1767
1768static void
1769str_shared_replace(VALUE str, VALUE str2)
1770{
1771 rb_encoding *enc;
1772 int cr;
1773 int termlen;
1774
1775 RUBY_ASSERT(str2 != str);
1776 enc = STR_ENC_GET(str2);
1777 cr = ENC_CODERANGE(str2);
1778 str_discard(str);
1779 termlen = rb_enc_mbminlen(enc);
1780
1781 STR_SET_LEN(str, RSTRING_LEN(str2));
1782
1783 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1784 STR_SET_EMBED(str);
1785 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1786 rb_enc_associate(str, enc);
1787 ENC_CODERANGE_SET(str, cr);
1788 }
1789 else {
1790 if (STR_EMBED_P(str2)) {
1791 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1792 long len = RSTRING_LEN(str2);
1793 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1794
1795 char *new_ptr = ALLOC_N(char, len + termlen);
1796 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1797 RSTRING(str2)->as.heap.ptr = new_ptr;
1798 STR_SET_LEN(str2, len);
1799 RSTRING(str2)->as.heap.aux.capa = len;
1800 STR_SET_NOEMBED(str2);
1801 }
1802
1803 STR_SET_NOEMBED(str);
1804 FL_UNSET(str, STR_SHARED);
1805 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1806
1807 if (FL_TEST(str2, STR_SHARED)) {
1808 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1809 STR_SET_SHARED(str, shared);
1810 }
1811 else {
1812 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1813 }
1814
1815 /* abandon str2 */
1816 STR_SET_EMBED(str2);
1817 RSTRING_PTR(str2)[0] = 0;
1818 STR_SET_LEN(str2, 0);
1819 rb_enc_associate(str, enc);
1820 ENC_CODERANGE_SET(str, cr);
1821 }
1822}
1823
1824VALUE
1826{
1827 VALUE str;
1828
1829 if (RB_TYPE_P(obj, T_STRING)) {
1830 return obj;
1831 }
1832 str = rb_funcall(obj, idTo_s, 0);
1833 return rb_obj_as_string_result(str, obj);
1834}
1835
1836VALUE
1837rb_obj_as_string_result(VALUE str, VALUE obj)
1838{
1839 if (!RB_TYPE_P(str, T_STRING))
1840 return rb_any_to_s(obj);
1841 return str;
1842}
1843
1844static VALUE
1845str_replace(VALUE str, VALUE str2)
1846{
1847 long len;
1848
1849 len = RSTRING_LEN(str2);
1850 if (STR_SHARED_P(str2)) {
1851 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1853 STR_SET_NOEMBED(str);
1854 STR_SET_LEN(str, len);
1855 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1856 STR_SET_SHARED(str, shared);
1857 rb_enc_cr_str_exact_copy(str, str2);
1858 }
1859 else {
1860 str_replace_shared(str, str2);
1861 }
1862
1863 return str;
1864}
1865
1866static inline VALUE
1867ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1868{
1869 size_t size = rb_str_embed_size(capa);
1870 RUBY_ASSERT(size > 0);
1871 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1872
1873 NEWOBJ_OF(str, struct RString, klass,
1875
1876 str->len = 0;
1877
1878 return (VALUE)str;
1879}
1880
1881static inline VALUE
1882ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1883{
1884 NEWOBJ_OF(str, struct RString, klass,
1885 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1886
1887 str->as.heap.aux.capa = 0;
1888 str->as.heap.ptr = NULL;
1889
1890 return (VALUE)str;
1891}
1892
1893static inline VALUE
1894str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1895{
1896 int encidx = 0;
1897 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1898 encidx = rb_enc_get_index(str);
1899 flags &= ~ENCODING_MASK;
1900 }
1901 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1902 if (encidx) rb_enc_associate_index(dup, encidx);
1903 return dup;
1904}
1905
1906static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1907
1908static inline VALUE
1909str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1910{
1911 VALUE flags = FL_TEST_RAW(str, flag_mask);
1912 long len = RSTRING_LEN(str);
1913
1914 RUBY_ASSERT(STR_EMBED_P(dup));
1915 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1916 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1917 STR_SET_LEN(dup, RSTRING_LEN(str));
1918 return str_duplicate_setup_encoding(str, dup, flags);
1919}
1920
1921static inline VALUE
1922str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1923{
1924 VALUE flags = FL_TEST_RAW(str, flag_mask);
1925 VALUE root = str;
1926 if (FL_TEST_RAW(str, STR_SHARED)) {
1927 root = RSTRING(str)->as.heap.aux.shared;
1928 }
1929 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1930 root = str = str_new_frozen(klass, str);
1931 flags = FL_TEST_RAW(str, flag_mask);
1932 }
1933 RUBY_ASSERT(!STR_SHARED_P(root));
1935
1936 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1937 FL_SET(root, STR_SHARED_ROOT);
1938 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1939 flags |= RSTRING_NOEMBED | STR_SHARED;
1940
1941 STR_SET_LEN(dup, RSTRING_LEN(str));
1942 return str_duplicate_setup_encoding(str, dup, flags);
1943}
1944
1945static inline VALUE
1946str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1947{
1948 if (STR_EMBED_P(str)) {
1949 return str_duplicate_setup_embed(klass, str, dup);
1950 }
1951 else {
1952 return str_duplicate_setup_heap(klass, str, dup);
1953 }
1954}
1955
1956static inline VALUE
1957str_duplicate(VALUE klass, VALUE str)
1958{
1959 VALUE dup;
1960 if (STR_EMBED_P(str)) {
1961 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1962 }
1963 else {
1964 dup = str_alloc_heap(klass);
1965 }
1966
1967 return str_duplicate_setup(klass, str, dup);
1968}
1969
1970VALUE
1972{
1973 return str_duplicate(rb_obj_class(str), str);
1974}
1975
1976/* :nodoc: */
1977VALUE
1978rb_str_dup_m(VALUE str)
1979{
1980 if (LIKELY(BARE_STRING_P(str))) {
1981 return str_duplicate(rb_cString, str);
1982 }
1983 else {
1984 return rb_obj_dup(str);
1985 }
1986}
1987
1988VALUE
1990{
1991 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1992 return str_duplicate(rb_cString, str);
1993}
1994
1995VALUE
1996rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1997{
1998 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1999 VALUE new_str, klass = rb_cString;
2000
2001 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2002 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2003 str_duplicate_setup_embed(klass, str, new_str);
2004 }
2005 else {
2006 new_str = ec_str_alloc_heap(ec, klass);
2007 str_duplicate_setup_heap(klass, str, new_str);
2008 }
2009 if (chilled) {
2010 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2011 }
2012 return new_str;
2013}
2014
2015VALUE
2016rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2017{
2018 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2019 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2020 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2021 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2022 return rb_str_freeze(str);
2023}
2024
2025/*
2026 * The documentation block below uses an include (instead of inline text)
2027 * because the included text has non-ASCII characters (which are not allowed in a C file).
2028 */
2029
2030/*
2031 *
2032 * call-seq:
2033 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2034 *
2035 * :include: doc/string/new.rdoc
2036 *
2037 */
2038
2039static VALUE
2040rb_str_init(int argc, VALUE *argv, VALUE str)
2041{
2042 static ID keyword_ids[2];
2043 VALUE orig, opt, venc, vcapa;
2044 VALUE kwargs[2];
2045 rb_encoding *enc = 0;
2046 int n;
2047
2048 if (!keyword_ids[0]) {
2049 keyword_ids[0] = rb_id_encoding();
2050 CONST_ID(keyword_ids[1], "capacity");
2051 }
2052
2053 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2054 if (!NIL_P(opt)) {
2055 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2056 venc = kwargs[0];
2057 vcapa = kwargs[1];
2058 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2059 enc = rb_to_encoding(venc);
2060 }
2061 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2062 long capa = NUM2LONG(vcapa);
2063 long len = 0;
2064 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2065
2066 if (capa < STR_BUF_MIN_SIZE) {
2067 capa = STR_BUF_MIN_SIZE;
2068 }
2069 if (n == 1) {
2070 StringValue(orig);
2071 len = RSTRING_LEN(orig);
2072 if (capa < len) {
2073 capa = len;
2074 }
2075 if (orig == str) n = 0;
2076 }
2077 str_modifiable(str);
2078 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2079 /* make noembed always */
2080 const size_t size = (size_t)capa + termlen;
2081 const char *const old_ptr = RSTRING_PTR(str);
2082 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2083 char *new_ptr = ALLOC_N(char, size);
2084 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2085 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2086 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2087 RSTRING(str)->as.heap.ptr = new_ptr;
2088 }
2089 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2090 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2091 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2092 }
2093 STR_SET_LEN(str, len);
2094 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2095 if (n == 1) {
2096 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2097 rb_enc_cr_str_exact_copy(str, orig);
2098 }
2099 FL_SET(str, STR_NOEMBED);
2100 RSTRING(str)->as.heap.aux.capa = capa;
2101 }
2102 else if (n == 1) {
2103 rb_str_replace(str, orig);
2104 }
2105 if (enc) {
2106 rb_enc_associate(str, enc);
2108 }
2109 }
2110 else if (n == 1) {
2111 rb_str_replace(str, orig);
2112 }
2113 return str;
2114}
2115
2116/* :nodoc: */
2117static VALUE
2118rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2119{
2120 if (klass != rb_cString) {
2121 return rb_class_new_instance_pass_kw(argc, argv, klass);
2122 }
2123
2124 static ID keyword_ids[2];
2125 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2126 VALUE kwargs[2];
2127 rb_encoding *enc = NULL;
2128
2129 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2130 if (NIL_P(opt)) {
2131 return rb_class_new_instance_pass_kw(argc, argv, klass);
2132 }
2133
2134 keyword_ids[0] = rb_id_encoding();
2135 CONST_ID(keyword_ids[1], "capacity");
2136 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2137 encoding = kwargs[0];
2138 capacity = kwargs[1];
2139
2140 if (n == 1) {
2141 orig = StringValue(orig);
2142 }
2143 else {
2144 orig = Qnil;
2145 }
2146
2147 if (UNDEF_P(encoding)) {
2148 if (!NIL_P(orig)) {
2149 encoding = rb_obj_encoding(orig);
2150 }
2151 }
2152
2153 if (!UNDEF_P(encoding)) {
2154 enc = rb_to_encoding(encoding);
2155 }
2156
2157 // If capacity is nil, we're basically just duping `orig`.
2158 if (UNDEF_P(capacity)) {
2159 if (NIL_P(orig)) {
2160 VALUE empty_str = str_new(klass, "", 0);
2161 if (enc) {
2162 rb_enc_associate(empty_str, enc);
2163 }
2164 return empty_str;
2165 }
2166 VALUE copy = str_duplicate(klass, orig);
2167 rb_enc_associate(copy, enc);
2168 ENC_CODERANGE_CLEAR(copy);
2169 return copy;
2170 }
2171
2172 long capa = 0;
2173 capa = NUM2LONG(capacity);
2174 if (capa < 0) {
2175 capa = 0;
2176 }
2177
2178 if (!NIL_P(orig)) {
2179 long orig_capa = rb_str_capacity(orig);
2180 if (orig_capa > capa) {
2181 capa = orig_capa;
2182 }
2183 }
2184
2185 VALUE str = str_enc_new(klass, NULL, capa, enc);
2186 STR_SET_LEN(str, 0);
2187 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2188
2189 if (!NIL_P(orig)) {
2190 rb_str_buf_append(str, orig);
2191 }
2192
2193 return str;
2194}
2195
2196#ifdef NONASCII_MASK
2197#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2198
2199/*
2200 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2201 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2202 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2203 *
2204 * if (!(byte & 0x80))
2205 * byte |= 0x40; // turn on bit6
2206 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2207 *
2208 * This function calculates whether a byte is leading or not for all bytes
2209 * in the argument word by concurrently using the above logic, and then
2210 * adds up the number of leading bytes in the word.
2211 */
2212static inline uintptr_t
2213count_utf8_lead_bytes_with_word(const uintptr_t *s)
2214{
2215 uintptr_t d = *s;
2216
2217 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2218 d = (d>>6) | (~d>>7);
2219 d &= NONASCII_MASK >> 7;
2220
2221 /* Gather all bytes. */
2222#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2223 /* use only if it can use POPCNT */
2224 return rb_popcount_intptr(d);
2225#else
2226 d += (d>>8);
2227 d += (d>>16);
2228# if SIZEOF_VOIDP == 8
2229 d += (d>>32);
2230# endif
2231 return (d&0xF);
2232#endif
2233}
2234#endif
2235
2236static inline long
2237enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2238{
2239 long c;
2240 const char *q;
2241
2242 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2243 long diff = (long)(e - p);
2244 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2245 }
2246#ifdef NONASCII_MASK
2247 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2248 uintptr_t len = 0;
2249 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2250 const uintptr_t *s, *t;
2251 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2252 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2253 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2254 while (p < (const char *)s) {
2255 if (is_utf8_lead_byte(*p)) len++;
2256 p++;
2257 }
2258 while (s < t) {
2259 len += count_utf8_lead_bytes_with_word(s);
2260 s++;
2261 }
2262 p = (const char *)s;
2263 }
2264 while (p < e) {
2265 if (is_utf8_lead_byte(*p)) len++;
2266 p++;
2267 }
2268 return (long)len;
2269 }
2270#endif
2271 else if (rb_enc_asciicompat(enc)) {
2272 c = 0;
2273 if (ENC_CODERANGE_CLEAN_P(cr)) {
2274 while (p < e) {
2275 if (ISASCII(*p)) {
2276 q = search_nonascii(p, e);
2277 if (!q)
2278 return c + (e - p);
2279 c += q - p;
2280 p = q;
2281 }
2282 p += rb_enc_fast_mbclen(p, e, enc);
2283 c++;
2284 }
2285 }
2286 else {
2287 while (p < e) {
2288 if (ISASCII(*p)) {
2289 q = search_nonascii(p, e);
2290 if (!q)
2291 return c + (e - p);
2292 c += q - p;
2293 p = q;
2294 }
2295 p += rb_enc_mbclen(p, e, enc);
2296 c++;
2297 }
2298 }
2299 return c;
2300 }
2301
2302 for (c=0; p<e; c++) {
2303 p += rb_enc_mbclen(p, e, enc);
2304 }
2305 return c;
2306}
2307
2308long
2309rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2310{
2311 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2312}
2313
2314/* To get strlen with cr
2315 * Note that given cr is not used.
2316 */
2317long
2318rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2319{
2320 long c;
2321 const char *q;
2322 int ret;
2323
2324 *cr = 0;
2325 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2326 long diff = (long)(e - p);
2327 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2328 }
2329 else if (rb_enc_asciicompat(enc)) {
2330 c = 0;
2331 while (p < e) {
2332 if (ISASCII(*p)) {
2333 q = search_nonascii(p, e);
2334 if (!q) {
2335 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2336 return c + (e - p);
2337 }
2338 c += q - p;
2339 p = q;
2340 }
2341 ret = rb_enc_precise_mbclen(p, e, enc);
2342 if (MBCLEN_CHARFOUND_P(ret)) {
2343 *cr |= ENC_CODERANGE_VALID;
2344 p += MBCLEN_CHARFOUND_LEN(ret);
2345 }
2346 else {
2348 p++;
2349 }
2350 c++;
2351 }
2352 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2353 return c;
2354 }
2355
2356 for (c=0; p<e; c++) {
2357 ret = rb_enc_precise_mbclen(p, e, enc);
2358 if (MBCLEN_CHARFOUND_P(ret)) {
2359 *cr |= ENC_CODERANGE_VALID;
2360 p += MBCLEN_CHARFOUND_LEN(ret);
2361 }
2362 else {
2364 if (p + rb_enc_mbminlen(enc) <= e)
2365 p += rb_enc_mbminlen(enc);
2366 else
2367 p = e;
2368 }
2369 }
2370 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2371 return c;
2372}
2373
2374/* enc must be str's enc or rb_enc_check(str, str2) */
2375static long
2376str_strlen(VALUE str, rb_encoding *enc)
2377{
2378 const char *p, *e;
2379 int cr;
2380
2381 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2382 if (!enc) enc = STR_ENC_GET(str);
2383 p = RSTRING_PTR(str);
2384 e = RSTRING_END(str);
2385 cr = ENC_CODERANGE(str);
2386
2387 if (cr == ENC_CODERANGE_UNKNOWN) {
2388 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2389 if (cr) ENC_CODERANGE_SET(str, cr);
2390 return n;
2391 }
2392 else {
2393 return enc_strlen(p, e, enc, cr);
2394 }
2395}
2396
2397long
2399{
2400 return str_strlen(str, NULL);
2401}
2402
2403/*
2404 * call-seq:
2405 * length -> integer
2406 *
2407 * :include: doc/string/length.rdoc
2408 *
2409 */
2410
2411VALUE
2413{
2414 return LONG2NUM(str_strlen(str, NULL));
2415}
2416
2417/*
2418 * call-seq:
2419 * bytesize -> integer
2420 *
2421 * :include: doc/string/bytesize.rdoc
2422 *
2423 */
2424
2425VALUE
2426rb_str_bytesize(VALUE str)
2427{
2428 return LONG2NUM(RSTRING_LEN(str));
2429}
2430
2431/*
2432 * call-seq:
2433 * empty? -> true or false
2434 *
2435 * Returns whether the length of +self+ is zero:
2436 *
2437 * 'hello'.empty? # => false
2438 * ' '.empty? # => false
2439 * ''.empty? # => true
2440 *
2441 * Related: see {Querying}[rdoc-ref:String@Querying].
2442 */
2443
2444static VALUE
2445rb_str_empty(VALUE str)
2446{
2447 return RBOOL(RSTRING_LEN(str) == 0);
2448}
2449
2450/*
2451 * call-seq:
2452 * self + other_string -> new_string
2453 *
2454 * Returns a new string containing +other_string+ concatenated to +self+:
2455 *
2456 * 'Hello from ' + self.to_s # => "Hello from main"
2457 *
2458 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2459 */
2460
2461VALUE
2463{
2464 VALUE str3;
2465 rb_encoding *enc;
2466 char *ptr1, *ptr2, *ptr3;
2467 long len1, len2;
2468 int termlen;
2469
2470 StringValue(str2);
2471 enc = rb_enc_check_str(str1, str2);
2472 RSTRING_GETMEM(str1, ptr1, len1);
2473 RSTRING_GETMEM(str2, ptr2, len2);
2474 termlen = rb_enc_mbminlen(enc);
2475 if (len1 > LONG_MAX - len2) {
2476 rb_raise(rb_eArgError, "string size too big");
2477 }
2478 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2479 ptr3 = RSTRING_PTR(str3);
2480 memcpy(ptr3, ptr1, len1);
2481 memcpy(ptr3+len1, ptr2, len2);
2482 TERM_FILL(&ptr3[len1+len2], termlen);
2483
2484 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2486 RB_GC_GUARD(str1);
2487 RB_GC_GUARD(str2);
2488 return str3;
2489}
2490
2491/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2492VALUE
2493rb_str_opt_plus(VALUE str1, VALUE str2)
2494{
2497 long len1, len2;
2498 MAYBE_UNUSED(char) *ptr1, *ptr2;
2499 RSTRING_GETMEM(str1, ptr1, len1);
2500 RSTRING_GETMEM(str2, ptr2, len2);
2501 int enc1 = rb_enc_get_index(str1);
2502 int enc2 = rb_enc_get_index(str2);
2503
2504 if (enc1 < 0) {
2505 return Qundef;
2506 }
2507 else if (enc2 < 0) {
2508 return Qundef;
2509 }
2510 else if (enc1 != enc2) {
2511 return Qundef;
2512 }
2513 else if (len1 > LONG_MAX - len2) {
2514 return Qundef;
2515 }
2516 else {
2517 return rb_str_plus(str1, str2);
2518 }
2519
2520}
2521
2522/*
2523 * call-seq:
2524 * self * n -> new_string
2525 *
2526 * Returns a new string containing +n+ copies of +self+:
2527 *
2528 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2529 * 'No!' * 0 # => ""
2530 *
2531 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2532 */
2533
2534VALUE
2536{
2537 VALUE str2;
2538 long n, len;
2539 char *ptr2;
2540 int termlen;
2541
2542 if (times == INT2FIX(1)) {
2543 return str_duplicate(rb_cString, str);
2544 }
2545 if (times == INT2FIX(0)) {
2546 str2 = str_alloc_embed(rb_cString, 0);
2547 rb_enc_copy(str2, str);
2548 return str2;
2549 }
2550 len = NUM2LONG(times);
2551 if (len < 0) {
2552 rb_raise(rb_eArgError, "negative argument");
2553 }
2554 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2555 if (STR_EMBEDDABLE_P(len, 1)) {
2556 str2 = str_alloc_embed(rb_cString, len + 1);
2557 memset(RSTRING_PTR(str2), 0, len + 1);
2558 }
2559 else {
2560 str2 = str_alloc_heap(rb_cString);
2561 RSTRING(str2)->as.heap.aux.capa = len;
2562 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2563 }
2564 STR_SET_LEN(str2, len);
2565 rb_enc_copy(str2, str);
2566 return str2;
2567 }
2568 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2569 rb_raise(rb_eArgError, "argument too big");
2570 }
2571
2572 len *= RSTRING_LEN(str);
2573 termlen = TERM_LEN(str);
2574 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2575 ptr2 = RSTRING_PTR(str2);
2576 if (len) {
2577 n = RSTRING_LEN(str);
2578 memcpy(ptr2, RSTRING_PTR(str), n);
2579 while (n <= len/2) {
2580 memcpy(ptr2 + n, ptr2, n);
2581 n *= 2;
2582 }
2583 memcpy(ptr2 + n, ptr2, len-n);
2584 }
2585 STR_SET_LEN(str2, len);
2586 TERM_FILL(&ptr2[len], termlen);
2587 rb_enc_cr_str_copy_for_substr(str2, str);
2588
2589 return str2;
2590}
2591
2592/*
2593 * call-seq:
2594 * self % object -> new_string
2595 *
2596 * Returns the result of formatting +object+ into the format specifications
2597 * contained in +self+
2598 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2599 *
2600 * '%05d' % 123 # => "00123"
2601 *
2602 * If +self+ contains multiple format specifications,
2603 * +object+ must be an array or hash containing the objects to be formatted:
2604 *
2605 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2606 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2607 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2608 *
2609 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2610 */
2611
2612static VALUE
2613rb_str_format_m(VALUE str, VALUE arg)
2614{
2615 VALUE tmp = rb_check_array_type(arg);
2616
2617 if (!NIL_P(tmp)) {
2618 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2619 }
2620 return rb_str_format(1, &arg, str);
2621}
2622
2623static inline void
2624rb_check_lockedtmp(VALUE str)
2625{
2626 if (FL_TEST(str, STR_TMPLOCK)) {
2627 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2628 }
2629}
2630
2631// If none of these flags are set, we know we have an modifiable string.
2632// If any is set, we need to do more detailed checks.
2633#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2634static inline void
2635str_modifiable(VALUE str)
2636{
2637 RUBY_ASSERT(ruby_thread_has_gvl_p());
2638
2639 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2640 if (CHILLED_STRING_P(str)) {
2641 CHILLED_STRING_MUTATED(str);
2642 }
2643 rb_check_lockedtmp(str);
2644 rb_check_frozen(str);
2645 }
2646}
2647
2648static inline int
2649str_dependent_p(VALUE str)
2650{
2651 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2652 return FALSE;
2653 }
2654 else {
2655 return TRUE;
2656 }
2657}
2658
2659// If none of these flags are set, we know we have an independent string.
2660// If any is set, we need to do more detailed checks.
2661#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2662static inline int
2663str_independent(VALUE str)
2664{
2665 RUBY_ASSERT(ruby_thread_has_gvl_p());
2666
2667 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2668 str_modifiable(str);
2669 return !str_dependent_p(str);
2670 }
2671 return TRUE;
2672}
2673
2674static void
2675str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2676{
2677 RUBY_ASSERT(ruby_thread_has_gvl_p());
2678
2679 char *ptr;
2680 char *oldptr;
2681 long capa = len + expand;
2682
2683 if (len > capa) len = capa;
2684
2685 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2686 ptr = RSTRING(str)->as.heap.ptr;
2687 STR_SET_EMBED(str);
2688 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2689 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2690 STR_SET_LEN(str, len);
2691 return;
2692 }
2693
2694 ptr = ALLOC_N(char, (size_t)capa + termlen);
2695 oldptr = RSTRING_PTR(str);
2696 if (oldptr) {
2697 memcpy(ptr, oldptr, len);
2698 }
2699 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2700 xfree(oldptr);
2701 }
2702 STR_SET_NOEMBED(str);
2703 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2704 TERM_FILL(ptr + len, termlen);
2705 RSTRING(str)->as.heap.ptr = ptr;
2706 STR_SET_LEN(str, len);
2707 RSTRING(str)->as.heap.aux.capa = capa;
2708}
2709
2710void
2711rb_str_modify(VALUE str)
2712{
2713 if (!str_independent(str))
2714 str_make_independent(str);
2716}
2717
2718void
2720{
2721 RUBY_ASSERT(ruby_thread_has_gvl_p());
2722
2723 int termlen = TERM_LEN(str);
2724 long len = RSTRING_LEN(str);
2725
2726 if (expand < 0) {
2727 rb_raise(rb_eArgError, "negative expanding string size");
2728 }
2729 if (expand >= LONG_MAX - len) {
2730 rb_raise(rb_eArgError, "string size too big");
2731 }
2732
2733 if (!str_independent(str)) {
2734 str_make_independent_expand(str, len, expand, termlen);
2735 }
2736 else if (expand > 0) {
2737 RESIZE_CAPA_TERM(str, len + expand, termlen);
2738 }
2740}
2741
2742/* As rb_str_modify(), but don't clear coderange */
2743static void
2744str_modify_keep_cr(VALUE str)
2745{
2746 if (!str_independent(str))
2747 str_make_independent(str);
2749 /* Force re-scan later */
2751}
2752
2753static inline void
2754str_discard(VALUE str)
2755{
2756 str_modifiable(str);
2757 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2758 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2759 RSTRING(str)->as.heap.ptr = 0;
2760 STR_SET_LEN(str, 0);
2761 }
2762}
2763
2764void
2766{
2767 int encindex = rb_enc_get_index(str);
2768
2769 if (RB_UNLIKELY(encindex == -1)) {
2770 rb_raise(rb_eTypeError, "not encoding capable object");
2771 }
2772
2773 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2774 return;
2775 }
2776
2777 rb_encoding *enc = rb_enc_from_index(encindex);
2778 if (!rb_enc_asciicompat(enc)) {
2779 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2780 }
2781}
2782
2783VALUE
2785{
2786 RUBY_ASSERT(ruby_thread_has_gvl_p());
2787
2788 VALUE s = *ptr;
2789 if (!RB_TYPE_P(s, T_STRING)) {
2790 s = rb_str_to_str(s);
2791 *ptr = s;
2792 }
2793 return s;
2794}
2795
2796char *
2798{
2799 VALUE str = rb_string_value(ptr);
2800 return RSTRING_PTR(str);
2801}
2802
2803static int
2804zero_filled(const char *s, int n)
2805{
2806 for (; n > 0; --n) {
2807 if (*s++) return 0;
2808 }
2809 return 1;
2810}
2811
2812static const char *
2813str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2814{
2815 const char *e = s + len;
2816
2817 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2818 if (zero_filled(s, minlen)) return s;
2819 }
2820 return 0;
2821}
2822
2823static char *
2824str_fill_term(VALUE str, char *s, long len, int termlen)
2825{
2826 /* This function assumes that (capa + termlen) bytes of memory
2827 * is allocated, like many other functions in this file.
2828 */
2829 if (str_dependent_p(str)) {
2830 if (!zero_filled(s + len, termlen))
2831 str_make_independent_expand(str, len, 0L, termlen);
2832 }
2833 else {
2834 TERM_FILL(s + len, termlen);
2835 return s;
2836 }
2837 return RSTRING_PTR(str);
2838}
2839
2840void
2841rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2842{
2843 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2844 long len = RSTRING_LEN(str);
2845
2846 RUBY_ASSERT(capa >= len);
2847 if (capa - len < termlen) {
2848 rb_check_lockedtmp(str);
2849 str_make_independent_expand(str, len, 0L, termlen);
2850 }
2851 else if (str_dependent_p(str)) {
2852 if (termlen > oldtermlen)
2853 str_make_independent_expand(str, len, 0L, termlen);
2854 }
2855 else {
2856 if (!STR_EMBED_P(str)) {
2857 /* modify capa instead of realloc */
2858 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2859 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2860 }
2861 if (termlen > oldtermlen) {
2862 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2863 }
2864 }
2865
2866 return;
2867}
2868
2869static char *
2870str_null_check(VALUE str, int *w)
2871{
2872 char *s = RSTRING_PTR(str);
2873 long len = RSTRING_LEN(str);
2874 rb_encoding *enc = rb_enc_get(str);
2875 const int minlen = rb_enc_mbminlen(enc);
2876
2877 if (minlen > 1) {
2878 *w = 1;
2879 if (str_null_char(s, len, minlen, enc)) {
2880 return NULL;
2881 }
2882 return str_fill_term(str, s, len, minlen);
2883 }
2884 *w = 0;
2885 if (!s || memchr(s, 0, len)) {
2886 return NULL;
2887 }
2888 if (s[len]) {
2889 s = str_fill_term(str, s, len, minlen);
2890 }
2891 return s;
2892}
2893
2894char *
2895rb_str_to_cstr(VALUE str)
2896{
2897 int w;
2898 return str_null_check(str, &w);
2899}
2900
2901char *
2903{
2904 VALUE str = rb_string_value(ptr);
2905 int w;
2906 char *s = str_null_check(str, &w);
2907 if (!s) {
2908 if (w) {
2909 rb_raise(rb_eArgError, "string contains null char");
2910 }
2911 rb_raise(rb_eArgError, "string contains null byte");
2912 }
2913 return s;
2914}
2915
2916char *
2917rb_str_fill_terminator(VALUE str, const int newminlen)
2918{
2919 char *s = RSTRING_PTR(str);
2920 long len = RSTRING_LEN(str);
2921 return str_fill_term(str, s, len, newminlen);
2922}
2923
2924VALUE
2926{
2927 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2928 return str;
2929}
2930
2931/*
2932 * call-seq:
2933 * String.try_convert(object) -> object, new_string, or nil
2934 *
2935 * Attempts to convert the given +object+ to a string.
2936 *
2937 * If +object+ is already a string, returns +object+, unmodified.
2938 *
2939 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2940 * calls <tt>object.to_str</tt> and returns the result.
2941 *
2942 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2943 *
2944 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2945 */
2946static VALUE
2947rb_str_s_try_convert(VALUE dummy, VALUE str)
2948{
2949 return rb_check_string_type(str);
2950}
2951
2952static char*
2953str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2954{
2955 long nth = *nthp;
2956 if (rb_enc_mbmaxlen(enc) == 1) {
2957 p += nth;
2958 }
2959 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2960 p += nth * rb_enc_mbmaxlen(enc);
2961 }
2962 else if (rb_enc_asciicompat(enc)) {
2963 const char *p2, *e2;
2964 int n;
2965
2966 while (p < e && 0 < nth) {
2967 e2 = p + nth;
2968 if (e < e2) {
2969 *nthp = nth;
2970 return (char *)e;
2971 }
2972 if (ISASCII(*p)) {
2973 p2 = search_nonascii(p, e2);
2974 if (!p2) {
2975 nth -= e2 - p;
2976 *nthp = nth;
2977 return (char *)e2;
2978 }
2979 nth -= p2 - p;
2980 p = p2;
2981 }
2982 n = rb_enc_mbclen(p, e, enc);
2983 p += n;
2984 nth--;
2985 }
2986 *nthp = nth;
2987 if (nth != 0) {
2988 return (char *)e;
2989 }
2990 return (char *)p;
2991 }
2992 else {
2993 while (p < e && nth--) {
2994 p += rb_enc_mbclen(p, e, enc);
2995 }
2996 }
2997 if (p > e) p = e;
2998 *nthp = nth;
2999 return (char*)p;
3000}
3001
3002char*
3003rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3004{
3005 return str_nth_len(p, e, &nth, enc);
3006}
3007
3008static char*
3009str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3010{
3011 if (singlebyte)
3012 p += nth;
3013 else {
3014 p = str_nth_len(p, e, &nth, enc);
3015 }
3016 if (!p) return 0;
3017 if (p > e) p = e;
3018 return (char *)p;
3019}
3020
3021/* char offset to byte offset */
3022static long
3023str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3024{
3025 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3026 if (!pp) return e - p;
3027 return pp - p;
3028}
3029
3030long
3031rb_str_offset(VALUE str, long pos)
3032{
3033 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3034 STR_ENC_GET(str), single_byte_optimizable(str));
3035}
3036
3037#ifdef NONASCII_MASK
3038static char *
3039str_utf8_nth(const char *p, const char *e, long *nthp)
3040{
3041 long nth = *nthp;
3042 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3043 const uintptr_t *s, *t;
3044 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3045 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3046 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3047 while (p < (const char *)s) {
3048 if (is_utf8_lead_byte(*p)) nth--;
3049 p++;
3050 }
3051 do {
3052 nth -= count_utf8_lead_bytes_with_word(s);
3053 s++;
3054 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3055 p = (char *)s;
3056 }
3057 while (p < e) {
3058 if (is_utf8_lead_byte(*p)) {
3059 if (nth == 0) break;
3060 nth--;
3061 }
3062 p++;
3063 }
3064 *nthp = nth;
3065 return (char *)p;
3066}
3067
3068static long
3069str_utf8_offset(const char *p, const char *e, long nth)
3070{
3071 const char *pp = str_utf8_nth(p, e, &nth);
3072 return pp - p;
3073}
3074#endif
3075
3076/* byte offset to char offset */
3077long
3078rb_str_sublen(VALUE str, long pos)
3079{
3080 if (single_byte_optimizable(str) || pos < 0)
3081 return pos;
3082 else {
3083 char *p = RSTRING_PTR(str);
3084 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3085 }
3086}
3087
3088static VALUE
3089str_subseq(VALUE str, long beg, long len)
3090{
3091 VALUE str2;
3092
3093 RUBY_ASSERT(beg >= 0);
3094 RUBY_ASSERT(len >= 0);
3095 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3096
3097 const int termlen = TERM_LEN(str);
3098 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3099 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3100 RB_GC_GUARD(str);
3101 return str2;
3102 }
3103
3104 str2 = str_alloc_heap(rb_cString);
3105 if (str_embed_capa(str2) >= len + termlen) {
3106 char *ptr2 = RSTRING(str2)->as.embed.ary;
3107 STR_SET_EMBED(str2);
3108 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3109 TERM_FILL(ptr2+len, termlen);
3110
3111 STR_SET_LEN(str2, len);
3112 RB_GC_GUARD(str);
3113 }
3114 else {
3115 str_replace_shared(str2, str);
3116 RUBY_ASSERT(!STR_EMBED_P(str2));
3117 ENC_CODERANGE_CLEAR(str2);
3118 RSTRING(str2)->as.heap.ptr += beg;
3119 if (RSTRING_LEN(str2) > len) {
3120 STR_SET_LEN(str2, len);
3121 }
3122 }
3123
3124 return str2;
3125}
3126
3127VALUE
3128rb_str_subseq(VALUE str, long beg, long len)
3129{
3130 VALUE str2 = str_subseq(str, beg, len);
3131 rb_enc_cr_str_copy_for_substr(str2, str);
3132 return str2;
3133}
3134
3135char *
3136rb_str_subpos(VALUE str, long beg, long *lenp)
3137{
3138 long len = *lenp;
3139 long slen = -1L;
3140 const long blen = RSTRING_LEN(str);
3141 rb_encoding *enc = STR_ENC_GET(str);
3142 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3143
3144 if (len < 0) return 0;
3145 if (beg < 0 && -beg < 0) return 0;
3146 if (!blen) {
3147 len = 0;
3148 }
3149 if (single_byte_optimizable(str)) {
3150 if (beg > blen) return 0;
3151 if (beg < 0) {
3152 beg += blen;
3153 if (beg < 0) return 0;
3154 }
3155 if (len > blen - beg)
3156 len = blen - beg;
3157 if (len < 0) return 0;
3158 p = s + beg;
3159 goto end;
3160 }
3161 if (beg < 0) {
3162 if (len > -beg) len = -beg;
3163 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3164 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3165 beg = -beg;
3166 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3167 p = e;
3168 if (!p) return 0;
3169 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3170 if (!p) return 0;
3171 len = e - p;
3172 goto end;
3173 }
3174 else {
3175 slen = str_strlen(str, enc);
3176 beg += slen;
3177 if (beg < 0) return 0;
3178 p = s + beg;
3179 if (len == 0) goto end;
3180 }
3181 }
3182 else if (beg > 0 && beg > blen) {
3183 return 0;
3184 }
3185 if (len == 0) {
3186 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3187 p = s + beg;
3188 }
3189#ifdef NONASCII_MASK
3190 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3191 enc == rb_utf8_encoding()) {
3192 p = str_utf8_nth(s, e, &beg);
3193 if (beg > 0) return 0;
3194 len = str_utf8_offset(p, e, len);
3195 }
3196#endif
3197 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3198 int char_sz = rb_enc_mbmaxlen(enc);
3199
3200 p = s + beg * char_sz;
3201 if (p > e) {
3202 return 0;
3203 }
3204 else if (len * char_sz > e - p)
3205 len = e - p;
3206 else
3207 len *= char_sz;
3208 }
3209 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3210 if (beg > 0) return 0;
3211 len = 0;
3212 }
3213 else {
3214 len = str_offset(p, e, len, enc, 0);
3215 }
3216 end:
3217 *lenp = len;
3218 RB_GC_GUARD(str);
3219 return p;
3220}
3221
3222static VALUE str_substr(VALUE str, long beg, long len, int empty);
3223
3224VALUE
3225rb_str_substr(VALUE str, long beg, long len)
3226{
3227 return str_substr(str, beg, len, TRUE);
3228}
3229
3230VALUE
3231rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3232{
3233 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3234}
3235
3236static VALUE
3237str_substr(VALUE str, long beg, long len, int empty)
3238{
3239 char *p = rb_str_subpos(str, beg, &len);
3240
3241 if (!p) return Qnil;
3242 if (!len && !empty) return Qnil;
3243
3244 beg = p - RSTRING_PTR(str);
3245
3246 VALUE str2 = str_subseq(str, beg, len);
3247 rb_enc_cr_str_copy_for_substr(str2, str);
3248 return str2;
3249}
3250
3251/* :nodoc: */
3252VALUE
3254{
3255 if (CHILLED_STRING_P(str)) {
3256 FL_UNSET_RAW(str, STR_CHILLED);
3257 }
3258
3259 if (OBJ_FROZEN(str)) return str;
3260 rb_str_resize(str, RSTRING_LEN(str));
3261 return rb_obj_freeze(str);
3262}
3263
3264/*
3265 * call-seq:
3266 * +string -> new_string or self
3267 *
3268 * Returns +self+ if +self+ is not frozen and can be mutated
3269 * without warning issuance.
3270 *
3271 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3272 *
3273 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3274 */
3275static VALUE
3276str_uplus(VALUE str)
3277{
3278 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3279 return rb_str_dup(str);
3280 }
3281 else {
3282 return str;
3283 }
3284}
3285
3286/*
3287 * call-seq:
3288 * -self -> frozen_string
3289 *
3290 * Returns a frozen string equal to +self+.
3291 *
3292 * The returned string is +self+ if and only if all of the following are true:
3293 *
3294 * - +self+ is already frozen.
3295 * - +self+ is an instance of \String (rather than of a subclass of \String)
3296 * - +self+ has no instance variables set on it.
3297 *
3298 * Otherwise, the returned string is a frozen copy of +self+.
3299 *
3300 * Returning +self+, when possible, saves duplicating +self+;
3301 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3302 *
3303 * It may also save duplicating other, already-existing, strings:
3304 *
3305 * s0 = 'foo'
3306 * s1 = 'foo'
3307 * s0.object_id == s1.object_id # => false
3308 * (-s0).object_id == (-s1).object_id # => true
3309 *
3310 * Note that method #-@ is convenient for defining a constant:
3311 *
3312 * FileName = -'config/database.yml'
3313 *
3314 * While its alias #dedup is better suited for chaining:
3315 *
3316 * 'foo'.dedup.gsub!('o')
3317 *
3318 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3319 */
3320static VALUE
3321str_uminus(VALUE str)
3322{
3323 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3324 str = rb_str_dup(str);
3325 }
3326 return rb_fstring(str);
3327}
3328
3329RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3330#define rb_str_dup_frozen rb_str_new_frozen
3331
3332VALUE
3334{
3335 rb_check_frozen(str);
3336 if (FL_TEST(str, STR_TMPLOCK)) {
3337 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3338 }
3339 FL_SET(str, STR_TMPLOCK);
3340 return str;
3341}
3342
3343VALUE
3345{
3346 rb_check_frozen(str);
3347 if (!FL_TEST(str, STR_TMPLOCK)) {
3348 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3349 }
3350 FL_UNSET(str, STR_TMPLOCK);
3351 return str;
3352}
3353
3354VALUE
3355rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3356{
3357 rb_str_locktmp(str);
3358 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3359}
3360
3361void
3363{
3364 RUBY_ASSERT(ruby_thread_has_gvl_p());
3365
3366 long capa;
3367 const int termlen = TERM_LEN(str);
3368
3369 str_modifiable(str);
3370 if (STR_SHARED_P(str)) {
3371 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3372 }
3373 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3374 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3375 }
3376
3377 int cr = ENC_CODERANGE(str);
3378 if (len == 0) {
3379 /* Empty string does not contain non-ASCII */
3381 }
3382 else if (cr == ENC_CODERANGE_UNKNOWN) {
3383 /* Leave unknown. */
3384 }
3385 else if (len > RSTRING_LEN(str)) {
3386 if (ENC_CODERANGE_CLEAN_P(cr)) {
3387 /* Update the coderange regarding the extended part. */
3388 const char *const prev_end = RSTRING_END(str);
3389 const char *const new_end = RSTRING_PTR(str) + len;
3390 rb_encoding *enc = rb_enc_get(str);
3391 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3392 ENC_CODERANGE_SET(str, cr);
3393 }
3394 else if (cr == ENC_CODERANGE_BROKEN) {
3395 /* May be valid now, by appended part. */
3397 }
3398 }
3399 else if (len < RSTRING_LEN(str)) {
3400 if (cr != ENC_CODERANGE_7BIT) {
3401 /* ASCII-only string is keeping after truncated. Valid
3402 * and broken may be invalid or valid, leave unknown. */
3404 }
3405 }
3406
3407 STR_SET_LEN(str, len);
3408 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3409}
3410
3411VALUE
3412rb_str_resize(VALUE str, long len)
3413{
3414 if (len < 0) {
3415 rb_raise(rb_eArgError, "negative string size (or size too big)");
3416 }
3417
3418 int independent = str_independent(str);
3419 long slen = RSTRING_LEN(str);
3420 const int termlen = TERM_LEN(str);
3421
3422 if (slen > len || (termlen != 1 && slen < len)) {
3424 }
3425
3426 {
3427 long capa;
3428 if (STR_EMBED_P(str)) {
3429 if (len == slen) return str;
3430 if (str_embed_capa(str) >= len + termlen) {
3431 STR_SET_LEN(str, len);
3432 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3433 return str;
3434 }
3435 str_make_independent_expand(str, slen, len - slen, termlen);
3436 }
3437 else if (str_embed_capa(str) >= len + termlen) {
3438 char *ptr = STR_HEAP_PTR(str);
3439 STR_SET_EMBED(str);
3440 if (slen > len) slen = len;
3441 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3442 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3443 STR_SET_LEN(str, len);
3444 if (independent) ruby_xfree(ptr);
3445 return str;
3446 }
3447 else if (!independent) {
3448 if (len == slen) return str;
3449 str_make_independent_expand(str, slen, len - slen, termlen);
3450 }
3451 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3452 (capa - len) > (len < 1024 ? len : 1024)) {
3453 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3454 (size_t)len + termlen, STR_HEAP_SIZE(str));
3455 RSTRING(str)->as.heap.aux.capa = len;
3456 }
3457 else if (len == slen) return str;
3458 STR_SET_LEN(str, len);
3459 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3460 }
3461 return str;
3462}
3463
3464static void
3465str_ensure_available_capa(VALUE str, long len)
3466{
3467 str_modify_keep_cr(str);
3468
3469 const int termlen = TERM_LEN(str);
3470 long olen = RSTRING_LEN(str);
3471
3472 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3473 rb_raise(rb_eArgError, "string sizes too big");
3474 }
3475
3476 long total = olen + len;
3477 long capa = str_capacity(str, termlen);
3478
3479 if (capa < total) {
3480 if (total >= LONG_MAX / 2) {
3481 capa = total;
3482 }
3483 while (total > capa) {
3484 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3485 }
3486 RESIZE_CAPA_TERM(str, capa, termlen);
3487 }
3488}
3489
3490static VALUE
3491str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3492{
3493 if (keep_cr) {
3494 str_modify_keep_cr(str);
3495 }
3496 else {
3497 rb_str_modify(str);
3498 }
3499 if (len == 0) return 0;
3500
3501 long total, olen, off = -1;
3502 char *sptr;
3503 const int termlen = TERM_LEN(str);
3504
3505 RSTRING_GETMEM(str, sptr, olen);
3506 if (ptr >= sptr && ptr <= sptr + olen) {
3507 off = ptr - sptr;
3508 }
3509
3510 long capa = str_capacity(str, termlen);
3511
3512 if (olen > LONG_MAX - len) {
3513 rb_raise(rb_eArgError, "string sizes too big");
3514 }
3515 total = olen + len;
3516 if (capa < total) {
3517 if (total >= LONG_MAX / 2) {
3518 capa = total;
3519 }
3520 while (total > capa) {
3521 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3522 }
3523 RESIZE_CAPA_TERM(str, capa, termlen);
3524 sptr = RSTRING_PTR(str);
3525 }
3526 if (off != -1) {
3527 ptr = sptr + off;
3528 }
3529 memcpy(sptr + olen, ptr, len);
3530 STR_SET_LEN(str, total);
3531 TERM_FILL(sptr + total, termlen); /* sentinel */
3532
3533 return str;
3534}
3535
3536#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3537#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3538
3539VALUE
3540rb_str_cat(VALUE str, const char *ptr, long len)
3541{
3542 if (len == 0) return str;
3543 if (len < 0) {
3544 rb_raise(rb_eArgError, "negative string size (or size too big)");
3545 }
3546 return str_buf_cat(str, ptr, len);
3547}
3548
3549VALUE
3550rb_str_cat_cstr(VALUE str, const char *ptr)
3551{
3552 must_not_null(ptr);
3553 return rb_str_buf_cat(str, ptr, strlen(ptr));
3554}
3555
3556static void
3557rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3558{
3559 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3560
3561 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3562 if (UNLIKELY(!str_independent(str))) {
3563 str_make_independent(str);
3564 }
3565
3566 long string_length = -1;
3567 const int null_terminator_length = 1;
3568 char *sptr;
3569 RSTRING_GETMEM(str, sptr, string_length);
3570
3571 // Ensure the resulting string wouldn't be too long.
3572 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3573 rb_raise(rb_eArgError, "string sizes too big");
3574 }
3575
3576 long string_capacity = str_capacity(str, null_terminator_length);
3577
3578 // Get the code range before any modifications since those might clear the code range.
3579 int cr = ENC_CODERANGE(str);
3580
3581 // Check if the string has spare string_capacity to write the new byte.
3582 if (LIKELY(string_capacity >= string_length + 1)) {
3583 // In fast path we can write the new byte and note the string's new length.
3584 sptr[string_length] = byte;
3585 STR_SET_LEN(str, string_length + 1);
3586 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3587 }
3588 else {
3589 // If there's not enough string_capacity, make a call into the general string concatenation function.
3590 str_buf_cat(str, (char *)&byte, 1);
3591 }
3592
3593 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3594 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3595 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3596 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3597 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3598 if (ISASCII(byte)) {
3600 }
3601 else {
3603
3604 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3605 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3606 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3607 }
3608 }
3609 }
3610}
3611
3612RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3613RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3614RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3615
3616static VALUE
3617rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3618 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3619{
3620 int str_encindex = ENCODING_GET(str);
3621 int res_encindex;
3622 int str_cr, res_cr;
3623 rb_encoding *str_enc, *ptr_enc;
3624
3625 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3626
3627 if (str_encindex == ptr_encindex) {
3628 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3629 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3630 }
3631 }
3632 else {
3633 str_enc = rb_enc_from_index(str_encindex);
3634 ptr_enc = rb_enc_from_index(ptr_encindex);
3635 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3636 if (len == 0)
3637 return str;
3638 if (RSTRING_LEN(str) == 0) {
3639 rb_str_buf_cat(str, ptr, len);
3640 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3641 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3642 return str;
3643 }
3644 goto incompatible;
3645 }
3646 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3647 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3648 }
3649 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3650 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3651 str_cr = rb_enc_str_coderange(str);
3652 }
3653 }
3654 }
3655 if (ptr_cr_ret)
3656 *ptr_cr_ret = ptr_cr;
3657
3658 if (str_encindex != ptr_encindex &&
3659 str_cr != ENC_CODERANGE_7BIT &&
3660 ptr_cr != ENC_CODERANGE_7BIT) {
3661 str_enc = rb_enc_from_index(str_encindex);
3662 ptr_enc = rb_enc_from_index(ptr_encindex);
3663 goto incompatible;
3664 }
3665
3666 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3667 res_encindex = str_encindex;
3668 res_cr = ENC_CODERANGE_UNKNOWN;
3669 }
3670 else if (str_cr == ENC_CODERANGE_7BIT) {
3671 if (ptr_cr == ENC_CODERANGE_7BIT) {
3672 res_encindex = str_encindex;
3673 res_cr = ENC_CODERANGE_7BIT;
3674 }
3675 else {
3676 res_encindex = ptr_encindex;
3677 res_cr = ptr_cr;
3678 }
3679 }
3680 else if (str_cr == ENC_CODERANGE_VALID) {
3681 res_encindex = str_encindex;
3682 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3683 res_cr = str_cr;
3684 else
3685 res_cr = ptr_cr;
3686 }
3687 else { /* str_cr == ENC_CODERANGE_BROKEN */
3688 res_encindex = str_encindex;
3689 res_cr = str_cr;
3690 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3691 }
3692
3693 if (len < 0) {
3694 rb_raise(rb_eArgError, "negative string size (or size too big)");
3695 }
3696 str_buf_cat(str, ptr, len);
3697 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3698 return str;
3699
3700 incompatible:
3701 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3702 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3704}
3705
3706VALUE
3707rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3708{
3709 return rb_enc_cr_str_buf_cat(str, ptr, len,
3710 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3711}
3712
3713VALUE
3715{
3716 /* ptr must reference NUL terminated ASCII string. */
3717 int encindex = ENCODING_GET(str);
3718 rb_encoding *enc = rb_enc_from_index(encindex);
3719 if (rb_enc_asciicompat(enc)) {
3720 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3721 encindex, ENC_CODERANGE_7BIT, 0);
3722 }
3723 else {
3724 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3725 while (*ptr) {
3726 unsigned int c = (unsigned char)*ptr;
3727 int len = rb_enc_codelen(c, enc);
3728 rb_enc_mbcput(c, buf, enc);
3729 rb_enc_cr_str_buf_cat(str, buf, len,
3730 encindex, ENC_CODERANGE_VALID, 0);
3731 ptr++;
3732 }
3733 return str;
3734 }
3735}
3736
3737VALUE
3739{
3740 int str2_cr = rb_enc_str_coderange(str2);
3741
3742 if (str_enc_fastpath(str)) {
3743 switch (str2_cr) {
3744 case ENC_CODERANGE_7BIT:
3745 // If RHS is 7bit we can do simple concatenation
3746 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3747 RB_GC_GUARD(str2);
3748 return str;
3750 // If RHS is valid, we can do simple concatenation if encodings are the same
3751 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3752 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3753 int str_cr = ENC_CODERANGE(str);
3754 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3755 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3756 }
3757 RB_GC_GUARD(str2);
3758 return str;
3759 }
3760 }
3761 }
3762
3763 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3764 ENCODING_GET(str2), str2_cr, &str2_cr);
3765
3766 ENC_CODERANGE_SET(str2, str2_cr);
3767
3768 return str;
3769}
3770
3771VALUE
3773{
3774 StringValue(str2);
3775 return rb_str_buf_append(str, str2);
3776}
3777
3778VALUE
3779rb_str_concat_literals(size_t num, const VALUE *strary)
3780{
3781 VALUE str;
3782 size_t i, s = 0;
3783 unsigned long len = 1;
3784
3785 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3786 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3787
3788 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3789 str = rb_str_buf_new(len);
3790 str_enc_copy_direct(str, strary[0]);
3791
3792 for (i = s; i < num; ++i) {
3793 const VALUE v = strary[i];
3794 int encidx = ENCODING_GET(v);
3795
3796 rb_str_buf_append(str, v);
3797 if (encidx != ENCINDEX_US_ASCII) {
3798 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3799 rb_enc_set_index(str, encidx);
3800 }
3801 }
3802 return str;
3803}
3804
3805/*
3806 * call-seq:
3807 * concat(*objects) -> string
3808 *
3809 * :include: doc/string/concat.rdoc
3810 */
3811static VALUE
3812rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3813{
3814 str_modifiable(str);
3815
3816 if (argc == 1) {
3817 return rb_str_concat(str, argv[0]);
3818 }
3819 else if (argc > 1) {
3820 int i;
3821 VALUE arg_str = rb_str_tmp_new(0);
3822 rb_enc_copy(arg_str, str);
3823 for (i = 0; i < argc; i++) {
3824 rb_str_concat(arg_str, argv[i]);
3825 }
3826 rb_str_buf_append(str, arg_str);
3827 }
3828
3829 return str;
3830}
3831
3832/*
3833 * call-seq:
3834 * append_as_bytes(*objects) -> self
3835 *
3836 * Concatenates each object in +objects+ into +self+; returns +self+;
3837 * performs no encoding validation or conversion:
3838 *
3839 * s = 'foo'
3840 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3841 * s.valid_encoding? # => false
3842 * s.append_as_bytes("\xAC 12")
3843 * s.valid_encoding? # => true
3844 *
3845 * When a given object is an integer,
3846 * the value is considered an 8-bit byte;
3847 * if the integer occupies more than one byte (i.e,. is greater than 255),
3848 * appends only the low-order byte (similar to String#setbyte):
3849 *
3850 * s = ""
3851 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3852 * s.bytesize # => 2
3853 *
3854 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3855 */
3856
3857VALUE
3858rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3859{
3860 long needed_capacity = 0;
3861 volatile VALUE t0;
3862 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3863
3864 for (int index = 0; index < argc; index++) {
3865 VALUE obj = argv[index];
3866 enum ruby_value_type type = types[index] = rb_type(obj);
3867 switch (type) {
3868 case T_FIXNUM:
3869 case T_BIGNUM:
3870 needed_capacity++;
3871 break;
3872 case T_STRING:
3873 needed_capacity += RSTRING_LEN(obj);
3874 break;
3875 default:
3876 rb_raise(
3878 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3879 rb_obj_class(obj)
3880 );
3881 break;
3882 }
3883 }
3884
3885 str_ensure_available_capa(str, needed_capacity);
3886 char *sptr = RSTRING_END(str);
3887
3888 for (int index = 0; index < argc; index++) {
3889 VALUE obj = argv[index];
3890 enum ruby_value_type type = types[index];
3891 switch (type) {
3892 case T_FIXNUM:
3893 case T_BIGNUM: {
3894 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3895 char byte = (char)(NUM2INT(obj) & 0xFF);
3896 *sptr = byte;
3897 sptr++;
3898 break;
3899 }
3900 case T_STRING: {
3901 const char *ptr;
3902 long len;
3903 RSTRING_GETMEM(obj, ptr, len);
3904 memcpy(sptr, ptr, len);
3905 sptr += len;
3906 break;
3907 }
3908 default:
3909 rb_bug("append_as_bytes arguments should have been validated");
3910 }
3911 }
3912
3913 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3914 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3915
3916 int cr = ENC_CODERANGE(str);
3917 switch (cr) {
3918 case ENC_CODERANGE_7BIT: {
3919 for (int index = 0; index < argc; index++) {
3920 VALUE obj = argv[index];
3921 enum ruby_value_type type = types[index];
3922 switch (type) {
3923 case T_FIXNUM:
3924 case T_BIGNUM: {
3925 if (!ISASCII(NUM2INT(obj))) {
3926 goto clear_cr;
3927 }
3928 break;
3929 }
3930 case T_STRING: {
3931 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3932 goto clear_cr;
3933 }
3934 break;
3935 }
3936 default:
3937 rb_bug("append_as_bytes arguments should have been validated");
3938 }
3939 }
3940 break;
3941 }
3943 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3944 goto keep_cr;
3945 }
3946 else {
3947 goto clear_cr;
3948 }
3949 break;
3950 default:
3951 goto clear_cr;
3952 break;
3953 }
3954
3955 RB_GC_GUARD(t0);
3956
3957 clear_cr:
3958 // If no fast path was hit, we clear the coderange.
3959 // append_as_bytes is predominently meant to be used in
3960 // buffering situation, hence it's likely the coderange
3961 // will never be scanned, so it's not worth spending time
3962 // precomputing the coderange except for simple and common
3963 // situations.
3965 keep_cr:
3966 return str;
3967}
3968
3969/*
3970 * call-seq:
3971 * self << object -> self
3972 *
3973 * Appends a string representation of +object+ to +self+;
3974 * returns +self+.
3975 *
3976 * If +object+ is a string, appends it to +self+:
3977 *
3978 * s = 'foo'
3979 * s << 'bar' # => "foobar"
3980 * s # => "foobar"
3981 *
3982 * If +object+ is an integer,
3983 * its value is considered a codepoint;
3984 * converts the value to a character before concatenating:
3985 *
3986 * s = 'foo'
3987 * s << 33 # => "foo!"
3988 *
3989 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
3990 * and the encoding of +self+ is Encoding::US_ASCII,
3991 * changes the encoding to Encoding::ASCII_8BIT:
3992 *
3993 * s = 'foo'.encode(Encoding::US_ASCII)
3994 * s.encoding # => #<Encoding:US-ASCII>
3995 * s << 0xff # => "foo\xFF"
3996 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3997 *
3998 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
3999 *
4000 * s = 'foo'
4001 * s.encoding # => <Encoding:UTF-8>
4002 * s << 0x00110000 # 1114112 out of char range (RangeError)
4003 * s = 'foo'.encode(Encoding::EUC_JP)
4004 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4005 *
4006 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4007 */
4008VALUE
4010{
4011 unsigned int code;
4012 rb_encoding *enc = STR_ENC_GET(str1);
4013 int encidx;
4014
4015 if (RB_INTEGER_TYPE_P(str2)) {
4016 if (rb_num_to_uint(str2, &code) == 0) {
4017 }
4018 else if (FIXNUM_P(str2)) {
4019 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4020 }
4021 else {
4022 rb_raise(rb_eRangeError, "bignum out of char range");
4023 }
4024 }
4025 else {
4026 return rb_str_append(str1, str2);
4027 }
4028
4029 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4030
4031 if (encidx >= 0) {
4032 rb_str_buf_cat_byte(str1, (unsigned char)code);
4033 }
4034 else {
4035 long pos = RSTRING_LEN(str1);
4036 int cr = ENC_CODERANGE(str1);
4037 int len;
4038 char *buf;
4039
4040 switch (len = rb_enc_codelen(code, enc)) {
4041 case ONIGERR_INVALID_CODE_POINT_VALUE:
4042 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4043 break;
4044 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4045 case 0:
4046 rb_raise(rb_eRangeError, "%u out of char range", code);
4047 break;
4048 }
4049 buf = ALLOCA_N(char, len + 1);
4050 rb_enc_mbcput(code, buf, enc);
4051 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4052 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4053 }
4054 rb_str_resize(str1, pos+len);
4055 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4056 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4058 }
4059 else if (cr == ENC_CODERANGE_BROKEN) {
4061 }
4062 ENC_CODERANGE_SET(str1, cr);
4063 }
4064 return str1;
4065}
4066
4067int
4068rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4069{
4070 int encidx = rb_enc_to_index(enc);
4071
4072 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4073 /* US-ASCII automatically extended to ASCII-8BIT */
4074 if (code > 0xFF) {
4075 rb_raise(rb_eRangeError, "%u out of char range", code);
4076 }
4077 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4078 return ENCINDEX_ASCII_8BIT;
4079 }
4080 return encidx;
4081 }
4082 else {
4083 return -1;
4084 }
4085}
4086
4087/*
4088 * call-seq:
4089 * prepend(*other_strings) -> new_string
4090 *
4091 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4092 *
4093 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4094 *
4095 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4096 *
4097 */
4098
4099static VALUE
4100rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4101{
4102 str_modifiable(str);
4103
4104 if (argc == 1) {
4105 rb_str_update(str, 0L, 0L, argv[0]);
4106 }
4107 else if (argc > 1) {
4108 int i;
4109 VALUE arg_str = rb_str_tmp_new(0);
4110 rb_enc_copy(arg_str, str);
4111 for (i = 0; i < argc; i++) {
4112 rb_str_append(arg_str, argv[i]);
4113 }
4114 rb_str_update(str, 0L, 0L, arg_str);
4115 }
4116
4117 return str;
4118}
4119
4120st_index_t
4122{
4123 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4124 st_index_t precomputed_hash;
4125 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4126
4127 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4128 return precomputed_hash;
4129 }
4130
4131 return str_do_hash(str);
4132}
4133
4134int
4136{
4137 long len1, len2;
4138 const char *ptr1, *ptr2;
4139 RSTRING_GETMEM(str1, ptr1, len1);
4140 RSTRING_GETMEM(str2, ptr2, len2);
4141 return (len1 != len2 ||
4142 !rb_str_comparable(str1, str2) ||
4143 memcmp(ptr1, ptr2, len1) != 0);
4144}
4145
4146/*
4147 * call-seq:
4148 * hash -> integer
4149 *
4150 * :include: doc/string/hash.rdoc
4151 *
4152 */
4153
4154static VALUE
4155rb_str_hash_m(VALUE str)
4156{
4157 st_index_t hval = rb_str_hash(str);
4158 return ST2FIX(hval);
4159}
4160
4161#define lesser(a,b) (((a)>(b))?(b):(a))
4162
4163int
4165{
4166 int idx1, idx2;
4167 int rc1, rc2;
4168
4169 if (RSTRING_LEN(str1) == 0) return TRUE;
4170 if (RSTRING_LEN(str2) == 0) return TRUE;
4171 idx1 = ENCODING_GET(str1);
4172 idx2 = ENCODING_GET(str2);
4173 if (idx1 == idx2) return TRUE;
4174 rc1 = rb_enc_str_coderange(str1);
4175 rc2 = rb_enc_str_coderange(str2);
4176 if (rc1 == ENC_CODERANGE_7BIT) {
4177 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4178 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4179 return TRUE;
4180 }
4181 if (rc2 == ENC_CODERANGE_7BIT) {
4182 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4183 return TRUE;
4184 }
4185 return FALSE;
4186}
4187
4188int
4190{
4191 long len1, len2;
4192 const char *ptr1, *ptr2;
4193 int retval;
4194
4195 if (str1 == str2) return 0;
4196 RSTRING_GETMEM(str1, ptr1, len1);
4197 RSTRING_GETMEM(str2, ptr2, len2);
4198 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4199 if (len1 == len2) {
4200 if (!rb_str_comparable(str1, str2)) {
4201 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4202 return 1;
4203 return -1;
4204 }
4205 return 0;
4206 }
4207 if (len1 > len2) return 1;
4208 return -1;
4209 }
4210 if (retval > 0) return 1;
4211 return -1;
4212}
4213
4214/*
4215 * call-seq:
4216 * self == object -> true or false
4217 *
4218 * Returns whether +object+ is equal to +self+.
4219 *
4220 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4221 *
4222 * s = 'foo'
4223 * s == 'foo' # => true
4224 * s == 'food' # => false
4225 * s == 'FOO' # => false
4226 *
4227 * Returns +false+ if the two strings' encodings are not compatible:
4228 *
4229 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4230 *
4231 * When +object+ is not a string:
4232 *
4233 * - If +object+ responds to method <tt>to_str</tt>,
4234 * <tt>object == self</tt> is called and its return value is returned.
4235 * - If +object+ does not respond to <tt>to_str</tt>,
4236 * +false+ is returned.
4237 *
4238 * Related: {Comparing}[rdoc-ref:String@Comparing].
4239 */
4240
4241VALUE
4243{
4244 if (str1 == str2) return Qtrue;
4245 if (!RB_TYPE_P(str2, T_STRING)) {
4246 if (!rb_respond_to(str2, idTo_str)) {
4247 return Qfalse;
4248 }
4249 return rb_equal(str2, str1);
4250 }
4251 return rb_str_eql_internal(str1, str2);
4252}
4253
4254/*
4255 * call-seq:
4256 * eql?(object) -> true or false
4257 *
4258 * :include: doc/string/eql_p.rdoc
4259 *
4260 */
4261
4262VALUE
4263rb_str_eql(VALUE str1, VALUE str2)
4264{
4265 if (str1 == str2) return Qtrue;
4266 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4267 return rb_str_eql_internal(str1, str2);
4268}
4269
4270/*
4271 * call-seq:
4272 * self <=> other_string -> -1, 0, 1, or nil
4273 *
4274 * Compares +self+ and +other_string+, returning:
4275 *
4276 * - -1 if +other_string+ is larger.
4277 * - 0 if the two are equal.
4278 * - 1 if +other_string+ is smaller.
4279 * - +nil+ if the two are incomparable.
4280 *
4281 * Examples:
4282 *
4283 * 'foo' <=> 'foo' # => 0
4284 * 'foo' <=> 'food' # => -1
4285 * 'food' <=> 'foo' # => 1
4286 * 'FOO' <=> 'foo' # => -1
4287 * 'foo' <=> 'FOO' # => 1
4288 * 'foo' <=> 1 # => nil
4289 *
4290 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4291 */
4292
4293static VALUE
4294rb_str_cmp_m(VALUE str1, VALUE str2)
4295{
4296 int result;
4297 VALUE s = rb_check_string_type(str2);
4298 if (NIL_P(s)) {
4299 return rb_invcmp(str1, str2);
4300 }
4301 result = rb_str_cmp(str1, s);
4302 return INT2FIX(result);
4303}
4304
4305static VALUE str_casecmp(VALUE str1, VALUE str2);
4306static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4307
4308/*
4309 * call-seq:
4310 * casecmp(other_string) -> -1, 0, 1, or nil
4311 *
4312 * Ignoring case, compares +self+ and +other_string+; returns:
4313 *
4314 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4315 * - 0 if the two are equal.
4316 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4317 * - +nil+ if the two are incomparable.
4318 *
4319 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4320 *
4321 * Examples:
4322 *
4323 * 'foo'.casecmp('goo') # => -1
4324 * 'goo'.casecmp('foo') # => 1
4325 * 'foo'.casecmp('food') # => -1
4326 * 'food'.casecmp('foo') # => 1
4327 * 'FOO'.casecmp('foo') # => 0
4328 * 'foo'.casecmp('FOO') # => 0
4329 * 'foo'.casecmp(1) # => nil
4330 *
4331 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4332 */
4333
4334static VALUE
4335rb_str_casecmp(VALUE str1, VALUE str2)
4336{
4337 VALUE s = rb_check_string_type(str2);
4338 if (NIL_P(s)) {
4339 return Qnil;
4340 }
4341 return str_casecmp(str1, s);
4342}
4343
4344static VALUE
4345str_casecmp(VALUE str1, VALUE str2)
4346{
4347 long len;
4348 rb_encoding *enc;
4349 const char *p1, *p1end, *p2, *p2end;
4350
4351 enc = rb_enc_compatible(str1, str2);
4352 if (!enc) {
4353 return Qnil;
4354 }
4355
4356 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4357 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4358 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4359 while (p1 < p1end && p2 < p2end) {
4360 if (*p1 != *p2) {
4361 unsigned int c1 = TOLOWER(*p1 & 0xff);
4362 unsigned int c2 = TOLOWER(*p2 & 0xff);
4363 if (c1 != c2)
4364 return INT2FIX(c1 < c2 ? -1 : 1);
4365 }
4366 p1++;
4367 p2++;
4368 }
4369 }
4370 else {
4371 while (p1 < p1end && p2 < p2end) {
4372 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4373 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4374
4375 if (0 <= c1 && 0 <= c2) {
4376 c1 = TOLOWER(c1);
4377 c2 = TOLOWER(c2);
4378 if (c1 != c2)
4379 return INT2FIX(c1 < c2 ? -1 : 1);
4380 }
4381 else {
4382 int r;
4383 l1 = rb_enc_mbclen(p1, p1end, enc);
4384 l2 = rb_enc_mbclen(p2, p2end, enc);
4385 len = l1 < l2 ? l1 : l2;
4386 r = memcmp(p1, p2, len);
4387 if (r != 0)
4388 return INT2FIX(r < 0 ? -1 : 1);
4389 if (l1 != l2)
4390 return INT2FIX(l1 < l2 ? -1 : 1);
4391 }
4392 p1 += l1;
4393 p2 += l2;
4394 }
4395 }
4396 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4397 if (p1 == p1end) return INT2FIX(-1);
4398 return INT2FIX(1);
4399}
4400
4401/*
4402 * call-seq:
4403 * casecmp?(other_string) -> true, false, or nil
4404 *
4405 * Returns +true+ if +self+ and +other_string+ are equal after
4406 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4407 *
4408 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4409 *
4410 * Examples:
4411 *
4412 * 'foo'.casecmp?('goo') # => false
4413 * 'goo'.casecmp?('foo') # => false
4414 * 'foo'.casecmp?('food') # => false
4415 * 'food'.casecmp?('foo') # => false
4416 * 'FOO'.casecmp?('foo') # => true
4417 * 'foo'.casecmp?('FOO') # => true
4418 * 'foo'.casecmp?(1) # => nil
4419 *
4420 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4421 */
4422
4423static VALUE
4424rb_str_casecmp_p(VALUE str1, VALUE str2)
4425{
4426 VALUE s = rb_check_string_type(str2);
4427 if (NIL_P(s)) {
4428 return Qnil;
4429 }
4430 return str_casecmp_p(str1, s);
4431}
4432
4433static VALUE
4434str_casecmp_p(VALUE str1, VALUE str2)
4435{
4436 rb_encoding *enc;
4437 VALUE folded_str1, folded_str2;
4438 VALUE fold_opt = sym_fold;
4439
4440 enc = rb_enc_compatible(str1, str2);
4441 if (!enc) {
4442 return Qnil;
4443 }
4444
4445 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4446 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4447
4448 return rb_str_eql(folded_str1, folded_str2);
4449}
4450
4451static long
4452strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4453 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4454{
4455 const char *search_start = str_ptr;
4456 long pos, search_len = str_len - offset;
4457
4458 for (;;) {
4459 const char *t;
4460 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4461 if (pos < 0) return pos;
4462 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4463 if (t == search_start + pos) break;
4464 search_len -= t - search_start;
4465 if (search_len <= 0) return -1;
4466 offset += t - search_start;
4467 search_start = t;
4468 }
4469 return pos + offset;
4470}
4471
4472/* found index in byte */
4473#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4474#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4475
4476static long
4477rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4478{
4479 const char *str_ptr, *str_ptr_end, *sub_ptr;
4480 long str_len, sub_len;
4481 rb_encoding *enc;
4482
4483 enc = rb_enc_check(str, sub);
4484 if (is_broken_string(sub)) return -1;
4485
4486 str_ptr = RSTRING_PTR(str);
4487 str_ptr_end = RSTRING_END(str);
4488 str_len = RSTRING_LEN(str);
4489 sub_ptr = RSTRING_PTR(sub);
4490 sub_len = RSTRING_LEN(sub);
4491
4492 if (str_len < sub_len) return -1;
4493
4494 if (offset != 0) {
4495 long str_len_char, sub_len_char;
4496 int single_byte = single_byte_optimizable(str);
4497 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4498 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4499 if (offset < 0) {
4500 offset += str_len_char;
4501 if (offset < 0) return -1;
4502 }
4503 if (str_len_char - offset < sub_len_char) return -1;
4504 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4505 str_ptr += offset;
4506 }
4507 if (sub_len == 0) return offset;
4508
4509 /* need proceed one character at a time */
4510 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4511}
4512
4513
4514/*
4515 * call-seq:
4516 * index(pattern, offset = 0) -> integer or nil
4517 *
4518 * :include: doc/string/index.rdoc
4519 *
4520 */
4521
4522static VALUE
4523rb_str_index_m(int argc, VALUE *argv, VALUE str)
4524{
4525 VALUE sub;
4526 VALUE initpos;
4527 rb_encoding *enc = STR_ENC_GET(str);
4528 long pos;
4529
4530 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4531 long slen = str_strlen(str, enc); /* str's enc */
4532 pos = NUM2LONG(initpos);
4533 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4534 if (RB_TYPE_P(sub, T_REGEXP)) {
4536 }
4537 return Qnil;
4538 }
4539 }
4540 else {
4541 pos = 0;
4542 }
4543
4544 if (RB_TYPE_P(sub, T_REGEXP)) {
4545 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4546 enc, single_byte_optimizable(str));
4547
4548 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4549 VALUE match = rb_backref_get();
4550 struct re_registers *regs = RMATCH_REGS(match);
4551 pos = rb_str_sublen(str, BEG(0));
4552 return LONG2NUM(pos);
4553 }
4554 }
4555 else {
4556 StringValue(sub);
4557 pos = rb_str_index(str, sub, pos);
4558 if (pos >= 0) {
4559 pos = rb_str_sublen(str, pos);
4560 return LONG2NUM(pos);
4561 }
4562 }
4563 return Qnil;
4564}
4565
4566/* Ensure that the given pos is a valid character boundary.
4567 * Note that in this function, "character" means a code point
4568 * (Unicode scalar value), not a grapheme cluster.
4569 */
4570static void
4571str_ensure_byte_pos(VALUE str, long pos)
4572{
4573 if (!single_byte_optimizable(str)) {
4574 const char *s = RSTRING_PTR(str);
4575 const char *e = RSTRING_END(str);
4576 const char *p = s + pos;
4577 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4578 rb_raise(rb_eIndexError,
4579 "offset %ld does not land on character boundary", pos);
4580 }
4581 }
4582}
4583
4584/*
4585 * call-seq:
4586 * byteindex(object, offset = 0) -> integer or nil
4587 *
4588 * Returns the 0-based integer index of a substring of +self+
4589 * specified by +object+ (a string or Regexp) and +offset+,
4590 * or +nil+ if there is no such substring;
4591 * the returned index is the count of _bytes_ (not characters).
4592 *
4593 * When +object+ is a string,
4594 * returns the index of the first found substring equal to +object+:
4595 *
4596 * s = 'foo' # => "foo"
4597 * s.size # => 3 # Three 1-byte characters.
4598 * s.bytesize # => 3 # Three bytes.
4599 * s.byteindex('f') # => 0
4600 * s.byteindex('o') # => 1
4601 * s.byteindex('oo') # => 1
4602 * s.byteindex('ooo') # => nil
4603 *
4604 * When +object+ is a Regexp,
4605 * returns the index of the first found substring matching +object+;
4606 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4607 *
4608 * s = 'foo'
4609 * s.byteindex(/f/) # => 0
4610 * $~ # => #<MatchData "f">
4611 * s.byteindex(/o/) # => 1
4612 * s.byteindex(/oo/) # => 1
4613 * s.byteindex(/ooo/) # => nil
4614 * $~ # => nil
4615 *
4616 * \Integer argument +offset+, if given, specifies the 0-based index
4617 * of the byte where searching is to begin.
4618 *
4619 * When +offset+ is non-negative,
4620 * searching begins at byte position +offset+:
4621 *
4622 * s = 'foo'
4623 * s.byteindex('o', 1) # => 1
4624 * s.byteindex('o', 2) # => 2
4625 * s.byteindex('o', 3) # => nil
4626 *
4627 * When +offset+ is negative, counts backward from the end of +self+:
4628 *
4629 * s = 'foo'
4630 * s.byteindex('o', -1) # => 2
4631 * s.byteindex('o', -2) # => 1
4632 * s.byteindex('o', -3) # => 1
4633 * s.byteindex('o', -4) # => nil
4634 *
4635 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4636 *
4637 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4638 * s.size # => 2 # Two 3-byte characters.
4639 * s.bytesize # => 6 # Six bytes.
4640 * s.byteindex("\uFFFF") # => 0
4641 * s.byteindex("\uFFFF", 1) # Raises IndexError
4642 * s.byteindex("\uFFFF", 2) # Raises IndexError
4643 * s.byteindex("\uFFFF", 3) # => 3
4644 * s.byteindex("\uFFFF", 4) # Raises IndexError
4645 * s.byteindex("\uFFFF", 5) # Raises IndexError
4646 * s.byteindex("\uFFFF", 6) # => nil
4647 *
4648 * Related: see {Querying}[rdoc-ref:String@Querying].
4649 */
4650
4651static VALUE
4652rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4653{
4654 VALUE sub;
4655 VALUE initpos;
4656 long pos;
4657
4658 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4659 long slen = RSTRING_LEN(str);
4660 pos = NUM2LONG(initpos);
4661 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4662 if (RB_TYPE_P(sub, T_REGEXP)) {
4664 }
4665 return Qnil;
4666 }
4667 }
4668 else {
4669 pos = 0;
4670 }
4671
4672 str_ensure_byte_pos(str, pos);
4673
4674 if (RB_TYPE_P(sub, T_REGEXP)) {
4675 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4676 VALUE match = rb_backref_get();
4677 struct re_registers *regs = RMATCH_REGS(match);
4678 pos = BEG(0);
4679 return LONG2NUM(pos);
4680 }
4681 }
4682 else {
4683 StringValue(sub);
4684 pos = rb_str_byteindex(str, sub, pos);
4685 if (pos >= 0) return LONG2NUM(pos);
4686 }
4687 return Qnil;
4688}
4689
4690#ifndef HAVE_MEMRCHR
4691static void*
4692memrchr(const char *search_str, int chr, long search_len)
4693{
4694 const char *ptr = search_str + search_len;
4695 while (ptr > search_str) {
4696 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4697 }
4698
4699 return ((void *)0);
4700}
4701#endif
4702
4703static long
4704str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4705{
4706 char *hit, *adjusted;
4707 int c;
4708 long slen, searchlen;
4709 char *sbeg, *e, *t;
4710
4711 sbeg = RSTRING_PTR(str);
4712 slen = RSTRING_LEN(sub);
4713 if (slen == 0) return s - sbeg;
4714 e = RSTRING_END(str);
4715 t = RSTRING_PTR(sub);
4716 c = *t & 0xff;
4717 searchlen = s - sbeg + 1;
4718
4719 if (memcmp(s, t, slen) == 0) {
4720 return s - sbeg;
4721 }
4722
4723 do {
4724 hit = memrchr(sbeg, c, searchlen);
4725 if (!hit) break;
4726 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4727 if (hit != adjusted) {
4728 searchlen = adjusted - sbeg;
4729 continue;
4730 }
4731 if (memcmp(hit, t, slen) == 0)
4732 return hit - sbeg;
4733 searchlen = adjusted - sbeg;
4734 } while (searchlen > 0);
4735
4736 return -1;
4737}
4738
4739/* found index in byte */
4740static long
4741rb_str_rindex(VALUE str, VALUE sub, long pos)
4742{
4743 long len, slen;
4744 char *sbeg, *s;
4745 rb_encoding *enc;
4746 int singlebyte;
4747
4748 enc = rb_enc_check(str, sub);
4749 if (is_broken_string(sub)) return -1;
4750 singlebyte = single_byte_optimizable(str);
4751 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4752 slen = str_strlen(sub, enc); /* rb_enc_check */
4753
4754 /* substring longer than string */
4755 if (len < slen) return -1;
4756 if (len - pos < slen) pos = len - slen;
4757 if (len == 0) return pos;
4758
4759 sbeg = RSTRING_PTR(str);
4760
4761 if (pos == 0) {
4762 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4763 return 0;
4764 else
4765 return -1;
4766 }
4767
4768 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4769 return str_rindex(str, sub, s, enc);
4770}
4771
4772/*
4773 * call-seq:
4774 * rindex(substring, offset = self.length) -> integer or nil
4775 * rindex(regexp, offset = self.length) -> integer or nil
4776 *
4777 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4778 * or +nil+ if none found:
4779 *
4780 * 'foo'.rindex('f') # => 0
4781 * 'foo'.rindex('o') # => 2
4782 * 'foo'.rindex('oo') # => 1
4783 * 'foo'.rindex('ooo') # => nil
4784 *
4785 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4786 * or +nil+ if none found:
4787 *
4788 * 'foo'.rindex(/f/) # => 0
4789 * 'foo'.rindex(/o/) # => 2
4790 * 'foo'.rindex(/oo/) # => 1
4791 * 'foo'.rindex(/ooo/) # => nil
4792 *
4793 * The _last_ match means starting at the possible last position, not
4794 * the last of longest matches.
4795 *
4796 * 'foo'.rindex(/o+/) # => 2
4797 * $~ #=> #<MatchData "o">
4798 *
4799 * To get the last longest match, needs to combine with negative
4800 * lookbehind.
4801 *
4802 * 'foo'.rindex(/(?<!o)o+/) # => 1
4803 * $~ #=> #<MatchData "oo">
4804 *
4805 * Or String#index with negative lookforward.
4806 *
4807 * 'foo'.index(/o+(?!.*o)/) # => 1
4808 * $~ #=> #<MatchData "oo">
4809 *
4810 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4811 * string to _end_ the search:
4812 *
4813 * 'foo'.rindex('o', 0) # => nil
4814 * 'foo'.rindex('o', 1) # => 1
4815 * 'foo'.rindex('o', 2) # => 2
4816 * 'foo'.rindex('o', 3) # => 2
4817 *
4818 * If +offset+ is a negative Integer, the maximum starting position in the
4819 * string to _end_ the search is the sum of the string's length and +offset+:
4820 *
4821 * 'foo'.rindex('o', -1) # => 2
4822 * 'foo'.rindex('o', -2) # => 1
4823 * 'foo'.rindex('o', -3) # => nil
4824 * 'foo'.rindex('o', -4) # => nil
4825 *
4826 * Related: String#index.
4827 */
4828
4829static VALUE
4830rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4831{
4832 VALUE sub;
4833 VALUE initpos;
4834 rb_encoding *enc = STR_ENC_GET(str);
4835 long pos, len = str_strlen(str, enc); /* str's enc */
4836
4837 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4838 pos = NUM2LONG(initpos);
4839 if (pos < 0 && (pos += len) < 0) {
4840 if (RB_TYPE_P(sub, T_REGEXP)) {
4842 }
4843 return Qnil;
4844 }
4845 if (pos > len) pos = len;
4846 }
4847 else {
4848 pos = len;
4849 }
4850
4851 if (RB_TYPE_P(sub, T_REGEXP)) {
4852 /* enc = rb_enc_check(str, sub); */
4853 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4854 enc, single_byte_optimizable(str));
4855
4856 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4857 VALUE match = rb_backref_get();
4858 struct re_registers *regs = RMATCH_REGS(match);
4859 pos = rb_str_sublen(str, BEG(0));
4860 return LONG2NUM(pos);
4861 }
4862 }
4863 else {
4864 StringValue(sub);
4865 pos = rb_str_rindex(str, sub, pos);
4866 if (pos >= 0) {
4867 pos = rb_str_sublen(str, pos);
4868 return LONG2NUM(pos);
4869 }
4870 }
4871 return Qnil;
4872}
4873
4874static long
4875rb_str_byterindex(VALUE str, VALUE sub, long pos)
4876{
4877 long len, slen;
4878 char *sbeg, *s;
4879 rb_encoding *enc;
4880
4881 enc = rb_enc_check(str, sub);
4882 if (is_broken_string(sub)) return -1;
4883 len = RSTRING_LEN(str);
4884 slen = RSTRING_LEN(sub);
4885
4886 /* substring longer than string */
4887 if (len < slen) return -1;
4888 if (len - pos < slen) pos = len - slen;
4889 if (len == 0) return pos;
4890
4891 sbeg = RSTRING_PTR(str);
4892
4893 if (pos == 0) {
4894 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4895 return 0;
4896 else
4897 return -1;
4898 }
4899
4900 s = sbeg + pos;
4901 return str_rindex(str, sub, s, enc);
4902}
4903
4904/*
4905 * call-seq:
4906 * byterindex(object, offset = self.bytesize) -> integer or nil
4907 *
4908 * Returns the 0-based integer index of a substring of +self+
4909 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4910 * or +nil+ if there is no such substring;
4911 * the returned index is the count of _bytes_ (not characters).
4912 *
4913 * When +object+ is a string,
4914 * returns the index of the _last_ found substring equal to +object+:
4915 *
4916 * s = 'foo' # => "foo"
4917 * s.size # => 3 # Three 1-byte characters.
4918 * s.bytesize # => 3 # Three bytes.
4919 * s.byterindex('f') # => 0
4920 s.byterindex('o') # => 2
4921 s.byterindex('oo') # => 1
4922 s.byterindex('ooo') # => nil
4923 *
4924 * When +object+ is a Regexp,
4925 * returns the index of the last found substring matching +object+;
4926 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4927 *
4928 * s = 'foo'
4929 * s.byterindex(/f/) # => 0
4930 * $~ # => #<MatchData "f">
4931 * s.byterindex(/o/) # => 2
4932 * s.byterindex(/oo/) # => 1
4933 * s.byterindex(/ooo/) # => nil
4934 * $~ # => nil
4935 *
4936 * The last match means starting at the possible last position,
4937 * not the last of the longest matches:
4938 *
4939 * s = 'foo'
4940 * s.byterindex(/o+/) # => 2
4941 * $~ #=> #<MatchData "o">
4942 *
4943 * To get the last longest match, use a negative lookbehind:
4944 *
4945 * s = 'foo'
4946 * s.byterindex(/(?<!o)o+/) # => 1
4947 * $~ # => #<MatchData "oo">
4948 *
4949 * Or use method #byteindex with negative lookahead:
4950 *
4951 * s = 'foo'
4952 * s.byteindex(/o+(?!.*o)/) # => 1
4953 * $~ #=> #<MatchData "oo">
4954 *
4955 * \Integer argument +offset+, if given, specifies the 0-based index
4956 * of the byte where searching is to end.
4957 *
4958 * When +offset+ is non-negative,
4959 * searching ends at byte position +offset+:
4960 *
4961 * s = 'foo'
4962 * s.byterindex('o', 0) # => nil
4963 * s.byterindex('o', 1) # => 1
4964 * s.byterindex('o', 2) # => 2
4965 * s.byterindex('o', 3) # => 2
4966 *
4967 * When +offset+ is negative, counts backward from the end of +self+:
4968 *
4969 * s = 'foo'
4970 * s.byterindex('o', -1) # => 2
4971 * s.byterindex('o', -2) # => 1
4972 * s.byterindex('o', -3) # => nil
4973 *
4974 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4975 *
4976 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4977 * s.size # => 2 # Two 3-byte characters.
4978 * s.bytesize # => 6 # Six bytes.
4979 * s.byterindex("\uFFFF") # => 3
4980 * s.byterindex("\uFFFF", 1) # Raises IndexError
4981 * s.byterindex("\uFFFF", 2) # Raises IndexError
4982 * s.byterindex("\uFFFF", 3) # => 3
4983 * s.byterindex("\uFFFF", 4) # Raises IndexError
4984 * s.byterindex("\uFFFF", 5) # Raises IndexError
4985 * s.byterindex("\uFFFF", 6) # => nil
4986 *
4987 * Related: see {Querying}[rdoc-ref:String@Querying].
4988 */
4989
4990static VALUE
4991rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4992{
4993 VALUE sub;
4994 VALUE initpos;
4995 long pos, len = RSTRING_LEN(str);
4996
4997 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4998 pos = NUM2LONG(initpos);
4999 if (pos < 0 && (pos += len) < 0) {
5000 if (RB_TYPE_P(sub, T_REGEXP)) {
5002 }
5003 return Qnil;
5004 }
5005 if (pos > len) pos = len;
5006 }
5007 else {
5008 pos = len;
5009 }
5010
5011 str_ensure_byte_pos(str, pos);
5012
5013 if (RB_TYPE_P(sub, T_REGEXP)) {
5014 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5015 VALUE match = rb_backref_get();
5016 struct re_registers *regs = RMATCH_REGS(match);
5017 pos = BEG(0);
5018 return LONG2NUM(pos);
5019 }
5020 }
5021 else {
5022 StringValue(sub);
5023 pos = rb_str_byterindex(str, sub, pos);
5024 if (pos >= 0) return LONG2NUM(pos);
5025 }
5026 return Qnil;
5027}
5028
5029/*
5030 * call-seq:
5031 * self =~ object -> integer or nil
5032 *
5033 * When +object+ is a Regexp, returns the index of the first substring in +self+
5034 * matched by +object+,
5035 * or +nil+ if no match is found;
5036 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5037 *
5038 * 'foo' =~ /f/ # => 0
5039 * $~ # => #<MatchData "f">
5040 * 'foo' =~ /o/ # => 1
5041 * $~ # => #<MatchData "o">
5042 * 'foo' =~ /x/ # => nil
5043 * $~ # => nil
5044 *
5045 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5046 * (see Regexp#=~):
5047 *
5048 * number = nil
5049 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5050 * number # => nil # Not assigned.
5051 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5052 * number # => "9" # Assigned.
5053 *
5054 * If +object+ is not a Regexp, returns the value
5055 * returned by <tt>object =~ self</tt>.
5056 *
5057 * Related: see {Querying}[rdoc-ref:String@Querying].
5058 */
5059
5060static VALUE
5061rb_str_match(VALUE x, VALUE y)
5062{
5063 switch (OBJ_BUILTIN_TYPE(y)) {
5064 case T_STRING:
5065 rb_raise(rb_eTypeError, "type mismatch: String given");
5066
5067 case T_REGEXP:
5068 return rb_reg_match(y, x);
5069
5070 default:
5071 return rb_funcall(y, idEqTilde, 1, x);
5072 }
5073}
5074
5075
5076static VALUE get_pat(VALUE);
5077
5078
5079/*
5080 * call-seq:
5081 * match(pattern, offset = 0) -> matchdata or nil
5082 * match(pattern, offset = 0) {|matchdata| ... } -> object
5083 *
5084 * Creates a MatchData object based on +self+ and the given arguments;
5085 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5086 *
5087 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5088 *
5089 * regexp = Regexp.new(pattern)
5090 *
5091 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5092 * (see Regexp#match):
5093 *
5094 * matchdata = regexp.match(self[offset..])
5095 *
5096 * With no block given, returns the computed +matchdata+ or +nil+:
5097 *
5098 * 'foo'.match('f') # => #<MatchData "f">
5099 * 'foo'.match('o') # => #<MatchData "o">
5100 * 'foo'.match('x') # => nil
5101 * 'foo'.match('f', 1) # => nil
5102 * 'foo'.match('o', 1) # => #<MatchData "o">
5103 *
5104 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5105 * returns the block's return value:
5106 *
5107 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5108 *
5109 * With a block given and +nil+ +matchdata+, does not call the block:
5110 *
5111 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5112 *
5113 * Related: see {Querying}[rdoc-ref:String@Querying].
5114 */
5115
5116static VALUE
5117rb_str_match_m(int argc, VALUE *argv, VALUE str)
5118{
5119 VALUE re, result;
5120 if (argc < 1)
5121 rb_check_arity(argc, 1, 2);
5122 re = argv[0];
5123 argv[0] = str;
5124 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5125 if (!NIL_P(result) && rb_block_given_p()) {
5126 return rb_yield(result);
5127 }
5128 return result;
5129}
5130
5131/*
5132 * call-seq:
5133 * match?(pattern, offset = 0) -> true or false
5134 *
5135 * Returns whether a match is found for +self+ and the given arguments;
5136 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5137 *
5138 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5139 *
5140 * regexp = Regexp.new(pattern)
5141 *
5142 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5143 * +false+ otherwise:
5144 *
5145 * 'foo'.match?(/o/) # => true
5146 * 'foo'.match?('o') # => true
5147 * 'foo'.match?(/x/) # => false
5148 * 'foo'.match?('f', 1) # => false
5149 * 'foo'.match?('o', 1) # => true
5150 *
5151 * Related: see {Querying}[rdoc-ref:String@Querying].
5152 */
5153
5154static VALUE
5155rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5156{
5157 VALUE re;
5158 rb_check_arity(argc, 1, 2);
5159 re = get_pat(argv[0]);
5160 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5161}
5162
5163enum neighbor_char {
5164 NEIGHBOR_NOT_CHAR,
5165 NEIGHBOR_FOUND,
5166 NEIGHBOR_WRAPPED
5167};
5168
5169static enum neighbor_char
5170enc_succ_char(char *p, long len, rb_encoding *enc)
5171{
5172 long i;
5173 int l;
5174
5175 if (rb_enc_mbminlen(enc) > 1) {
5176 /* wchar, trivial case */
5177 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5178 if (!MBCLEN_CHARFOUND_P(r)) {
5179 return NEIGHBOR_NOT_CHAR;
5180 }
5181 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5182 l = rb_enc_code_to_mbclen(c, enc);
5183 if (!l) return NEIGHBOR_NOT_CHAR;
5184 if (l != len) return NEIGHBOR_WRAPPED;
5185 rb_enc_mbcput(c, p, enc);
5186 r = rb_enc_precise_mbclen(p, p + len, enc);
5187 if (!MBCLEN_CHARFOUND_P(r)) {
5188 return NEIGHBOR_NOT_CHAR;
5189 }
5190 return NEIGHBOR_FOUND;
5191 }
5192 while (1) {
5193 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5194 p[i] = '\0';
5195 if (i < 0)
5196 return NEIGHBOR_WRAPPED;
5197 ++((unsigned char*)p)[i];
5198 l = rb_enc_precise_mbclen(p, p+len, enc);
5199 if (MBCLEN_CHARFOUND_P(l)) {
5200 l = MBCLEN_CHARFOUND_LEN(l);
5201 if (l == len) {
5202 return NEIGHBOR_FOUND;
5203 }
5204 else {
5205 memset(p+l, 0xff, len-l);
5206 }
5207 }
5208 if (MBCLEN_INVALID_P(l) && i < len-1) {
5209 long len2;
5210 int l2;
5211 for (len2 = len-1; 0 < len2; len2--) {
5212 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5213 if (!MBCLEN_INVALID_P(l2))
5214 break;
5215 }
5216 memset(p+len2+1, 0xff, len-(len2+1));
5217 }
5218 }
5219}
5220
5221static enum neighbor_char
5222enc_pred_char(char *p, long len, rb_encoding *enc)
5223{
5224 long i;
5225 int l;
5226 if (rb_enc_mbminlen(enc) > 1) {
5227 /* wchar, trivial case */
5228 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5229 if (!MBCLEN_CHARFOUND_P(r)) {
5230 return NEIGHBOR_NOT_CHAR;
5231 }
5232 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5233 if (!c) return NEIGHBOR_NOT_CHAR;
5234 --c;
5235 l = rb_enc_code_to_mbclen(c, enc);
5236 if (!l) return NEIGHBOR_NOT_CHAR;
5237 if (l != len) return NEIGHBOR_WRAPPED;
5238 rb_enc_mbcput(c, p, enc);
5239 r = rb_enc_precise_mbclen(p, p + len, enc);
5240 if (!MBCLEN_CHARFOUND_P(r)) {
5241 return NEIGHBOR_NOT_CHAR;
5242 }
5243 return NEIGHBOR_FOUND;
5244 }
5245 while (1) {
5246 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5247 p[i] = '\xff';
5248 if (i < 0)
5249 return NEIGHBOR_WRAPPED;
5250 --((unsigned char*)p)[i];
5251 l = rb_enc_precise_mbclen(p, p+len, enc);
5252 if (MBCLEN_CHARFOUND_P(l)) {
5253 l = MBCLEN_CHARFOUND_LEN(l);
5254 if (l == len) {
5255 return NEIGHBOR_FOUND;
5256 }
5257 else {
5258 memset(p+l, 0, len-l);
5259 }
5260 }
5261 if (MBCLEN_INVALID_P(l) && i < len-1) {
5262 long len2;
5263 int l2;
5264 for (len2 = len-1; 0 < len2; len2--) {
5265 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5266 if (!MBCLEN_INVALID_P(l2))
5267 break;
5268 }
5269 memset(p+len2+1, 0, len-(len2+1));
5270 }
5271 }
5272}
5273
5274/*
5275 overwrite +p+ by succeeding letter in +enc+ and returns
5276 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5277 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5278 assuming each ranges are successive, and mbclen
5279 never change in each ranges.
5280 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5281 character.
5282 */
5283static enum neighbor_char
5284enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5285{
5286 enum neighbor_char ret;
5287 unsigned int c;
5288 int ctype;
5289 int range;
5290 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5291
5292 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5293 int try;
5294 const int max_gaps = 1;
5295
5296 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5297 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5298 ctype = ONIGENC_CTYPE_DIGIT;
5299 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5300 ctype = ONIGENC_CTYPE_ALPHA;
5301 else
5302 return NEIGHBOR_NOT_CHAR;
5303
5304 MEMCPY(save, p, char, len);
5305 for (try = 0; try <= max_gaps; ++try) {
5306 ret = enc_succ_char(p, len, enc);
5307 if (ret == NEIGHBOR_FOUND) {
5308 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5309 if (rb_enc_isctype(c, ctype, enc))
5310 return NEIGHBOR_FOUND;
5311 }
5312 }
5313 MEMCPY(p, save, char, len);
5314 range = 1;
5315 while (1) {
5316 MEMCPY(save, p, char, len);
5317 ret = enc_pred_char(p, len, enc);
5318 if (ret == NEIGHBOR_FOUND) {
5319 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5320 if (!rb_enc_isctype(c, ctype, enc)) {
5321 MEMCPY(p, save, char, len);
5322 break;
5323 }
5324 }
5325 else {
5326 MEMCPY(p, save, char, len);
5327 break;
5328 }
5329 range++;
5330 }
5331 if (range == 1) {
5332 return NEIGHBOR_NOT_CHAR;
5333 }
5334
5335 if (ctype != ONIGENC_CTYPE_DIGIT) {
5336 MEMCPY(carry, p, char, len);
5337 return NEIGHBOR_WRAPPED;
5338 }
5339
5340 MEMCPY(carry, p, char, len);
5341 enc_succ_char(carry, len, enc);
5342 return NEIGHBOR_WRAPPED;
5343}
5344
5345
5346static VALUE str_succ(VALUE str);
5347
5348/*
5349 * call-seq:
5350 * succ -> new_str
5351 *
5352 * Returns the successor to +self+. The successor is calculated by
5353 * incrementing characters.
5354 *
5355 * The first character to be incremented is the rightmost alphanumeric:
5356 * or, if no alphanumerics, the rightmost character:
5357 *
5358 * 'THX1138'.succ # => "THX1139"
5359 * '<<koala>>'.succ # => "<<koalb>>"
5360 * '***'.succ # => '**+'
5361 *
5362 * The successor to a digit is another digit, "carrying" to the next-left
5363 * character for a "rollover" from 9 to 0, and prepending another digit
5364 * if necessary:
5365 *
5366 * '00'.succ # => "01"
5367 * '09'.succ # => "10"
5368 * '99'.succ # => "100"
5369 *
5370 * The successor to a letter is another letter of the same case,
5371 * carrying to the next-left character for a rollover,
5372 * and prepending another same-case letter if necessary:
5373 *
5374 * 'aa'.succ # => "ab"
5375 * 'az'.succ # => "ba"
5376 * 'zz'.succ # => "aaa"
5377 * 'AA'.succ # => "AB"
5378 * 'AZ'.succ # => "BA"
5379 * 'ZZ'.succ # => "AAA"
5380 *
5381 * The successor to a non-alphanumeric character is the next character
5382 * in the underlying character set's collating sequence,
5383 * carrying to the next-left character for a rollover,
5384 * and prepending another character if necessary:
5385 *
5386 * s = 0.chr * 3
5387 * s # => "\x00\x00\x00"
5388 * s.succ # => "\x00\x00\x01"
5389 * s = 255.chr * 3
5390 * s # => "\xFF\xFF\xFF"
5391 * s.succ # => "\x01\x00\x00\x00"
5392 *
5393 * Carrying can occur between and among mixtures of alphanumeric characters:
5394 *
5395 * s = 'zz99zz99'
5396 * s.succ # => "aaa00aa00"
5397 * s = '99zz99zz'
5398 * s.succ # => "100aa00aa"
5399 *
5400 * The successor to an empty +String+ is a new empty +String+:
5401 *
5402 * ''.succ # => ""
5403 *
5404 */
5405
5406VALUE
5408{
5409 VALUE str;
5410 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5411 rb_enc_cr_str_copy_for_substr(str, orig);
5412 return str_succ(str);
5413}
5414
5415static VALUE
5416str_succ(VALUE str)
5417{
5418 rb_encoding *enc;
5419 char *sbeg, *s, *e, *last_alnum = 0;
5420 int found_alnum = 0;
5421 long l, slen;
5422 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5423 long carry_pos = 0, carry_len = 1;
5424 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5425
5426 slen = RSTRING_LEN(str);
5427 if (slen == 0) return str;
5428
5429 enc = STR_ENC_GET(str);
5430 sbeg = RSTRING_PTR(str);
5431 s = e = sbeg + slen;
5432
5433 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5434 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5435 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5436 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5437 break;
5438 }
5439 }
5440 l = rb_enc_precise_mbclen(s, e, enc);
5441 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5442 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5443 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5444 switch (neighbor) {
5445 case NEIGHBOR_NOT_CHAR:
5446 continue;
5447 case NEIGHBOR_FOUND:
5448 return str;
5449 case NEIGHBOR_WRAPPED:
5450 last_alnum = s;
5451 break;
5452 }
5453 found_alnum = 1;
5454 carry_pos = s - sbeg;
5455 carry_len = l;
5456 }
5457 if (!found_alnum) { /* str contains no alnum */
5458 s = e;
5459 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5460 enum neighbor_char neighbor;
5461 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5462 l = rb_enc_precise_mbclen(s, e, enc);
5463 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5464 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5465 MEMCPY(tmp, s, char, l);
5466 neighbor = enc_succ_char(tmp, l, enc);
5467 switch (neighbor) {
5468 case NEIGHBOR_FOUND:
5469 MEMCPY(s, tmp, char, l);
5470 return str;
5471 break;
5472 case NEIGHBOR_WRAPPED:
5473 MEMCPY(s, tmp, char, l);
5474 break;
5475 case NEIGHBOR_NOT_CHAR:
5476 break;
5477 }
5478 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5479 /* wrapped to \0...\0. search next valid char. */
5480 enc_succ_char(s, l, enc);
5481 }
5482 if (!rb_enc_asciicompat(enc)) {
5483 MEMCPY(carry, s, char, l);
5484 carry_len = l;
5485 }
5486 carry_pos = s - sbeg;
5487 }
5489 }
5490 RESIZE_CAPA(str, slen + carry_len);
5491 sbeg = RSTRING_PTR(str);
5492 s = sbeg + carry_pos;
5493 memmove(s + carry_len, s, slen - carry_pos);
5494 memmove(s, carry, carry_len);
5495 slen += carry_len;
5496 STR_SET_LEN(str, slen);
5497 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5499 return str;
5500}
5501
5502
5503/*
5504 * call-seq:
5505 * succ! -> self
5506 *
5507 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5508 */
5509
5510static VALUE
5511rb_str_succ_bang(VALUE str)
5512{
5513 rb_str_modify(str);
5514 str_succ(str);
5515 return str;
5516}
5517
5518static int
5519all_digits_p(const char *s, long len)
5520{
5521 while (len-- > 0) {
5522 if (!ISDIGIT(*s)) return 0;
5523 s++;
5524 }
5525 return 1;
5526}
5527
5528static int
5529str_upto_i(VALUE str, VALUE arg)
5530{
5531 rb_yield(str);
5532 return 0;
5533}
5534
5535/*
5536 * call-seq:
5537 * upto(other_string, exclusive = false) {|string| ... } -> self
5538 * upto(other_string, exclusive = false) -> new_enumerator
5539 *
5540 * With a block given, calls the block with each +String+ value
5541 * returned by successive calls to String#succ;
5542 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5543 * the sequence terminates when value +other_string+ is reached;
5544 * returns +self+:
5545 *
5546 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5547 * Output:
5548 *
5549 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5550 *
5551 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5552 *
5553 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5554 *
5555 * Output:
5556 *
5557 * a8 a9 b0 b1 b2 b3 b4 b5
5558 *
5559 * If +other_string+ would not be reached, does not call the block:
5560 *
5561 * '25'.upto('5') {|s| fail s }
5562 * 'aa'.upto('a') {|s| fail s }
5563 *
5564 * With no block given, returns a new Enumerator:
5565 *
5566 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5567 *
5568 */
5569
5570static VALUE
5571rb_str_upto(int argc, VALUE *argv, VALUE beg)
5572{
5573 VALUE end, exclusive;
5574
5575 rb_scan_args(argc, argv, "11", &end, &exclusive);
5576 RETURN_ENUMERATOR(beg, argc, argv);
5577 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5578}
5579
5580VALUE
5581rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5582{
5583 VALUE current, after_end;
5584 ID succ;
5585 int n, ascii;
5586 rb_encoding *enc;
5587
5588 CONST_ID(succ, "succ");
5589 StringValue(end);
5590 enc = rb_enc_check(beg, end);
5591 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5592 /* single character */
5593 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5594 char c = RSTRING_PTR(beg)[0];
5595 char e = RSTRING_PTR(end)[0];
5596
5597 if (c > e || (excl && c == e)) return beg;
5598 for (;;) {
5599 VALUE str = rb_enc_str_new(&c, 1, enc);
5601 if ((*each)(str, arg)) break;
5602 if (!excl && c == e) break;
5603 c++;
5604 if (excl && c == e) break;
5605 }
5606 return beg;
5607 }
5608 /* both edges are all digits */
5609 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5610 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5611 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5612 VALUE b, e;
5613 int width;
5614
5615 width = RSTRING_LENINT(beg);
5616 b = rb_str_to_inum(beg, 10, FALSE);
5617 e = rb_str_to_inum(end, 10, FALSE);
5618 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5619 long bi = FIX2LONG(b);
5620 long ei = FIX2LONG(e);
5621 rb_encoding *usascii = rb_usascii_encoding();
5622
5623 while (bi <= ei) {
5624 if (excl && bi == ei) break;
5625 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5626 bi++;
5627 }
5628 }
5629 else {
5630 ID op = excl ? '<' : idLE;
5631 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5632
5633 args[0] = INT2FIX(width);
5634 while (rb_funcall(b, op, 1, e)) {
5635 args[1] = b;
5636 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5637 b = rb_funcallv(b, succ, 0, 0);
5638 }
5639 }
5640 return beg;
5641 }
5642 /* normal case */
5643 n = rb_str_cmp(beg, end);
5644 if (n > 0 || (excl && n == 0)) return beg;
5645
5646 after_end = rb_funcallv(end, succ, 0, 0);
5647 current = str_duplicate(rb_cString, beg);
5648 while (!rb_str_equal(current, after_end)) {
5649 VALUE next = Qnil;
5650 if (excl || !rb_str_equal(current, end))
5651 next = rb_funcallv(current, succ, 0, 0);
5652 if ((*each)(current, arg)) break;
5653 if (NIL_P(next)) break;
5654 current = next;
5655 StringValue(current);
5656 if (excl && rb_str_equal(current, end)) break;
5657 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5658 break;
5659 }
5660
5661 return beg;
5662}
5663
5664VALUE
5665rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5666{
5667 VALUE current;
5668 ID succ;
5669
5670 CONST_ID(succ, "succ");
5671 /* both edges are all digits */
5672 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5673 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5674 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5675 int width = RSTRING_LENINT(beg);
5676 b = rb_str_to_inum(beg, 10, FALSE);
5677 if (FIXNUM_P(b)) {
5678 long bi = FIX2LONG(b);
5679 rb_encoding *usascii = rb_usascii_encoding();
5680
5681 while (FIXABLE(bi)) {
5682 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5683 bi++;
5684 }
5685 b = LONG2NUM(bi);
5686 }
5687 args[0] = INT2FIX(width);
5688 while (1) {
5689 args[1] = b;
5690 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5691 b = rb_funcallv(b, succ, 0, 0);
5692 }
5693 }
5694 /* normal case */
5695 current = str_duplicate(rb_cString, beg);
5696 while (1) {
5697 VALUE next = rb_funcallv(current, succ, 0, 0);
5698 if ((*each)(current, arg)) break;
5699 current = next;
5700 StringValue(current);
5701 if (RSTRING_LEN(current) == 0)
5702 break;
5703 }
5704
5705 return beg;
5706}
5707
5708static int
5709include_range_i(VALUE str, VALUE arg)
5710{
5711 VALUE *argp = (VALUE *)arg;
5712 if (!rb_equal(str, *argp)) return 0;
5713 *argp = Qnil;
5714 return 1;
5715}
5716
5717VALUE
5718rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5719{
5720 beg = rb_str_new_frozen(beg);
5721 StringValue(end);
5722 end = rb_str_new_frozen(end);
5723 if (NIL_P(val)) return Qfalse;
5724 val = rb_check_string_type(val);
5725 if (NIL_P(val)) return Qfalse;
5726 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5727 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5728 rb_enc_asciicompat(STR_ENC_GET(val))) {
5729 const char *bp = RSTRING_PTR(beg);
5730 const char *ep = RSTRING_PTR(end);
5731 const char *vp = RSTRING_PTR(val);
5732 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5733 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5734 return Qfalse;
5735 else {
5736 char b = *bp;
5737 char e = *ep;
5738 char v = *vp;
5739
5740 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5741 if (b <= v && v < e) return Qtrue;
5742 return RBOOL(!RTEST(exclusive) && v == e);
5743 }
5744 }
5745 }
5746#if 0
5747 /* both edges are all digits */
5748 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5749 all_digits_p(bp, RSTRING_LEN(beg)) &&
5750 all_digits_p(ep, RSTRING_LEN(end))) {
5751 /* TODO */
5752 }
5753#endif
5754 }
5755 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5756
5757 return RBOOL(NIL_P(val));
5758}
5759
5760static VALUE
5761rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5762{
5763 if (rb_reg_search(re, str, 0, 0) >= 0) {
5764 VALUE match = rb_backref_get();
5765 int nth = rb_reg_backref_number(match, backref);
5766 return rb_reg_nth_match(nth, match);
5767 }
5768 return Qnil;
5769}
5770
5771static VALUE
5772rb_str_aref(VALUE str, VALUE indx)
5773{
5774 long idx;
5775
5776 if (FIXNUM_P(indx)) {
5777 idx = FIX2LONG(indx);
5778 }
5779 else if (RB_TYPE_P(indx, T_REGEXP)) {
5780 return rb_str_subpat(str, indx, INT2FIX(0));
5781 }
5782 else if (RB_TYPE_P(indx, T_STRING)) {
5783 if (rb_str_index(str, indx, 0) != -1)
5784 return str_duplicate(rb_cString, indx);
5785 return Qnil;
5786 }
5787 else {
5788 /* check if indx is Range */
5789 long beg, len = str_strlen(str, NULL);
5790 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5791 case Qfalse:
5792 break;
5793 case Qnil:
5794 return Qnil;
5795 default:
5796 return rb_str_substr(str, beg, len);
5797 }
5798 idx = NUM2LONG(indx);
5799 }
5800
5801 return str_substr(str, idx, 1, FALSE);
5802}
5803
5804
5805/*
5806 * call-seq:
5807 * self[index] -> new_string or nil
5808 * self[start, length] -> new_string or nil
5809 * self[range] -> new_string or nil
5810 * self[regexp, capture = 0] -> new_string or nil
5811 * self[substring] -> new_string or nil
5812 *
5813 * Returns the substring of +self+ specified by the arguments.
5814 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5815 *
5816 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
5817 */
5818
5819static VALUE
5820rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5821{
5822 if (argc == 2) {
5823 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5824 return rb_str_subpat(str, argv[0], argv[1]);
5825 }
5826 else {
5827 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5828 }
5829 }
5830 rb_check_arity(argc, 1, 2);
5831 return rb_str_aref(str, argv[0]);
5832}
5833
5834VALUE
5836{
5837 char *ptr = RSTRING_PTR(str);
5838 long olen = RSTRING_LEN(str), nlen;
5839
5840 str_modifiable(str);
5841 if (len > olen) len = olen;
5842 nlen = olen - len;
5843 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5844 char *oldptr = ptr;
5845 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5846 STR_SET_EMBED(str);
5847 ptr = RSTRING(str)->as.embed.ary;
5848 memmove(ptr, oldptr + len, nlen);
5849 if (fl == STR_NOEMBED) xfree(oldptr);
5850 }
5851 else {
5852 if (!STR_SHARED_P(str)) {
5853 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5854 rb_enc_cr_str_exact_copy(shared, str);
5855 OBJ_FREEZE(shared);
5856 }
5857 ptr = RSTRING(str)->as.heap.ptr += len;
5858 }
5859 STR_SET_LEN(str, nlen);
5860
5861 if (!SHARABLE_MIDDLE_SUBSTRING) {
5862 TERM_FILL(ptr + nlen, TERM_LEN(str));
5863 }
5865 return str;
5866}
5867
5868static void
5869rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5870{
5871 char *sptr;
5872 long slen;
5873 int cr;
5874
5875 if (beg == 0 && vlen == 0) {
5876 rb_str_drop_bytes(str, len);
5877 return;
5878 }
5879
5880 str_modify_keep_cr(str);
5881 RSTRING_GETMEM(str, sptr, slen);
5882 if (len < vlen) {
5883 /* expand string */
5884 RESIZE_CAPA(str, slen + vlen - len);
5885 sptr = RSTRING_PTR(str);
5886 }
5887
5889 cr = rb_enc_str_coderange(val);
5890 else
5892
5893 if (vlen != len) {
5894 memmove(sptr + beg + vlen,
5895 sptr + beg + len,
5896 slen - (beg + len));
5897 }
5898 if (vlen < beg && len < 0) {
5899 MEMZERO(sptr + slen, char, -len);
5900 }
5901 if (vlen > 0) {
5902 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5903 }
5904 slen += vlen - len;
5905 STR_SET_LEN(str, slen);
5906 TERM_FILL(&sptr[slen], TERM_LEN(str));
5907 ENC_CODERANGE_SET(str, cr);
5908}
5909
5910static inline void
5911rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5912{
5913 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5914}
5915
5916void
5917rb_str_update(VALUE str, long beg, long len, VALUE val)
5918{
5919 long slen;
5920 char *p, *e;
5921 rb_encoding *enc;
5922 int singlebyte = single_byte_optimizable(str);
5923 int cr;
5924
5925 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5926
5927 StringValue(val);
5928 enc = rb_enc_check(str, val);
5929 slen = str_strlen(str, enc); /* rb_enc_check */
5930
5931 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5932 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5933 }
5934 if (beg < 0) {
5935 beg += slen;
5936 }
5937 RUBY_ASSERT(beg >= 0);
5938 RUBY_ASSERT(beg <= slen);
5939
5940 if (len > slen - beg) {
5941 len = slen - beg;
5942 }
5943 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5944 if (!p) p = RSTRING_END(str);
5945 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5946 if (!e) e = RSTRING_END(str);
5947 /* error check */
5948 beg = p - RSTRING_PTR(str); /* physical position */
5949 len = e - p; /* physical length */
5950 rb_str_update_0(str, beg, len, val);
5951 rb_enc_associate(str, enc);
5953 if (cr != ENC_CODERANGE_BROKEN)
5954 ENC_CODERANGE_SET(str, cr);
5955}
5956
5957static void
5958rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5959{
5960 int nth;
5961 VALUE match;
5962 long start, end, len;
5963 rb_encoding *enc;
5964 struct re_registers *regs;
5965
5966 if (rb_reg_search(re, str, 0, 0) < 0) {
5967 rb_raise(rb_eIndexError, "regexp not matched");
5968 }
5969 match = rb_backref_get();
5970 nth = rb_reg_backref_number(match, backref);
5971 regs = RMATCH_REGS(match);
5972 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5973 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5974 }
5975 if (nth < 0) {
5976 nth += regs->num_regs;
5977 }
5978
5979 start = BEG(nth);
5980 if (start == -1) {
5981 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5982 }
5983 end = END(nth);
5984 len = end - start;
5985 StringValue(val);
5986 enc = rb_enc_check_str(str, val);
5987 rb_str_update_0(str, start, len, val);
5988 rb_enc_associate(str, enc);
5989}
5990
5991static VALUE
5992rb_str_aset(VALUE str, VALUE indx, VALUE val)
5993{
5994 long idx, beg;
5995
5996 switch (TYPE(indx)) {
5997 case T_REGEXP:
5998 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5999 return val;
6000
6001 case T_STRING:
6002 beg = rb_str_index(str, indx, 0);
6003 if (beg < 0) {
6004 rb_raise(rb_eIndexError, "string not matched");
6005 }
6006 beg = rb_str_sublen(str, beg);
6007 rb_str_update(str, beg, str_strlen(indx, NULL), val);
6008 return val;
6009
6010 default:
6011 /* check if indx is Range */
6012 {
6013 long beg, len;
6014 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6015 rb_str_update(str, beg, len, val);
6016 return val;
6017 }
6018 }
6019 /* FALLTHROUGH */
6020
6021 case T_FIXNUM:
6022 idx = NUM2LONG(indx);
6023 rb_str_update(str, idx, 1, val);
6024 return val;
6025 }
6026}
6027
6028/*
6029 * call-seq:
6030 * self[index] = new_string
6031 * self[start, length] = new_string
6032 * self[range] = new_string
6033 * self[regexp, capture = 0] = new_string
6034 * self[substring] = new_string
6035 *
6036 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6037 * See {String Slices}[rdoc-ref:String@String+Slices].
6038 *
6039 * A few examples:
6040 *
6041 * s = 'foo'
6042 * s[2] = 'rtune' # => "rtune"
6043 * s # => "fortune"
6044 * s[1, 5] = 'init' # => "init"
6045 * s # => "finite"
6046 * s[3..4] = 'al' # => "al"
6047 * s # => "finale"
6048 * s[/e$/] = 'ly' # => "ly"
6049 * s # => "finally"
6050 * s['lly'] = 'ncial' # => "ncial"
6051 * s # => "financial"
6052 *
6053 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6054 */
6055
6056static VALUE
6057rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6058{
6059 if (argc == 3) {
6060 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6061 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6062 }
6063 else {
6064 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6065 }
6066 return argv[2];
6067 }
6068 rb_check_arity(argc, 2, 3);
6069 return rb_str_aset(str, argv[0], argv[1]);
6070}
6071
6072/*
6073 * call-seq:
6074 * insert(offset, other_string) -> self
6075 *
6076 * :include: doc/string/insert.rdoc
6077 *
6078 */
6079
6080static VALUE
6081rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6082{
6083 long pos = NUM2LONG(idx);
6084
6085 if (pos == -1) {
6086 return rb_str_append(str, str2);
6087 }
6088 else if (pos < 0) {
6089 pos++;
6090 }
6091 rb_str_update(str, pos, 0, str2);
6092 return str;
6093}
6094
6095
6096/*
6097 * call-seq:
6098 * slice!(index) -> new_string or nil
6099 * slice!(start, length) -> new_string or nil
6100 * slice!(range) -> new_string or nil
6101 * slice!(regexp, capture = 0) -> new_string or nil
6102 * slice!(substring) -> new_string or nil
6103 *
6104 * Removes and returns the substring of +self+ specified by the arguments.
6105 * See {String Slices}[rdoc-ref:String@String+Slices].
6106 *
6107 * A few examples:
6108 *
6109 * string = "This is a string"
6110 * string.slice!(2) #=> "i"
6111 * string.slice!(3..6) #=> " is "
6112 * string.slice!(/s.*t/) #=> "sa st"
6113 * string.slice!("r") #=> "r"
6114 * string #=> "Thing"
6115 *
6116 */
6117
6118static VALUE
6119rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6120{
6121 VALUE result = Qnil;
6122 VALUE indx;
6123 long beg, len = 1;
6124 char *p;
6125
6126 rb_check_arity(argc, 1, 2);
6127 str_modify_keep_cr(str);
6128 indx = argv[0];
6129 if (RB_TYPE_P(indx, T_REGEXP)) {
6130 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6131 VALUE match = rb_backref_get();
6132 struct re_registers *regs = RMATCH_REGS(match);
6133 int nth = 0;
6134 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6135 if ((nth += regs->num_regs) <= 0) return Qnil;
6136 }
6137 else if (nth >= regs->num_regs) return Qnil;
6138 beg = BEG(nth);
6139 len = END(nth) - beg;
6140 goto subseq;
6141 }
6142 else if (argc == 2) {
6143 beg = NUM2LONG(indx);
6144 len = NUM2LONG(argv[1]);
6145 goto num_index;
6146 }
6147 else if (FIXNUM_P(indx)) {
6148 beg = FIX2LONG(indx);
6149 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6150 if (!len) return Qnil;
6151 beg = p - RSTRING_PTR(str);
6152 goto subseq;
6153 }
6154 else if (RB_TYPE_P(indx, T_STRING)) {
6155 beg = rb_str_index(str, indx, 0);
6156 if (beg == -1) return Qnil;
6157 len = RSTRING_LEN(indx);
6158 result = str_duplicate(rb_cString, indx);
6159 goto squash;
6160 }
6161 else {
6162 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6163 case Qnil:
6164 return Qnil;
6165 case Qfalse:
6166 beg = NUM2LONG(indx);
6167 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6168 if (!len) return Qnil;
6169 beg = p - RSTRING_PTR(str);
6170 goto subseq;
6171 default:
6172 goto num_index;
6173 }
6174 }
6175
6176 num_index:
6177 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6178 beg = p - RSTRING_PTR(str);
6179
6180 subseq:
6181 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6182 rb_enc_cr_str_copy_for_substr(result, str);
6183
6184 squash:
6185 if (len > 0) {
6186 if (beg == 0) {
6187 rb_str_drop_bytes(str, len);
6188 }
6189 else {
6190 char *sptr = RSTRING_PTR(str);
6191 long slen = RSTRING_LEN(str);
6192 if (beg + len > slen) /* pathological check */
6193 len = slen - beg;
6194 memmove(sptr + beg,
6195 sptr + beg + len,
6196 slen - (beg + len));
6197 slen -= len;
6198 STR_SET_LEN(str, slen);
6199 TERM_FILL(&sptr[slen], TERM_LEN(str));
6200 }
6201 }
6202 return result;
6203}
6204
6205static VALUE
6206get_pat(VALUE pat)
6207{
6208 VALUE val;
6209
6210 switch (OBJ_BUILTIN_TYPE(pat)) {
6211 case T_REGEXP:
6212 return pat;
6213
6214 case T_STRING:
6215 break;
6216
6217 default:
6218 val = rb_check_string_type(pat);
6219 if (NIL_P(val)) {
6220 Check_Type(pat, T_REGEXP);
6221 }
6222 pat = val;
6223 }
6224
6225 return rb_reg_regcomp(pat);
6226}
6227
6228static VALUE
6229get_pat_quoted(VALUE pat, int check)
6230{
6231 VALUE val;
6232
6233 switch (OBJ_BUILTIN_TYPE(pat)) {
6234 case T_REGEXP:
6235 return pat;
6236
6237 case T_STRING:
6238 break;
6239
6240 default:
6241 val = rb_check_string_type(pat);
6242 if (NIL_P(val)) {
6243 Check_Type(pat, T_REGEXP);
6244 }
6245 pat = val;
6246 }
6247 if (check && is_broken_string(pat)) {
6248 rb_exc_raise(rb_reg_check_preprocess(pat));
6249 }
6250 return pat;
6251}
6252
6253static long
6254rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6255{
6256 if (BUILTIN_TYPE(pat) == T_STRING) {
6257 pos = rb_str_byteindex(str, pat, pos);
6258 if (set_backref_str) {
6259 if (pos >= 0) {
6260 str = rb_str_new_frozen_String(str);
6261 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6262 if (match) {
6263 *match = match_data;
6264 }
6265 }
6266 else {
6268 }
6269 }
6270 return pos;
6271 }
6272 else {
6273 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6274 }
6275}
6276
6277static long
6278rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6279{
6280 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6281}
6282
6283
6284/*
6285 * call-seq:
6286 * sub!(pattern, replacement) -> self or nil
6287 * sub!(pattern) {|match| ... } -> self or nil
6288 *
6289 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6290 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6291 *
6292 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6293 *
6294 * Related: String#sub, String#gsub, String#gsub!.
6295 *
6296 */
6297
6298static VALUE
6299rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6300{
6301 VALUE pat, repl, hash = Qnil;
6302 int iter = 0;
6303 long plen;
6304 int min_arity = rb_block_given_p() ? 1 : 2;
6305 long beg;
6306
6307 rb_check_arity(argc, min_arity, 2);
6308 if (argc == 1) {
6309 iter = 1;
6310 }
6311 else {
6312 repl = argv[1];
6313 hash = rb_check_hash_type(argv[1]);
6314 if (NIL_P(hash)) {
6315 StringValue(repl);
6316 }
6317 }
6318
6319 pat = get_pat_quoted(argv[0], 1);
6320
6321 str_modifiable(str);
6322 beg = rb_pat_search(pat, str, 0, 1);
6323 if (beg >= 0) {
6324 rb_encoding *enc;
6325 int cr = ENC_CODERANGE(str);
6326 long beg0, end0;
6327 VALUE match, match0 = Qnil;
6328 struct re_registers *regs;
6329 char *p, *rp;
6330 long len, rlen;
6331
6332 match = rb_backref_get();
6333 regs = RMATCH_REGS(match);
6334 if (RB_TYPE_P(pat, T_STRING)) {
6335 beg0 = beg;
6336 end0 = beg0 + RSTRING_LEN(pat);
6337 match0 = pat;
6338 }
6339 else {
6340 beg0 = BEG(0);
6341 end0 = END(0);
6342 if (iter) match0 = rb_reg_nth_match(0, match);
6343 }
6344
6345 if (iter || !NIL_P(hash)) {
6346 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6347
6348 if (iter) {
6349 repl = rb_obj_as_string(rb_yield(match0));
6350 }
6351 else {
6352 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6353 repl = rb_obj_as_string(repl);
6354 }
6355 str_mod_check(str, p, len);
6356 rb_check_frozen(str);
6357 }
6358 else {
6359 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6360 }
6361
6362 enc = rb_enc_compatible(str, repl);
6363 if (!enc) {
6364 rb_encoding *str_enc = STR_ENC_GET(str);
6365 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6366 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6367 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6368 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6369 rb_enc_inspect_name(str_enc),
6370 rb_enc_inspect_name(STR_ENC_GET(repl)));
6371 }
6372 enc = STR_ENC_GET(repl);
6373 }
6374 rb_str_modify(str);
6375 rb_enc_associate(str, enc);
6377 int cr2 = ENC_CODERANGE(repl);
6378 if (cr2 == ENC_CODERANGE_BROKEN ||
6379 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6381 else
6382 cr = cr2;
6383 }
6384 plen = end0 - beg0;
6385 rlen = RSTRING_LEN(repl);
6386 len = RSTRING_LEN(str);
6387 if (rlen > plen) {
6388 RESIZE_CAPA(str, len + rlen - plen);
6389 }
6390 p = RSTRING_PTR(str);
6391 if (rlen != plen) {
6392 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6393 }
6394 rp = RSTRING_PTR(repl);
6395 memmove(p + beg0, rp, rlen);
6396 len += rlen - plen;
6397 STR_SET_LEN(str, len);
6398 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6399 ENC_CODERANGE_SET(str, cr);
6400
6401 RB_GC_GUARD(match);
6402
6403 return str;
6404 }
6405 return Qnil;
6406}
6407
6408
6409/*
6410 * call-seq:
6411 * sub(pattern, replacement) -> new_string
6412 * sub(pattern) {|match| ... } -> new_string
6413 *
6414 * Returns a copy of +self+ with only the first occurrence
6415 * (not all occurrences) of the given +pattern+ replaced.
6416 *
6417 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6418 *
6419 * Related: String#sub!, String#gsub, String#gsub!.
6420 *
6421 */
6422
6423static VALUE
6424rb_str_sub(int argc, VALUE *argv, VALUE str)
6425{
6426 str = str_duplicate(rb_cString, str);
6427 rb_str_sub_bang(argc, argv, str);
6428 return str;
6429}
6430
6431static VALUE
6432str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6433{
6434 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6435 long beg, beg0, end0;
6436 long offset, blen, slen, len, last;
6437 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6438 char *sp, *cp;
6439 int need_backref_str = -1;
6440 rb_encoding *str_enc;
6441
6442 switch (argc) {
6443 case 1:
6444 RETURN_ENUMERATOR(str, argc, argv);
6445 mode = ITER;
6446 break;
6447 case 2:
6448 repl = argv[1];
6449 hash = rb_check_hash_type(argv[1]);
6450 if (NIL_P(hash)) {
6451 StringValue(repl);
6452 }
6453 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6454 mode = FAST_MAP;
6455 }
6456 else {
6457 mode = MAP;
6458 }
6459 break;
6460 default:
6461 rb_error_arity(argc, 1, 2);
6462 }
6463
6464 pat = get_pat_quoted(argv[0], 1);
6465 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6466
6467 if (beg < 0) {
6468 if (bang) return Qnil; /* no match, no substitution */
6469 return str_duplicate(rb_cString, str);
6470 }
6471
6472 offset = 0;
6473 blen = RSTRING_LEN(str) + 30; /* len + margin */
6474 dest = rb_str_buf_new(blen);
6475 sp = RSTRING_PTR(str);
6476 slen = RSTRING_LEN(str);
6477 cp = sp;
6478 str_enc = STR_ENC_GET(str);
6479 rb_enc_associate(dest, str_enc);
6480 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6481
6482 do {
6483 struct re_registers *regs = RMATCH_REGS(match);
6484 if (RB_TYPE_P(pat, T_STRING)) {
6485 beg0 = beg;
6486 end0 = beg0 + RSTRING_LEN(pat);
6487 match0 = pat;
6488 }
6489 else {
6490 beg0 = BEG(0);
6491 end0 = END(0);
6492 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6493 }
6494
6495 if (mode != STR) {
6496 if (mode == ITER) {
6497 val = rb_obj_as_string(rb_yield(match0));
6498 }
6499 else {
6500 struct RString fake_str = {RBASIC_INIT};
6501 VALUE key;
6502 if (mode == FAST_MAP) {
6503 // It is safe to use a fake_str here because we established that it won't escape,
6504 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6505 // default proc.
6506 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6507 }
6508 else {
6509 key = rb_str_subseq(str, beg0, end0 - beg0);
6510 }
6511 val = rb_hash_aref(hash, key);
6512 val = rb_obj_as_string(val);
6513 }
6514 str_mod_check(str, sp, slen);
6515 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6516 rb_raise(rb_eRuntimeError, "block should not cheat");
6517 }
6518 }
6519 else if (need_backref_str) {
6520 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6521 if (need_backref_str < 0) {
6522 need_backref_str = val != repl;
6523 }
6524 }
6525 else {
6526 val = repl;
6527 }
6528
6529 len = beg0 - offset; /* copy pre-match substr */
6530 if (len) {
6531 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6532 }
6533
6534 rb_str_buf_append(dest, val);
6535
6536 last = offset;
6537 offset = end0;
6538 if (beg0 == end0) {
6539 /*
6540 * Always consume at least one character of the input string
6541 * in order to prevent infinite loops.
6542 */
6543 if (RSTRING_LEN(str) <= end0) break;
6544 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6545 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6546 offset = end0 + len;
6547 }
6548 cp = RSTRING_PTR(str) + offset;
6549 if (offset > RSTRING_LEN(str)) break;
6550
6551 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6552 if (mode != FAST_MAP && mode != STR) {
6553 match = Qnil;
6554 }
6555 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6556
6557 RB_GC_GUARD(match);
6558 } while (beg >= 0);
6559
6560 if (RSTRING_LEN(str) > offset) {
6561 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6562 }
6563 rb_pat_search0(pat, str, last, 1, &match);
6564 if (bang) {
6565 str_shared_replace(str, dest);
6566 }
6567 else {
6568 str = dest;
6569 }
6570
6571 return str;
6572}
6573
6574
6575/*
6576 * call-seq:
6577 * gsub!(pattern, replacement) -> self or nil
6578 * gsub!(pattern) {|match| ... } -> self or nil
6579 * gsub!(pattern) -> an_enumerator
6580 *
6581 * Like String#gsub, except that:
6582 *
6583 * - Performs substitutions in +self+ (not in a copy of +self+).
6584 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6585 *
6586 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6587 */
6588
6589static VALUE
6590rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6591{
6592 str_modify_keep_cr(str);
6593 return str_gsub(argc, argv, str, 1);
6594}
6595
6596
6597/*
6598 * call-seq:
6599 * gsub(pattern, replacement) -> new_string
6600 * gsub(pattern) {|match| ... } -> new_string
6601 * gsub(pattern) -> enumerator
6602 *
6603 * Returns a copy of +self+ with zero or more substrings replaced.
6604 *
6605 * Argument +pattern+ may be a string or a Regexp;
6606 * argument +replacement+ may be a string or a Hash.
6607 * Varying types for the argument values makes this method very versatile.
6608 *
6609 * Below are some simple examples;
6610 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6611 *
6612 * With arguments +pattern+ and string +replacement+ given,
6613 * replaces each matching substring with the given +replacement+ string:
6614 *
6615 * s = 'abracadabra'
6616 * s.gsub('ab', 'AB') # => "ABracadABra"
6617 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6618 *
6619 * With arguments +pattern+ and hash +replacement+ given,
6620 * replaces each matching substring with a value from the given +replacement+ hash,
6621 * or removes it:
6622 *
6623 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6624 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6625 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6626 *
6627 * With argument +pattern+ and a block given,
6628 * calls the block with each matching substring;
6629 * replaces that substring with the block's return value:
6630 *
6631 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6632 * # => "ABrACADABrA"
6633 *
6634 * With argument +pattern+ and no block given,
6635 * returns a new Enumerator.
6636 *
6637 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6638 */
6639
6640static VALUE
6641rb_str_gsub(int argc, VALUE *argv, VALUE str)
6642{
6643 return str_gsub(argc, argv, str, 0);
6644}
6645
6646
6647/*
6648 * call-seq:
6649 * replace(other_string) -> self
6650 *
6651 * Replaces the contents of +self+ with the contents of +other_string+;
6652 * returns +self+:
6653 *
6654 * s = 'foo' # => "foo"
6655 * s.replace('bar') # => "bar"
6656 *
6657 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6658 */
6659
6660VALUE
6662{
6663 str_modifiable(str);
6664 if (str == str2) return str;
6665
6666 StringValue(str2);
6667 str_discard(str);
6668 return str_replace(str, str2);
6669}
6670
6671/*
6672 * call-seq:
6673 * clear -> self
6674 *
6675 * Removes the contents of +self+:
6676 *
6677 * s = 'foo'
6678 * s.clear # => ""
6679 * s # => ""
6680 *
6681 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6682 */
6683
6684static VALUE
6685rb_str_clear(VALUE str)
6686{
6687 str_discard(str);
6688 STR_SET_EMBED(str);
6689 STR_SET_LEN(str, 0);
6690 RSTRING_PTR(str)[0] = 0;
6691 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6693 else
6695 return str;
6696}
6697
6698/*
6699 * call-seq:
6700 * chr -> string
6701 *
6702 * :include: doc/string/chr.rdoc
6703 *
6704 */
6705
6706static VALUE
6707rb_str_chr(VALUE str)
6708{
6709 return rb_str_substr(str, 0, 1);
6710}
6711
6712/*
6713 * call-seq:
6714 * getbyte(index) -> integer or nil
6715 *
6716 * :include: doc/string/getbyte.rdoc
6717 *
6718 */
6719VALUE
6720rb_str_getbyte(VALUE str, VALUE index)
6721{
6722 long pos = NUM2LONG(index);
6723
6724 if (pos < 0)
6725 pos += RSTRING_LEN(str);
6726 if (pos < 0 || RSTRING_LEN(str) <= pos)
6727 return Qnil;
6728
6729 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6730}
6731
6732/*
6733 * call-seq:
6734 * setbyte(index, integer) -> integer
6735 *
6736 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6737 *
6738 * s = 'abcde' # => "abcde"
6739 * s.setbyte(0, 98) # => 98
6740 * s # => "bbcde"
6741 *
6742 * Related: String#getbyte.
6743 */
6744VALUE
6745rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6746{
6747 long pos = NUM2LONG(index);
6748 long len = RSTRING_LEN(str);
6749 char *ptr, *head, *left = 0;
6750 rb_encoding *enc;
6751 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6752
6753 if (pos < -len || len <= pos)
6754 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6755 if (pos < 0)
6756 pos += len;
6757
6758 VALUE v = rb_to_int(value);
6759 VALUE w = rb_int_and(v, INT2FIX(0xff));
6760 char byte = (char)(NUM2INT(w) & 0xFF);
6761
6762 if (!str_independent(str))
6763 str_make_independent(str);
6764 enc = STR_ENC_GET(str);
6765 head = RSTRING_PTR(str);
6766 ptr = &head[pos];
6767 if (!STR_EMBED_P(str)) {
6768 cr = ENC_CODERANGE(str);
6769 switch (cr) {
6770 case ENC_CODERANGE_7BIT:
6771 left = ptr;
6772 *ptr = byte;
6773 if (ISASCII(byte)) goto end;
6774 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6775 if (!MBCLEN_CHARFOUND_P(nlen))
6777 else
6779 goto end;
6781 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6782 width = rb_enc_precise_mbclen(left, head+len, enc);
6783 *ptr = byte;
6784 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6785 if (!MBCLEN_CHARFOUND_P(nlen))
6787 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6789 goto end;
6790 }
6791 }
6793 *ptr = byte;
6794
6795 end:
6796 return value;
6797}
6798
6799static VALUE
6800str_byte_substr(VALUE str, long beg, long len, int empty)
6801{
6802 long n = RSTRING_LEN(str);
6803
6804 if (beg > n || len < 0) return Qnil;
6805 if (beg < 0) {
6806 beg += n;
6807 if (beg < 0) return Qnil;
6808 }
6809 if (len > n - beg)
6810 len = n - beg;
6811 if (len <= 0) {
6812 if (!empty) return Qnil;
6813 len = 0;
6814 }
6815
6816 VALUE str2 = str_subseq(str, beg, len);
6817
6818 str_enc_copy_direct(str2, str);
6819
6820 if (RSTRING_LEN(str2) == 0) {
6821 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6823 else
6825 }
6826 else {
6827 switch (ENC_CODERANGE(str)) {
6828 case ENC_CODERANGE_7BIT:
6830 break;
6831 default:
6833 break;
6834 }
6835 }
6836
6837 return str2;
6838}
6839
6840VALUE
6841rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6842{
6843 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6844}
6845
6846static VALUE
6847str_byte_aref(VALUE str, VALUE indx)
6848{
6849 long idx;
6850 if (FIXNUM_P(indx)) {
6851 idx = FIX2LONG(indx);
6852 }
6853 else {
6854 /* check if indx is Range */
6855 long beg, len = RSTRING_LEN(str);
6856
6857 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6858 case Qfalse:
6859 break;
6860 case Qnil:
6861 return Qnil;
6862 default:
6863 return str_byte_substr(str, beg, len, TRUE);
6864 }
6865
6866 idx = NUM2LONG(indx);
6867 }
6868 return str_byte_substr(str, idx, 1, FALSE);
6869}
6870
6871/*
6872 * call-seq:
6873 * byteslice(offset, length = 1) -> string or nil
6874 * byteslice(range) -> string or nil
6875 *
6876 * :include: doc/string/byteslice.rdoc
6877 */
6878
6879static VALUE
6880rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6881{
6882 if (argc == 2) {
6883 long beg = NUM2LONG(argv[0]);
6884 long len = NUM2LONG(argv[1]);
6885 return str_byte_substr(str, beg, len, TRUE);
6886 }
6887 rb_check_arity(argc, 1, 2);
6888 return str_byte_aref(str, argv[0]);
6889}
6890
6891static void
6892str_check_beg_len(VALUE str, long *beg, long *len)
6893{
6894 long end, slen = RSTRING_LEN(str);
6895
6896 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6897 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6898 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6899 }
6900 if (*beg < 0) {
6901 *beg += slen;
6902 }
6903 RUBY_ASSERT(*beg >= 0);
6904 RUBY_ASSERT(*beg <= slen);
6905
6906 if (*len > slen - *beg) {
6907 *len = slen - *beg;
6908 }
6909 end = *beg + *len;
6910 str_ensure_byte_pos(str, *beg);
6911 str_ensure_byte_pos(str, end);
6912}
6913
6914/*
6915 * call-seq:
6916 * bytesplice(offset, length, str) -> self
6917 * bytesplice(offset, length, str, str_offset, str_length) -> self
6918 * bytesplice(range, str) -> self
6919 * bytesplice(range, str, str_range) -> self
6920 *
6921 * :include: doc/string/bytesplice.rdoc
6922 */
6923
6924static VALUE
6925rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6926{
6927 long beg, len, vbeg, vlen;
6928 VALUE val;
6929 int cr;
6930
6931 rb_check_arity(argc, 2, 5);
6932 if (!(argc == 2 || argc == 3 || argc == 5)) {
6933 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6934 }
6935 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6936 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6937 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6938 rb_builtin_class_name(argv[0]));
6939 }
6940 val = argv[1];
6941 StringValue(val);
6942 if (argc == 2) {
6943 /* bytesplice(range, str) */
6944 vbeg = 0;
6945 vlen = RSTRING_LEN(val);
6946 }
6947 else {
6948 /* bytesplice(range, str, str_range) */
6949 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6950 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6951 rb_builtin_class_name(argv[2]));
6952 }
6953 }
6954 }
6955 else {
6956 beg = NUM2LONG(argv[0]);
6957 len = NUM2LONG(argv[1]);
6958 val = argv[2];
6959 StringValue(val);
6960 if (argc == 3) {
6961 /* bytesplice(index, length, str) */
6962 vbeg = 0;
6963 vlen = RSTRING_LEN(val);
6964 }
6965 else {
6966 /* bytesplice(index, length, str, str_index, str_length) */
6967 vbeg = NUM2LONG(argv[3]);
6968 vlen = NUM2LONG(argv[4]);
6969 }
6970 }
6971 str_check_beg_len(str, &beg, &len);
6972 str_check_beg_len(val, &vbeg, &vlen);
6973 str_modify_keep_cr(str);
6974
6975 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6976 rb_enc_associate(str, rb_enc_check(str, val));
6977 }
6978
6979 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6981 if (cr != ENC_CODERANGE_BROKEN)
6982 ENC_CODERANGE_SET(str, cr);
6983 return str;
6984}
6985
6986/*
6987 * call-seq:
6988 * reverse -> new_string
6989 *
6990 * Returns a new string with the characters from +self+ in reverse order.
6991 *
6992 * 'drawer'.reverse # => "reward"
6993 * 'reviled'.reverse # => "deliver"
6994 * 'stressed'.reverse # => "desserts"
6995 * 'semordnilaps'.reverse # => "spalindromes"
6996 *
6997 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6998 */
6999
7000static VALUE
7001rb_str_reverse(VALUE str)
7002{
7003 rb_encoding *enc;
7004 VALUE rev;
7005 char *s, *e, *p;
7006 int cr;
7007
7008 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
7009 enc = STR_ENC_GET(str);
7010 rev = rb_str_new(0, RSTRING_LEN(str));
7011 s = RSTRING_PTR(str); e = RSTRING_END(str);
7012 p = RSTRING_END(rev);
7013 cr = ENC_CODERANGE(str);
7014
7015 if (RSTRING_LEN(str) > 1) {
7016 if (single_byte_optimizable(str)) {
7017 while (s < e) {
7018 *--p = *s++;
7019 }
7020 }
7021 else if (cr == ENC_CODERANGE_VALID) {
7022 while (s < e) {
7023 int clen = rb_enc_fast_mbclen(s, e, enc);
7024
7025 p -= clen;
7026 memcpy(p, s, clen);
7027 s += clen;
7028 }
7029 }
7030 else {
7031 cr = rb_enc_asciicompat(enc) ?
7033 while (s < e) {
7034 int clen = rb_enc_mbclen(s, e, enc);
7035
7036 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7037 p -= clen;
7038 memcpy(p, s, clen);
7039 s += clen;
7040 }
7041 }
7042 }
7043 STR_SET_LEN(rev, RSTRING_LEN(str));
7044 str_enc_copy_direct(rev, str);
7045 ENC_CODERANGE_SET(rev, cr);
7046
7047 return rev;
7048}
7049
7050
7051/*
7052 * call-seq:
7053 * reverse! -> self
7054 *
7055 * Returns +self+ with its characters reversed:
7056 *
7057 * 'drawer'.reverse! # => "reward"
7058 * 'reviled'.reverse! # => "deliver"
7059 * 'stressed'.reverse! # => "desserts"
7060 * 'semordnilaps'.reverse! # => "spalindromes"
7061 *
7062 * Related: see {Modifying}[rdoc-ref:String@Modifying].
7063 */
7064
7065static VALUE
7066rb_str_reverse_bang(VALUE str)
7067{
7068 if (RSTRING_LEN(str) > 1) {
7069 if (single_byte_optimizable(str)) {
7070 char *s, *e, c;
7071
7072 str_modify_keep_cr(str);
7073 s = RSTRING_PTR(str);
7074 e = RSTRING_END(str) - 1;
7075 while (s < e) {
7076 c = *s;
7077 *s++ = *e;
7078 *e-- = c;
7079 }
7080 }
7081 else {
7082 str_shared_replace(str, rb_str_reverse(str));
7083 }
7084 }
7085 else {
7086 str_modify_keep_cr(str);
7087 }
7088 return str;
7089}
7090
7091
7092/*
7093 * call-seq:
7094 * include?(other_string) -> true or false
7095 *
7096 * Returns whether +self+ contains +other_string+:
7097 *
7098 * s = 'bar'
7099 * s.include?('ba') # => true
7100 * s.include?('ar') # => true
7101 * s.include?('bar') # => true
7102 * s.include?('a') # => true
7103 * s.include?('') # => true
7104 * s.include?('foo') # => false
7105 *
7106 * Related: see {Querying}[rdoc-ref:String@Querying].
7107 */
7108
7109VALUE
7110rb_str_include(VALUE str, VALUE arg)
7111{
7112 long i;
7113
7114 StringValue(arg);
7115 i = rb_str_index(str, arg, 0);
7116
7117 return RBOOL(i != -1);
7118}
7119
7120
7121/*
7122 * call-seq:
7123 * to_i(base = 10) -> integer
7124 *
7125 * Returns the result of interpreting leading characters in +self+
7126 * as an integer in the given +base+ (which must be in (0, 2..36)):
7127 *
7128 * '123456'.to_i # => 123456
7129 * '123def'.to_i(16) # => 1195503
7130 *
7131 * With +base+ zero, string +object+ may contain leading characters
7132 * to specify the actual base:
7133 *
7134 * '123def'.to_i(0) # => 123
7135 * '0123def'.to_i(0) # => 83
7136 * '0b123def'.to_i(0) # => 1
7137 * '0o123def'.to_i(0) # => 83
7138 * '0d123def'.to_i(0) # => 123
7139 * '0x123def'.to_i(0) # => 1195503
7140 *
7141 * Characters past a leading valid number (in the given +base+) are ignored:
7142 *
7143 * '12.345'.to_i # => 12
7144 * '12345'.to_i(2) # => 1
7145 *
7146 * Returns zero if there is no leading valid number:
7147 *
7148 * 'abcdef'.to_i # => 0
7149 * '2'.to_i(2) # => 0
7150 *
7151 */
7152
7153static VALUE
7154rb_str_to_i(int argc, VALUE *argv, VALUE str)
7155{
7156 int base = 10;
7157
7158 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7159 rb_raise(rb_eArgError, "invalid radix %d", base);
7160 }
7161 return rb_str_to_inum(str, base, FALSE);
7162}
7163
7164
7165/*
7166 * call-seq:
7167 * to_f -> float
7168 *
7169 * Returns the result of interpreting leading characters in +self+ as a Float:
7170 *
7171 * '3.14159'.to_f # => 3.14159
7172 * '1.234e-2'.to_f # => 0.01234
7173 *
7174 * Characters past a leading valid number (in the given +base+) are ignored:
7175 *
7176 * '3.14 (pi to two places)'.to_f # => 3.14
7177 *
7178 * Returns zero if there is no leading valid number:
7179 *
7180 * 'abcdef'.to_f # => 0.0
7181 *
7182 */
7183
7184static VALUE
7185rb_str_to_f(VALUE str)
7186{
7187 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7188}
7189
7190
7191/*
7192 * call-seq:
7193 * to_s -> self or string
7194 *
7195 * Returns +self+ if +self+ is a +String+,
7196 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7197 */
7198
7199static VALUE
7200rb_str_to_s(VALUE str)
7201{
7202 if (rb_obj_class(str) != rb_cString) {
7203 return str_duplicate(rb_cString, str);
7204 }
7205 return str;
7206}
7207
7208#if 0
7209static void
7210str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7211{
7212 char s[RUBY_MAX_CHAR_LEN];
7213 int n = rb_enc_codelen(c, enc);
7214
7215 rb_enc_mbcput(c, s, enc);
7216 rb_enc_str_buf_cat(str, s, n, enc);
7217}
7218#endif
7219
7220#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7221
7222int
7223rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7224{
7225 char buf[CHAR_ESC_LEN + 1];
7226 int l;
7227
7228#if SIZEOF_INT > 4
7229 c &= 0xffffffff;
7230#endif
7231 if (unicode_p) {
7232 if (c < 0x7F && ISPRINT(c)) {
7233 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7234 }
7235 else if (c < 0x10000) {
7236 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7237 }
7238 else {
7239 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7240 }
7241 }
7242 else {
7243 if (c < 0x100) {
7244 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7245 }
7246 else {
7247 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7248 }
7249 }
7250 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7251 rb_str_buf_cat(result, buf, l);
7252 return l;
7253}
7254
7255const char *
7256ruby_escaped_char(int c)
7257{
7258 switch (c) {
7259 case '\0': return "\\0";
7260 case '\n': return "\\n";
7261 case '\r': return "\\r";
7262 case '\t': return "\\t";
7263 case '\f': return "\\f";
7264 case '\013': return "\\v";
7265 case '\010': return "\\b";
7266 case '\007': return "\\a";
7267 case '\033': return "\\e";
7268 case '\x7f': return "\\c?";
7269 }
7270 return NULL;
7271}
7272
7273VALUE
7274rb_str_escape(VALUE str)
7275{
7276 int encidx = ENCODING_GET(str);
7277 rb_encoding *enc = rb_enc_from_index(encidx);
7278 const char *p = RSTRING_PTR(str);
7279 const char *pend = RSTRING_END(str);
7280 const char *prev = p;
7281 char buf[CHAR_ESC_LEN + 1];
7282 VALUE result = rb_str_buf_new(0);
7283 int unicode_p = rb_enc_unicode_p(enc);
7284 int asciicompat = rb_enc_asciicompat(enc);
7285
7286 while (p < pend) {
7287 unsigned int c;
7288 const char *cc;
7289 int n = rb_enc_precise_mbclen(p, pend, enc);
7290 if (!MBCLEN_CHARFOUND_P(n)) {
7291 if (p > prev) str_buf_cat(result, prev, p - prev);
7292 n = rb_enc_mbminlen(enc);
7293 if (pend < p + n)
7294 n = (int)(pend - p);
7295 while (n--) {
7296 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7297 str_buf_cat(result, buf, strlen(buf));
7298 prev = ++p;
7299 }
7300 continue;
7301 }
7302 n = MBCLEN_CHARFOUND_LEN(n);
7303 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7304 p += n;
7305 cc = ruby_escaped_char(c);
7306 if (cc) {
7307 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7308 str_buf_cat(result, cc, strlen(cc));
7309 prev = p;
7310 }
7311 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7312 }
7313 else {
7314 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7315 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7316 prev = p;
7317 }
7318 }
7319 if (p > prev) str_buf_cat(result, prev, p - prev);
7320 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7321
7322 return result;
7323}
7324
7325/*
7326 * call-seq:
7327 * inspect -> string
7328 *
7329 * :include: doc/string/inspect.rdoc
7330 *
7331 */
7332
7333VALUE
7335{
7336 int encidx = ENCODING_GET(str);
7337 rb_encoding *enc = rb_enc_from_index(encidx);
7338 const char *p, *pend, *prev;
7339 char buf[CHAR_ESC_LEN + 1];
7340 VALUE result = rb_str_buf_new(0);
7341 rb_encoding *resenc = rb_default_internal_encoding();
7342 int unicode_p = rb_enc_unicode_p(enc);
7343 int asciicompat = rb_enc_asciicompat(enc);
7344
7345 if (resenc == NULL) resenc = rb_default_external_encoding();
7346 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7347 rb_enc_associate(result, resenc);
7348 str_buf_cat2(result, "\"");
7349
7350 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7351 prev = p;
7352 while (p < pend) {
7353 unsigned int c, cc;
7354 int n;
7355
7356 n = rb_enc_precise_mbclen(p, pend, enc);
7357 if (!MBCLEN_CHARFOUND_P(n)) {
7358 if (p > prev) str_buf_cat(result, prev, p - prev);
7359 n = rb_enc_mbminlen(enc);
7360 if (pend < p + n)
7361 n = (int)(pend - p);
7362 while (n--) {
7363 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7364 str_buf_cat(result, buf, strlen(buf));
7365 prev = ++p;
7366 }
7367 continue;
7368 }
7369 n = MBCLEN_CHARFOUND_LEN(n);
7370 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7371 p += n;
7372 if ((asciicompat || unicode_p) &&
7373 (c == '"'|| c == '\\' ||
7374 (c == '#' &&
7375 p < pend &&
7376 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7377 (cc = rb_enc_codepoint(p,pend,enc),
7378 (cc == '$' || cc == '@' || cc == '{'))))) {
7379 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7380 str_buf_cat2(result, "\\");
7381 if (asciicompat || enc == resenc) {
7382 prev = p - n;
7383 continue;
7384 }
7385 }
7386 switch (c) {
7387 case '\n': cc = 'n'; break;
7388 case '\r': cc = 'r'; break;
7389 case '\t': cc = 't'; break;
7390 case '\f': cc = 'f'; break;
7391 case '\013': cc = 'v'; break;
7392 case '\010': cc = 'b'; break;
7393 case '\007': cc = 'a'; break;
7394 case 033: cc = 'e'; break;
7395 default: cc = 0; break;
7396 }
7397 if (cc) {
7398 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7399 buf[0] = '\\';
7400 buf[1] = (char)cc;
7401 str_buf_cat(result, buf, 2);
7402 prev = p;
7403 continue;
7404 }
7405 /* The special casing of 0x85 (NEXT_LINE) here is because
7406 * Oniguruma historically treats it as printable, but it
7407 * doesn't match the print POSIX bracket class or character
7408 * property in regexps.
7409 *
7410 * See Ruby Bug #16842 for details:
7411 * https://bugs.ruby-lang.org/issues/16842
7412 */
7413 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7414 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7415 continue;
7416 }
7417 else {
7418 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7419 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7420 prev = p;
7421 continue;
7422 }
7423 }
7424 if (p > prev) str_buf_cat(result, prev, p - prev);
7425 str_buf_cat2(result, "\"");
7426
7427 return result;
7428}
7429
7430#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7431
7432/*
7433 * call-seq:
7434 * dump -> new_string
7435 *
7436 * :include: doc/string/dump.rdoc
7437 *
7438 */
7439
7440VALUE
7442{
7443 int encidx = rb_enc_get_index(str);
7444 rb_encoding *enc = rb_enc_from_index(encidx);
7445 long len;
7446 const char *p, *pend;
7447 char *q, *qend;
7448 VALUE result;
7449 int u8 = (encidx == rb_utf8_encindex());
7450 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7451
7452 len = 2; /* "" */
7453 if (!rb_enc_asciicompat(enc)) {
7454 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7455 len += strlen(enc->name);
7456 }
7457
7458 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7459 while (p < pend) {
7460 int clen;
7461 unsigned char c = *p++;
7462
7463 switch (c) {
7464 case '"': case '\\':
7465 case '\n': case '\r':
7466 case '\t': case '\f':
7467 case '\013': case '\010': case '\007': case '\033':
7468 clen = 2;
7469 break;
7470
7471 case '#':
7472 clen = IS_EVSTR(p, pend) ? 2 : 1;
7473 break;
7474
7475 default:
7476 if (ISPRINT(c)) {
7477 clen = 1;
7478 }
7479 else {
7480 if (u8 && c > 0x7F) { /* \u notation */
7481 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7482 if (MBCLEN_CHARFOUND_P(n)) {
7483 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7484 if (cc <= 0xFFFF)
7485 clen = 6; /* \uXXXX */
7486 else if (cc <= 0xFFFFF)
7487 clen = 9; /* \u{XXXXX} */
7488 else
7489 clen = 10; /* \u{XXXXXX} */
7490 p += MBCLEN_CHARFOUND_LEN(n)-1;
7491 break;
7492 }
7493 }
7494 clen = 4; /* \xNN */
7495 }
7496 break;
7497 }
7498
7499 if (clen > LONG_MAX - len) {
7500 rb_raise(rb_eRuntimeError, "string size too big");
7501 }
7502 len += clen;
7503 }
7504
7505 result = rb_str_new(0, len);
7506 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7507 q = RSTRING_PTR(result); qend = q + len + 1;
7508
7509 *q++ = '"';
7510 while (p < pend) {
7511 unsigned char c = *p++;
7512
7513 if (c == '"' || c == '\\') {
7514 *q++ = '\\';
7515 *q++ = c;
7516 }
7517 else if (c == '#') {
7518 if (IS_EVSTR(p, pend)) *q++ = '\\';
7519 *q++ = '#';
7520 }
7521 else if (c == '\n') {
7522 *q++ = '\\';
7523 *q++ = 'n';
7524 }
7525 else if (c == '\r') {
7526 *q++ = '\\';
7527 *q++ = 'r';
7528 }
7529 else if (c == '\t') {
7530 *q++ = '\\';
7531 *q++ = 't';
7532 }
7533 else if (c == '\f') {
7534 *q++ = '\\';
7535 *q++ = 'f';
7536 }
7537 else if (c == '\013') {
7538 *q++ = '\\';
7539 *q++ = 'v';
7540 }
7541 else if (c == '\010') {
7542 *q++ = '\\';
7543 *q++ = 'b';
7544 }
7545 else if (c == '\007') {
7546 *q++ = '\\';
7547 *q++ = 'a';
7548 }
7549 else if (c == '\033') {
7550 *q++ = '\\';
7551 *q++ = 'e';
7552 }
7553 else if (ISPRINT(c)) {
7554 *q++ = c;
7555 }
7556 else {
7557 *q++ = '\\';
7558 if (u8) {
7559 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7560 if (MBCLEN_CHARFOUND_P(n)) {
7561 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7562 p += n;
7563 if (cc <= 0xFFFF)
7564 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7565 else
7566 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7567 q += strlen(q);
7568 continue;
7569 }
7570 }
7571 snprintf(q, qend-q, "x%02X", c);
7572 q += 3;
7573 }
7574 }
7575 *q++ = '"';
7576 *q = '\0';
7577 if (!rb_enc_asciicompat(enc)) {
7578 snprintf(q, qend-q, nonascii_suffix, enc->name);
7579 encidx = rb_ascii8bit_encindex();
7580 }
7581 /* result from dump is ASCII */
7582 rb_enc_associate_index(result, encidx);
7584 return result;
7585}
7586
7587static int
7588unescape_ascii(unsigned int c)
7589{
7590 switch (c) {
7591 case 'n':
7592 return '\n';
7593 case 'r':
7594 return '\r';
7595 case 't':
7596 return '\t';
7597 case 'f':
7598 return '\f';
7599 case 'v':
7600 return '\13';
7601 case 'b':
7602 return '\010';
7603 case 'a':
7604 return '\007';
7605 case 'e':
7606 return 033;
7607 }
7609}
7610
7611static void
7612undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7613{
7614 const char *s = *ss;
7615 unsigned int c;
7616 int codelen;
7617 size_t hexlen;
7618 unsigned char buf[6];
7619 static rb_encoding *enc_utf8 = NULL;
7620
7621 switch (*s) {
7622 case '\\':
7623 case '"':
7624 case '#':
7625 rb_str_cat(undumped, s, 1); /* cat itself */
7626 s++;
7627 break;
7628 case 'n':
7629 case 'r':
7630 case 't':
7631 case 'f':
7632 case 'v':
7633 case 'b':
7634 case 'a':
7635 case 'e':
7636 *buf = unescape_ascii(*s);
7637 rb_str_cat(undumped, (char *)buf, 1);
7638 s++;
7639 break;
7640 case 'u':
7641 if (*binary) {
7642 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7643 }
7644 *utf8 = true;
7645 if (++s >= s_end) {
7646 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7647 }
7648 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7649 if (*penc != enc_utf8) {
7650 *penc = enc_utf8;
7651 rb_enc_associate(undumped, enc_utf8);
7652 }
7653 if (*s == '{') { /* handle \u{...} form */
7654 s++;
7655 for (;;) {
7656 if (s >= s_end) {
7657 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7658 }
7659 if (*s == '}') {
7660 s++;
7661 break;
7662 }
7663 if (ISSPACE(*s)) {
7664 s++;
7665 continue;
7666 }
7667 c = scan_hex(s, s_end-s, &hexlen);
7668 if (hexlen == 0 || hexlen > 6) {
7669 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7670 }
7671 if (c > 0x10ffff) {
7672 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7673 }
7674 if (0xd800 <= c && c <= 0xdfff) {
7675 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7676 }
7677 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7678 rb_str_cat(undumped, (char *)buf, codelen);
7679 s += hexlen;
7680 }
7681 }
7682 else { /* handle \uXXXX form */
7683 c = scan_hex(s, 4, &hexlen);
7684 if (hexlen != 4) {
7685 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7686 }
7687 if (0xd800 <= c && c <= 0xdfff) {
7688 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7689 }
7690 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7691 rb_str_cat(undumped, (char *)buf, codelen);
7692 s += hexlen;
7693 }
7694 break;
7695 case 'x':
7696 if (*utf8) {
7697 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7698 }
7699 *binary = true;
7700 if (++s >= s_end) {
7701 rb_raise(rb_eRuntimeError, "invalid hex escape");
7702 }
7703 *buf = scan_hex(s, 2, &hexlen);
7704 if (hexlen != 2) {
7705 rb_raise(rb_eRuntimeError, "invalid hex escape");
7706 }
7707 rb_str_cat(undumped, (char *)buf, 1);
7708 s += hexlen;
7709 break;
7710 default:
7711 rb_str_cat(undumped, s-1, 2);
7712 s++;
7713 }
7714
7715 *ss = s;
7716}
7717
7718static VALUE rb_str_is_ascii_only_p(VALUE str);
7719
7720/*
7721 * call-seq:
7722 * undump -> string
7723 *
7724 * Returns an unescaped version of +self+:
7725 *
7726 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7727 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7728 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7729 * s_undumped == s_orig # => true
7730 *
7731 * Related: String#dump (inverse of String#undump).
7732 *
7733 */
7734
7735static VALUE
7736str_undump(VALUE str)
7737{
7738 const char *s = RSTRING_PTR(str);
7739 const char *s_end = RSTRING_END(str);
7740 rb_encoding *enc = rb_enc_get(str);
7741 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7742 bool utf8 = false;
7743 bool binary = false;
7744 int w;
7745
7747 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7748 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7749 }
7750 if (!str_null_check(str, &w)) {
7751 rb_raise(rb_eRuntimeError, "string contains null byte");
7752 }
7753 if (RSTRING_LEN(str) < 2) goto invalid_format;
7754 if (*s != '"') goto invalid_format;
7755
7756 /* strip '"' at the start */
7757 s++;
7758
7759 for (;;) {
7760 if (s >= s_end) {
7761 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7762 }
7763
7764 if (*s == '"') {
7765 /* epilogue */
7766 s++;
7767 if (s == s_end) {
7768 /* ascii compatible dumped string */
7769 break;
7770 }
7771 else {
7772 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7773 static const char dup_suffix[] = ".dup";
7774 const char *encname;
7775 int encidx;
7776 ptrdiff_t size;
7777
7778 /* check separately for strings dumped by older versions */
7779 size = sizeof(dup_suffix) - 1;
7780 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7781
7782 size = sizeof(force_encoding_suffix) - 1;
7783 if (s_end - s <= size) goto invalid_format;
7784 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7785 s += size;
7786
7787 if (utf8) {
7788 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7789 }
7790
7791 encname = s;
7792 s = memchr(s, '"', s_end-s);
7793 size = s - encname;
7794 if (!s) goto invalid_format;
7795 if (s_end - s != 2) goto invalid_format;
7796 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7797
7798 encidx = rb_enc_find_index2(encname, (long)size);
7799 if (encidx < 0) {
7800 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7801 }
7802 rb_enc_associate_index(undumped, encidx);
7803 }
7804 break;
7805 }
7806
7807 if (*s == '\\') {
7808 s++;
7809 if (s >= s_end) {
7810 rb_raise(rb_eRuntimeError, "invalid escape");
7811 }
7812 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7813 }
7814 else {
7815 rb_str_cat(undumped, s++, 1);
7816 }
7817 }
7818
7819 RB_GC_GUARD(str);
7820
7821 return undumped;
7822invalid_format:
7823 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7824}
7825
7826static void
7827rb_str_check_dummy_enc(rb_encoding *enc)
7828{
7829 if (rb_enc_dummy_p(enc)) {
7830 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7831 rb_enc_name(enc));
7832 }
7833}
7834
7835static rb_encoding *
7836str_true_enc(VALUE str)
7837{
7838 rb_encoding *enc = STR_ENC_GET(str);
7839 rb_str_check_dummy_enc(enc);
7840 return enc;
7841}
7842
7843static OnigCaseFoldType
7844check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7845{
7846 if (argc==0)
7847 return flags;
7848 if (argc>2)
7849 rb_raise(rb_eArgError, "too many options");
7850 if (argv[0]==sym_turkic) {
7851 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7852 if (argc==2) {
7853 if (argv[1]==sym_lithuanian)
7854 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7855 else
7856 rb_raise(rb_eArgError, "invalid second option");
7857 }
7858 }
7859 else if (argv[0]==sym_lithuanian) {
7860 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7861 if (argc==2) {
7862 if (argv[1]==sym_turkic)
7863 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7864 else
7865 rb_raise(rb_eArgError, "invalid second option");
7866 }
7867 }
7868 else if (argc>1)
7869 rb_raise(rb_eArgError, "too many options");
7870 else if (argv[0]==sym_ascii)
7871 flags |= ONIGENC_CASE_ASCII_ONLY;
7872 else if (argv[0]==sym_fold) {
7873 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7874 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7875 else
7876 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7877 }
7878 else
7879 rb_raise(rb_eArgError, "invalid option");
7880 return flags;
7881}
7882
7883static inline bool
7884case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7885{
7886 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7887 return true;
7888 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7889}
7890
7891/* 16 should be long enough to absorb any kind of single character length increase */
7892#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7893#ifndef CASEMAP_DEBUG
7894# define CASEMAP_DEBUG 0
7895#endif
7896
7897struct mapping_buffer;
7898typedef struct mapping_buffer {
7899 size_t capa;
7900 size_t used;
7901 struct mapping_buffer *next;
7902 OnigUChar space[FLEX_ARY_LEN];
7904
7905static void
7906mapping_buffer_free(void *p)
7907{
7908 mapping_buffer *previous_buffer;
7909 mapping_buffer *current_buffer = p;
7910 while (current_buffer) {
7911 previous_buffer = current_buffer;
7912 current_buffer = current_buffer->next;
7913 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7914 }
7915}
7916
7917static const rb_data_type_t mapping_buffer_type = {
7918 "mapping_buffer",
7919 {0, mapping_buffer_free,},
7920 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7921};
7922
7923static VALUE
7924rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7925{
7926 VALUE target;
7927
7928 const OnigUChar *source_current, *source_end;
7929 int target_length = 0;
7930 VALUE buffer_anchor;
7931 mapping_buffer *current_buffer = 0;
7932 mapping_buffer **pre_buffer;
7933 size_t buffer_count = 0;
7934 int buffer_length_or_invalid;
7935
7936 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7937
7938 source_current = (OnigUChar*)RSTRING_PTR(source);
7939 source_end = (OnigUChar*)RSTRING_END(source);
7940
7941 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7942 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7943 while (source_current < source_end) {
7944 /* increase multiplier using buffer count to converge quickly */
7945 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7946 if (CASEMAP_DEBUG) {
7947 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7948 }
7949 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7950 *pre_buffer = current_buffer;
7951 pre_buffer = &current_buffer->next;
7952 current_buffer->next = NULL;
7953 current_buffer->capa = capa;
7954 buffer_length_or_invalid = enc->case_map(flags,
7955 &source_current, source_end,
7956 current_buffer->space,
7957 current_buffer->space+current_buffer->capa,
7958 enc);
7959 if (buffer_length_or_invalid < 0) {
7960 current_buffer = DATA_PTR(buffer_anchor);
7961 DATA_PTR(buffer_anchor) = 0;
7962 mapping_buffer_free(current_buffer);
7963 rb_raise(rb_eArgError, "input string invalid");
7964 }
7965 target_length += current_buffer->used = buffer_length_or_invalid;
7966 }
7967 if (CASEMAP_DEBUG) {
7968 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7969 }
7970
7971 if (buffer_count==1) {
7972 target = rb_str_new((const char*)current_buffer->space, target_length);
7973 }
7974 else {
7975 char *target_current;
7976
7977 target = rb_str_new(0, target_length);
7978 target_current = RSTRING_PTR(target);
7979 current_buffer = DATA_PTR(buffer_anchor);
7980 while (current_buffer) {
7981 memcpy(target_current, current_buffer->space, current_buffer->used);
7982 target_current += current_buffer->used;
7983 current_buffer = current_buffer->next;
7984 }
7985 }
7986 current_buffer = DATA_PTR(buffer_anchor);
7987 DATA_PTR(buffer_anchor) = 0;
7988 mapping_buffer_free(current_buffer);
7989
7990 RB_GC_GUARD(buffer_anchor);
7991
7992 /* TODO: check about string terminator character */
7993 str_enc_copy_direct(target, source);
7994 /*ENC_CODERANGE_SET(mapped, cr);*/
7995
7996 return target;
7997}
7998
7999static VALUE
8000rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
8001{
8002 const OnigUChar *source_current, *source_end;
8003 OnigUChar *target_current, *target_end;
8004 long old_length = RSTRING_LEN(source);
8005 int length_or_invalid;
8006
8007 if (old_length == 0) return Qnil;
8008
8009 source_current = (OnigUChar*)RSTRING_PTR(source);
8010 source_end = (OnigUChar*)RSTRING_END(source);
8011 if (source == target) {
8012 target_current = (OnigUChar*)source_current;
8013 target_end = (OnigUChar*)source_end;
8014 }
8015 else {
8016 target_current = (OnigUChar*)RSTRING_PTR(target);
8017 target_end = (OnigUChar*)RSTRING_END(target);
8018 }
8019
8020 length_or_invalid = onigenc_ascii_only_case_map(flags,
8021 &source_current, source_end,
8022 target_current, target_end, enc);
8023 if (length_or_invalid < 0)
8024 rb_raise(rb_eArgError, "input string invalid");
8025 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8026 fprintf(stderr, "problem with rb_str_ascii_casemap"
8027 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8028 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8029 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8030 }
8031
8032 str_enc_copy(target, source);
8033
8034 return target;
8035}
8036
8037static bool
8038upcase_single(VALUE str)
8039{
8040 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8041 bool modified = false;
8042
8043 while (s < send) {
8044 unsigned int c = *(unsigned char*)s;
8045
8046 if ('a' <= c && c <= 'z') {
8047 *s = 'A' + (c - 'a');
8048 modified = true;
8049 }
8050 s++;
8051 }
8052 return modified;
8053}
8054
8055/*
8056 * call-seq:
8057 * upcase!(mapping) -> self or nil
8058 *
8059 * Upcases the characters in +self+;
8060 * returns +self+ if any changes were made, +nil+ otherwise:
8061 *
8062 * s = 'Hello World!' # => "Hello World!"
8063 * s.upcase! # => "HELLO WORLD!"
8064 * s # => "HELLO WORLD!"
8065 * s.upcase! # => nil
8066 *
8067 * The casing may be affected by the given +mapping+;
8068 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8069 *
8070 * Related: String#upcase, String#downcase, String#downcase!.
8071 *
8072 */
8073
8074static VALUE
8075rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8076{
8077 rb_encoding *enc;
8078 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8079
8080 flags = check_case_options(argc, argv, flags);
8081 str_modify_keep_cr(str);
8082 enc = str_true_enc(str);
8083 if (case_option_single_p(flags, enc, str)) {
8084 if (upcase_single(str))
8085 flags |= ONIGENC_CASE_MODIFIED;
8086 }
8087 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8088 rb_str_ascii_casemap(str, str, &flags, enc);
8089 else
8090 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8091
8092 if (ONIGENC_CASE_MODIFIED&flags) return str;
8093 return Qnil;
8094}
8095
8096
8097/*
8098 * call-seq:
8099 * upcase(mapping) -> string
8100 *
8101 * Returns a string containing the upcased characters in +self+:
8102 *
8103 * s = 'Hello World!' # => "Hello World!"
8104 * s.upcase # => "HELLO WORLD!"
8105 *
8106 * The casing may be affected by the given +mapping+;
8107 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8108 *
8109 * Related: String#upcase!, String#downcase, String#downcase!.
8110 *
8111 */
8112
8113static VALUE
8114rb_str_upcase(int argc, VALUE *argv, VALUE str)
8115{
8116 rb_encoding *enc;
8117 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8118 VALUE ret;
8119
8120 flags = check_case_options(argc, argv, flags);
8121 enc = str_true_enc(str);
8122 if (case_option_single_p(flags, enc, str)) {
8123 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8124 str_enc_copy_direct(ret, str);
8125 upcase_single(ret);
8126 }
8127 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8128 ret = rb_str_new(0, RSTRING_LEN(str));
8129 rb_str_ascii_casemap(str, ret, &flags, enc);
8130 }
8131 else {
8132 ret = rb_str_casemap(str, &flags, enc);
8133 }
8134
8135 return ret;
8136}
8137
8138static bool
8139downcase_single(VALUE str)
8140{
8141 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8142 bool modified = false;
8143
8144 while (s < send) {
8145 unsigned int c = *(unsigned char*)s;
8146
8147 if ('A' <= c && c <= 'Z') {
8148 *s = 'a' + (c - 'A');
8149 modified = true;
8150 }
8151 s++;
8152 }
8153
8154 return modified;
8155}
8156
8157/*
8158 * call-seq:
8159 * downcase!(mapping) -> self or nil
8160 *
8161 * Like String#downcase, except that:
8162 *
8163 * - Changes character casings in +self+ (not in a copy of +self+).
8164 * - Returns +self+ if any changes are made, +nil+ otherwise.
8165 *
8166 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8167 */
8168
8169static VALUE
8170rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8171{
8172 rb_encoding *enc;
8173 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8174
8175 flags = check_case_options(argc, argv, flags);
8176 str_modify_keep_cr(str);
8177 enc = str_true_enc(str);
8178 if (case_option_single_p(flags, enc, str)) {
8179 if (downcase_single(str))
8180 flags |= ONIGENC_CASE_MODIFIED;
8181 }
8182 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8183 rb_str_ascii_casemap(str, str, &flags, enc);
8184 else
8185 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8186
8187 if (ONIGENC_CASE_MODIFIED&flags) return str;
8188 return Qnil;
8189}
8190
8191
8192/*
8193 * call-seq:
8194 * downcase(mapping) -> string
8195 *
8196 * :include: doc/string/downcase.rdoc
8197 *
8198 */
8199
8200static VALUE
8201rb_str_downcase(int argc, VALUE *argv, VALUE str)
8202{
8203 rb_encoding *enc;
8204 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8205 VALUE ret;
8206
8207 flags = check_case_options(argc, argv, flags);
8208 enc = str_true_enc(str);
8209 if (case_option_single_p(flags, enc, str)) {
8210 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8211 str_enc_copy_direct(ret, str);
8212 downcase_single(ret);
8213 }
8214 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8215 ret = rb_str_new(0, RSTRING_LEN(str));
8216 rb_str_ascii_casemap(str, ret, &flags, enc);
8217 }
8218 else {
8219 ret = rb_str_casemap(str, &flags, enc);
8220 }
8221
8222 return ret;
8223}
8224
8225
8226/*
8227 * call-seq:
8228 * capitalize!(mapping = :ascii) -> self or nil
8229 *
8230 * Like String#capitalize, except that:
8231 *
8232 * - Changes character casings in +self+ (not in a copy of +self+).
8233 * - Returns +self+ if any changes are made, +nil+ otherwise.
8234 *
8235 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8236 */
8237
8238static VALUE
8239rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8240{
8241 rb_encoding *enc;
8242 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8243
8244 flags = check_case_options(argc, argv, flags);
8245 str_modify_keep_cr(str);
8246 enc = str_true_enc(str);
8247 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8248 if (flags&ONIGENC_CASE_ASCII_ONLY)
8249 rb_str_ascii_casemap(str, str, &flags, enc);
8250 else
8251 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8252
8253 if (ONIGENC_CASE_MODIFIED&flags) return str;
8254 return Qnil;
8255}
8256
8257
8258/*
8259 * call-seq:
8260 * capitalize(mapping = :ascii) -> string
8261 *
8262 * Returns a string containing the characters in +self+,
8263 * each with possibly changed case:
8264 *
8265 * - The first character is upcased.
8266 * - All other characters are downcased.
8267 *
8268 * Examples:
8269 *
8270 * 'hello world'.capitalize # => "Hello world"
8271 * 'HELLO WORLD'.capitalize # => "Hello world"
8272 *
8273 * Some characters do not have upcase and downcase, and so are not changed;
8274 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc]:
8275 *
8276 * '1, 2, 3, ...'.capitalize # => "1, 2, 3, ..."
8277 *
8278 * The casing is affected by the given +mapping+,
8279 * which may be +:ascii+, +:fold+, or +:turkic+;
8280 * see {Case Mappings}[rdoc-ref:case_mapping.rdoc@Case+Mappings].
8281 *
8282 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8283 */
8284
8285static VALUE
8286rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8287{
8288 rb_encoding *enc;
8289 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8290 VALUE ret;
8291
8292 flags = check_case_options(argc, argv, flags);
8293 enc = str_true_enc(str);
8294 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8295 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8296 ret = rb_str_new(0, RSTRING_LEN(str));
8297 rb_str_ascii_casemap(str, ret, &flags, enc);
8298 }
8299 else {
8300 ret = rb_str_casemap(str, &flags, enc);
8301 }
8302 return ret;
8303}
8304
8305
8306/*
8307 * call-seq:
8308 * swapcase!(mapping) -> self or nil
8309 *
8310 * Upcases each lowercase character in +self+;
8311 * downcases uppercase character;
8312 * returns +self+ if any changes were made, +nil+ otherwise:
8313 *
8314 * s = 'Hello World!' # => "Hello World!"
8315 * s.swapcase! # => "hELLO wORLD!"
8316 * s # => "hELLO wORLD!"
8317 * ''.swapcase! # => nil
8318 *
8319 * The casing may be affected by the given +mapping+;
8320 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8321 *
8322 * Related: String#swapcase.
8323 *
8324 */
8325
8326static VALUE
8327rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8328{
8329 rb_encoding *enc;
8330 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8331
8332 flags = check_case_options(argc, argv, flags);
8333 str_modify_keep_cr(str);
8334 enc = str_true_enc(str);
8335 if (flags&ONIGENC_CASE_ASCII_ONLY)
8336 rb_str_ascii_casemap(str, str, &flags, enc);
8337 else
8338 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8339
8340 if (ONIGENC_CASE_MODIFIED&flags) return str;
8341 return Qnil;
8342}
8343
8344
8345/*
8346 * call-seq:
8347 * swapcase(mapping) -> string
8348 *
8349 * Returns a string containing the characters in +self+, with cases reversed;
8350 * each uppercase character is downcased;
8351 * each lowercase character is upcased:
8352 *
8353 * s = 'Hello World!' # => "Hello World!"
8354 * s.swapcase # => "hELLO wORLD!"
8355 *
8356 * The casing may be affected by the given +mapping+;
8357 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8358 *
8359 * Related: String#swapcase!.
8360 *
8361 */
8362
8363static VALUE
8364rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8365{
8366 rb_encoding *enc;
8367 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8368 VALUE ret;
8369
8370 flags = check_case_options(argc, argv, flags);
8371 enc = str_true_enc(str);
8372 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8373 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8374 ret = rb_str_new(0, RSTRING_LEN(str));
8375 rb_str_ascii_casemap(str, ret, &flags, enc);
8376 }
8377 else {
8378 ret = rb_str_casemap(str, &flags, enc);
8379 }
8380 return ret;
8381}
8382
8383typedef unsigned char *USTR;
8384
8385struct tr {
8386 int gen;
8387 unsigned int now, max;
8388 char *p, *pend;
8389};
8390
8391static unsigned int
8392trnext(struct tr *t, rb_encoding *enc)
8393{
8394 int n;
8395
8396 for (;;) {
8397 nextpart:
8398 if (!t->gen) {
8399 if (t->p == t->pend) return -1;
8400 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8401 t->p += n;
8402 }
8403 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8404 t->p += n;
8405 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8406 t->p += n;
8407 if (t->p < t->pend) {
8408 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8409 t->p += n;
8410 if (t->now > c) {
8411 if (t->now < 0x80 && c < 0x80) {
8412 rb_raise(rb_eArgError,
8413 "invalid range \"%c-%c\" in string transliteration",
8414 t->now, c);
8415 }
8416 else {
8417 rb_raise(rb_eArgError, "invalid range in string transliteration");
8418 }
8419 continue; /* not reached */
8420 }
8421 else if (t->now < c) {
8422 t->gen = 1;
8423 t->max = c;
8424 }
8425 }
8426 }
8427 return t->now;
8428 }
8429 else {
8430 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8431 if (t->now == t->max) {
8432 t->gen = 0;
8433 goto nextpart;
8434 }
8435 }
8436 if (t->now < t->max) {
8437 return t->now;
8438 }
8439 else {
8440 t->gen = 0;
8441 return t->max;
8442 }
8443 }
8444 }
8445}
8446
8447static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8448
8449static VALUE
8450tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8451{
8452 const unsigned int errc = -1;
8453 unsigned int trans[256];
8454 rb_encoding *enc, *e1, *e2;
8455 struct tr trsrc, trrepl;
8456 int cflag = 0;
8457 unsigned int c, c0, last = 0;
8458 int modify = 0, i, l;
8459 unsigned char *s, *send;
8460 VALUE hash = 0;
8461 int singlebyte = single_byte_optimizable(str);
8462 int termlen;
8463 int cr;
8464
8465#define CHECK_IF_ASCII(c) \
8466 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8467 (cr = ENC_CODERANGE_VALID) : 0)
8468
8469 StringValue(src);
8470 StringValue(repl);
8471 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8472 if (RSTRING_LEN(repl) == 0) {
8473 return rb_str_delete_bang(1, &src, str);
8474 }
8475
8476 cr = ENC_CODERANGE(str);
8477 e1 = rb_enc_check(str, src);
8478 e2 = rb_enc_check(str, repl);
8479 if (e1 == e2) {
8480 enc = e1;
8481 }
8482 else {
8483 enc = rb_enc_check(src, repl);
8484 }
8485 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8486 if (RSTRING_LEN(src) > 1 &&
8487 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8488 trsrc.p + l < trsrc.pend) {
8489 cflag = 1;
8490 trsrc.p += l;
8491 }
8492 trrepl.p = RSTRING_PTR(repl);
8493 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8494 trsrc.gen = trrepl.gen = 0;
8495 trsrc.now = trrepl.now = 0;
8496 trsrc.max = trrepl.max = 0;
8497
8498 if (cflag) {
8499 for (i=0; i<256; i++) {
8500 trans[i] = 1;
8501 }
8502 while ((c = trnext(&trsrc, enc)) != errc) {
8503 if (c < 256) {
8504 trans[c] = errc;
8505 }
8506 else {
8507 if (!hash) hash = rb_hash_new();
8508 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8509 }
8510 }
8511 while ((c = trnext(&trrepl, enc)) != errc)
8512 /* retrieve last replacer */;
8513 last = trrepl.now;
8514 for (i=0; i<256; i++) {
8515 if (trans[i] != errc) {
8516 trans[i] = last;
8517 }
8518 }
8519 }
8520 else {
8521 unsigned int r;
8522
8523 for (i=0; i<256; i++) {
8524 trans[i] = errc;
8525 }
8526 while ((c = trnext(&trsrc, enc)) != errc) {
8527 r = trnext(&trrepl, enc);
8528 if (r == errc) r = trrepl.now;
8529 if (c < 256) {
8530 trans[c] = r;
8531 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8532 }
8533 else {
8534 if (!hash) hash = rb_hash_new();
8535 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8536 }
8537 }
8538 }
8539
8540 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8541 cr = ENC_CODERANGE_7BIT;
8542 str_modify_keep_cr(str);
8543 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8544 termlen = rb_enc_mbminlen(enc);
8545 if (sflag) {
8546 int clen, tlen;
8547 long offset, max = RSTRING_LEN(str);
8548 unsigned int save = -1;
8549 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8550
8551 while (s < send) {
8552 int may_modify = 0;
8553
8554 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8555 if (!MBCLEN_CHARFOUND_P(r)) {
8556 xfree(buf);
8557 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8558 }
8559 clen = MBCLEN_CHARFOUND_LEN(r);
8560 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8561
8562 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8563
8564 s += clen;
8565 if (c < 256) {
8566 c = trans[c];
8567 }
8568 else if (hash) {
8569 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8570 if (NIL_P(tmp)) {
8571 if (cflag) c = last;
8572 else c = errc;
8573 }
8574 else if (cflag) c = errc;
8575 else c = NUM2INT(tmp);
8576 }
8577 else {
8578 c = errc;
8579 }
8580 if (c != (unsigned int)-1) {
8581 if (save == c) {
8582 CHECK_IF_ASCII(c);
8583 continue;
8584 }
8585 save = c;
8586 tlen = rb_enc_codelen(c, enc);
8587 modify = 1;
8588 }
8589 else {
8590 save = -1;
8591 c = c0;
8592 if (enc != e1) may_modify = 1;
8593 }
8594 if ((offset = t - buf) + tlen > max) {
8595 size_t MAYBE_UNUSED(old) = max + termlen;
8596 max = offset + tlen + (send - s);
8597 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8598 t = buf + offset;
8599 }
8600 rb_enc_mbcput(c, t, enc);
8601 if (may_modify && memcmp(s, t, tlen) != 0) {
8602 modify = 1;
8603 }
8604 CHECK_IF_ASCII(c);
8605 t += tlen;
8606 }
8607 if (!STR_EMBED_P(str)) {
8608 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8609 }
8610 TERM_FILL((char *)t, termlen);
8611 RSTRING(str)->as.heap.ptr = (char *)buf;
8612 STR_SET_LEN(str, t - buf);
8613 STR_SET_NOEMBED(str);
8614 RSTRING(str)->as.heap.aux.capa = max;
8615 }
8616 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8617 while (s < send) {
8618 c = (unsigned char)*s;
8619 if (trans[c] != errc) {
8620 if (!cflag) {
8621 c = trans[c];
8622 *s = c;
8623 modify = 1;
8624 }
8625 else {
8626 *s = last;
8627 modify = 1;
8628 }
8629 }
8630 CHECK_IF_ASCII(c);
8631 s++;
8632 }
8633 }
8634 else {
8635 int clen, tlen;
8636 long offset, max = (long)((send - s) * 1.2);
8637 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8638
8639 while (s < send) {
8640 int may_modify = 0;
8641
8642 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8643 if (!MBCLEN_CHARFOUND_P(r)) {
8644 xfree(buf);
8645 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8646 }
8647 clen = MBCLEN_CHARFOUND_LEN(r);
8648 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8649
8650 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8651
8652 if (c < 256) {
8653 c = trans[c];
8654 }
8655 else if (hash) {
8656 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8657 if (NIL_P(tmp)) {
8658 if (cflag) c = last;
8659 else c = errc;
8660 }
8661 else if (cflag) c = errc;
8662 else c = NUM2INT(tmp);
8663 }
8664 else {
8665 c = cflag ? last : errc;
8666 }
8667 if (c != errc) {
8668 tlen = rb_enc_codelen(c, enc);
8669 modify = 1;
8670 }
8671 else {
8672 c = c0;
8673 if (enc != e1) may_modify = 1;
8674 }
8675 if ((offset = t - buf) + tlen > max) {
8676 size_t MAYBE_UNUSED(old) = max + termlen;
8677 max = offset + tlen + (long)((send - s) * 1.2);
8678 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8679 t = buf + offset;
8680 }
8681 if (s != t) {
8682 rb_enc_mbcput(c, t, enc);
8683 if (may_modify && memcmp(s, t, tlen) != 0) {
8684 modify = 1;
8685 }
8686 }
8687 CHECK_IF_ASCII(c);
8688 s += clen;
8689 t += tlen;
8690 }
8691 if (!STR_EMBED_P(str)) {
8692 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8693 }
8694 TERM_FILL((char *)t, termlen);
8695 RSTRING(str)->as.heap.ptr = (char *)buf;
8696 STR_SET_LEN(str, t - buf);
8697 STR_SET_NOEMBED(str);
8698 RSTRING(str)->as.heap.aux.capa = max;
8699 }
8700
8701 if (modify) {
8702 if (cr != ENC_CODERANGE_BROKEN)
8703 ENC_CODERANGE_SET(str, cr);
8704 rb_enc_associate(str, enc);
8705 return str;
8706 }
8707 return Qnil;
8708}
8709
8710
8711/*
8712 * call-seq:
8713 * tr!(selector, replacements) -> self or nil
8714 *
8715 * Like String#tr, but modifies +self+ in place.
8716 * Returns +self+ if any changes were made, +nil+ otherwise.
8717 *
8718 */
8719
8720static VALUE
8721rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8722{
8723 return tr_trans(str, src, repl, 0);
8724}
8725
8726
8727/*
8728 * call-seq:
8729 * tr(selector, replacements) -> new_string
8730 *
8731 * Returns a copy of +self+ with each character specified by string +selector+
8732 * translated to the corresponding character in string +replacements+.
8733 * The correspondence is _positional_:
8734 *
8735 * - Each occurrence of the first character specified by +selector+
8736 * is translated to the first character in +replacements+.
8737 * - Each occurrence of the second character specified by +selector+
8738 * is translated to the second character in +replacements+.
8739 * - And so on.
8740 *
8741 * Example:
8742 *
8743 * 'hello'.tr('el', 'ip') #=> "hippo"
8744 *
8745 * If +replacements+ is shorter than +selector+,
8746 * it is implicitly padded with its own last character:
8747 *
8748 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8749 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8750 *
8751 * Arguments +selector+ and +replacements+ must be valid character selectors
8752 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8753 * and may use any of its valid forms, including negation, ranges, and escaping:
8754 *
8755 * # Negation.
8756 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8757 * # Ranges.
8758 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8759 * # Escapes.
8760 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8761 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8762 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8763 *
8764 */
8765
8766static VALUE
8767rb_str_tr(VALUE str, VALUE src, VALUE repl)
8768{
8769 str = str_duplicate(rb_cString, str);
8770 tr_trans(str, src, repl, 0);
8771 return str;
8772}
8773
8774#define TR_TABLE_MAX (UCHAR_MAX+1)
8775#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8776static void
8777tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8778 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8779{
8780 const unsigned int errc = -1;
8781 char buf[TR_TABLE_MAX];
8782 struct tr tr;
8783 unsigned int c;
8784 VALUE table = 0, ptable = 0;
8785 int i, l, cflag = 0;
8786
8787 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8788 tr.gen = tr.now = tr.max = 0;
8789
8790 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8791 cflag = 1;
8792 tr.p += l;
8793 }
8794 if (first) {
8795 for (i=0; i<TR_TABLE_MAX; i++) {
8796 stable[i] = 1;
8797 }
8798 stable[TR_TABLE_MAX] = cflag;
8799 }
8800 else if (stable[TR_TABLE_MAX] && !cflag) {
8801 stable[TR_TABLE_MAX] = 0;
8802 }
8803 for (i=0; i<TR_TABLE_MAX; i++) {
8804 buf[i] = cflag;
8805 }
8806
8807 while ((c = trnext(&tr, enc)) != errc) {
8808 if (c < TR_TABLE_MAX) {
8809 buf[(unsigned char)c] = !cflag;
8810 }
8811 else {
8812 VALUE key = UINT2NUM(c);
8813
8814 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8815 if (cflag) {
8816 ptable = *ctablep;
8817 table = ptable ? ptable : rb_hash_new();
8818 *ctablep = table;
8819 }
8820 else {
8821 table = rb_hash_new();
8822 ptable = *tablep;
8823 *tablep = table;
8824 }
8825 }
8826 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8827 rb_hash_aset(table, key, Qtrue);
8828 }
8829 }
8830 }
8831 for (i=0; i<TR_TABLE_MAX; i++) {
8832 stable[i] = stable[i] && buf[i];
8833 }
8834 if (!table && !cflag) {
8835 *tablep = 0;
8836 }
8837}
8838
8839
8840static int
8841tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8842{
8843 if (c < TR_TABLE_MAX) {
8844 return table[c] != 0;
8845 }
8846 else {
8847 VALUE v = UINT2NUM(c);
8848
8849 if (del) {
8850 if (!NIL_P(rb_hash_lookup(del, v)) &&
8851 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8852 return TRUE;
8853 }
8854 }
8855 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8856 return FALSE;
8857 }
8858 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8859 }
8860}
8861
8862/*
8863 * call-seq:
8864 * delete!(*selectors) -> self or nil
8865 *
8866 * Like String#delete, but modifies +self+ in place;
8867 * returns +self+ if any characters were deleted, +nil+ otherwise.
8868 *
8869 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8870 */
8871
8872static VALUE
8873rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8874{
8875 char squeez[TR_TABLE_SIZE];
8876 rb_encoding *enc = 0;
8877 char *s, *send, *t;
8878 VALUE del = 0, nodel = 0;
8879 int modify = 0;
8880 int i, ascompat, cr;
8881
8882 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8884 for (i=0; i<argc; i++) {
8885 VALUE s = argv[i];
8886
8887 StringValue(s);
8888 enc = rb_enc_check(str, s);
8889 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8890 }
8891
8892 str_modify_keep_cr(str);
8893 ascompat = rb_enc_asciicompat(enc);
8894 s = t = RSTRING_PTR(str);
8895 send = RSTRING_END(str);
8896 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8897 while (s < send) {
8898 unsigned int c;
8899 int clen;
8900
8901 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8902 if (squeez[c]) {
8903 modify = 1;
8904 }
8905 else {
8906 if (t != s) *t = c;
8907 t++;
8908 }
8909 s++;
8910 }
8911 else {
8912 c = rb_enc_codepoint_len(s, send, &clen, enc);
8913
8914 if (tr_find(c, squeez, del, nodel)) {
8915 modify = 1;
8916 }
8917 else {
8918 if (t != s) rb_enc_mbcput(c, t, enc);
8919 t += clen;
8921 }
8922 s += clen;
8923 }
8924 }
8925 TERM_FILL(t, TERM_LEN(str));
8926 STR_SET_LEN(str, t - RSTRING_PTR(str));
8927 ENC_CODERANGE_SET(str, cr);
8928
8929 if (modify) return str;
8930 return Qnil;
8931}
8932
8933
8934/*
8935 * call-seq:
8936 * delete(*selectors) -> new_string
8937 *
8938 * :include: doc/string/delete.rdoc
8939 *
8940 */
8941
8942static VALUE
8943rb_str_delete(int argc, VALUE *argv, VALUE str)
8944{
8945 str = str_duplicate(rb_cString, str);
8946 rb_str_delete_bang(argc, argv, str);
8947 return str;
8948}
8949
8950
8951/*
8952 * call-seq:
8953 * squeeze!(*selectors) -> self or nil
8954 *
8955 * Like String#squeeze, but modifies +self+ in place.
8956 * Returns +self+ if any changes were made, +nil+ otherwise.
8957 */
8958
8959static VALUE
8960rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8961{
8962 char squeez[TR_TABLE_SIZE];
8963 rb_encoding *enc = 0;
8964 VALUE del = 0, nodel = 0;
8965 unsigned char *s, *send, *t;
8966 int i, modify = 0;
8967 int ascompat, singlebyte = single_byte_optimizable(str);
8968 unsigned int save;
8969
8970 if (argc == 0) {
8971 enc = STR_ENC_GET(str);
8972 }
8973 else {
8974 for (i=0; i<argc; i++) {
8975 VALUE s = argv[i];
8976
8977 StringValue(s);
8978 enc = rb_enc_check(str, s);
8979 if (singlebyte && !single_byte_optimizable(s))
8980 singlebyte = 0;
8981 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8982 }
8983 }
8984
8985 str_modify_keep_cr(str);
8986 s = t = (unsigned char *)RSTRING_PTR(str);
8987 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8988 send = (unsigned char *)RSTRING_END(str);
8989 save = -1;
8990 ascompat = rb_enc_asciicompat(enc);
8991
8992 if (singlebyte) {
8993 while (s < send) {
8994 unsigned int c = *s++;
8995 if (c != save || (argc > 0 && !squeez[c])) {
8996 *t++ = save = c;
8997 }
8998 }
8999 }
9000 else {
9001 while (s < send) {
9002 unsigned int c;
9003 int clen;
9004
9005 if (ascompat && (c = *s) < 0x80) {
9006 if (c != save || (argc > 0 && !squeez[c])) {
9007 *t++ = save = c;
9008 }
9009 s++;
9010 }
9011 else {
9012 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
9013
9014 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9015 if (t != s) rb_enc_mbcput(c, t, enc);
9016 save = c;
9017 t += clen;
9018 }
9019 s += clen;
9020 }
9021 }
9022 }
9023
9024 TERM_FILL((char *)t, TERM_LEN(str));
9025 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9026 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9027 modify = 1;
9028 }
9029
9030 if (modify) return str;
9031 return Qnil;
9032}
9033
9034
9035/*
9036 * call-seq:
9037 * squeeze(*selectors) -> new_string
9038 *
9039 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9040 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9041 *
9042 * "Squeezed" means that each multiple-character run of a selected character
9043 * is squeezed down to a single character;
9044 * with no arguments given, squeezes all characters:
9045 *
9046 * "yellow moon".squeeze #=> "yelow mon"
9047 * " now is the".squeeze(" ") #=> " now is the"
9048 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9049 *
9050 */
9051
9052static VALUE
9053rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9054{
9055 str = str_duplicate(rb_cString, str);
9056 rb_str_squeeze_bang(argc, argv, str);
9057 return str;
9058}
9059
9060
9061/*
9062 * call-seq:
9063 * tr_s!(selector, replacements) -> self or nil
9064 *
9065 * Like String#tr_s, but modifies +self+ in place.
9066 * Returns +self+ if any changes were made, +nil+ otherwise.
9067 *
9068 * Related: String#squeeze!.
9069 */
9070
9071static VALUE
9072rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9073{
9074 return tr_trans(str, src, repl, 1);
9075}
9076
9077
9078/*
9079 * call-seq:
9080 * tr_s(selector, replacements) -> string
9081 *
9082 * Like String#tr, but also squeezes the modified portions of the translated string;
9083 * returns a new string (translated and squeezed).
9084 *
9085 * 'hello'.tr_s('l', 'r') #=> "hero"
9086 * 'hello'.tr_s('el', '-') #=> "h-o"
9087 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9088 *
9089 * Related: String#squeeze.
9090 *
9091 */
9092
9093static VALUE
9094rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9095{
9096 str = str_duplicate(rb_cString, str);
9097 tr_trans(str, src, repl, 1);
9098 return str;
9099}
9100
9101
9102/*
9103 * call-seq:
9104 * count(*selectors) -> integer
9105 *
9106 * :include: doc/string/count.rdoc
9107 */
9108
9109static VALUE
9110rb_str_count(int argc, VALUE *argv, VALUE str)
9111{
9112 char table[TR_TABLE_SIZE];
9113 rb_encoding *enc = 0;
9114 VALUE del = 0, nodel = 0, tstr;
9115 char *s, *send;
9116 int i;
9117 int ascompat;
9118 size_t n = 0;
9119
9121
9122 tstr = argv[0];
9123 StringValue(tstr);
9124 enc = rb_enc_check(str, tstr);
9125 if (argc == 1) {
9126 const char *ptstr;
9127 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9128 (ptstr = RSTRING_PTR(tstr),
9129 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9130 !is_broken_string(str)) {
9131 int clen;
9132 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9133
9134 s = RSTRING_PTR(str);
9135 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9136 send = RSTRING_END(str);
9137 while (s < send) {
9138 if (*(unsigned char*)s++ == c) n++;
9139 }
9140 return SIZET2NUM(n);
9141 }
9142 }
9143
9144 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9145 for (i=1; i<argc; i++) {
9146 tstr = argv[i];
9147 StringValue(tstr);
9148 enc = rb_enc_check(str, tstr);
9149 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9150 }
9151
9152 s = RSTRING_PTR(str);
9153 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9154 send = RSTRING_END(str);
9155 ascompat = rb_enc_asciicompat(enc);
9156 while (s < send) {
9157 unsigned int c;
9158
9159 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9160 if (table[c]) {
9161 n++;
9162 }
9163 s++;
9164 }
9165 else {
9166 int clen;
9167 c = rb_enc_codepoint_len(s, send, &clen, enc);
9168 if (tr_find(c, table, del, nodel)) {
9169 n++;
9170 }
9171 s += clen;
9172 }
9173 }
9174
9175 return SIZET2NUM(n);
9176}
9177
9178static VALUE
9179rb_fs_check(VALUE val)
9180{
9181 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9182 val = rb_check_string_type(val);
9183 if (NIL_P(val)) return 0;
9184 }
9185 return val;
9186}
9187
9188static const char isspacetable[256] = {
9189 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9191 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9205};
9206
9207#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9208
9209static long
9210split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9211{
9212 if (empty_count >= 0 && len == 0) {
9213 return empty_count + 1;
9214 }
9215 if (empty_count > 0) {
9216 /* make different substrings */
9217 if (result) {
9218 do {
9219 rb_ary_push(result, str_new_empty_String(str));
9220 } while (--empty_count > 0);
9221 }
9222 else {
9223 do {
9224 rb_yield(str_new_empty_String(str));
9225 } while (--empty_count > 0);
9226 }
9227 }
9228 str = rb_str_subseq(str, beg, len);
9229 if (result) {
9230 rb_ary_push(result, str);
9231 }
9232 else {
9233 rb_yield(str);
9234 }
9235 return empty_count;
9236}
9237
9238typedef enum {
9239 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9240} split_type_t;
9241
9242static split_type_t
9243literal_split_pattern(VALUE spat, split_type_t default_type)
9244{
9245 rb_encoding *enc = STR_ENC_GET(spat);
9246 const char *ptr;
9247 long len;
9248 RSTRING_GETMEM(spat, ptr, len);
9249 if (len == 0) {
9250 /* Special case - split into chars */
9251 return SPLIT_TYPE_CHARS;
9252 }
9253 else if (rb_enc_asciicompat(enc)) {
9254 if (len == 1 && ptr[0] == ' ') {
9255 return SPLIT_TYPE_AWK;
9256 }
9257 }
9258 else {
9259 int l;
9260 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9261 return SPLIT_TYPE_AWK;
9262 }
9263 }
9264 return default_type;
9265}
9266
9267/*
9268 * call-seq:
9269 * split(field_sep = $;, limit = 0) -> array
9270 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9271 *
9272 * :include: doc/string/split.rdoc
9273 *
9274 */
9275
9276static VALUE
9277rb_str_split_m(int argc, VALUE *argv, VALUE str)
9278{
9279 rb_encoding *enc;
9280 VALUE spat;
9281 VALUE limit;
9282 split_type_t split_type;
9283 long beg, end, i = 0, empty_count = -1;
9284 int lim = 0;
9285 VALUE result, tmp;
9286
9287 result = rb_block_given_p() ? Qfalse : Qnil;
9288 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9289 lim = NUM2INT(limit);
9290 if (lim <= 0) limit = Qnil;
9291 else if (lim == 1) {
9292 if (RSTRING_LEN(str) == 0)
9293 return result ? rb_ary_new2(0) : str;
9294 tmp = str_duplicate(rb_cString, str);
9295 if (!result) {
9296 rb_yield(tmp);
9297 return str;
9298 }
9299 return rb_ary_new3(1, tmp);
9300 }
9301 i = 1;
9302 }
9303 if (NIL_P(limit) && !lim) empty_count = 0;
9304
9305 enc = STR_ENC_GET(str);
9306 split_type = SPLIT_TYPE_REGEXP;
9307 if (!NIL_P(spat)) {
9308 spat = get_pat_quoted(spat, 0);
9309 }
9310 else if (NIL_P(spat = rb_fs)) {
9311 split_type = SPLIT_TYPE_AWK;
9312 }
9313 else if (!(spat = rb_fs_check(spat))) {
9314 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9315 }
9316 else {
9317 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9318 }
9319 if (split_type != SPLIT_TYPE_AWK) {
9320 switch (BUILTIN_TYPE(spat)) {
9321 case T_REGEXP:
9322 rb_reg_options(spat); /* check if uninitialized */
9323 tmp = RREGEXP_SRC(spat);
9324 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9325 if (split_type == SPLIT_TYPE_AWK) {
9326 spat = tmp;
9327 split_type = SPLIT_TYPE_STRING;
9328 }
9329 break;
9330
9331 case T_STRING:
9332 mustnot_broken(spat);
9333 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9334 break;
9335
9336 default:
9338 }
9339 }
9340
9341#define SPLIT_STR(beg, len) ( \
9342 empty_count = split_string(result, str, beg, len, empty_count), \
9343 str_mod_check(str, str_start, str_len))
9344
9345 beg = 0;
9346 char *ptr = RSTRING_PTR(str);
9347 char *const str_start = ptr;
9348 const long str_len = RSTRING_LEN(str);
9349 char *const eptr = str_start + str_len;
9350 if (split_type == SPLIT_TYPE_AWK) {
9351 char *bptr = ptr;
9352 int skip = 1;
9353 unsigned int c;
9354
9355 if (result) result = rb_ary_new();
9356 end = beg;
9357 if (is_ascii_string(str)) {
9358 while (ptr < eptr) {
9359 c = (unsigned char)*ptr++;
9360 if (skip) {
9361 if (ascii_isspace(c)) {
9362 beg = ptr - bptr;
9363 }
9364 else {
9365 end = ptr - bptr;
9366 skip = 0;
9367 if (!NIL_P(limit) && lim <= i) break;
9368 }
9369 }
9370 else if (ascii_isspace(c)) {
9371 SPLIT_STR(beg, end-beg);
9372 skip = 1;
9373 beg = ptr - bptr;
9374 if (!NIL_P(limit)) ++i;
9375 }
9376 else {
9377 end = ptr - bptr;
9378 }
9379 }
9380 }
9381 else {
9382 while (ptr < eptr) {
9383 int n;
9384
9385 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9386 ptr += n;
9387 if (skip) {
9388 if (rb_isspace(c)) {
9389 beg = ptr - bptr;
9390 }
9391 else {
9392 end = ptr - bptr;
9393 skip = 0;
9394 if (!NIL_P(limit) && lim <= i) break;
9395 }
9396 }
9397 else if (rb_isspace(c)) {
9398 SPLIT_STR(beg, end-beg);
9399 skip = 1;
9400 beg = ptr - bptr;
9401 if (!NIL_P(limit)) ++i;
9402 }
9403 else {
9404 end = ptr - bptr;
9405 }
9406 }
9407 }
9408 }
9409 else if (split_type == SPLIT_TYPE_STRING) {
9410 char *substr_start = ptr;
9411 char *sptr = RSTRING_PTR(spat);
9412 long slen = RSTRING_LEN(spat);
9413
9414 if (result) result = rb_ary_new();
9415 mustnot_broken(str);
9416 enc = rb_enc_check(str, spat);
9417 while (ptr < eptr &&
9418 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9419 /* Check we are at the start of a char */
9420 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9421 if (t != ptr + end) {
9422 ptr = t;
9423 continue;
9424 }
9425 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9426 str_mod_check(spat, sptr, slen);
9427 ptr += end + slen;
9428 substr_start = ptr;
9429 if (!NIL_P(limit) && lim <= ++i) break;
9430 }
9431 beg = ptr - str_start;
9432 }
9433 else if (split_type == SPLIT_TYPE_CHARS) {
9434 int n;
9435
9436 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9437 mustnot_broken(str);
9438 enc = rb_enc_get(str);
9439 while (ptr < eptr &&
9440 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9441 SPLIT_STR(ptr - str_start, n);
9442 ptr += n;
9443 if (!NIL_P(limit) && lim <= ++i) break;
9444 }
9445 beg = ptr - str_start;
9446 }
9447 else {
9448 if (result) result = rb_ary_new();
9449 long len = RSTRING_LEN(str);
9450 long start = beg;
9451 long idx;
9452 int last_null = 0;
9453 struct re_registers *regs;
9454 VALUE match = 0;
9455
9456 for (; rb_reg_search(spat, str, start, 0) >= 0;
9457 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9458 match = rb_backref_get();
9459 if (!result) rb_match_busy(match);
9460 regs = RMATCH_REGS(match);
9461 end = BEG(0);
9462 if (start == end && BEG(0) == END(0)) {
9463 if (!ptr) {
9464 SPLIT_STR(0, 0);
9465 break;
9466 }
9467 else if (last_null == 1) {
9468 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9469 beg = start;
9470 }
9471 else {
9472 if (start == len)
9473 start++;
9474 else
9475 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9476 last_null = 1;
9477 continue;
9478 }
9479 }
9480 else {
9481 SPLIT_STR(beg, end-beg);
9482 beg = start = END(0);
9483 }
9484 last_null = 0;
9485
9486 for (idx=1; idx < regs->num_regs; idx++) {
9487 if (BEG(idx) == -1) continue;
9488 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9489 }
9490 if (!NIL_P(limit) && lim <= ++i) break;
9491 }
9492 if (match) rb_match_unbusy(match);
9493 }
9494 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9495 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9496 }
9497
9498 return result ? result : str;
9499}
9500
9501VALUE
9502rb_str_split(VALUE str, const char *sep0)
9503{
9504 VALUE sep;
9505
9506 StringValue(str);
9507 sep = rb_str_new_cstr(sep0);
9508 return rb_str_split_m(1, &sep, str);
9509}
9510
9511#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9512
9513static inline int
9514enumerator_element(VALUE ary, VALUE e)
9515{
9516 if (ary) {
9517 rb_ary_push(ary, e);
9518 return 0;
9519 }
9520 else {
9521 rb_yield(e);
9522 return 1;
9523 }
9524}
9525
9526#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9527
9528static const char *
9529chomp_newline(const char *p, const char *e, rb_encoding *enc)
9530{
9531 const char *prev = rb_enc_prev_char(p, e, e, enc);
9532 if (rb_enc_is_newline(prev, e, enc)) {
9533 e = prev;
9534 prev = rb_enc_prev_char(p, e, e, enc);
9535 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9536 e = prev;
9537 }
9538 return e;
9539}
9540
9541static VALUE
9542get_rs(void)
9543{
9544 VALUE rs = rb_rs;
9545 if (!NIL_P(rs) &&
9546 (!RB_TYPE_P(rs, T_STRING) ||
9547 RSTRING_LEN(rs) != 1 ||
9548 RSTRING_PTR(rs)[0] != '\n')) {
9549 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9550 }
9551 return rs;
9552}
9553
9554#define rb_rs get_rs()
9555
9556static VALUE
9557rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9558{
9559 rb_encoding *enc;
9560 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9561 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9562 long pos, len, rslen;
9563 int rsnewline = 0;
9564
9565 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9566 rs = rb_rs;
9567 if (!NIL_P(opts)) {
9568 static ID keywords[1];
9569 if (!keywords[0]) {
9570 keywords[0] = rb_intern_const("chomp");
9571 }
9572 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9573 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9574 }
9575
9576 if (NIL_P(rs)) {
9577 if (!ENUM_ELEM(ary, str)) {
9578 return ary;
9579 }
9580 else {
9581 return orig;
9582 }
9583 }
9584
9585 if (!RSTRING_LEN(str)) goto end;
9586 str = rb_str_new_frozen(str);
9587 ptr = subptr = RSTRING_PTR(str);
9588 pend = RSTRING_END(str);
9589 len = RSTRING_LEN(str);
9590 StringValue(rs);
9591 rslen = RSTRING_LEN(rs);
9592
9593 if (rs == rb_default_rs)
9594 enc = rb_enc_get(str);
9595 else
9596 enc = rb_enc_check(str, rs);
9597
9598 if (rslen == 0) {
9599 /* paragraph mode */
9600 int n;
9601 const char *eol = NULL;
9602 subend = subptr;
9603 while (subend < pend) {
9604 long chomp_rslen = 0;
9605 do {
9606 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9607 n = 0;
9608 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9609 if (rb_enc_is_newline(subend + n, pend, enc)) {
9610 if (eol == subend) break;
9611 subend += rslen;
9612 if (subptr) {
9613 eol = subend;
9614 chomp_rslen = -rslen;
9615 }
9616 }
9617 else {
9618 if (!subptr) subptr = subend;
9619 subend += rslen;
9620 }
9621 rslen = 0;
9622 } while (subend < pend);
9623 if (!subptr) break;
9624 if (rslen == 0) chomp_rslen = 0;
9625 line = rb_str_subseq(str, subptr - ptr,
9626 subend - subptr + (chomp ? chomp_rslen : rslen));
9627 if (ENUM_ELEM(ary, line)) {
9628 str_mod_check(str, ptr, len);
9629 }
9630 subptr = eol = NULL;
9631 }
9632 goto end;
9633 }
9634 else {
9635 rsptr = RSTRING_PTR(rs);
9636 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9637 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9638 rsnewline = 1;
9639 }
9640 }
9641
9642 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9643 rs = rb_str_new(rsptr, rslen);
9644 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9645 rsptr = RSTRING_PTR(rs);
9646 rslen = RSTRING_LEN(rs);
9647 }
9648
9649 while (subptr < pend) {
9650 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9651 if (pos < 0) break;
9652 hit = subptr + pos;
9653 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9654 if (hit != adjusted) {
9655 subptr = adjusted;
9656 continue;
9657 }
9658 subend = hit += rslen;
9659 if (chomp) {
9660 if (rsnewline) {
9661 subend = chomp_newline(subptr, subend, enc);
9662 }
9663 else {
9664 subend -= rslen;
9665 }
9666 }
9667 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9668 if (ENUM_ELEM(ary, line)) {
9669 str_mod_check(str, ptr, len);
9670 }
9671 subptr = hit;
9672 }
9673
9674 if (subptr != pend) {
9675 if (chomp) {
9676 if (rsnewline) {
9677 pend = chomp_newline(subptr, pend, enc);
9678 }
9679 else if (pend - subptr >= rslen &&
9680 memcmp(pend - rslen, rsptr, rslen) == 0) {
9681 pend -= rslen;
9682 }
9683 }
9684 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9685 ENUM_ELEM(ary, line);
9686 RB_GC_GUARD(str);
9687 }
9688
9689 end:
9690 if (ary)
9691 return ary;
9692 else
9693 return orig;
9694}
9695
9696/*
9697 * call-seq:
9698 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9699 * each_line(record_separator = $/, chomp: false) -> enumerator
9700 *
9701 * :include: doc/string/each_line.rdoc
9702 *
9703 */
9704
9705static VALUE
9706rb_str_each_line(int argc, VALUE *argv, VALUE str)
9707{
9708 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9709 return rb_str_enumerate_lines(argc, argv, str, 0);
9710}
9711
9712/*
9713 * call-seq:
9714 * lines(record_separator = $/, chomp: false) -> array_of_strings
9715 *
9716 * Returns substrings ("lines") of +self+
9717 * according to the given arguments:
9718 *
9719 * s = <<~EOT
9720 * This is the first line.
9721 * This is line two.
9722 *
9723 * This is line four.
9724 * This is line five.
9725 * EOT
9726 *
9727 * With the default argument values:
9728 *
9729 * $/ # => "\n"
9730 * s.lines
9731 * # =>
9732 * ["This is the first line.\n",
9733 * "This is line two.\n",
9734 * "\n",
9735 * "This is line four.\n",
9736 * "This is line five.\n"]
9737 *
9738 * With a different +record_separator+:
9739 *
9740 * record_separator = ' is '
9741 * s.lines(record_separator)
9742 * # =>
9743 * ["This is ",
9744 * "the first line.\nThis is ",
9745 * "line two.\n\nThis is ",
9746 * "line four.\nThis is ",
9747 * "line five.\n"]
9748 *
9749 * With keyword argument +chomp+ as +true+,
9750 * removes the trailing newline from each line:
9751 *
9752 * s.lines(chomp: true)
9753 * # =>
9754 * ["This is the first line.",
9755 * "This is line two.",
9756 * "",
9757 * "This is line four.",
9758 * "This is line five."]
9759 *
9760 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9761 */
9762
9763static VALUE
9764rb_str_lines(int argc, VALUE *argv, VALUE str)
9765{
9766 VALUE ary = WANTARRAY("lines", 0);
9767 return rb_str_enumerate_lines(argc, argv, str, ary);
9768}
9769
9770static VALUE
9771rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9772{
9773 return LONG2FIX(RSTRING_LEN(str));
9774}
9775
9776static VALUE
9777rb_str_enumerate_bytes(VALUE str, VALUE ary)
9778{
9779 long i;
9780
9781 for (i=0; i<RSTRING_LEN(str); i++) {
9782 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9783 }
9784 if (ary)
9785 return ary;
9786 else
9787 return str;
9788}
9789
9790/*
9791 * call-seq:
9792 * each_byte {|byte| ... } -> self
9793 * each_byte -> enumerator
9794 *
9795 * :include: doc/string/each_byte.rdoc
9796 *
9797 */
9798
9799static VALUE
9800rb_str_each_byte(VALUE str)
9801{
9802 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9803 return rb_str_enumerate_bytes(str, 0);
9804}
9805
9806/*
9807 * call-seq:
9808 * bytes -> array_of_bytes
9809 *
9810 * :include: doc/string/bytes.rdoc
9811 *
9812 */
9813
9814static VALUE
9815rb_str_bytes(VALUE str)
9816{
9817 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9818 return rb_str_enumerate_bytes(str, ary);
9819}
9820
9821static VALUE
9822rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9823{
9824 return rb_str_length(str);
9825}
9826
9827static VALUE
9828rb_str_enumerate_chars(VALUE str, VALUE ary)
9829{
9830 VALUE orig = str;
9831 long i, len, n;
9832 const char *ptr;
9833 rb_encoding *enc;
9834
9835 str = rb_str_new_frozen(str);
9836 ptr = RSTRING_PTR(str);
9837 len = RSTRING_LEN(str);
9838 enc = rb_enc_get(str);
9839
9841 for (i = 0; i < len; i += n) {
9842 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9843 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9844 }
9845 }
9846 else {
9847 for (i = 0; i < len; i += n) {
9848 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9849 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9850 }
9851 }
9852 RB_GC_GUARD(str);
9853 if (ary)
9854 return ary;
9855 else
9856 return orig;
9857}
9858
9859/*
9860 * call-seq:
9861 * each_char {|char| ... } -> self
9862 * each_char -> enumerator
9863 *
9864 * :include: doc/string/each_char.rdoc
9865 *
9866 */
9867
9868static VALUE
9869rb_str_each_char(VALUE str)
9870{
9871 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9872 return rb_str_enumerate_chars(str, 0);
9873}
9874
9875/*
9876 * call-seq:
9877 * chars -> array_of_characters
9878 *
9879 * :include: doc/string/chars.rdoc
9880 *
9881 */
9882
9883static VALUE
9884rb_str_chars(VALUE str)
9885{
9886 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9887 return rb_str_enumerate_chars(str, ary);
9888}
9889
9890static VALUE
9891rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9892{
9893 VALUE orig = str;
9894 int n;
9895 unsigned int c;
9896 const char *ptr, *end;
9897 rb_encoding *enc;
9898
9899 if (single_byte_optimizable(str))
9900 return rb_str_enumerate_bytes(str, ary);
9901
9902 str = rb_str_new_frozen(str);
9903 ptr = RSTRING_PTR(str);
9904 end = RSTRING_END(str);
9905 enc = STR_ENC_GET(str);
9906
9907 while (ptr < end) {
9908 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9909 ENUM_ELEM(ary, UINT2NUM(c));
9910 ptr += n;
9911 }
9912 RB_GC_GUARD(str);
9913 if (ary)
9914 return ary;
9915 else
9916 return orig;
9917}
9918
9919/*
9920 * call-seq:
9921 * each_codepoint {|codepoint| ... } -> self
9922 * each_codepoint -> enumerator
9923 *
9924 * :include: doc/string/each_codepoint.rdoc
9925 *
9926 */
9927
9928static VALUE
9929rb_str_each_codepoint(VALUE str)
9930{
9931 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9932 return rb_str_enumerate_codepoints(str, 0);
9933}
9934
9935/*
9936 * call-seq:
9937 * codepoints -> array_of_integers
9938 *
9939 * :include: doc/string/codepoints.rdoc
9940 *
9941 */
9942
9943static VALUE
9944rb_str_codepoints(VALUE str)
9945{
9946 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9947 return rb_str_enumerate_codepoints(str, ary);
9948}
9949
9950static regex_t *
9951get_reg_grapheme_cluster(rb_encoding *enc)
9952{
9953 int encidx = rb_enc_to_index(enc);
9954
9955 const OnigUChar source_ascii[] = "\\X";
9956 const OnigUChar *source = source_ascii;
9957 size_t source_len = sizeof(source_ascii) - 1;
9958
9959 switch (encidx) {
9960#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9961#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9962#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9963#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9964#define CASE_UTF(e) \
9965 case ENCINDEX_UTF_##e: { \
9966 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9967 source = source_UTF_##e; \
9968 source_len = sizeof(source_UTF_##e); \
9969 break; \
9970 }
9971 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9972#undef CASE_UTF
9973#undef CHARS_16BE
9974#undef CHARS_16LE
9975#undef CHARS_32BE
9976#undef CHARS_32LE
9977 }
9978
9979 regex_t *reg_grapheme_cluster;
9980 OnigErrorInfo einfo;
9981 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9982 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9983 if (r) {
9984 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9985 onig_error_code_to_str(message, r, &einfo);
9986 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9987 }
9988
9989 return reg_grapheme_cluster;
9990}
9991
9992static regex_t *
9993get_cached_reg_grapheme_cluster(rb_encoding *enc)
9994{
9995 int encidx = rb_enc_to_index(enc);
9996 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9997
9998 if (encidx == rb_utf8_encindex()) {
9999 if (!reg_grapheme_cluster_utf8) {
10000 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
10001 }
10002
10003 return reg_grapheme_cluster_utf8;
10004 }
10005
10006 return NULL;
10007}
10008
10009static VALUE
10010rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
10011{
10012 size_t grapheme_cluster_count = 0;
10013 rb_encoding *enc = get_encoding(str);
10014 const char *ptr, *end;
10015
10016 if (!rb_enc_unicode_p(enc)) {
10017 return rb_str_length(str);
10018 }
10019
10020 bool cached_reg_grapheme_cluster = true;
10021 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10022 if (!reg_grapheme_cluster) {
10023 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10024 cached_reg_grapheme_cluster = false;
10025 }
10026
10027 ptr = RSTRING_PTR(str);
10028 end = RSTRING_END(str);
10029
10030 while (ptr < end) {
10031 OnigPosition len = onig_match(reg_grapheme_cluster,
10032 (const OnigUChar *)ptr, (const OnigUChar *)end,
10033 (const OnigUChar *)ptr, NULL, 0);
10034 if (len <= 0) break;
10035 grapheme_cluster_count++;
10036 ptr += len;
10037 }
10038
10039 if (!cached_reg_grapheme_cluster) {
10040 onig_free(reg_grapheme_cluster);
10041 }
10042
10043 return SIZET2NUM(grapheme_cluster_count);
10044}
10045
10046static VALUE
10047rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
10048{
10049 VALUE orig = str;
10050 rb_encoding *enc = get_encoding(str);
10051 const char *ptr0, *ptr, *end;
10052
10053 if (!rb_enc_unicode_p(enc)) {
10054 return rb_str_enumerate_chars(str, ary);
10055 }
10056
10057 if (!ary) str = rb_str_new_frozen(str);
10058
10059 bool cached_reg_grapheme_cluster = true;
10060 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10061 if (!reg_grapheme_cluster) {
10062 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10063 cached_reg_grapheme_cluster = false;
10064 }
10065
10066 ptr0 = ptr = RSTRING_PTR(str);
10067 end = RSTRING_END(str);
10068
10069 while (ptr < end) {
10070 OnigPosition len = onig_match(reg_grapheme_cluster,
10071 (const OnigUChar *)ptr, (const OnigUChar *)end,
10072 (const OnigUChar *)ptr, NULL, 0);
10073 if (len <= 0) break;
10074 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10075 ptr += len;
10076 }
10077
10078 if (!cached_reg_grapheme_cluster) {
10079 onig_free(reg_grapheme_cluster);
10080 }
10081
10082 RB_GC_GUARD(str);
10083 if (ary)
10084 return ary;
10085 else
10086 return orig;
10087}
10088
10089/*
10090 * call-seq:
10091 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
10092 * each_grapheme_cluster -> enumerator
10093 *
10094 * :include: doc/string/each_grapheme_cluster.rdoc
10095 *
10096 */
10097
10098static VALUE
10099rb_str_each_grapheme_cluster(VALUE str)
10100{
10101 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10102 return rb_str_enumerate_grapheme_clusters(str, 0);
10103}
10104
10105/*
10106 * call-seq:
10107 * grapheme_clusters -> array_of_grapheme_clusters
10108 *
10109 * :include: doc/string/grapheme_clusters.rdoc
10110 *
10111 */
10112
10113static VALUE
10114rb_str_grapheme_clusters(VALUE str)
10115{
10116 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10117 return rb_str_enumerate_grapheme_clusters(str, ary);
10118}
10119
10120static long
10121chopped_length(VALUE str)
10122{
10123 rb_encoding *enc = STR_ENC_GET(str);
10124 const char *p, *p2, *beg, *end;
10125
10126 beg = RSTRING_PTR(str);
10127 end = beg + RSTRING_LEN(str);
10128 if (beg >= end) return 0;
10129 p = rb_enc_prev_char(beg, end, end, enc);
10130 if (!p) return 0;
10131 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10132 p2 = rb_enc_prev_char(beg, p, end, enc);
10133 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10134 }
10135 return p - beg;
10136}
10137
10138/*
10139 * call-seq:
10140 * chop! -> self or nil
10141 *
10142 * Like String#chop, except that:
10143 *
10144 * - Removes trailing characters from +self+ (not from a copy of +self+).
10145 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10146 *
10147 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10148 */
10149
10150static VALUE
10151rb_str_chop_bang(VALUE str)
10152{
10153 str_modify_keep_cr(str);
10154 if (RSTRING_LEN(str) > 0) {
10155 long len;
10156 len = chopped_length(str);
10157 STR_SET_LEN(str, len);
10158 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10159 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10161 }
10162 return str;
10163 }
10164 return Qnil;
10165}
10166
10167
10168/*
10169 * call-seq:
10170 * chop -> new_string
10171 *
10172 * :include: doc/string/chop.rdoc
10173 *
10174 */
10175
10176static VALUE
10177rb_str_chop(VALUE str)
10178{
10179 return rb_str_subseq(str, 0, chopped_length(str));
10180}
10181
10182static long
10183smart_chomp(VALUE str, const char *e, const char *p)
10184{
10185 rb_encoding *enc = rb_enc_get(str);
10186 if (rb_enc_mbminlen(enc) > 1) {
10187 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10188 if (rb_enc_is_newline(pp, e, enc)) {
10189 e = pp;
10190 }
10191 pp = e - rb_enc_mbminlen(enc);
10192 if (pp >= p) {
10193 pp = rb_enc_left_char_head(p, pp, e, enc);
10194 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10195 e = pp;
10196 }
10197 }
10198 }
10199 else {
10200 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10201 case '\n':
10202 if (--e > p && *(e-1) == '\r') {
10203 --e;
10204 }
10205 break;
10206 case '\r':
10207 --e;
10208 break;
10209 }
10210 }
10211 return e - p;
10212}
10213
10214static long
10215chompped_length(VALUE str, VALUE rs)
10216{
10217 rb_encoding *enc;
10218 int newline;
10219 char *pp, *e, *rsptr;
10220 long rslen;
10221 char *const p = RSTRING_PTR(str);
10222 long len = RSTRING_LEN(str);
10223
10224 if (len == 0) return 0;
10225 e = p + len;
10226 if (rs == rb_default_rs) {
10227 return smart_chomp(str, e, p);
10228 }
10229
10230 enc = rb_enc_get(str);
10231 RSTRING_GETMEM(rs, rsptr, rslen);
10232 if (rslen == 0) {
10233 if (rb_enc_mbminlen(enc) > 1) {
10234 while (e > p) {
10235 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10236 if (!rb_enc_is_newline(pp, e, enc)) break;
10237 e = pp;
10238 pp -= rb_enc_mbminlen(enc);
10239 if (pp >= p) {
10240 pp = rb_enc_left_char_head(p, pp, e, enc);
10241 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10242 e = pp;
10243 }
10244 }
10245 }
10246 }
10247 else {
10248 while (e > p && *(e-1) == '\n') {
10249 --e;
10250 if (e > p && *(e-1) == '\r')
10251 --e;
10252 }
10253 }
10254 return e - p;
10255 }
10256 if (rslen > len) return len;
10257
10258 enc = rb_enc_get(rs);
10259 newline = rsptr[rslen-1];
10260 if (rslen == rb_enc_mbminlen(enc)) {
10261 if (rslen == 1) {
10262 if (newline == '\n')
10263 return smart_chomp(str, e, p);
10264 }
10265 else {
10266 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10267 return smart_chomp(str, e, p);
10268 }
10269 }
10270
10271 enc = rb_enc_check(str, rs);
10272 if (is_broken_string(rs)) {
10273 return len;
10274 }
10275 pp = e - rslen;
10276 if (p[len-1] == newline &&
10277 (rslen <= 1 ||
10278 memcmp(rsptr, pp, rslen) == 0)) {
10279 if (at_char_boundary(p, pp, e, enc))
10280 return len - rslen;
10281 RB_GC_GUARD(rs);
10282 }
10283 return len;
10284}
10285
10291static VALUE
10292chomp_rs(int argc, const VALUE *argv)
10293{
10294 rb_check_arity(argc, 0, 1);
10295 if (argc > 0) {
10296 VALUE rs = argv[0];
10297 if (!NIL_P(rs)) StringValue(rs);
10298 return rs;
10299 }
10300 else {
10301 return rb_rs;
10302 }
10303}
10304
10305VALUE
10306rb_str_chomp_string(VALUE str, VALUE rs)
10307{
10308 long olen = RSTRING_LEN(str);
10309 long len = chompped_length(str, rs);
10310 if (len >= olen) return Qnil;
10311 str_modify_keep_cr(str);
10312 STR_SET_LEN(str, len);
10313 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10314 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10316 }
10317 return str;
10318}
10319
10320/*
10321 * call-seq:
10322 * chomp!(line_sep = $/) -> self or nil
10323 *
10324 * Like String#chomp, except that:
10325 *
10326 * - Removes trailing characters from +self+ (not from a copy of +self+).
10327 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10328 *
10329 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10330 */
10331
10332static VALUE
10333rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10334{
10335 VALUE rs;
10336 str_modifiable(str);
10337 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10338 rs = chomp_rs(argc, argv);
10339 if (NIL_P(rs)) return Qnil;
10340 return rb_str_chomp_string(str, rs);
10341}
10342
10343
10344/*
10345 * call-seq:
10346 * chomp(line_sep = $/) -> new_string
10347 *
10348 * :include: doc/string/chomp.rdoc
10349 *
10350 */
10351
10352static VALUE
10353rb_str_chomp(int argc, VALUE *argv, VALUE str)
10354{
10355 VALUE rs = chomp_rs(argc, argv);
10356 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10357 return rb_str_subseq(str, 0, chompped_length(str, rs));
10358}
10359
10360static long
10361lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10362{
10363 const char *const start = s;
10364
10365 if (!s || s >= e) return 0;
10366
10367 /* remove spaces at head */
10368 if (single_byte_optimizable(str)) {
10369 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10370 }
10371 else {
10372 while (s < e) {
10373 int n;
10374 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10375
10376 if (cc && !rb_isspace(cc)) break;
10377 s += n;
10378 }
10379 }
10380 return s - start;
10381}
10382
10383/*
10384 * call-seq:
10385 * lstrip! -> self or nil
10386 *
10387 * Like String#lstrip, except that:
10388 *
10389 * - Performs stripping in +self+ (not in a copy of +self+).
10390 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10391 *
10392 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10393 */
10394
10395static VALUE
10396rb_str_lstrip_bang(VALUE str)
10397{
10398 rb_encoding *enc;
10399 char *start, *s;
10400 long olen, loffset;
10401
10402 str_modify_keep_cr(str);
10403 enc = STR_ENC_GET(str);
10404 RSTRING_GETMEM(str, start, olen);
10405 loffset = lstrip_offset(str, start, start+olen, enc);
10406 if (loffset > 0) {
10407 long len = olen-loffset;
10408 s = start + loffset;
10409 memmove(start, s, len);
10410 STR_SET_LEN(str, len);
10411 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10412 return str;
10413 }
10414 return Qnil;
10415}
10416
10417
10418/*
10419 * call-seq:
10420 * lstrip -> new_string
10421 *
10422 * Returns a copy of +self+ with leading whitespace removed;
10423 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10424 *
10425 * whitespace = "\x00\t\n\v\f\r "
10426 * s = whitespace + 'abc' + whitespace
10427 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10428 * s.lstrip
10429 * # => "abc\u0000\t\n\v\f\r "
10430 *
10431 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10432 */
10433
10434static VALUE
10435rb_str_lstrip(VALUE str)
10436{
10437 char *start;
10438 long len, loffset;
10439 RSTRING_GETMEM(str, start, len);
10440 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10441 if (loffset <= 0) return str_duplicate(rb_cString, str);
10442 return rb_str_subseq(str, loffset, len - loffset);
10443}
10444
10445static long
10446rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10447{
10448 const char *t;
10449
10450 rb_str_check_dummy_enc(enc);
10452 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10453 }
10454 if (!s || s >= e) return 0;
10455 t = e;
10456
10457 /* remove trailing spaces or '\0's */
10458 if (single_byte_optimizable(str)) {
10459 unsigned char c;
10460 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10461 }
10462 else {
10463 char *tp;
10464
10465 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10466 unsigned int c = rb_enc_codepoint(tp, e, enc);
10467 if (c && !rb_isspace(c)) break;
10468 t = tp;
10469 }
10470 }
10471 return e - t;
10472}
10473
10474/*
10475 * call-seq:
10476 * rstrip! -> self or nil
10477 *
10478 * Like String#rstrip, except that any modifications are made in +self+;
10479 * returns +self+ if any modification are made, +nil+ otherwise.
10480 *
10481 * Related: String#lstrip!, String#strip!.
10482 */
10483
10484static VALUE
10485rb_str_rstrip_bang(VALUE str)
10486{
10487 rb_encoding *enc;
10488 char *start;
10489 long olen, roffset;
10490
10491 str_modify_keep_cr(str);
10492 enc = STR_ENC_GET(str);
10493 RSTRING_GETMEM(str, start, olen);
10494 roffset = rstrip_offset(str, start, start+olen, enc);
10495 if (roffset > 0) {
10496 long len = olen - roffset;
10497
10498 STR_SET_LEN(str, len);
10499 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10500 return str;
10501 }
10502 return Qnil;
10503}
10504
10505
10506/*
10507 * call-seq:
10508 * rstrip -> new_string
10509 *
10510 * Returns a copy of the receiver with trailing whitespace removed;
10511 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10512 *
10513 * whitespace = "\x00\t\n\v\f\r "
10514 * s = whitespace + 'abc' + whitespace
10515 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10516 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10517 *
10518 * Related: String#lstrip, String#strip.
10519 */
10520
10521static VALUE
10522rb_str_rstrip(VALUE str)
10523{
10524 rb_encoding *enc;
10525 char *start;
10526 long olen, roffset;
10527
10528 enc = STR_ENC_GET(str);
10529 RSTRING_GETMEM(str, start, olen);
10530 roffset = rstrip_offset(str, start, start+olen, enc);
10531
10532 if (roffset <= 0) return str_duplicate(rb_cString, str);
10533 return rb_str_subseq(str, 0, olen-roffset);
10534}
10535
10536
10537/*
10538 * call-seq:
10539 * strip! -> self or nil
10540 *
10541 * Like String#strip, except that any modifications are made in +self+;
10542 * returns +self+ if any modification are made, +nil+ otherwise.
10543 *
10544 * Related: String#lstrip!, String#strip!.
10545 */
10546
10547static VALUE
10548rb_str_strip_bang(VALUE str)
10549{
10550 char *start;
10551 long olen, loffset, roffset;
10552 rb_encoding *enc;
10553
10554 str_modify_keep_cr(str);
10555 enc = STR_ENC_GET(str);
10556 RSTRING_GETMEM(str, start, olen);
10557 loffset = lstrip_offset(str, start, start+olen, enc);
10558 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10559
10560 if (loffset > 0 || roffset > 0) {
10561 long len = olen-roffset;
10562 if (loffset > 0) {
10563 len -= loffset;
10564 memmove(start, start + loffset, len);
10565 }
10566 STR_SET_LEN(str, len);
10567 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10568 return str;
10569 }
10570 return Qnil;
10571}
10572
10573
10574/*
10575 * call-seq:
10576 * strip -> new_string
10577 *
10578 * Returns a copy of the receiver with leading and trailing whitespace removed;
10579 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10580 *
10581 * whitespace = "\x00\t\n\v\f\r "
10582 * s = whitespace + 'abc' + whitespace
10583 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10584 * s.strip # => "abc"
10585 *
10586 * Related: String#lstrip, String#rstrip.
10587 */
10588
10589static VALUE
10590rb_str_strip(VALUE str)
10591{
10592 char *start;
10593 long olen, loffset, roffset;
10594 rb_encoding *enc = STR_ENC_GET(str);
10595
10596 RSTRING_GETMEM(str, start, olen);
10597 loffset = lstrip_offset(str, start, start+olen, enc);
10598 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10599
10600 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10601 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10602}
10603
10604static VALUE
10605scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10606{
10607 VALUE result = Qnil;
10608 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10609 if (pos >= 0) {
10610 VALUE match;
10611 struct re_registers *regs;
10612 if (BUILTIN_TYPE(pat) == T_STRING) {
10613 regs = NULL;
10614 end = pos + RSTRING_LEN(pat);
10615 }
10616 else {
10617 match = rb_backref_get();
10618 regs = RMATCH_REGS(match);
10619 pos = BEG(0);
10620 end = END(0);
10621 }
10622
10623 if (pos == end) {
10624 rb_encoding *enc = STR_ENC_GET(str);
10625 /*
10626 * Always consume at least one character of the input string
10627 */
10628 if (RSTRING_LEN(str) > end)
10629 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10630 RSTRING_END(str), enc);
10631 else
10632 *start = end + 1;
10633 }
10634 else {
10635 *start = end;
10636 }
10637
10638 if (!regs || regs->num_regs == 1) {
10639 result = rb_str_subseq(str, pos, end - pos);
10640 return result;
10641 }
10642 else {
10643 result = rb_ary_new2(regs->num_regs);
10644 for (int i = 1; i < regs->num_regs; i++) {
10645 VALUE s = Qnil;
10646 if (BEG(i) >= 0) {
10647 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10648 }
10649
10650 rb_ary_push(result, s);
10651 }
10652 }
10653
10654 RB_GC_GUARD(match);
10655 }
10656
10657 return result;
10658}
10659
10660
10661/*
10662 * call-seq:
10663 * scan(string_or_regexp) -> array
10664 * scan(string_or_regexp) {|matches| ... } -> self
10665 *
10666 * Matches a pattern against +self+; the pattern is:
10667 *
10668 * - +string_or_regexp+ itself, if it is a Regexp.
10669 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10670 *
10671 * Iterates through +self+, generating a collection of matching results:
10672 *
10673 * - If the pattern contains no groups, each result is the
10674 * matched string, <code>$&</code>.
10675 * - If the pattern contains groups, each result is an array
10676 * containing one entry per group.
10677 *
10678 * With no block given, returns an array of the results:
10679 *
10680 * s = 'cruel world'
10681 * s.scan(/\w+/) # => ["cruel", "world"]
10682 * s.scan(/.../) # => ["cru", "el ", "wor"]
10683 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10684 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10685 *
10686 * With a block given, calls the block with each result; returns +self+:
10687 *
10688 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10689 * print "\n"
10690 * s.scan(/(.)(.)/) {|x,y| print y, x }
10691 * print "\n"
10692 *
10693 * Output:
10694 *
10695 * <<cruel>> <<world>>
10696 * rceu lowlr
10697 *
10698 */
10699
10700static VALUE
10701rb_str_scan(VALUE str, VALUE pat)
10702{
10703 VALUE result;
10704 long start = 0;
10705 long last = -1, prev = 0;
10706 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10707
10708 pat = get_pat_quoted(pat, 1);
10709 mustnot_broken(str);
10710 if (!rb_block_given_p()) {
10711 VALUE ary = rb_ary_new();
10712
10713 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10714 last = prev;
10715 prev = start;
10716 rb_ary_push(ary, result);
10717 }
10718 if (last >= 0) rb_pat_search(pat, str, last, 1);
10719 else rb_backref_set(Qnil);
10720 return ary;
10721 }
10722
10723 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10724 last = prev;
10725 prev = start;
10726 rb_yield(result);
10727 str_mod_check(str, p, len);
10728 }
10729 if (last >= 0) rb_pat_search(pat, str, last, 1);
10730 return str;
10731}
10732
10733
10734/*
10735 * call-seq:
10736 * hex -> integer
10737 *
10738 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10739 * returns its value as an integer.
10740 *
10741 * The leading substring is interpreted as hexadecimal when it begins with:
10742 *
10743 * - One or more character representing hexadecimal digits
10744 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10745 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10746 *
10747 * 'f'.hex # => 15
10748 * '11'.hex # => 17
10749 * 'FFF'.hex # => 4095
10750 * 'fffg'.hex # => 4095
10751 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10752 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10753 * 'deadbeef'.hex # => 3735928559
10754 *
10755 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10756 *
10757 * '0xfff'.hex # => 4095
10758 * '0xfffg'.hex # => 4095
10759 *
10760 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10761 *
10762 * '-fff'.hex # => -4095
10763 * '-0xFFF'.hex # => -4095
10764 *
10765 * For any substring not described above, returns zero:
10766 *
10767 * 'xxx'.hex # => 0
10768 * ''.hex # => 0
10769 *
10770 * Note that, unlike #oct, this method interprets only hexadecimal,
10771 * and not binary, octal, or decimal notations:
10772 *
10773 * '0b111'.hex # => 45329
10774 * '0o777'.hex # => 0
10775 * '0d999'.hex # => 55705
10776 *
10777 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10778 */
10779
10780static VALUE
10781rb_str_hex(VALUE str)
10782{
10783 return rb_str_to_inum(str, 16, FALSE);
10784}
10785
10786
10787/*
10788 * call-seq:
10789 * oct -> integer
10790 *
10791 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10792 * returns their value as an integer.
10793 *
10794 * In brief:
10795 *
10796 * # Interpreted as octal.
10797 * '777'.oct # => 511
10798 * '777x'.oct # => 511
10799 * '0777'.oct # => 511
10800 * '0o777'.oct # => 511
10801 * '-777'.oct # => -511
10802 * # Not interpreted as octal.
10803 * '0b111'.oct # => 7 # Interpreted as binary.
10804 * '0d999'.oct # => 999 # Interpreted as decimal.
10805 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10806 *
10807 * The leading substring is interpreted as octal when it begins with:
10808 *
10809 * - One or more character representing octal digits
10810 * (each in the range <tt>'0'..'7'</tt>);
10811 * the string to be interpreted ends at the first character that does not represent an octal digit:
10812 *
10813 * '7'.oct @ => 7
10814 * '11'.oct # => 9
10815 * '777'.oct # => 511
10816 * '0777'.oct # => 511
10817 * '7778'.oct # => 511
10818 * '777x'.oct # => 511
10819 *
10820 * - <tt>'0o'</tt>, followed by one or more octal digits:
10821 *
10822 * '0o777'.oct # => 511
10823 * '0o7778'.oct # => 511
10824 *
10825 * The leading substring is _not_ interpreted as octal when it begins with:
10826 *
10827 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10828 * (each in the range <tt>'0'..'1'</tt>);
10829 * the string to be interpreted ends at the first character that does not represent a binary digit.
10830 * the string is interpreted as binary digits (base 2):
10831 *
10832 * '0b111'.oct # => 7
10833 * '0b1112'.oct # => 7
10834 *
10835 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10836 * (each in the range <tt>'0'..'9'</tt>);
10837 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10838 * the string is interpreted as decimal digits (base 10):
10839 *
10840 * '0d999'.oct # => 999
10841 * '0d999x'.oct # => 999
10842 *
10843 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10844 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10845 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10846 * the string is interpreted as hexadecimal digits (base 16):
10847 *
10848 * '0xfff'.oct # => 4095
10849 * '0xfffg'.oct # => 4095
10850 *
10851 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10852 *
10853 * '-777'.oct # => -511
10854 * '-0777'.oct # => -511
10855 * '-0b111'.oct # => -7
10856 * '-0xfff'.oct # => -4095
10857 *
10858 * For any substring not described above, returns zero:
10859 *
10860 * 'foo'.oct # => 0
10861 * ''.oct # => 0
10862 *
10863 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10864 */
10865
10866static VALUE
10867rb_str_oct(VALUE str)
10868{
10869 return rb_str_to_inum(str, -8, FALSE);
10870}
10871
10872#ifndef HAVE_CRYPT_R
10873# include "ruby/thread_native.h"
10874# include "ruby/atomic.h"
10875
10876static struct {
10877 rb_nativethread_lock_t lock;
10878} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10879#endif
10880
10881/*
10882 * call-seq:
10883 * crypt(salt_str) -> new_string
10884 *
10885 * Returns the string generated by calling <code>crypt(3)</code>
10886 * standard library function with <code>str</code> and
10887 * <code>salt_str</code>, in this order, as its arguments. Please do
10888 * not use this method any longer. It is legacy; provided only for
10889 * backward compatibility with ruby scripts in earlier days. It is
10890 * bad to use in contemporary programs for several reasons:
10891 *
10892 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10893 * run. The generated string lacks data portability.
10894 *
10895 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10896 * (i.e. silently ends up in unexpected results).
10897 *
10898 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10899 * thread safe.
10900 *
10901 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10902 * very very weak. According to its manpage, Linux's traditional
10903 * <code>crypt(3)</code> output has only 2**56 variations; too
10904 * easy to brute force today. And this is the default behaviour.
10905 *
10906 * * In order to make things robust some OSes implement so-called
10907 * "modular" usage. To go through, you have to do a complex
10908 * build-up of the <code>salt_str</code> parameter, by hand.
10909 * Failure in generation of a proper salt string tends not to
10910 * yield any errors; typos in parameters are normally not
10911 * detectable.
10912 *
10913 * * For instance, in the following example, the second invocation
10914 * of String#crypt is wrong; it has a typo in "round=" (lacks
10915 * "s"). However the call does not fail and something unexpected
10916 * is generated.
10917 *
10918 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10919 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10920 *
10921 * * Even in the "modular" mode, some hash functions are considered
10922 * archaic and no longer recommended at all; for instance module
10923 * <code>$1$</code> is officially abandoned by its author: see
10924 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10925 * instance module <code>$3$</code> is considered completely
10926 * broken: see the manpage of FreeBSD.
10927 *
10928 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10929 * written above, <code>crypt(3)</code> on Mac OS never fails.
10930 * This means even if you build up a proper salt string it
10931 * generates a traditional DES hash anyways, and there is no way
10932 * for you to be aware of.
10933 *
10934 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10935 *
10936 * If for some reason you cannot migrate to other secure contemporary
10937 * password hashing algorithms, install the string-crypt gem and
10938 * <code>require 'string/crypt'</code> to continue using it.
10939 */
10940
10941static VALUE
10942rb_str_crypt(VALUE str, VALUE salt)
10943{
10944#ifdef HAVE_CRYPT_R
10945 VALUE databuf;
10946 struct crypt_data *data;
10947# define CRYPT_END() ALLOCV_END(databuf)
10948#else
10949 char *tmp_buf;
10950 extern char *crypt(const char *, const char *);
10951# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10952#endif
10953 VALUE result;
10954 const char *s, *saltp;
10955 char *res;
10956#ifdef BROKEN_CRYPT
10957 char salt_8bit_clean[3];
10958#endif
10959
10960 StringValue(salt);
10961 mustnot_wchar(str);
10962 mustnot_wchar(salt);
10963 s = StringValueCStr(str);
10964 saltp = RSTRING_PTR(salt);
10965 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10966 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10967 }
10968
10969#ifdef BROKEN_CRYPT
10970 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10971 salt_8bit_clean[0] = saltp[0] & 0x7f;
10972 salt_8bit_clean[1] = saltp[1] & 0x7f;
10973 salt_8bit_clean[2] = '\0';
10974 saltp = salt_8bit_clean;
10975 }
10976#endif
10977#ifdef HAVE_CRYPT_R
10978 data = ALLOCV(databuf, sizeof(struct crypt_data));
10979# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10980 data->initialized = 0;
10981# endif
10982 res = crypt_r(s, saltp, data);
10983#else
10984 rb_nativethread_lock_lock(&crypt_mutex.lock);
10985 res = crypt(s, saltp);
10986#endif
10987 if (!res) {
10988 int err = errno;
10989 CRYPT_END();
10990 rb_syserr_fail(err, "crypt");
10991 }
10992#ifdef HAVE_CRYPT_R
10993 result = rb_str_new_cstr(res);
10994 CRYPT_END();
10995#else
10996 // We need to copy this buffer because it's static and we need to unlock the mutex
10997 // before allocating a new object (the string to be returned). If we allocate while
10998 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10999 // if other ractors are waiting on this lock.
11000 size_t res_size = strlen(res)+1;
11001 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
11002 memcpy(tmp_buf, res, res_size);
11003 res = tmp_buf;
11004 CRYPT_END();
11005 result = rb_str_new_cstr(res);
11006#endif
11007 return result;
11008}
11009
11010
11011/*
11012 * call-seq:
11013 * ord -> integer
11014 *
11015 * :include: doc/string/ord.rdoc
11016 *
11017 */
11018
11019static VALUE
11020rb_str_ord(VALUE s)
11021{
11022 unsigned int c;
11023
11024 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
11025 return UINT2NUM(c);
11026}
11027/*
11028 * call-seq:
11029 * sum(n = 16) -> integer
11030 *
11031 * :include: doc/string/sum.rdoc
11032 *
11033 */
11034
11035static VALUE
11036rb_str_sum(int argc, VALUE *argv, VALUE str)
11037{
11038 int bits = 16;
11039 char *ptr, *p, *pend;
11040 long len;
11041 VALUE sum = INT2FIX(0);
11042 unsigned long sum0 = 0;
11043
11044 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11045 bits = 0;
11046 }
11047 ptr = p = RSTRING_PTR(str);
11048 len = RSTRING_LEN(str);
11049 pend = p + len;
11050
11051 while (p < pend) {
11052 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11053 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11054 str_mod_check(str, ptr, len);
11055 sum0 = 0;
11056 }
11057 sum0 += (unsigned char)*p;
11058 p++;
11059 }
11060
11061 if (bits == 0) {
11062 if (sum0) {
11063 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11064 }
11065 }
11066 else {
11067 if (sum == INT2FIX(0)) {
11068 if (bits < (int)sizeof(long)*CHAR_BIT) {
11069 sum0 &= (((unsigned long)1)<<bits)-1;
11070 }
11071 sum = LONG2FIX(sum0);
11072 }
11073 else {
11074 VALUE mod;
11075
11076 if (sum0) {
11077 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11078 }
11079
11080 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11081 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11082 sum = rb_funcall(sum, '&', 1, mod);
11083 }
11084 }
11085 return sum;
11086}
11087
11088static VALUE
11089rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11090{
11091 rb_encoding *enc;
11092 VALUE w;
11093 long width, len, flen = 1, fclen = 1;
11094 VALUE res;
11095 char *p;
11096 const char *f = " ";
11097 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11098 VALUE pad;
11099 int singlebyte = 1, cr;
11100 int termlen;
11101
11102 rb_scan_args(argc, argv, "11", &w, &pad);
11103 enc = STR_ENC_GET(str);
11104 termlen = rb_enc_mbminlen(enc);
11105 width = NUM2LONG(w);
11106 if (argc == 2) {
11107 StringValue(pad);
11108 enc = rb_enc_check(str, pad);
11109 f = RSTRING_PTR(pad);
11110 flen = RSTRING_LEN(pad);
11111 fclen = str_strlen(pad, enc); /* rb_enc_check */
11112 singlebyte = single_byte_optimizable(pad);
11113 if (flen == 0 || fclen == 0) {
11114 rb_raise(rb_eArgError, "zero width padding");
11115 }
11116 }
11117 len = str_strlen(str, enc); /* rb_enc_check */
11118 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11119 n = width - len;
11120 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11121 rlen = n - llen;
11122 cr = ENC_CODERANGE(str);
11123 if (flen > 1) {
11124 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11125 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11126 }
11127 size = RSTRING_LEN(str);
11128 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11129 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11130 (len += llen2 + rlen2) >= LONG_MAX - size) {
11131 rb_raise(rb_eArgError, "argument too big");
11132 }
11133 len += size;
11134 res = str_enc_new(rb_cString, 0, len, enc);
11135 p = RSTRING_PTR(res);
11136 if (flen <= 1) {
11137 memset(p, *f, llen);
11138 p += llen;
11139 }
11140 else {
11141 while (llen >= fclen) {
11142 memcpy(p,f,flen);
11143 p += flen;
11144 llen -= fclen;
11145 }
11146 if (llen > 0) {
11147 memcpy(p, f, llen2);
11148 p += llen2;
11149 }
11150 }
11151 memcpy(p, RSTRING_PTR(str), size);
11152 p += size;
11153 if (flen <= 1) {
11154 memset(p, *f, rlen);
11155 p += rlen;
11156 }
11157 else {
11158 while (rlen >= fclen) {
11159 memcpy(p,f,flen);
11160 p += flen;
11161 rlen -= fclen;
11162 }
11163 if (rlen > 0) {
11164 memcpy(p, f, rlen2);
11165 p += rlen2;
11166 }
11167 }
11168 TERM_FILL(p, termlen);
11169 STR_SET_LEN(res, p-RSTRING_PTR(res));
11170
11171 if (argc == 2)
11172 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11173 if (cr != ENC_CODERANGE_BROKEN)
11174 ENC_CODERANGE_SET(res, cr);
11175
11176 RB_GC_GUARD(pad);
11177 return res;
11178}
11179
11180
11181/*
11182 * call-seq:
11183 * ljust(width, pad_string = ' ') -> new_string
11184 *
11185 * :include: doc/string/ljust.rdoc
11186 *
11187 */
11188
11189static VALUE
11190rb_str_ljust(int argc, VALUE *argv, VALUE str)
11191{
11192 return rb_str_justify(argc, argv, str, 'l');
11193}
11194
11195/*
11196 * call-seq:
11197 * rjust(size, pad_string = ' ') -> new_string
11198 *
11199 * :include: doc/string/rjust.rdoc
11200 *
11201 * Related: String#ljust, String#center.
11202 *
11203 */
11204
11205static VALUE
11206rb_str_rjust(int argc, VALUE *argv, VALUE str)
11207{
11208 return rb_str_justify(argc, argv, str, 'r');
11209}
11210
11211
11212/*
11213 * call-seq:
11214 * center(size, pad_string = ' ') -> new_string
11215 *
11216 * :include: doc/string/center.rdoc
11217 *
11218 */
11219
11220static VALUE
11221rb_str_center(int argc, VALUE *argv, VALUE str)
11222{
11223 return rb_str_justify(argc, argv, str, 'c');
11224}
11225
11226/*
11227 * call-seq:
11228 * partition(pattern) -> [pre_match, first_match, post_match]
11229 *
11230 * :include: doc/string/partition.rdoc
11231 *
11232 */
11233
11234static VALUE
11235rb_str_partition(VALUE str, VALUE sep)
11236{
11237 long pos;
11238
11239 sep = get_pat_quoted(sep, 0);
11240 if (RB_TYPE_P(sep, T_REGEXP)) {
11241 if (rb_reg_search(sep, str, 0, 0) < 0) {
11242 goto failed;
11243 }
11244 VALUE match = rb_backref_get();
11245 struct re_registers *regs = RMATCH_REGS(match);
11246
11247 pos = BEG(0);
11248 sep = rb_str_subseq(str, pos, END(0) - pos);
11249 }
11250 else {
11251 pos = rb_str_index(str, sep, 0);
11252 if (pos < 0) goto failed;
11253 }
11254 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11255 sep,
11256 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11257 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11258
11259 failed:
11260 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11261}
11262
11263/*
11264 * call-seq:
11265 * rpartition(sep) -> [head, match, tail]
11266 *
11267 * :include: doc/string/rpartition.rdoc
11268 *
11269 */
11270
11271static VALUE
11272rb_str_rpartition(VALUE str, VALUE sep)
11273{
11274 long pos = RSTRING_LEN(str);
11275
11276 sep = get_pat_quoted(sep, 0);
11277 if (RB_TYPE_P(sep, T_REGEXP)) {
11278 if (rb_reg_search(sep, str, pos, 1) < 0) {
11279 goto failed;
11280 }
11281 VALUE match = rb_backref_get();
11282 struct re_registers *regs = RMATCH_REGS(match);
11283
11284 pos = BEG(0);
11285 sep = rb_str_subseq(str, pos, END(0) - pos);
11286 }
11287 else {
11288 pos = rb_str_sublen(str, pos);
11289 pos = rb_str_rindex(str, sep, pos);
11290 if (pos < 0) {
11291 goto failed;
11292 }
11293 }
11294
11295 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11296 sep,
11297 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11298 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11299 failed:
11300 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11301}
11302
11303/*
11304 * call-seq:
11305 * start_with?(*string_or_regexp) -> true or false
11306 *
11307 * :include: doc/string/start_with_p.rdoc
11308 *
11309 */
11310
11311static VALUE
11312rb_str_start_with(int argc, VALUE *argv, VALUE str)
11313{
11314 int i;
11315
11316 for (i=0; i<argc; i++) {
11317 VALUE tmp = argv[i];
11318 if (RB_TYPE_P(tmp, T_REGEXP)) {
11319 if (rb_reg_start_with_p(tmp, str))
11320 return Qtrue;
11321 }
11322 else {
11323 const char *p, *s, *e;
11324 long slen, tlen;
11325 rb_encoding *enc;
11326
11327 StringValue(tmp);
11328 enc = rb_enc_check(str, tmp);
11329 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11330 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11331 p = RSTRING_PTR(str);
11332 e = p + slen;
11333 s = p + tlen;
11334 if (!at_char_right_boundary(p, s, e, enc))
11335 continue;
11336 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11337 return Qtrue;
11338 }
11339 }
11340 return Qfalse;
11341}
11342
11343/*
11344 * call-seq:
11345 * end_with?(*strings) -> true or false
11346 *
11347 * :include: doc/string/end_with_p.rdoc
11348 *
11349 */
11350
11351static VALUE
11352rb_str_end_with(int argc, VALUE *argv, VALUE str)
11353{
11354 int i;
11355
11356 for (i=0; i<argc; i++) {
11357 VALUE tmp = argv[i];
11358 const char *p, *s, *e;
11359 long slen, tlen;
11360 rb_encoding *enc;
11361
11362 StringValue(tmp);
11363 enc = rb_enc_check(str, tmp);
11364 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11365 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11366 p = RSTRING_PTR(str);
11367 e = p + slen;
11368 s = e - tlen;
11369 if (!at_char_boundary(p, s, e, enc))
11370 continue;
11371 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11372 return Qtrue;
11373 }
11374 return Qfalse;
11375}
11376
11386static long
11387deleted_prefix_length(VALUE str, VALUE prefix)
11388{
11389 const char *strptr, *prefixptr;
11390 long olen, prefixlen;
11391 rb_encoding *enc = rb_enc_get(str);
11392
11393 StringValue(prefix);
11394
11395 if (!is_broken_string(prefix) ||
11396 !rb_enc_asciicompat(enc) ||
11397 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11398 enc = rb_enc_check(str, prefix);
11399 }
11400
11401 /* return 0 if not start with prefix */
11402 prefixlen = RSTRING_LEN(prefix);
11403 if (prefixlen <= 0) return 0;
11404 olen = RSTRING_LEN(str);
11405 if (olen < prefixlen) return 0;
11406 strptr = RSTRING_PTR(str);
11407 prefixptr = RSTRING_PTR(prefix);
11408 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11409 if (is_broken_string(prefix)) {
11410 if (!is_broken_string(str)) {
11411 /* prefix in a valid string cannot be broken */
11412 return 0;
11413 }
11414 const char *strend = strptr + olen;
11415 const char *after_prefix = strptr + prefixlen;
11416 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11417 /* prefix does not end at char-boundary */
11418 return 0;
11419 }
11420 }
11421 /* prefix part in `str` also should be valid. */
11422
11423 return prefixlen;
11424}
11425
11426/*
11427 * call-seq:
11428 * delete_prefix!(prefix) -> self or nil
11429 *
11430 * Like String#delete_prefix, except that +self+ is modified in place;
11431 * returns +self+ if the prefix is removed, +nil+ otherwise.
11432 *
11433 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11434 */
11435
11436static VALUE
11437rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11438{
11439 long prefixlen;
11440 str_modify_keep_cr(str);
11441
11442 prefixlen = deleted_prefix_length(str, prefix);
11443 if (prefixlen <= 0) return Qnil;
11444
11445 return rb_str_drop_bytes(str, prefixlen);
11446}
11447
11448/*
11449 * call-seq:
11450 * delete_prefix(prefix) -> new_string
11451 *
11452 * :include: doc/string/delete_prefix.rdoc
11453 *
11454 */
11455
11456static VALUE
11457rb_str_delete_prefix(VALUE str, VALUE prefix)
11458{
11459 long prefixlen;
11460
11461 prefixlen = deleted_prefix_length(str, prefix);
11462 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11463
11464 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11465}
11466
11476static long
11477deleted_suffix_length(VALUE str, VALUE suffix)
11478{
11479 const char *strptr, *suffixptr;
11480 long olen, suffixlen;
11481 rb_encoding *enc;
11482
11483 StringValue(suffix);
11484 if (is_broken_string(suffix)) return 0;
11485 enc = rb_enc_check(str, suffix);
11486
11487 /* return 0 if not start with suffix */
11488 suffixlen = RSTRING_LEN(suffix);
11489 if (suffixlen <= 0) return 0;
11490 olen = RSTRING_LEN(str);
11491 if (olen < suffixlen) return 0;
11492 strptr = RSTRING_PTR(str);
11493 suffixptr = RSTRING_PTR(suffix);
11494 const char *strend = strptr + olen;
11495 const char *before_suffix = strend - suffixlen;
11496 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11497 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11498
11499 return suffixlen;
11500}
11501
11502/*
11503 * call-seq:
11504 * delete_suffix!(suffix) -> self or nil
11505 *
11506 * Like String#delete_suffix, except that +self+ is modified in place;
11507 * returns +self+ if the suffix is removed, +nil+ otherwise.
11508 *
11509 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11510 */
11511
11512static VALUE
11513rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11514{
11515 long olen, suffixlen, len;
11516 str_modifiable(str);
11517
11518 suffixlen = deleted_suffix_length(str, suffix);
11519 if (suffixlen <= 0) return Qnil;
11520
11521 olen = RSTRING_LEN(str);
11522 str_modify_keep_cr(str);
11523 len = olen - suffixlen;
11524 STR_SET_LEN(str, len);
11525 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11526 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11528 }
11529 return str;
11530}
11531
11532/*
11533 * call-seq:
11534 * delete_suffix(suffix) -> new_string
11535 *
11536 * :include: doc/string/delete_suffix.rdoc
11537 *
11538 */
11539
11540static VALUE
11541rb_str_delete_suffix(VALUE str, VALUE suffix)
11542{
11543 long suffixlen;
11544
11545 suffixlen = deleted_suffix_length(str, suffix);
11546 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11547
11548 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11549}
11550
11551void
11552rb_str_setter(VALUE val, ID id, VALUE *var)
11553{
11554 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11555 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11556 }
11557 *var = val;
11558}
11559
11560static void
11561rb_fs_setter(VALUE val, ID id, VALUE *var)
11562{
11563 val = rb_fs_check(val);
11564 if (!val) {
11565 rb_raise(rb_eTypeError,
11566 "value of %"PRIsVALUE" must be String or Regexp",
11567 rb_id2str(id));
11568 }
11569 if (!NIL_P(val)) {
11570 rb_warn_deprecated("'$;'", NULL);
11571 }
11572 *var = val;
11573}
11574
11575
11576/*
11577 * call-seq:
11578 * force_encoding(encoding) -> self
11579 *
11580 * :include: doc/string/force_encoding.rdoc
11581 *
11582 */
11583
11584static VALUE
11585rb_str_force_encoding(VALUE str, VALUE enc)
11586{
11587 str_modifiable(str);
11588
11589 rb_encoding *encoding = rb_to_encoding(enc);
11590 int idx = rb_enc_to_index(encoding);
11591
11592 // If the encoding is unchanged, we do nothing.
11593 if (ENCODING_GET(str) == idx) {
11594 return str;
11595 }
11596
11597 rb_enc_associate_index(str, idx);
11598
11599 // If the coderange was 7bit and the new encoding is ASCII-compatible
11600 // we can keep the coderange.
11601 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11602 return str;
11603 }
11604
11606 return str;
11607}
11608
11609/*
11610 * call-seq:
11611 * b -> new_string
11612 *
11613 * :include: doc/string/b.rdoc
11614 *
11615 */
11616
11617static VALUE
11618rb_str_b(VALUE str)
11619{
11620 VALUE str2;
11621 if (STR_EMBED_P(str)) {
11622 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11623 }
11624 else {
11625 str2 = str_alloc_heap(rb_cString);
11626 }
11627 str_replace_shared_without_enc(str2, str);
11628
11629 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11630 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11631 // If we know the receiver's code range then we know the result's code range.
11632 int cr = ENC_CODERANGE(str);
11633 switch (cr) {
11634 case ENC_CODERANGE_7BIT:
11636 break;
11640 break;
11641 default:
11642 ENC_CODERANGE_CLEAR(str2);
11643 break;
11644 }
11645 }
11646
11647 return str2;
11648}
11649
11650/*
11651 * call-seq:
11652 * valid_encoding? -> true or false
11653 *
11654 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11655 *
11656 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11657 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11658 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11659 */
11660
11661static VALUE
11662rb_str_valid_encoding_p(VALUE str)
11663{
11664 int cr = rb_enc_str_coderange(str);
11665
11666 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11667}
11668
11669/*
11670 * call-seq:
11671 * ascii_only? -> true or false
11672 *
11673 * Returns whether +self+ contains only ASCII characters:
11674 *
11675 * 'abc'.ascii_only? # => true
11676 * "abc\u{6666}".ascii_only? # => false
11677 *
11678 * Related: see {Querying}[rdoc-ref:String@Querying].
11679 */
11680
11681static VALUE
11682rb_str_is_ascii_only_p(VALUE str)
11683{
11684 int cr = rb_enc_str_coderange(str);
11685
11686 return RBOOL(cr == ENC_CODERANGE_7BIT);
11687}
11688
11689VALUE
11691{
11692 static const char ellipsis[] = "...";
11693 const long ellipsislen = sizeof(ellipsis) - 1;
11694 rb_encoding *const enc = rb_enc_get(str);
11695 const long blen = RSTRING_LEN(str);
11696 const char *const p = RSTRING_PTR(str), *e = p + blen;
11697 VALUE estr, ret = 0;
11698
11699 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11700 if (len * rb_enc_mbminlen(enc) >= blen ||
11701 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11702 ret = str;
11703 }
11704 else if (len <= ellipsislen ||
11705 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11706 if (rb_enc_asciicompat(enc)) {
11707 ret = rb_str_new(ellipsis, len);
11708 rb_enc_associate(ret, enc);
11709 }
11710 else {
11711 estr = rb_usascii_str_new(ellipsis, len);
11712 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11713 }
11714 }
11715 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11716 rb_str_cat(ret, ellipsis, ellipsislen);
11717 }
11718 else {
11719 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11720 rb_enc_from_encoding(enc), 0, Qnil);
11721 rb_str_append(ret, estr);
11722 }
11723 return ret;
11724}
11725
11726static VALUE
11727str_compat_and_valid(VALUE str, rb_encoding *enc)
11728{
11729 int cr;
11730 str = StringValue(str);
11731 cr = rb_enc_str_coderange(str);
11732 if (cr == ENC_CODERANGE_BROKEN) {
11733 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11734 }
11735 else {
11736 rb_encoding *e = STR_ENC_GET(str);
11737 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11738 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11739 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11740 }
11741 }
11742 return str;
11743}
11744
11745static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11746
11747VALUE
11749{
11750 rb_encoding *enc = STR_ENC_GET(str);
11751 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11752}
11753
11754VALUE
11755rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11756{
11757 int cr = ENC_CODERANGE_UNKNOWN;
11758 if (enc == STR_ENC_GET(str)) {
11759 /* cached coderange makes sense only when enc equals the
11760 * actual encoding of str */
11761 cr = ENC_CODERANGE(str);
11762 }
11763 return enc_str_scrub(enc, str, repl, cr);
11764}
11765
11766static VALUE
11767enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11768{
11769 int encidx;
11770 VALUE buf = Qnil;
11771 const char *rep, *p, *e, *p1, *sp;
11772 long replen = -1;
11773 long slen;
11774
11775 if (rb_block_given_p()) {
11776 if (!NIL_P(repl))
11777 rb_raise(rb_eArgError, "both of block and replacement given");
11778 replen = 0;
11779 }
11780
11781 if (ENC_CODERANGE_CLEAN_P(cr))
11782 return Qnil;
11783
11784 if (!NIL_P(repl)) {
11785 repl = str_compat_and_valid(repl, enc);
11786 }
11787
11788 if (rb_enc_dummy_p(enc)) {
11789 return Qnil;
11790 }
11791 encidx = rb_enc_to_index(enc);
11792
11793#define DEFAULT_REPLACE_CHAR(str) do { \
11794 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11795 rep = replace; replen = (int)sizeof(replace); \
11796 } while (0)
11797
11798 slen = RSTRING_LEN(str);
11799 p = RSTRING_PTR(str);
11800 e = RSTRING_END(str);
11801 p1 = p;
11802 sp = p;
11803
11804 if (rb_enc_asciicompat(enc)) {
11805 int rep7bit_p;
11806 if (!replen) {
11807 rep = NULL;
11808 rep7bit_p = FALSE;
11809 }
11810 else if (!NIL_P(repl)) {
11811 rep = RSTRING_PTR(repl);
11812 replen = RSTRING_LEN(repl);
11813 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11814 }
11815 else if (encidx == rb_utf8_encindex()) {
11816 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11817 rep7bit_p = FALSE;
11818 }
11819 else {
11820 DEFAULT_REPLACE_CHAR("?");
11821 rep7bit_p = TRUE;
11822 }
11823 cr = ENC_CODERANGE_7BIT;
11824
11825 p = search_nonascii(p, e);
11826 if (!p) {
11827 p = e;
11828 }
11829 while (p < e) {
11830 int ret = rb_enc_precise_mbclen(p, e, enc);
11831 if (MBCLEN_NEEDMORE_P(ret)) {
11832 break;
11833 }
11834 else if (MBCLEN_CHARFOUND_P(ret)) {
11836 p += MBCLEN_CHARFOUND_LEN(ret);
11837 }
11838 else if (MBCLEN_INVALID_P(ret)) {
11839 /*
11840 * p1~p: valid ascii/multibyte chars
11841 * p ~e: invalid bytes + unknown bytes
11842 */
11843 long clen = rb_enc_mbmaxlen(enc);
11844 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11845 if (p > p1) {
11846 rb_str_buf_cat(buf, p1, p - p1);
11847 }
11848
11849 if (e - p < clen) clen = e - p;
11850 if (clen <= 2) {
11851 clen = 1;
11852 }
11853 else {
11854 const char *q = p;
11855 clen--;
11856 for (; clen > 1; clen--) {
11857 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11858 if (MBCLEN_NEEDMORE_P(ret)) break;
11859 if (MBCLEN_INVALID_P(ret)) continue;
11861 }
11862 }
11863 if (rep) {
11864 rb_str_buf_cat(buf, rep, replen);
11865 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11866 }
11867 else {
11868 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11869 str_mod_check(str, sp, slen);
11870 repl = str_compat_and_valid(repl, enc);
11871 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11874 }
11875 p += clen;
11876 p1 = p;
11877 p = search_nonascii(p, e);
11878 if (!p) {
11879 p = e;
11880 break;
11881 }
11882 }
11883 else {
11885 }
11886 }
11887 if (NIL_P(buf)) {
11888 if (p == e) {
11889 ENC_CODERANGE_SET(str, cr);
11890 return Qnil;
11891 }
11892 buf = rb_str_buf_new(RSTRING_LEN(str));
11893 }
11894 if (p1 < p) {
11895 rb_str_buf_cat(buf, p1, p - p1);
11896 }
11897 if (p < e) {
11898 if (rep) {
11899 rb_str_buf_cat(buf, rep, replen);
11900 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11901 }
11902 else {
11903 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11904 str_mod_check(str, sp, slen);
11905 repl = str_compat_and_valid(repl, enc);
11906 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11909 }
11910 }
11911 }
11912 else {
11913 /* ASCII incompatible */
11914 long mbminlen = rb_enc_mbminlen(enc);
11915 if (!replen) {
11916 rep = NULL;
11917 }
11918 else if (!NIL_P(repl)) {
11919 rep = RSTRING_PTR(repl);
11920 replen = RSTRING_LEN(repl);
11921 }
11922 else if (encidx == ENCINDEX_UTF_16BE) {
11923 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11924 }
11925 else if (encidx == ENCINDEX_UTF_16LE) {
11926 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11927 }
11928 else if (encidx == ENCINDEX_UTF_32BE) {
11929 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11930 }
11931 else if (encidx == ENCINDEX_UTF_32LE) {
11932 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11933 }
11934 else {
11935 DEFAULT_REPLACE_CHAR("?");
11936 }
11937
11938 while (p < e) {
11939 int ret = rb_enc_precise_mbclen(p, e, enc);
11940 if (MBCLEN_NEEDMORE_P(ret)) {
11941 break;
11942 }
11943 else if (MBCLEN_CHARFOUND_P(ret)) {
11944 p += MBCLEN_CHARFOUND_LEN(ret);
11945 }
11946 else if (MBCLEN_INVALID_P(ret)) {
11947 const char *q = p;
11948 long clen = rb_enc_mbmaxlen(enc);
11949 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11950 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11951
11952 if (e - p < clen) clen = e - p;
11953 if (clen <= mbminlen * 2) {
11954 clen = mbminlen;
11955 }
11956 else {
11957 clen -= mbminlen;
11958 for (; clen > mbminlen; clen-=mbminlen) {
11959 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11960 if (MBCLEN_NEEDMORE_P(ret)) break;
11961 if (MBCLEN_INVALID_P(ret)) continue;
11963 }
11964 }
11965 if (rep) {
11966 rb_str_buf_cat(buf, rep, replen);
11967 }
11968 else {
11969 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11970 str_mod_check(str, sp, slen);
11971 repl = str_compat_and_valid(repl, enc);
11972 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11973 }
11974 p += clen;
11975 p1 = p;
11976 }
11977 else {
11979 }
11980 }
11981 if (NIL_P(buf)) {
11982 if (p == e) {
11984 return Qnil;
11985 }
11986 buf = rb_str_buf_new(RSTRING_LEN(str));
11987 }
11988 if (p1 < p) {
11989 rb_str_buf_cat(buf, p1, p - p1);
11990 }
11991 if (p < e) {
11992 if (rep) {
11993 rb_str_buf_cat(buf, rep, replen);
11994 }
11995 else {
11996 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11997 str_mod_check(str, sp, slen);
11998 repl = str_compat_and_valid(repl, enc);
11999 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12000 }
12001 }
12003 }
12004 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
12005 return buf;
12006}
12007
12008/*
12009 * call-seq:
12010 * scrub(replacement_string = default_replacement) -> new_string
12011 * scrub{|bytes| ... } -> new_string
12012 *
12013 * :include: doc/string/scrub.rdoc
12014 *
12015 */
12016static VALUE
12017str_scrub(int argc, VALUE *argv, VALUE str)
12018{
12019 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12020 VALUE new = rb_str_scrub(str, repl);
12021 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
12022}
12023
12024/*
12025 * call-seq:
12026 * scrub! -> self
12027 * scrub!(replacement_string = default_replacement) -> self
12028 * scrub!{|bytes| ... } -> self
12029 *
12030 * Like String#scrub, except that any replacements are made in +self+.
12031 *
12032 */
12033static VALUE
12034str_scrub_bang(int argc, VALUE *argv, VALUE str)
12035{
12036 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12037 VALUE new = rb_str_scrub(str, repl);
12038 if (!NIL_P(new)) rb_str_replace(str, new);
12039 return str;
12040}
12041
12042static ID id_normalize;
12043static ID id_normalized_p;
12044static VALUE mUnicodeNormalize;
12045
12046static VALUE
12047unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12048{
12049 static int UnicodeNormalizeRequired = 0;
12050 VALUE argv2[2];
12051
12052 if (!UnicodeNormalizeRequired) {
12053 rb_require("unicode_normalize/normalize.rb");
12054 UnicodeNormalizeRequired = 1;
12055 }
12056 argv2[0] = str;
12057 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12058 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12059}
12060
12061/*
12062 * call-seq:
12063 * unicode_normalize(form = :nfc) -> string
12064 *
12065 * Returns a copy of +self+ with
12066 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
12067 *
12068 * Argument +form+ must be one of the following symbols
12069 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
12070 *
12071 * - +:nfc+: Canonical decomposition, followed by canonical composition.
12072 * - +:nfd+: Canonical decomposition.
12073 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
12074 * - +:nfkd+: Compatibility decomposition.
12075 *
12076 * The encoding of +self+ must be one of:
12077 *
12078 * - Encoding::UTF_8
12079 * - Encoding::UTF_16BE
12080 * - Encoding::UTF_16LE
12081 * - Encoding::UTF_32BE
12082 * - Encoding::UTF_32LE
12083 * - Encoding::GB18030
12084 * - Encoding::UCS_2BE
12085 * - Encoding::UCS_4BE
12086 *
12087 * Examples:
12088 *
12089 * "a\u0300".unicode_normalize # => "a"
12090 * "\u00E0".unicode_normalize(:nfd) # => "a "
12091 *
12092 * Related: String#unicode_normalize!, String#unicode_normalized?.
12093 */
12094static VALUE
12095rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12096{
12097 return unicode_normalize_common(argc, argv, str, id_normalize);
12098}
12099
12100/*
12101 * call-seq:
12102 * unicode_normalize!(form = :nfc) -> self
12103 *
12104 * Like String#unicode_normalize, except that the normalization
12105 * is performed on +self+.
12106 *
12107 * Related String#unicode_normalized?.
12108 *
12109 */
12110static VALUE
12111rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12112{
12113 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12114}
12115
12116/* call-seq:
12117 * unicode_normalized?(form = :nfc) -> true or false
12118 *
12119 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
12120 * +false+ otherwise.
12121 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12122 *
12123 * Examples:
12124 *
12125 * "a\u0300".unicode_normalized? # => false
12126 * "a\u0300".unicode_normalized?(:nfd) # => true
12127 * "\u00E0".unicode_normalized? # => true
12128 * "\u00E0".unicode_normalized?(:nfd) # => false
12129 *
12130 *
12131 * Raises an exception if +self+ is not in a Unicode encoding:
12132 *
12133 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12134 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
12135 *
12136 * Related: String#unicode_normalize, String#unicode_normalize!.
12137 *
12138 */
12139static VALUE
12140rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12141{
12142 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12143}
12144
12145/**********************************************************************
12146 * Document-class: Symbol
12147 *
12148 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12149 *
12150 * You can create a +Symbol+ object explicitly with:
12151 *
12152 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12153 *
12154 * The same +Symbol+ object will be
12155 * created for a given name or string for the duration of a program's
12156 * execution, regardless of the context or meaning of that name. Thus
12157 * if <code>Fred</code> is a constant in one context, a method in
12158 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12159 * will be the same object in all three contexts.
12160 *
12161 * module One
12162 * class Fred
12163 * end
12164 * $f1 = :Fred
12165 * end
12166 * module Two
12167 * Fred = 1
12168 * $f2 = :Fred
12169 * end
12170 * def Fred()
12171 * end
12172 * $f3 = :Fred
12173 * $f1.object_id #=> 2514190
12174 * $f2.object_id #=> 2514190
12175 * $f3.object_id #=> 2514190
12176 *
12177 * Constant, method, and variable names are returned as symbols:
12178 *
12179 * module One
12180 * Two = 2
12181 * def three; 3 end
12182 * @four = 4
12183 * @@five = 5
12184 * $six = 6
12185 * end
12186 * seven = 7
12187 *
12188 * One.constants
12189 * # => [:Two]
12190 * One.instance_methods(true)
12191 * # => [:three]
12192 * One.instance_variables
12193 * # => [:@four]
12194 * One.class_variables
12195 * # => [:@@five]
12196 * global_variables.grep(/six/)
12197 * # => [:$six]
12198 * local_variables
12199 * # => [:seven]
12200 *
12201 * A +Symbol+ object differs from a String object in that
12202 * a +Symbol+ object represents an identifier, while a String object
12203 * represents text or data.
12204 *
12205 * == What's Here
12206 *
12207 * First, what's elsewhere. Class +Symbol+:
12208 *
12209 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12210 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12211 *
12212 * Here, class +Symbol+ provides methods that are useful for:
12213 *
12214 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12215 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12216 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12217 *
12218 * === Methods for Querying
12219 *
12220 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12221 * - #=~: Returns the index of the first substring in symbol that matches a
12222 * given Regexp or other object; returns +nil+ if no match is found.
12223 * - #[], #slice : Returns a substring of symbol
12224 * determined by a given index, start/length, or range, or string.
12225 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12226 * - #encoding: Returns the Encoding object that represents the encoding
12227 * of symbol.
12228 * - #end_with?: Returns +true+ if symbol ends with
12229 * any of the given strings.
12230 * - #match: Returns a MatchData object if symbol
12231 * matches a given Regexp; +nil+ otherwise.
12232 * - #match?: Returns +true+ if symbol
12233 * matches a given Regexp; +false+ otherwise.
12234 * - #length, #size: Returns the number of characters in symbol.
12235 * - #start_with?: Returns +true+ if symbol starts with
12236 * any of the given strings.
12237 *
12238 * === Methods for Comparing
12239 *
12240 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12241 * or larger than symbol.
12242 * - #==, #===: Returns +true+ if a given symbol has the same content and
12243 * encoding.
12244 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12245 * symbol is smaller than, equal to, or larger than symbol.
12246 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12247 * after Unicode case folding; +false+ otherwise.
12248 *
12249 * === Methods for Converting
12250 *
12251 * - #capitalize: Returns symbol with the first character upcased
12252 * and all other characters downcased.
12253 * - #downcase: Returns symbol with all characters downcased.
12254 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12255 * - #name: Returns the frozen string corresponding to symbol.
12256 * - #succ, #next: Returns the symbol that is the successor to symbol.
12257 * - #swapcase: Returns symbol with all upcase characters downcased
12258 * and all downcase characters upcased.
12259 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12260 * - #to_s, #id2name: Returns the string corresponding to +self+.
12261 * - #to_sym, #intern: Returns +self+.
12262 * - #upcase: Returns symbol with all characters upcased.
12263 *
12264 */
12265
12266
12267/*
12268 * call-seq:
12269 * symbol == object -> true or false
12270 *
12271 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12272 */
12273
12274#define sym_equal rb_obj_equal
12275
12276static int
12277sym_printable(const char *s, const char *send, rb_encoding *enc)
12278{
12279 while (s < send) {
12280 int n;
12281 int c = rb_enc_precise_mbclen(s, send, enc);
12282
12283 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12284 n = MBCLEN_CHARFOUND_LEN(c);
12285 c = rb_enc_mbc_to_codepoint(s, send, enc);
12286 if (!rb_enc_isprint(c, enc)) return FALSE;
12287 s += n;
12288 }
12289 return TRUE;
12290}
12291
12292int
12293rb_str_symname_p(VALUE sym)
12294{
12295 rb_encoding *enc;
12296 const char *ptr;
12297 long len;
12298 rb_encoding *resenc = rb_default_internal_encoding();
12299
12300 if (resenc == NULL) resenc = rb_default_external_encoding();
12301 enc = STR_ENC_GET(sym);
12302 ptr = RSTRING_PTR(sym);
12303 len = RSTRING_LEN(sym);
12304 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12305 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12306 return FALSE;
12307 }
12308 return TRUE;
12309}
12310
12311VALUE
12312rb_str_quote_unprintable(VALUE str)
12313{
12314 rb_encoding *enc;
12315 const char *ptr;
12316 long len;
12317 rb_encoding *resenc;
12318
12319 Check_Type(str, T_STRING);
12320 resenc = rb_default_internal_encoding();
12321 if (resenc == NULL) resenc = rb_default_external_encoding();
12322 enc = STR_ENC_GET(str);
12323 ptr = RSTRING_PTR(str);
12324 len = RSTRING_LEN(str);
12325 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12326 !sym_printable(ptr, ptr + len, enc)) {
12327 return rb_str_escape(str);
12328 }
12329 return str;
12330}
12331
12332VALUE
12333rb_id_quote_unprintable(ID id)
12334{
12335 VALUE str = rb_id2str(id);
12336 if (!rb_str_symname_p(str)) {
12337 return rb_str_escape(str);
12338 }
12339 return str;
12340}
12341
12342/*
12343 * call-seq:
12344 * inspect -> string
12345 *
12346 * Returns a string representation of +self+ (including the leading colon):
12347 *
12348 * :foo.inspect # => ":foo"
12349 *
12350 * Related: Symbol#to_s, Symbol#name.
12351 *
12352 */
12353
12354static VALUE
12355sym_inspect(VALUE sym)
12356{
12357 VALUE str = rb_sym2str(sym);
12358 const char *ptr;
12359 long len;
12360 char *dest;
12361
12362 if (!rb_str_symname_p(str)) {
12363 str = rb_str_inspect(str);
12364 len = RSTRING_LEN(str);
12365 rb_str_resize(str, len + 1);
12366 dest = RSTRING_PTR(str);
12367 memmove(dest + 1, dest, len);
12368 }
12369 else {
12370 rb_encoding *enc = STR_ENC_GET(str);
12371 VALUE orig_str = str;
12372
12373 len = RSTRING_LEN(orig_str);
12374 str = rb_enc_str_new(0, len + 1, enc);
12375
12376 // Get data pointer after allocation
12377 ptr = RSTRING_PTR(orig_str);
12378 dest = RSTRING_PTR(str);
12379 memcpy(dest + 1, ptr, len);
12380
12381 RB_GC_GUARD(orig_str);
12382 }
12383 dest[0] = ':';
12384
12386
12387 return str;
12388}
12389
12390VALUE
12392{
12393 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12394 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12395 return str;
12396}
12397
12398VALUE
12399rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12400{
12401 VALUE obj;
12402
12403 if (argc < 1) {
12404 rb_raise(rb_eArgError, "no receiver given");
12405 }
12406 obj = argv[0];
12407 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12408}
12409
12410/*
12411 * call-seq:
12412 * succ
12413 *
12414 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12415 *
12416 * :foo.succ # => :fop
12417 *
12418 * Related: String#succ.
12419 */
12420
12421static VALUE
12422sym_succ(VALUE sym)
12423{
12424 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12425}
12426
12427/*
12428 * call-seq:
12429 * symbol <=> object -> -1, 0, +1, or nil
12430 *
12431 * If +object+ is a symbol,
12432 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12433 *
12434 * :bar <=> :foo # => -1
12435 * :foo <=> :foo # => 0
12436 * :foo <=> :bar # => 1
12437 *
12438 * Otherwise, returns +nil+:
12439 *
12440 * :foo <=> 'bar' # => nil
12441 *
12442 * Related: String#<=>.
12443 */
12444
12445static VALUE
12446sym_cmp(VALUE sym, VALUE other)
12447{
12448 if (!SYMBOL_P(other)) {
12449 return Qnil;
12450 }
12451 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12452}
12453
12454/*
12455 * call-seq:
12456 * casecmp(object) -> -1, 0, 1, or nil
12457 *
12458 * :include: doc/symbol/casecmp.rdoc
12459 *
12460 */
12461
12462static VALUE
12463sym_casecmp(VALUE sym, VALUE other)
12464{
12465 if (!SYMBOL_P(other)) {
12466 return Qnil;
12467 }
12468 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12469}
12470
12471/*
12472 * call-seq:
12473 * casecmp?(object) -> true, false, or nil
12474 *
12475 * :include: doc/symbol/casecmp_p.rdoc
12476 *
12477 */
12478
12479static VALUE
12480sym_casecmp_p(VALUE sym, VALUE other)
12481{
12482 if (!SYMBOL_P(other)) {
12483 return Qnil;
12484 }
12485 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12486}
12487
12488/*
12489 * call-seq:
12490 * symbol =~ object -> integer or nil
12491 *
12492 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12493 * including possible updates to global variables;
12494 * see String#=~.
12495 *
12496 */
12497
12498static VALUE
12499sym_match(VALUE sym, VALUE other)
12500{
12501 return rb_str_match(rb_sym2str(sym), other);
12502}
12503
12504/*
12505 * call-seq:
12506 * match(pattern, offset = 0) -> matchdata or nil
12507 * match(pattern, offset = 0) {|matchdata| } -> object
12508 *
12509 * Equivalent to <tt>self.to_s.match</tt>,
12510 * including possible updates to global variables;
12511 * see String#match.
12512 *
12513 */
12514
12515static VALUE
12516sym_match_m(int argc, VALUE *argv, VALUE sym)
12517{
12518 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12519}
12520
12521/*
12522 * call-seq:
12523 * match?(pattern, offset) -> true or false
12524 *
12525 * Equivalent to <tt>sym.to_s.match?</tt>;
12526 * see String#match.
12527 *
12528 */
12529
12530static VALUE
12531sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12532{
12533 return rb_str_match_m_p(argc, argv, sym);
12534}
12535
12536/*
12537 * call-seq:
12538 * symbol[index] -> string or nil
12539 * symbol[start, length] -> string or nil
12540 * symbol[range] -> string or nil
12541 * symbol[regexp, capture = 0] -> string or nil
12542 * symbol[substring] -> string or nil
12543 *
12544 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12545 *
12546 */
12547
12548static VALUE
12549sym_aref(int argc, VALUE *argv, VALUE sym)
12550{
12551 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12552}
12553
12554/*
12555 * call-seq:
12556 * length -> integer
12557 *
12558 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12559 */
12560
12561static VALUE
12562sym_length(VALUE sym)
12563{
12564 return rb_str_length(rb_sym2str(sym));
12565}
12566
12567/*
12568 * call-seq:
12569 * empty? -> true or false
12570 *
12571 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12572 *
12573 */
12574
12575static VALUE
12576sym_empty(VALUE sym)
12577{
12578 return rb_str_empty(rb_sym2str(sym));
12579}
12580
12581/*
12582 * call-seq:
12583 * upcase(mapping) -> symbol
12584 *
12585 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12586 *
12587 * See String#upcase.
12588 *
12589 */
12590
12591static VALUE
12592sym_upcase(int argc, VALUE *argv, VALUE sym)
12593{
12594 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12595}
12596
12597/*
12598 * call-seq:
12599 * downcase(mapping) -> symbol
12600 *
12601 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12602 *
12603 * See String#downcase.
12604 *
12605 * Related: Symbol#upcase.
12606 *
12607 */
12608
12609static VALUE
12610sym_downcase(int argc, VALUE *argv, VALUE sym)
12611{
12612 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12613}
12614
12615/*
12616 * call-seq:
12617 * capitalize(mapping) -> symbol
12618 *
12619 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12620 *
12621 * See String#capitalize.
12622 *
12623 */
12624
12625static VALUE
12626sym_capitalize(int argc, VALUE *argv, VALUE sym)
12627{
12628 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12629}
12630
12631/*
12632 * call-seq:
12633 * swapcase(mapping) -> symbol
12634 *
12635 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12636 *
12637 * See String#swapcase.
12638 *
12639 */
12640
12641static VALUE
12642sym_swapcase(int argc, VALUE *argv, VALUE sym)
12643{
12644 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12645}
12646
12647/*
12648 * call-seq:
12649 * start_with?(*string_or_regexp) -> true or false
12650 *
12651 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12652 *
12653 */
12654
12655static VALUE
12656sym_start_with(int argc, VALUE *argv, VALUE sym)
12657{
12658 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12659}
12660
12661/*
12662 * call-seq:
12663 * end_with?(*strings) -> true or false
12664 *
12665 *
12666 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12667 *
12668 */
12669
12670static VALUE
12671sym_end_with(int argc, VALUE *argv, VALUE sym)
12672{
12673 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12674}
12675
12676/*
12677 * call-seq:
12678 * encoding -> encoding
12679 *
12680 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12681 *
12682 */
12683
12684static VALUE
12685sym_encoding(VALUE sym)
12686{
12687 return rb_obj_encoding(rb_sym2str(sym));
12688}
12689
12690static VALUE
12691string_for_symbol(VALUE name)
12692{
12693 if (!RB_TYPE_P(name, T_STRING)) {
12694 VALUE tmp = rb_check_string_type(name);
12695 if (NIL_P(tmp)) {
12696 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12697 name);
12698 }
12699 name = tmp;
12700 }
12701 return name;
12702}
12703
12704ID
12706{
12707 if (SYMBOL_P(name)) {
12708 return SYM2ID(name);
12709 }
12710 name = string_for_symbol(name);
12711 return rb_intern_str(name);
12712}
12713
12714VALUE
12716{
12717 if (SYMBOL_P(name)) {
12718 return name;
12719 }
12720 name = string_for_symbol(name);
12721 return rb_str_intern(name);
12722}
12723
12724/*
12725 * call-seq:
12726 * Symbol.all_symbols -> array_of_symbols
12727 *
12728 * Returns an array of all symbols currently in Ruby's symbol table:
12729 *
12730 * Symbol.all_symbols.size # => 9334
12731 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12732 *
12733 */
12734
12735static VALUE
12736sym_all_symbols(VALUE _)
12737{
12738 return rb_sym_all_symbols();
12739}
12740
12741VALUE
12742rb_str_to_interned_str(VALUE str)
12743{
12744 return rb_fstring(str);
12745}
12746
12747VALUE
12748rb_interned_str(const char *ptr, long len)
12749{
12750 struct RString fake_str = {RBASIC_INIT};
12751 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12752}
12753
12754VALUE
12756{
12757 return rb_interned_str(ptr, strlen(ptr));
12758}
12759
12760VALUE
12761rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12762{
12763 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12764 rb_enc_autoload(enc);
12765 }
12766
12767 struct RString fake_str = {RBASIC_INIT};
12768 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12769}
12770
12771VALUE
12772rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12773{
12774 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12775 rb_enc_autoload(enc);
12776 }
12777
12778 struct RString fake_str = {RBASIC_INIT};
12779 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12780}
12781
12782VALUE
12784{
12785 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12786}
12787
12788#if USE_YJIT
12789void
12790rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12791{
12792 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12793 ssize_t code = RB_NUM2SSIZE(codepoint);
12794
12795 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12796 rb_str_buf_cat_byte(str, (char) code);
12797 return;
12798 }
12799 }
12800
12801 rb_str_concat(str, codepoint);
12802}
12803#endif
12804
12805static int
12806fstring_set_class_i(VALUE *str, void *data)
12807{
12808 RBASIC_SET_CLASS(*str, rb_cString);
12809
12810 return ST_CONTINUE;
12811}
12812
12813void
12814Init_String(void)
12815{
12816 rb_cString = rb_define_class("String", rb_cObject);
12817
12818 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12819
12821 rb_define_alloc_func(rb_cString, empty_str_alloc);
12822 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12823 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12824 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12826 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12827 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12830 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12831 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12832 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12833 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12836 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12837 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12838 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12839 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12842 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12843 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12844 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12845 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12846 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12848 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12850 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12851 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12852 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12853 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12854 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12855 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12856 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12857 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12858 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12859 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12860 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12861 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12862 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12863 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12865 rb_define_method(rb_cString, "+@", str_uplus, 0);
12866 rb_define_method(rb_cString, "-@", str_uminus, 0);
12867 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12868 rb_define_alias(rb_cString, "dedup", "-@");
12869
12870 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12871 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12872 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12873 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12876 rb_define_method(rb_cString, "undump", str_undump, 0);
12877
12878 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12879 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12880 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12881 sym_fold = ID2SYM(rb_intern_const("fold"));
12882
12883 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12884 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12885 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12886 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12887
12888 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12889 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12890 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12891 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12892
12893 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12894 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12895 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12896 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12897 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12898 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12899 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12900 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12901 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12902 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12903 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12904 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12906 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12907 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12908 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12909 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12910 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12911
12912 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12913 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12914 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12915
12916 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12917
12918 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12919 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12920 rb_define_method(rb_cString, "center", rb_str_center, -1);
12921
12922 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12923 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12924 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12925 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12926 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12927 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12928 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12929 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12930 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12931
12932 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12933 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12934 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12935 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12936 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12937 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12938 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12939 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12940 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12941
12942 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12943 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12944 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12945 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12946 rb_define_method(rb_cString, "count", rb_str_count, -1);
12947
12948 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12949 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12950 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12951 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12952
12953 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12954 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12955 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12956 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12957 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12958
12959 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12960
12961 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12962 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12963
12964 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12965 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12966
12967 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12968 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12969 rb_define_method(rb_cString, "b", rb_str_b, 0);
12970 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12971 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12972
12973 /* define UnicodeNormalize module here so that we don't have to look it up */
12974 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12975 id_normalize = rb_intern_const("normalize");
12976 id_normalized_p = rb_intern_const("normalized?");
12977
12978 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12979 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12980 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12981
12982 rb_fs = Qnil;
12983 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12984 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12985 rb_gc_register_address(&rb_fs);
12986
12987 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12991 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12992
12993 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12994 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12995 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12996 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12997 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12998 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12999
13000 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
13001 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
13002 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
13003 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
13004
13005 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
13006 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
13007 rb_define_method(rb_cSymbol, "length", sym_length, 0);
13008 rb_define_method(rb_cSymbol, "size", sym_length, 0);
13009 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
13010 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
13011 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
13012
13013 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
13014 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
13015 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
13016 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
13017
13018 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
13019 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
13020
13021 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
13022}
13023
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1701
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1484
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1602
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2853
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2673
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3143
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1037
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2932
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:206
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:683
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3908
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:676
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2164
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2182
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1341
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3578
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:265
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:583
VALUE rb_cSymbol
Symbol class.
Definition string.c:84
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:177
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1329
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:83
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3262
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1327
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:932
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1192
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3003
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1211
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12761
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2309
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3707
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1140
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1432
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1333
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:951
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12783
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:816
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2677
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2940
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:701
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1959
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1965
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1927
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4220
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3717
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1727
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1497
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2462
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3772
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1408
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12391
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2535
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1384
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1721
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3031
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5407
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4135
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3128
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11690
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1763
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1174
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:986
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1503
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1971
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4121
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3540
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2398
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:1989
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6661
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3136
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12755
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1414
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3738
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3078
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4242
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3362
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7334
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2765
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12748
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4189
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4009
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4164
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3714
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3253
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5917
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11748
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1677
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2925
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3225
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3344
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1186
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2719
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7441
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1396
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1693
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2412
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5835
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9502
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1180
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1825
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1985
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2064
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3359
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1622
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12715
ID rb_to_id(VALUE str)
Definition string.c:12705
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3496
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4464
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1426
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2902
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2784
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1420
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2797
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1754
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:456
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1477
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:202
Definition string.c:8385
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113