Ruby 3.5.0dev (2025-08-30 revision 96c8938535ff0cb2bf65943d7472a6702a121e16)
string.c (96c8938535ff0cb2bf65943d7472a6702a121e16)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby_assert.h"
49#include "shape.h"
50#include "vm_sync.h"
52
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
55# include <crypt.h>
56# endif
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
60#endif
61
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
64
65#undef rb_str_new
66#undef rb_usascii_str_new
67#undef rb_utf8_str_new
68#undef rb_enc_str_new
69#undef rb_str_new_cstr
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
77#undef rb_str_buf_cat
78#undef rb_str_buf_cat2
79#undef rb_str_cat2
80#undef rb_str_cat_cstr
81#undef rb_fstring_cstr
82
85
86/* Flags of RString
87 *
88 * 0: STR_SHARED (equal to ELTS_SHARED)
89 * The string is shared. The buffer this string points to is owned by
90 * another string (the shared root).
91 * 1: RSTRING_NOEMBED
92 * The string is not embedded. When a string is embedded, the contents
93 * follow the header. When a string is not embedded, the contents is
94 * on a separately allocated buffer.
95 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
96 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
97 * It emits a deprecation warning when mutated for the first time.
98 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
99 * The string was allocated by the `Symbol#to_s` method.
100 * It emits a deprecation warning when mutated for the first time.
101 * 4: STR_PRECOMPUTED_HASH
102 * The string is embedded and has its precomputed hashcode stored
103 * after the terminator.
104 * 5: STR_SHARED_ROOT
105 * Other strings may point to the contents of this string. When this
106 * flag is set, STR_SHARED must not be set.
107 * 6: STR_BORROWED
108 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
109 * to be unshared by rb_str_tmp_frozen_release.
110 * 7: STR_TMPLOCK
111 * The pointer to the buffer is passed to a system call such as
112 * read(2). Any modification and realloc is prohibited.
113 * 8-9: ENC_CODERANGE
114 * Stores the coderange of the string.
115 * 10-16: ENCODING
116 * Stores the encoding of the string.
117 * 17: RSTRING_FSTR
118 * The string is a fstring. The string is deduplicated in the fstring
119 * table.
120 * 18: STR_NOFREE
121 * Do not free this string's buffer when the string is reclaimed
122 * by the garbage collector. Used for when the string buffer is a C
123 * string literal.
124 * 19: STR_FAKESTR
125 * The string is not allocated or managed by the garbage collector.
126 * Typically, the string object header (struct RString) is temporarily
127 * allocated on C stack.
128 */
129
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
137
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
141} while (0)
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
143
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
146} while (0)
147
148static inline bool
149str_encindex_fastpath(int encindex)
150{
151 // The overwhelming majority of strings are in one of these 3 encodings.
152 switch (encindex) {
153 case ENCINDEX_ASCII_8BIT:
154 case ENCINDEX_UTF_8:
155 case ENCINDEX_US_ASCII:
156 return true;
157 default:
158 return false;
159 }
160}
161
162static inline bool
163str_enc_fastpath(VALUE str)
164{
165 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
166}
167
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
175} while (0)
176
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
180} while (0)
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
191 }\
192 }\
193 else {\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
198 }\
199} while (0)
200
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
210 } \
211} while (0)
212
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
215/* TODO: include the terminator size in capa. */
216
217#define STR_ENC_GET(str) get_encoding(str)
218
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
221#endif
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
224#else
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
226#endif
227
228
229static inline long
230str_embed_capa(VALUE str)
231{
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233}
234
235bool
236rb_str_reembeddable_p(VALUE str)
237{
238 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
239}
240
241static inline size_t
242rb_str_embed_size(long capa)
243{
244 return offsetof(struct RString, as.embed.ary) + capa;
245}
246
247size_t
248rb_str_size_as_embedded(VALUE str)
249{
250 size_t real_size;
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
253 }
254 /* if the string is not currently embedded, but it can be embedded, how
255 * much space would it require */
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
258 }
259 else {
260 real_size = sizeof(struct RString);
261 }
262
263 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
264 real_size += sizeof(st_index_t);
265 }
266
267 return real_size;
268}
269
270static inline bool
271STR_EMBEDDABLE_P(long len, long termlen)
272{
273 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
274}
275
276static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
277static VALUE str_new_frozen(VALUE klass, VALUE orig);
278static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
279static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
280static VALUE str_new(VALUE klass, const char *ptr, long len);
281static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
282static inline void str_modifiable(VALUE str);
283static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
284static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
285
286static inline void
287str_make_independent(VALUE str)
288{
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str), len, 0L, termlen);
292}
293
294static inline int str_dependent_p(VALUE str);
295
296void
297rb_str_make_independent(VALUE str)
298{
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
301 }
302}
303
304void
305rb_str_make_embedded(VALUE str)
306{
307 RUBY_ASSERT(rb_str_reembeddable_p(str));
308 RUBY_ASSERT(!STR_EMBED_P(str));
309
310 char *buf = RSTRING(str)->as.heap.ptr;
311 long len = RSTRING(str)->len;
312
313 STR_SET_EMBED(str);
314 STR_SET_LEN(str, len);
315
316 if (len > 0) {
317 memcpy(RSTRING_PTR(str), buf, len);
318 ruby_xfree(buf);
319 }
320
321 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
322}
323
324void
325rb_debug_rstring_null_ptr(const char *func)
326{
327 fprintf(stderr, "%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
331 func);
332}
333
334/* symbols for [up|down|swap]case/capitalize options */
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
336
337static rb_encoding *
338get_encoding(VALUE str)
339{
340 return rb_enc_from_index(ENCODING_GET(str));
341}
342
343static void
344mustnot_broken(VALUE str)
345{
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348 }
349}
350
351static void
352mustnot_wchar(VALUE str)
353{
354 rb_encoding *enc = STR_ENC_GET(str);
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
357 }
358}
359
360static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
361
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
364#else
365#endif
366
367static inline bool
368BARE_STRING_P(VALUE str)
369{
370 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
371}
372
373static inline st_index_t
374str_do_hash(VALUE str)
375{
376 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
377 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
378 if (e && !is_ascii_string(str)) {
379 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
380 }
381 return h;
382}
383
384static VALUE
385str_store_precomputed_hash(VALUE str, st_index_t hash)
386{
387 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
388 RUBY_ASSERT(STR_EMBED_P(str));
389
390#if RUBY_DEBUG
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
393 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
394#endif
395
396 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
397
398 FL_SET(str, STR_PRECOMPUTED_HASH);
399
400 return str;
401}
402
403VALUE
404rb_fstring(VALUE str)
405{
406 VALUE fstr;
407 int bare;
408
409 Check_Type(str, T_STRING);
410
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
413
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE(str);
418 return str;
419 }
420
421 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
423 return str;
424 }
425 }
426
427 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
428 rb_str_resize(str, RSTRING_LEN(str));
429
430 fstr = register_fstring(str, false, false);
431
432 if (!bare) {
433 str_replace_shared_without_enc(str, fstr);
434 OBJ_FREEZE(str);
435 return str;
436 }
437 return fstr;
438}
439
440static VALUE fstring_table_obj;
441
442static VALUE
443fstring_concurrent_set_hash(VALUE str)
444{
445#ifdef PRECOMPUTED_FAKESTR_HASH
446 st_index_t h;
447 if (FL_TEST_RAW(str, STR_FAKESTR)) {
448 // register_fstring precomputes the hash and stores it in capa for fake strings
449 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
450 }
451 else {
452 h = rb_str_hash(str);
453 }
454 // rb_str_hash doesn't include the encoding for ascii only strings, so
455 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
456 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
457#else
458 return (VALUE)rb_str_hash(str);
459#endif
460}
461
462static bool
463fstring_concurrent_set_cmp(VALUE a, VALUE b)
464{
465 long alen, blen;
466 const char *aptr, *bptr;
467
470
471 RSTRING_GETMEM(a, aptr, alen);
472 RSTRING_GETMEM(b, bptr, blen);
473 return (alen == blen &&
474 ENCODING_GET(a) == ENCODING_GET(b) &&
475 memcmp(aptr, bptr, alen) == 0);
476}
477
479 bool copy;
480 bool force_precompute_hash;
481};
482
483static VALUE
484fstring_concurrent_set_create(VALUE str, void *data)
485{
486 struct fstr_create_arg *arg = data;
487
488 // Unless the string is empty or binary, its coderange has been precomputed.
489 int coderange = ENC_CODERANGE(str);
490
491 if (FL_TEST_RAW(str, STR_FAKESTR)) {
492 if (arg->copy) {
493 VALUE new_str;
494 long len = RSTRING_LEN(str);
495 long capa = len + sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
497
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
499 new_str = str_alloc_embed(rb_cString, capa + term_len);
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
502 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
505 }
506 else {
507 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
512 }
513#endif
514 }
515 str = new_str;
516 }
517 else {
518 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
519 RSTRING(str)->len,
520 ENCODING_GET(str));
521 }
522 OBJ_FREEZE(str);
523 }
524 else {
525 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
526 str = str_new_frozen(rb_cString, str);
527 }
528 if (STR_SHARED_P(str)) { /* str should not be shared */
529 /* shared substring */
530 str_make_independent(str);
532 }
533 if (!BARE_STRING_P(str)) {
534 str = str_new_frozen(rb_cString, str);
535 }
536 }
537
538 ENC_CODERANGE_SET(str, coderange);
539 RBASIC(str)->flags |= RSTRING_FSTR;
540
543 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
544 RUBY_ASSERT(!rb_obj_exivar_p(str));
546 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
547
548 return str;
549}
550
551static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
555 .free = NULL,
556};
557
558void
559Init_fstring_table(void)
560{
561 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
562 rb_gc_register_address(&fstring_table_obj);
563}
564
565static VALUE
566register_fstring(VALUE str, bool copy, bool force_precompute_hash)
567{
568 struct fstr_create_arg args = {
569 .copy = copy,
570 .force_precompute_hash = force_precompute_hash
571 };
572
573#if SIZEOF_VOIDP == SIZEOF_LONG
574 if (FL_TEST_RAW(str, STR_FAKESTR)) {
575 // if the string hasn't been interned, we'll need the hash twice, so we
576 // compute it once and store it in capa
577 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
578 }
579#endif
580
581 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
582
583 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
585 RUBY_ASSERT(OBJ_FROZEN(result));
586 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
588
589 return result;
590}
591
592bool
593rb_obj_is_fstring_table(VALUE obj)
594{
595 ASSERT_vm_locking();
596
597 return obj == fstring_table_obj;
598}
599
600void
601rb_gc_free_fstring(VALUE obj)
602{
603 // Assume locking and barrier (which there is no assert for)
604 ASSERT_vm_locking();
605
606 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
607
608 RB_DEBUG_COUNTER_INC(obj_str_fstr);
609
610 FL_UNSET(obj, RSTRING_FSTR);
611}
612
613void
614rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
615{
616 if (fstring_table_obj) {
617 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
618 }
619}
620
621static VALUE
622setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
623{
624 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
625 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
626
627 if (!name) {
629 name = "";
630 }
631
632 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
633
634 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
635 fake_str->len = len;
636 fake_str->as.heap.ptr = (char *)name;
637 fake_str->as.heap.aux.capa = len;
638 return (VALUE)fake_str;
639}
640
641/*
642 * set up a fake string which refers a static string literal.
643 */
644VALUE
645rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
646{
647 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
648}
649
650/*
651 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
652 * shared string which refers a static string literal. `ptr` must
653 * point a constant string.
654 */
655VALUE
656rb_fstring_new(const char *ptr, long len)
657{
658 struct RString fake_str;
659 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
660}
661
662VALUE
663rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
664{
665 struct RString fake_str;
666 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
667}
668
669VALUE
670rb_fstring_cstr(const char *ptr)
671{
672 return rb_fstring_new(ptr, strlen(ptr));
673}
674
675static inline bool
676single_byte_optimizable(VALUE str)
677{
678 int encindex = ENCODING_GET(str);
679 switch (encindex) {
680 case ENCINDEX_ASCII_8BIT:
681 case ENCINDEX_US_ASCII:
682 return true;
683 case ENCINDEX_UTF_8:
684 // For UTF-8 it's worth scanning the string coderange when unknown.
686 }
687 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
688 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
689 return true;
690 }
691
692 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
693 return true;
694 }
695
696 /* Conservative. Possibly single byte.
697 * "\xa1" in Shift_JIS for example. */
698 return false;
699}
700
702
703static inline const char *
704search_nonascii(const char *p, const char *e)
705{
706 const uintptr_t *s, *t;
707
708#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
709# if SIZEOF_UINTPTR_T == 8
710# define NONASCII_MASK UINT64_C(0x8080808080808080)
711# elif SIZEOF_UINTPTR_T == 4
712# define NONASCII_MASK UINT32_C(0x80808080)
713# else
714# error "don't know what to do."
715# endif
716#else
717# if SIZEOF_UINTPTR_T == 8
718# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
719# elif SIZEOF_UINTPTR_T == 4
720# define NONASCII_MASK 0x80808080UL /* or...? */
721# else
722# error "don't know what to do."
723# endif
724#endif
725
726 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
727#if !UNALIGNED_WORD_ACCESS
728 if ((uintptr_t)p % SIZEOF_VOIDP) {
729 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
730 p += l;
731 switch (l) {
732 default: UNREACHABLE;
733#if SIZEOF_VOIDP > 4
734 case 7: if (p[-7]&0x80) return p-7;
735 case 6: if (p[-6]&0x80) return p-6;
736 case 5: if (p[-5]&0x80) return p-5;
737 case 4: if (p[-4]&0x80) return p-4;
738#endif
739 case 3: if (p[-3]&0x80) return p-3;
740 case 2: if (p[-2]&0x80) return p-2;
741 case 1: if (p[-1]&0x80) return p-1;
742 case 0: break;
743 }
744 }
745#endif
746#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
747#define aligned_ptr(value) \
748 __builtin_assume_aligned((value), sizeof(uintptr_t))
749#else
750#define aligned_ptr(value) (uintptr_t *)(value)
751#endif
752 s = aligned_ptr(p);
753 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
754#undef aligned_ptr
755 for (;s < t; s++) {
756 if (*s & NONASCII_MASK) {
757#ifdef WORDS_BIGENDIAN
758 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
759#else
760 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
761#endif
762 }
763 }
764 p = (const char *)s;
765 }
766
767 switch (e - p) {
768 default: UNREACHABLE;
769#if SIZEOF_VOIDP > 4
770 case 7: if (e[-7]&0x80) return e-7;
771 case 6: if (e[-6]&0x80) return e-6;
772 case 5: if (e[-5]&0x80) return e-5;
773 case 4: if (e[-4]&0x80) return e-4;
774#endif
775 case 3: if (e[-3]&0x80) return e-3;
776 case 2: if (e[-2]&0x80) return e-2;
777 case 1: if (e[-1]&0x80) return e-1;
778 case 0: return NULL;
779 }
780}
781
782static int
783coderange_scan(const char *p, long len, rb_encoding *enc)
784{
785 const char *e = p + len;
786
787 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
788 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
789 p = search_nonascii(p, e);
791 }
792
793 if (rb_enc_asciicompat(enc)) {
794 p = search_nonascii(p, e);
795 if (!p) return ENC_CODERANGE_7BIT;
796 for (;;) {
797 int ret = rb_enc_precise_mbclen(p, e, enc);
799 p += MBCLEN_CHARFOUND_LEN(ret);
800 if (p == e) break;
801 p = search_nonascii(p, e);
802 if (!p) break;
803 }
804 }
805 else {
806 while (p < e) {
807 int ret = rb_enc_precise_mbclen(p, e, enc);
809 p += MBCLEN_CHARFOUND_LEN(ret);
810 }
811 }
812 return ENC_CODERANGE_VALID;
813}
814
815long
816rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
817{
818 const char *p = s;
819
820 if (*cr == ENC_CODERANGE_BROKEN)
821 return e - s;
822
823 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
824 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
825 if (*cr == ENC_CODERANGE_VALID) return e - s;
826 p = search_nonascii(p, e);
828 return e - s;
829 }
830 else if (rb_enc_asciicompat(enc)) {
831 p = search_nonascii(p, e);
832 if (!p) {
833 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
834 return e - s;
835 }
836 for (;;) {
837 int ret = rb_enc_precise_mbclen(p, e, enc);
838 if (!MBCLEN_CHARFOUND_P(ret)) {
840 return p - s;
841 }
842 p += MBCLEN_CHARFOUND_LEN(ret);
843 if (p == e) break;
844 p = search_nonascii(p, e);
845 if (!p) break;
846 }
847 }
848 else {
849 while (p < e) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 }
857 }
859 return e - s;
860}
861
862static inline void
863str_enc_copy(VALUE str1, VALUE str2)
864{
865 rb_enc_set_index(str1, ENCODING_GET(str2));
866}
867
868/* Like str_enc_copy, but does not check frozen status of str1.
869 * You should use this only if you're certain that str1 is not frozen. */
870static inline void
871str_enc_copy_direct(VALUE str1, VALUE str2)
872{
873 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
874 if (inlined_encoding == ENCODING_INLINE_MAX) {
875 rb_enc_set_index(str1, rb_enc_get_index(str2));
876 }
877 else {
878 ENCODING_SET_INLINED(str1, inlined_encoding);
879 }
880}
881
882static void
883rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
884{
885 /* this function is designed for copying encoding and coderange
886 * from src to new string "dest" which is made from the part of src.
887 */
888 str_enc_copy(dest, src);
889 if (RSTRING_LEN(dest) == 0) {
890 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
892 else
894 return;
895 }
896 switch (ENC_CODERANGE(src)) {
899 break;
901 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
902 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
904 else
906 break;
907 default:
908 break;
909 }
910}
911
912static void
913rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
914{
915 str_enc_copy(dest, src);
917}
918
919static int
920enc_coderange_scan(VALUE str, rb_encoding *enc)
921{
922 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
923}
924
925int
926rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
927{
928 return enc_coderange_scan(str, enc);
929}
930
931int
933{
934 int cr = ENC_CODERANGE(str);
935
936 if (cr == ENC_CODERANGE_UNKNOWN) {
937 cr = enc_coderange_scan(str, get_encoding(str));
938 ENC_CODERANGE_SET(str, cr);
939 }
940 return cr;
941}
942
943static inline bool
944rb_enc_str_asciicompat(VALUE str)
945{
946 int encindex = ENCODING_GET_INLINED(str);
947 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
948}
949
950int
952{
953 switch(ENC_CODERANGE(str)) {
955 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
957 return true;
958 default:
959 return false;
960 }
961}
962
963static inline void
964str_mod_check(VALUE s, const char *p, long len)
965{
966 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
967 rb_raise(rb_eRuntimeError, "string modified");
968 }
969}
970
971static size_t
972str_capacity(VALUE str, const int termlen)
973{
974 if (STR_EMBED_P(str)) {
975 return str_embed_capa(str) - termlen;
976 }
977 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
978 return RSTRING(str)->len;
979 }
980 else {
981 return RSTRING(str)->as.heap.aux.capa;
982 }
983}
984
985size_t
987{
988 return str_capacity(str, TERM_LEN(str));
989}
990
991static inline void
992must_not_null(const char *ptr)
993{
994 if (!ptr) {
995 rb_raise(rb_eArgError, "NULL pointer given");
996 }
997}
998
999static inline VALUE
1000str_alloc_embed(VALUE klass, size_t capa)
1001{
1002 size_t size = rb_str_embed_size(capa);
1003 RUBY_ASSERT(size > 0);
1004 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1005
1006 NEWOBJ_OF(str, struct RString, klass,
1008
1009 return (VALUE)str;
1010}
1011
1012static inline VALUE
1013str_alloc_heap(VALUE klass)
1014{
1015 NEWOBJ_OF(str, struct RString, klass,
1016 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1017
1018 return (VALUE)str;
1019}
1020
1021static inline VALUE
1022empty_str_alloc(VALUE klass)
1023{
1024 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1025 VALUE str = str_alloc_embed(klass, 0);
1026 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1028 return str;
1029}
1030
1031static VALUE
1032str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1033{
1034 VALUE str;
1035
1036 if (len < 0) {
1037 rb_raise(rb_eArgError, "negative string size (or size too big)");
1038 }
1039
1040 if (enc == NULL) {
1041 enc = rb_ascii8bit_encoding();
1042 }
1043
1044 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1045
1046 int termlen = rb_enc_mbminlen(enc);
1047
1048 if (STR_EMBEDDABLE_P(len, termlen)) {
1049 str = str_alloc_embed(klass, len + termlen);
1050 if (len == 0) {
1051 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1052 }
1053 }
1054 else {
1055 str = str_alloc_heap(klass);
1056 RSTRING(str)->as.heap.aux.capa = len;
1057 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1058 * integer overflow. If we can STATIC_ASSERT that, the following
1059 * mul_add_mul can be reverted to a simple ALLOC_N. */
1060 RSTRING(str)->as.heap.ptr =
1061 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1062 }
1063
1064 rb_enc_raw_set(str, enc);
1065
1066 if (ptr) {
1067 memcpy(RSTRING_PTR(str), ptr, len);
1068 }
1069
1070 STR_SET_LEN(str, len);
1071 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1072 return str;
1073}
1074
1075static VALUE
1076str_new(VALUE klass, const char *ptr, long len)
1077{
1078 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1079}
1080
1081VALUE
1082rb_str_new(const char *ptr, long len)
1083{
1084 return str_new(rb_cString, ptr, len);
1085}
1086
1087VALUE
1088rb_usascii_str_new(const char *ptr, long len)
1089{
1090 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1091}
1092
1093VALUE
1094rb_utf8_str_new(const char *ptr, long len)
1095{
1096 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1097}
1098
1099VALUE
1100rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1101{
1102 return str_enc_new(rb_cString, ptr, len, enc);
1103}
1104
1105VALUE
1107{
1108 must_not_null(ptr);
1109 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1110 * memory regions, and that cannot be detected by the MSAN. Just
1111 * trust the programmer that the argument passed here is a sane C
1112 * string. */
1113 __msan_unpoison_string(ptr);
1114 return rb_str_new(ptr, strlen(ptr));
1115}
1116
1117VALUE
1119{
1120 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1121}
1122
1123VALUE
1125{
1126 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1127}
1128
1129VALUE
1131{
1132 must_not_null(ptr);
1133 if (rb_enc_mbminlen(enc) != 1) {
1134 rb_raise(rb_eArgError, "wchar encoding given");
1135 }
1136 return rb_enc_str_new(ptr, strlen(ptr), enc);
1137}
1138
1139static VALUE
1140str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1141{
1142 VALUE str;
1143
1144 if (len < 0) {
1145 rb_raise(rb_eArgError, "negative string size (or size too big)");
1146 }
1147
1148 if (!ptr) {
1149 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1150 }
1151 else {
1152 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1153 str = str_alloc_heap(klass);
1154 RSTRING(str)->len = len;
1155 RSTRING(str)->as.heap.ptr = (char *)ptr;
1156 RSTRING(str)->as.heap.aux.capa = len;
1157 RBASIC(str)->flags |= STR_NOFREE;
1158 rb_enc_associate_index(str, encindex);
1159 }
1160 return str;
1161}
1162
1163VALUE
1164rb_str_new_static(const char *ptr, long len)
1165{
1166 return str_new_static(rb_cString, ptr, len, 0);
1167}
1168
1169VALUE
1171{
1172 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1173}
1174
1175VALUE
1177{
1178 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1179}
1180
1181VALUE
1183{
1184 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1185}
1186
1187static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1188 rb_encoding *from, rb_encoding *to,
1189 int ecflags, VALUE ecopts);
1190
1191static inline bool
1192is_enc_ascii_string(VALUE str, rb_encoding *enc)
1193{
1194 int encidx = rb_enc_to_index(enc);
1195 if (rb_enc_get_index(str) == encidx)
1196 return is_ascii_string(str);
1197 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1198}
1199
1200VALUE
1201rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1202{
1203 long len;
1204 const char *ptr;
1205 VALUE newstr;
1206
1207 if (!to) return str;
1208 if (!from) from = rb_enc_get(str);
1209 if (from == to) return str;
1210 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1211 rb_is_ascii8bit_enc(to)) {
1212 if (STR_ENC_GET(str) != to) {
1213 str = rb_str_dup(str);
1214 rb_enc_associate(str, to);
1215 }
1216 return str;
1217 }
1218
1219 RSTRING_GETMEM(str, ptr, len);
1220 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1221 from, to, ecflags, ecopts);
1222 if (NIL_P(newstr)) {
1223 /* some error, return original */
1224 return str;
1225 }
1226 return newstr;
1227}
1228
1229VALUE
1230rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1231 rb_encoding *from, int ecflags, VALUE ecopts)
1232{
1233 long olen;
1234
1235 olen = RSTRING_LEN(newstr);
1236 if (ofs < -olen || olen < ofs)
1237 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1238 if (ofs < 0) ofs += olen;
1239 if (!from) {
1240 STR_SET_LEN(newstr, ofs);
1241 return rb_str_cat(newstr, ptr, len);
1242 }
1243
1244 rb_str_modify(newstr);
1245 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1246 rb_enc_get(newstr),
1247 ecflags, ecopts);
1248}
1249
1250VALUE
1251rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1252{
1253 STR_SET_LEN(str, 0);
1254 rb_enc_associate(str, enc);
1255 rb_str_cat(str, ptr, len);
1256 return str;
1257}
1258
1259static VALUE
1260str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1261 rb_encoding *from, rb_encoding *to,
1262 int ecflags, VALUE ecopts)
1263{
1264 rb_econv_t *ec;
1266 long olen;
1267 VALUE econv_wrapper;
1268 const unsigned char *start, *sp;
1269 unsigned char *dest, *dp;
1270 size_t converted_output = (size_t)ofs;
1271
1272 olen = rb_str_capacity(newstr);
1273
1274 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1275 RBASIC_CLEAR_CLASS(econv_wrapper);
1276 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1277 if (!ec) return Qnil;
1278 DATA_PTR(econv_wrapper) = ec;
1279
1280 sp = (unsigned char*)ptr;
1281 start = sp;
1282 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1283 (dp = dest + converted_output),
1284 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1286 /* destination buffer short */
1287 size_t converted_input = sp - start;
1288 size_t rest = len - converted_input;
1289 converted_output = dp - dest;
1290 rb_str_set_len(newstr, converted_output);
1291 if (converted_input && converted_output &&
1292 rest < (LONG_MAX / converted_output)) {
1293 rest = (rest * converted_output) / converted_input;
1294 }
1295 else {
1296 rest = olen;
1297 }
1298 olen += rest < 2 ? 2 : rest;
1299 rb_str_resize(newstr, olen);
1300 }
1301 DATA_PTR(econv_wrapper) = 0;
1302 RB_GC_GUARD(econv_wrapper);
1303 rb_econv_close(ec);
1304 switch (ret) {
1305 case econv_finished:
1306 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1307 rb_str_set_len(newstr, len);
1308 rb_enc_associate(newstr, to);
1309 return newstr;
1310
1311 default:
1312 return Qnil;
1313 }
1314}
1315
1316VALUE
1318{
1319 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1320}
1321
1322VALUE
1324{
1325 rb_encoding *ienc;
1326 VALUE str;
1327 const int eidx = rb_enc_to_index(eenc);
1328
1329 if (!ptr) {
1330 return rb_enc_str_new(ptr, len, eenc);
1331 }
1332
1333 /* ASCII-8BIT case, no conversion */
1334 if ((eidx == rb_ascii8bit_encindex()) ||
1335 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1336 return rb_str_new(ptr, len);
1337 }
1338 /* no default_internal or same encoding, no conversion */
1339 ienc = rb_default_internal_encoding();
1340 if (!ienc || eenc == ienc) {
1341 return rb_enc_str_new(ptr, len, eenc);
1342 }
1343 /* ASCII compatible, and ASCII only string, no conversion in
1344 * default_internal */
1345 if ((eidx == rb_ascii8bit_encindex()) ||
1346 (eidx == rb_usascii_encindex()) ||
1347 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1348 return rb_enc_str_new(ptr, len, ienc);
1349 }
1350 /* convert from the given encoding to default_internal */
1351 str = rb_enc_str_new(NULL, 0, ienc);
1352 /* when the conversion failed for some reason, just ignore the
1353 * default_internal and result in the given encoding as-is. */
1354 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1355 rb_str_initialize(str, ptr, len, eenc);
1356 }
1357 return str;
1358}
1359
1360VALUE
1361rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1362{
1363 int eidx = rb_enc_to_index(eenc);
1364 if (eidx == rb_usascii_encindex() &&
1365 !is_ascii_string(str)) {
1366 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1367 return str;
1368 }
1369 rb_enc_associate_index(str, eidx);
1370 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1371}
1372
1373VALUE
1374rb_external_str_new(const char *ptr, long len)
1375{
1376 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1377}
1378
1379VALUE
1381{
1382 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1383}
1384
1385VALUE
1386rb_locale_str_new(const char *ptr, long len)
1387{
1388 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1389}
1390
1391VALUE
1393{
1394 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1395}
1396
1397VALUE
1399{
1400 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1401}
1402
1403VALUE
1405{
1406 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1407}
1408
1409VALUE
1411{
1412 return rb_str_export_to_enc(str, rb_default_external_encoding());
1413}
1414
1415VALUE
1417{
1418 return rb_str_export_to_enc(str, rb_locale_encoding());
1419}
1420
1421VALUE
1423{
1424 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1425}
1426
1427static VALUE
1428str_replace_shared_without_enc(VALUE str2, VALUE str)
1429{
1430 const int termlen = TERM_LEN(str);
1431 char *ptr;
1432 long len;
1433
1434 RSTRING_GETMEM(str, ptr, len);
1435 if (str_embed_capa(str2) >= len + termlen) {
1436 char *ptr2 = RSTRING(str2)->as.embed.ary;
1437 STR_SET_EMBED(str2);
1438 memcpy(ptr2, RSTRING_PTR(str), len);
1439 TERM_FILL(ptr2+len, termlen);
1440 }
1441 else {
1442 VALUE root;
1443 if (STR_SHARED_P(str)) {
1444 root = RSTRING(str)->as.heap.aux.shared;
1445 RSTRING_GETMEM(str, ptr, len);
1446 }
1447 else {
1448 root = rb_str_new_frozen(str);
1449 RSTRING_GETMEM(root, ptr, len);
1450 }
1451 RUBY_ASSERT(OBJ_FROZEN(root));
1452
1453 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1454 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1455 rb_fatal("about to free a possible shared root");
1456 }
1457 char *ptr2 = STR_HEAP_PTR(str2);
1458 if (ptr2 != ptr) {
1459 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1460 }
1461 }
1462 FL_SET(str2, STR_NOEMBED);
1463 RSTRING(str2)->as.heap.ptr = ptr;
1464 STR_SET_SHARED(str2, root);
1465 }
1466
1467 STR_SET_LEN(str2, len);
1468
1469 return str2;
1470}
1471
1472static VALUE
1473str_replace_shared(VALUE str2, VALUE str)
1474{
1475 str_replace_shared_without_enc(str2, str);
1476 rb_enc_cr_str_exact_copy(str2, str);
1477 return str2;
1478}
1479
1480static VALUE
1481str_new_shared(VALUE klass, VALUE str)
1482{
1483 return str_replace_shared(str_alloc_heap(klass), str);
1484}
1485
1486VALUE
1488{
1489 return str_new_shared(rb_obj_class(str), str);
1490}
1491
1492VALUE
1494{
1495 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1496 return str_new_frozen(rb_obj_class(orig), orig);
1497}
1498
1499static VALUE
1500rb_str_new_frozen_String(VALUE orig)
1501{
1502 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1503 return str_new_frozen(rb_cString, orig);
1504}
1505
1506
1507VALUE
1508rb_str_frozen_bare_string(VALUE orig)
1509{
1510 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1511 return str_new_frozen(rb_cString, orig);
1512}
1513
1514VALUE
1515rb_str_tmp_frozen_acquire(VALUE orig)
1516{
1517 if (OBJ_FROZEN_RAW(orig)) return orig;
1518 return str_new_frozen_buffer(0, orig, FALSE);
1519}
1520
1521VALUE
1522rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1523{
1524 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1525 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1526
1527 VALUE str = str_alloc_heap(0);
1528 OBJ_FREEZE(str);
1529 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1530 FL_SET(str, STR_SHARED_ROOT);
1531
1532 size_t capa = str_capacity(orig, TERM_LEN(orig));
1533
1534 /* If the string is embedded then we want to create a copy that is heap
1535 * allocated. If the string is shared then the shared root must be
1536 * embedded, so we want to create a copy. If the string is a shared root
1537 * then it must be embedded, so we want to create a copy. */
1538 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1539 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1540 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1541 }
1542 else {
1543 /* orig must be heap allocated and not shared, so we can safely transfer
1544 * the pointer to str. */
1545 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1546 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1547 RBASIC(orig)->flags &= ~STR_NOFREE;
1548 STR_SET_SHARED(orig, str);
1549 }
1550
1551 RSTRING(str)->len = RSTRING(orig)->len;
1552 RSTRING(str)->as.heap.aux.capa = capa;
1553
1554 return str;
1555}
1556
1557void
1558rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1559{
1560 if (RBASIC_CLASS(tmp) != 0)
1561 return;
1562
1563 if (STR_EMBED_P(tmp)) {
1565 }
1566 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1567 !OBJ_FROZEN_RAW(orig)) {
1568 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1569
1570 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1571 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1572 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1573
1574 /* Unshare orig since the root (tmp) only has this one child. */
1575 FL_UNSET_RAW(orig, STR_SHARED);
1576 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1577 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1579
1580 /* Make tmp embedded and empty so it is safe for sweeping. */
1581 STR_SET_EMBED(tmp);
1582 STR_SET_LEN(tmp, 0);
1583 }
1584 }
1585}
1586
1587static VALUE
1588str_new_frozen(VALUE klass, VALUE orig)
1589{
1590 return str_new_frozen_buffer(klass, orig, TRUE);
1591}
1592
1593static VALUE
1594heap_str_make_shared(VALUE klass, VALUE orig)
1595{
1596 RUBY_ASSERT(!STR_EMBED_P(orig));
1597 RUBY_ASSERT(!STR_SHARED_P(orig));
1598
1599 VALUE str = str_alloc_heap(klass);
1600 STR_SET_LEN(str, RSTRING_LEN(orig));
1601 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1602 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1603 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1604 RBASIC(orig)->flags &= ~STR_NOFREE;
1605 STR_SET_SHARED(orig, str);
1606 if (klass == 0)
1607 FL_UNSET_RAW(str, STR_BORROWED);
1608 return str;
1609}
1610
1611static VALUE
1612str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1613{
1614 VALUE str;
1615
1616 long len = RSTRING_LEN(orig);
1617 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1618 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1619
1620 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1621 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1622 RUBY_ASSERT(STR_EMBED_P(str));
1623 }
1624 else {
1625 if (FL_TEST_RAW(orig, STR_SHARED)) {
1626 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1627 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1628 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1629 RUBY_ASSERT(ofs >= 0);
1630 RUBY_ASSERT(rest >= 0);
1631 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1633
1634 if ((ofs > 0) || (rest > 0) ||
1635 (klass != RBASIC(shared)->klass) ||
1636 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1637 str = str_new_shared(klass, shared);
1638 RUBY_ASSERT(!STR_EMBED_P(str));
1639 RSTRING(str)->as.heap.ptr += ofs;
1640 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1641 }
1642 else {
1643 if (RBASIC_CLASS(shared) == 0)
1644 FL_SET_RAW(shared, STR_BORROWED);
1645 return shared;
1646 }
1647 }
1648 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1649 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1650 STR_SET_EMBED(str);
1651 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1652 STR_SET_LEN(str, RSTRING_LEN(orig));
1653 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1654 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1655 }
1656 else {
1657 str = heap_str_make_shared(klass, orig);
1658 }
1659 }
1660
1661 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1662 OBJ_FREEZE(str);
1663 return str;
1664}
1665
1666VALUE
1667rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1668{
1669 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1670}
1671
1672static VALUE
1673str_new_empty_String(VALUE str)
1674{
1675 VALUE v = rb_str_new(0, 0);
1676 rb_enc_copy(v, str);
1677 return v;
1678}
1679
1680#define STR_BUF_MIN_SIZE 63
1681
1682VALUE
1684{
1685 if (STR_EMBEDDABLE_P(capa, 1)) {
1686 return str_alloc_embed(rb_cString, capa + 1);
1687 }
1688
1689 VALUE str = str_alloc_heap(rb_cString);
1690
1691 RSTRING(str)->as.heap.aux.capa = capa;
1692 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1693 RSTRING(str)->as.heap.ptr[0] = '\0';
1694
1695 return str;
1696}
1697
1698VALUE
1700{
1701 VALUE str;
1702 long len = strlen(ptr);
1703
1704 str = rb_str_buf_new(len);
1705 rb_str_buf_cat(str, ptr, len);
1706
1707 return str;
1708}
1709
1710VALUE
1712{
1713 return str_new(0, 0, len);
1714}
1715
1716void
1718{
1719 if (STR_EMBED_P(str)) {
1720 RB_DEBUG_COUNTER_INC(obj_str_embed);
1721 }
1722 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1723 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1724 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1725 }
1726 else {
1727 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1728 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1729 }
1730}
1731
1732size_t
1733rb_str_memsize(VALUE str)
1734{
1735 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1736 return STR_HEAP_SIZE(str);
1737 }
1738 else {
1739 return 0;
1740 }
1741}
1742
1743VALUE
1745{
1746 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1747}
1748
1749static inline void str_discard(VALUE str);
1750static void str_shared_replace(VALUE str, VALUE str2);
1751
1752void
1754{
1755 if (str != str2) str_shared_replace(str, str2);
1756}
1757
1758static void
1759str_shared_replace(VALUE str, VALUE str2)
1760{
1761 rb_encoding *enc;
1762 int cr;
1763 int termlen;
1764
1765 RUBY_ASSERT(str2 != str);
1766 enc = STR_ENC_GET(str2);
1767 cr = ENC_CODERANGE(str2);
1768 str_discard(str);
1769 termlen = rb_enc_mbminlen(enc);
1770
1771 STR_SET_LEN(str, RSTRING_LEN(str2));
1772
1773 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1774 STR_SET_EMBED(str);
1775 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1776 rb_enc_associate(str, enc);
1777 ENC_CODERANGE_SET(str, cr);
1778 }
1779 else {
1780 if (STR_EMBED_P(str2)) {
1781 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1782 long len = RSTRING_LEN(str2);
1783 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1784
1785 char *new_ptr = ALLOC_N(char, len + termlen);
1786 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1787 RSTRING(str2)->as.heap.ptr = new_ptr;
1788 STR_SET_LEN(str2, len);
1789 RSTRING(str2)->as.heap.aux.capa = len;
1790 STR_SET_NOEMBED(str2);
1791 }
1792
1793 STR_SET_NOEMBED(str);
1794 FL_UNSET(str, STR_SHARED);
1795 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1796
1797 if (FL_TEST(str2, STR_SHARED)) {
1798 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1799 STR_SET_SHARED(str, shared);
1800 }
1801 else {
1802 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1803 }
1804
1805 /* abandon str2 */
1806 STR_SET_EMBED(str2);
1807 RSTRING_PTR(str2)[0] = 0;
1808 STR_SET_LEN(str2, 0);
1809 rb_enc_associate(str, enc);
1810 ENC_CODERANGE_SET(str, cr);
1811 }
1812}
1813
1814VALUE
1816{
1817 VALUE str;
1818
1819 if (RB_TYPE_P(obj, T_STRING)) {
1820 return obj;
1821 }
1822 str = rb_funcall(obj, idTo_s, 0);
1823 return rb_obj_as_string_result(str, obj);
1824}
1825
1826VALUE
1827rb_obj_as_string_result(VALUE str, VALUE obj)
1828{
1829 if (!RB_TYPE_P(str, T_STRING))
1830 return rb_any_to_s(obj);
1831 return str;
1832}
1833
1834static VALUE
1835str_replace(VALUE str, VALUE str2)
1836{
1837 long len;
1838
1839 len = RSTRING_LEN(str2);
1840 if (STR_SHARED_P(str2)) {
1841 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1843 STR_SET_NOEMBED(str);
1844 STR_SET_LEN(str, len);
1845 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1846 STR_SET_SHARED(str, shared);
1847 rb_enc_cr_str_exact_copy(str, str2);
1848 }
1849 else {
1850 str_replace_shared(str, str2);
1851 }
1852
1853 return str;
1854}
1855
1856static inline VALUE
1857ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1858{
1859 size_t size = rb_str_embed_size(capa);
1860 RUBY_ASSERT(size > 0);
1861 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1862
1863 NEWOBJ_OF(str, struct RString, klass,
1865
1866 return (VALUE)str;
1867}
1868
1869static inline VALUE
1870ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1871{
1872 NEWOBJ_OF(str, struct RString, klass,
1873 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1874
1875 return (VALUE)str;
1876}
1877
1878static inline VALUE
1879str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1880{
1881 int encidx = 0;
1882 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1883 encidx = rb_enc_get_index(str);
1884 flags &= ~ENCODING_MASK;
1885 }
1886 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1887 if (encidx) rb_enc_associate_index(dup, encidx);
1888 return dup;
1889}
1890
1891static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1892
1893static inline VALUE
1894str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1895{
1896 VALUE flags = FL_TEST_RAW(str, flag_mask);
1897 long len = RSTRING_LEN(str);
1898
1899 RUBY_ASSERT(STR_EMBED_P(dup));
1900 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1901 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1902 STR_SET_LEN(dup, RSTRING_LEN(str));
1903 return str_duplicate_setup_encoding(str, dup, flags);
1904}
1905
1906static inline VALUE
1907str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1908{
1909 VALUE flags = FL_TEST_RAW(str, flag_mask);
1910 VALUE root = str;
1911 if (FL_TEST_RAW(str, STR_SHARED)) {
1912 root = RSTRING(str)->as.heap.aux.shared;
1913 }
1914 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1915 root = str = str_new_frozen(klass, str);
1916 flags = FL_TEST_RAW(str, flag_mask);
1917 }
1918 RUBY_ASSERT(!STR_SHARED_P(root));
1920
1921 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1922 FL_SET(root, STR_SHARED_ROOT);
1923 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1924 flags |= RSTRING_NOEMBED | STR_SHARED;
1925
1926 STR_SET_LEN(dup, RSTRING_LEN(str));
1927 return str_duplicate_setup_encoding(str, dup, flags);
1928}
1929
1930static inline VALUE
1931str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1932{
1933 if (STR_EMBED_P(str)) {
1934 return str_duplicate_setup_embed(klass, str, dup);
1935 }
1936 else {
1937 return str_duplicate_setup_heap(klass, str, dup);
1938 }
1939}
1940
1941static inline VALUE
1942str_duplicate(VALUE klass, VALUE str)
1943{
1944 VALUE dup;
1945 if (STR_EMBED_P(str)) {
1946 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1947 }
1948 else {
1949 dup = str_alloc_heap(klass);
1950 }
1951
1952 return str_duplicate_setup(klass, str, dup);
1953}
1954
1955VALUE
1957{
1958 return str_duplicate(rb_obj_class(str), str);
1959}
1960
1961/* :nodoc: */
1962VALUE
1963rb_str_dup_m(VALUE str)
1964{
1965 if (LIKELY(BARE_STRING_P(str))) {
1966 return str_duplicate(rb_cString, str);
1967 }
1968 else {
1969 return rb_obj_dup(str);
1970 }
1971}
1972
1973VALUE
1975{
1976 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1977 return str_duplicate(rb_cString, str);
1978}
1979
1980VALUE
1981rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1982{
1983 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1984 VALUE new_str, klass = rb_cString;
1985
1986 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1987 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1988 str_duplicate_setup_embed(klass, str, new_str);
1989 }
1990 else {
1991 new_str = ec_str_alloc_heap(ec, klass);
1992 str_duplicate_setup_heap(klass, str, new_str);
1993 }
1994 if (chilled) {
1995 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1996 }
1997 return new_str;
1998}
1999
2000VALUE
2001rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2002{
2003 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2004 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2005 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2006 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2007 return rb_str_freeze(str);
2008}
2009
2010/*
2011 * The documentation block below uses an include (instead of inline text)
2012 * because the included text has non-ASCII characters (which are not allowed in a C file).
2013 */
2014
2015/*
2016 *
2017 * call-seq:
2018 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2019 *
2020 * :include: doc/string/new.rdoc
2021 *
2022 */
2023
2024static VALUE
2025rb_str_init(int argc, VALUE *argv, VALUE str)
2026{
2027 static ID keyword_ids[2];
2028 VALUE orig, opt, venc, vcapa;
2029 VALUE kwargs[2];
2030 rb_encoding *enc = 0;
2031 int n;
2032
2033 if (!keyword_ids[0]) {
2034 keyword_ids[0] = rb_id_encoding();
2035 CONST_ID(keyword_ids[1], "capacity");
2036 }
2037
2038 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2039 if (!NIL_P(opt)) {
2040 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2041 venc = kwargs[0];
2042 vcapa = kwargs[1];
2043 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2044 enc = rb_to_encoding(venc);
2045 }
2046 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2047 long capa = NUM2LONG(vcapa);
2048 long len = 0;
2049 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2050
2051 if (capa < STR_BUF_MIN_SIZE) {
2052 capa = STR_BUF_MIN_SIZE;
2053 }
2054 if (n == 1) {
2055 StringValue(orig);
2056 len = RSTRING_LEN(orig);
2057 if (capa < len) {
2058 capa = len;
2059 }
2060 if (orig == str) n = 0;
2061 }
2062 str_modifiable(str);
2063 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2064 /* make noembed always */
2065 const size_t size = (size_t)capa + termlen;
2066 const char *const old_ptr = RSTRING_PTR(str);
2067 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2068 char *new_ptr = ALLOC_N(char, size);
2069 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2070 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2071 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2072 RSTRING(str)->as.heap.ptr = new_ptr;
2073 }
2074 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2075 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2076 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2077 }
2078 STR_SET_LEN(str, len);
2079 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2080 if (n == 1) {
2081 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2082 rb_enc_cr_str_exact_copy(str, orig);
2083 }
2084 FL_SET(str, STR_NOEMBED);
2085 RSTRING(str)->as.heap.aux.capa = capa;
2086 }
2087 else if (n == 1) {
2088 rb_str_replace(str, orig);
2089 }
2090 if (enc) {
2091 rb_enc_associate(str, enc);
2093 }
2094 }
2095 else if (n == 1) {
2096 rb_str_replace(str, orig);
2097 }
2098 return str;
2099}
2100
2101/* :nodoc: */
2102static VALUE
2103rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2104{
2105 if (klass != rb_cString) {
2106 return rb_class_new_instance_pass_kw(argc, argv, klass);
2107 }
2108
2109 static ID keyword_ids[2];
2110 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2111 VALUE kwargs[2];
2112 rb_encoding *enc = NULL;
2113
2114 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2115 if (NIL_P(opt)) {
2116 return rb_class_new_instance_pass_kw(argc, argv, klass);
2117 }
2118
2119 keyword_ids[0] = rb_id_encoding();
2120 CONST_ID(keyword_ids[1], "capacity");
2121 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2122 encoding = kwargs[0];
2123 capacity = kwargs[1];
2124
2125 if (n == 1) {
2126 orig = StringValue(orig);
2127 }
2128 else {
2129 orig = Qnil;
2130 }
2131
2132 if (UNDEF_P(encoding)) {
2133 if (!NIL_P(orig)) {
2134 encoding = rb_obj_encoding(orig);
2135 }
2136 }
2137
2138 if (!UNDEF_P(encoding)) {
2139 enc = rb_to_encoding(encoding);
2140 }
2141
2142 // If capacity is nil, we're basically just duping `orig`.
2143 if (UNDEF_P(capacity)) {
2144 if (NIL_P(orig)) {
2145 VALUE empty_str = str_new(klass, "", 0);
2146 if (enc) {
2147 rb_enc_associate(empty_str, enc);
2148 }
2149 return empty_str;
2150 }
2151 VALUE copy = str_duplicate(klass, orig);
2152 rb_enc_associate(copy, enc);
2153 ENC_CODERANGE_CLEAR(copy);
2154 return copy;
2155 }
2156
2157 long capa = 0;
2158 capa = NUM2LONG(capacity);
2159 if (capa < 0) {
2160 capa = 0;
2161 }
2162
2163 if (!NIL_P(orig)) {
2164 long orig_capa = rb_str_capacity(orig);
2165 if (orig_capa > capa) {
2166 capa = orig_capa;
2167 }
2168 }
2169
2170 VALUE str = str_enc_new(klass, NULL, capa, enc);
2171 STR_SET_LEN(str, 0);
2172 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2173
2174 if (!NIL_P(orig)) {
2175 rb_str_buf_append(str, orig);
2176 }
2177
2178 return str;
2179}
2180
2181#ifdef NONASCII_MASK
2182#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2183
2184/*
2185 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2186 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2187 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2188 *
2189 * if (!(byte & 0x80))
2190 * byte |= 0x40; // turn on bit6
2191 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2192 *
2193 * This function calculates whether a byte is leading or not for all bytes
2194 * in the argument word by concurrently using the above logic, and then
2195 * adds up the number of leading bytes in the word.
2196 */
2197static inline uintptr_t
2198count_utf8_lead_bytes_with_word(const uintptr_t *s)
2199{
2200 uintptr_t d = *s;
2201
2202 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2203 d = (d>>6) | (~d>>7);
2204 d &= NONASCII_MASK >> 7;
2205
2206 /* Gather all bytes. */
2207#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2208 /* use only if it can use POPCNT */
2209 return rb_popcount_intptr(d);
2210#else
2211 d += (d>>8);
2212 d += (d>>16);
2213# if SIZEOF_VOIDP == 8
2214 d += (d>>32);
2215# endif
2216 return (d&0xF);
2217#endif
2218}
2219#endif
2220
2221static inline long
2222enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2223{
2224 long c;
2225 const char *q;
2226
2227 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2228 long diff = (long)(e - p);
2229 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2230 }
2231#ifdef NONASCII_MASK
2232 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2233 uintptr_t len = 0;
2234 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2235 const uintptr_t *s, *t;
2236 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2237 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2238 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2239 while (p < (const char *)s) {
2240 if (is_utf8_lead_byte(*p)) len++;
2241 p++;
2242 }
2243 while (s < t) {
2244 len += count_utf8_lead_bytes_with_word(s);
2245 s++;
2246 }
2247 p = (const char *)s;
2248 }
2249 while (p < e) {
2250 if (is_utf8_lead_byte(*p)) len++;
2251 p++;
2252 }
2253 return (long)len;
2254 }
2255#endif
2256 else if (rb_enc_asciicompat(enc)) {
2257 c = 0;
2258 if (ENC_CODERANGE_CLEAN_P(cr)) {
2259 while (p < e) {
2260 if (ISASCII(*p)) {
2261 q = search_nonascii(p, e);
2262 if (!q)
2263 return c + (e - p);
2264 c += q - p;
2265 p = q;
2266 }
2267 p += rb_enc_fast_mbclen(p, e, enc);
2268 c++;
2269 }
2270 }
2271 else {
2272 while (p < e) {
2273 if (ISASCII(*p)) {
2274 q = search_nonascii(p, e);
2275 if (!q)
2276 return c + (e - p);
2277 c += q - p;
2278 p = q;
2279 }
2280 p += rb_enc_mbclen(p, e, enc);
2281 c++;
2282 }
2283 }
2284 return c;
2285 }
2286
2287 for (c=0; p<e; c++) {
2288 p += rb_enc_mbclen(p, e, enc);
2289 }
2290 return c;
2291}
2292
2293long
2294rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2295{
2296 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2297}
2298
2299/* To get strlen with cr
2300 * Note that given cr is not used.
2301 */
2302long
2303rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2304{
2305 long c;
2306 const char *q;
2307 int ret;
2308
2309 *cr = 0;
2310 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2311 long diff = (long)(e - p);
2312 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2313 }
2314 else if (rb_enc_asciicompat(enc)) {
2315 c = 0;
2316 while (p < e) {
2317 if (ISASCII(*p)) {
2318 q = search_nonascii(p, e);
2319 if (!q) {
2320 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2321 return c + (e - p);
2322 }
2323 c += q - p;
2324 p = q;
2325 }
2326 ret = rb_enc_precise_mbclen(p, e, enc);
2327 if (MBCLEN_CHARFOUND_P(ret)) {
2328 *cr |= ENC_CODERANGE_VALID;
2329 p += MBCLEN_CHARFOUND_LEN(ret);
2330 }
2331 else {
2333 p++;
2334 }
2335 c++;
2336 }
2337 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2338 return c;
2339 }
2340
2341 for (c=0; p<e; c++) {
2342 ret = rb_enc_precise_mbclen(p, e, enc);
2343 if (MBCLEN_CHARFOUND_P(ret)) {
2344 *cr |= ENC_CODERANGE_VALID;
2345 p += MBCLEN_CHARFOUND_LEN(ret);
2346 }
2347 else {
2349 if (p + rb_enc_mbminlen(enc) <= e)
2350 p += rb_enc_mbminlen(enc);
2351 else
2352 p = e;
2353 }
2354 }
2355 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2356 return c;
2357}
2358
2359/* enc must be str's enc or rb_enc_check(str, str2) */
2360static long
2361str_strlen(VALUE str, rb_encoding *enc)
2362{
2363 const char *p, *e;
2364 int cr;
2365
2366 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2367 if (!enc) enc = STR_ENC_GET(str);
2368 p = RSTRING_PTR(str);
2369 e = RSTRING_END(str);
2370 cr = ENC_CODERANGE(str);
2371
2372 if (cr == ENC_CODERANGE_UNKNOWN) {
2373 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2374 if (cr) ENC_CODERANGE_SET(str, cr);
2375 return n;
2376 }
2377 else {
2378 return enc_strlen(p, e, enc, cr);
2379 }
2380}
2381
2382long
2384{
2385 return str_strlen(str, NULL);
2386}
2387
2388/*
2389 * call-seq:
2390 * length -> integer
2391 *
2392 * :include: doc/string/length.rdoc
2393 *
2394 */
2395
2396VALUE
2398{
2399 return LONG2NUM(str_strlen(str, NULL));
2400}
2401
2402/*
2403 * call-seq:
2404 * bytesize -> integer
2405 *
2406 * :include: doc/string/bytesize.rdoc
2407 *
2408 */
2409
2410VALUE
2411rb_str_bytesize(VALUE str)
2412{
2413 return LONG2NUM(RSTRING_LEN(str));
2414}
2415
2416/*
2417 * call-seq:
2418 * empty? -> true or false
2419 *
2420 * Returns whether the length of +self+ is zero:
2421 *
2422 * 'hello'.empty? # => false
2423 * ' '.empty? # => false
2424 * ''.empty? # => true
2425 *
2426 * Related: see {Querying}[rdoc-ref:String@Querying].
2427 */
2428
2429static VALUE
2430rb_str_empty(VALUE str)
2431{
2432 return RBOOL(RSTRING_LEN(str) == 0);
2433}
2434
2435/*
2436 * call-seq:
2437 * self + other_string -> new_string
2438 *
2439 * Returns a new string containing +other_string+ concatenated to +self+:
2440 *
2441 * 'Hello from ' + self.to_s # => "Hello from main"
2442 *
2443 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2444 */
2445
2446VALUE
2448{
2449 VALUE str3;
2450 rb_encoding *enc;
2451 char *ptr1, *ptr2, *ptr3;
2452 long len1, len2;
2453 int termlen;
2454
2455 StringValue(str2);
2456 enc = rb_enc_check_str(str1, str2);
2457 RSTRING_GETMEM(str1, ptr1, len1);
2458 RSTRING_GETMEM(str2, ptr2, len2);
2459 termlen = rb_enc_mbminlen(enc);
2460 if (len1 > LONG_MAX - len2) {
2461 rb_raise(rb_eArgError, "string size too big");
2462 }
2463 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2464 ptr3 = RSTRING_PTR(str3);
2465 memcpy(ptr3, ptr1, len1);
2466 memcpy(ptr3+len1, ptr2, len2);
2467 TERM_FILL(&ptr3[len1+len2], termlen);
2468
2469 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2471 RB_GC_GUARD(str1);
2472 RB_GC_GUARD(str2);
2473 return str3;
2474}
2475
2476/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2477VALUE
2478rb_str_opt_plus(VALUE str1, VALUE str2)
2479{
2482 long len1, len2;
2483 MAYBE_UNUSED(char) *ptr1, *ptr2;
2484 RSTRING_GETMEM(str1, ptr1, len1);
2485 RSTRING_GETMEM(str2, ptr2, len2);
2486 int enc1 = rb_enc_get_index(str1);
2487 int enc2 = rb_enc_get_index(str2);
2488
2489 if (enc1 < 0) {
2490 return Qundef;
2491 }
2492 else if (enc2 < 0) {
2493 return Qundef;
2494 }
2495 else if (enc1 != enc2) {
2496 return Qundef;
2497 }
2498 else if (len1 > LONG_MAX - len2) {
2499 return Qundef;
2500 }
2501 else {
2502 return rb_str_plus(str1, str2);
2503 }
2504
2505}
2506
2507/*
2508 * call-seq:
2509 * self * n -> new_string
2510 *
2511 * Returns a new string containing +n+ copies of +self+:
2512 *
2513 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2514 * 'No!' * 0 # => ""
2515 *
2516 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2517 */
2518
2519VALUE
2521{
2522 VALUE str2;
2523 long n, len;
2524 char *ptr2;
2525 int termlen;
2526
2527 if (times == INT2FIX(1)) {
2528 return str_duplicate(rb_cString, str);
2529 }
2530 if (times == INT2FIX(0)) {
2531 str2 = str_alloc_embed(rb_cString, 0);
2532 rb_enc_copy(str2, str);
2533 return str2;
2534 }
2535 len = NUM2LONG(times);
2536 if (len < 0) {
2537 rb_raise(rb_eArgError, "negative argument");
2538 }
2539 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2540 if (STR_EMBEDDABLE_P(len, 1)) {
2541 str2 = str_alloc_embed(rb_cString, len + 1);
2542 memset(RSTRING_PTR(str2), 0, len + 1);
2543 }
2544 else {
2545 str2 = str_alloc_heap(rb_cString);
2546 RSTRING(str2)->as.heap.aux.capa = len;
2547 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2548 }
2549 STR_SET_LEN(str2, len);
2550 rb_enc_copy(str2, str);
2551 return str2;
2552 }
2553 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2554 rb_raise(rb_eArgError, "argument too big");
2555 }
2556
2557 len *= RSTRING_LEN(str);
2558 termlen = TERM_LEN(str);
2559 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2560 ptr2 = RSTRING_PTR(str2);
2561 if (len) {
2562 n = RSTRING_LEN(str);
2563 memcpy(ptr2, RSTRING_PTR(str), n);
2564 while (n <= len/2) {
2565 memcpy(ptr2 + n, ptr2, n);
2566 n *= 2;
2567 }
2568 memcpy(ptr2 + n, ptr2, len-n);
2569 }
2570 STR_SET_LEN(str2, len);
2571 TERM_FILL(&ptr2[len], termlen);
2572 rb_enc_cr_str_copy_for_substr(str2, str);
2573
2574 return str2;
2575}
2576
2577/*
2578 * call-seq:
2579 * self % object -> new_string
2580 *
2581 * Returns the result of formatting +object+ into the format specifications
2582 * contained in +self+
2583 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2584 *
2585 * '%05d' % 123 # => "00123"
2586 *
2587 * If +self+ contains multiple format specifications,
2588 * +object+ must be an array or hash containing the objects to be formatted:
2589 *
2590 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2591 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2592 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2593 *
2594 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2595 */
2596
2597static VALUE
2598rb_str_format_m(VALUE str, VALUE arg)
2599{
2600 VALUE tmp = rb_check_array_type(arg);
2601
2602 if (!NIL_P(tmp)) {
2603 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2604 }
2605 return rb_str_format(1, &arg, str);
2606}
2607
2608static inline void
2609rb_check_lockedtmp(VALUE str)
2610{
2611 if (FL_TEST(str, STR_TMPLOCK)) {
2612 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2613 }
2614}
2615
2616// If none of these flags are set, we know we have an modifiable string.
2617// If any is set, we need to do more detailed checks.
2618#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2619static inline void
2620str_modifiable(VALUE str)
2621{
2622 RUBY_ASSERT(ruby_thread_has_gvl_p());
2623
2624 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2625 if (CHILLED_STRING_P(str)) {
2626 CHILLED_STRING_MUTATED(str);
2627 }
2628 rb_check_lockedtmp(str);
2629 rb_check_frozen(str);
2630 }
2631}
2632
2633static inline int
2634str_dependent_p(VALUE str)
2635{
2636 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2637 return FALSE;
2638 }
2639 else {
2640 return TRUE;
2641 }
2642}
2643
2644// If none of these flags are set, we know we have an independent string.
2645// If any is set, we need to do more detailed checks.
2646#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2647static inline int
2648str_independent(VALUE str)
2649{
2650 RUBY_ASSERT(ruby_thread_has_gvl_p());
2651
2652 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2653 str_modifiable(str);
2654 return !str_dependent_p(str);
2655 }
2656 return TRUE;
2657}
2658
2659static void
2660str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2661{
2662 RUBY_ASSERT(ruby_thread_has_gvl_p());
2663
2664 char *ptr;
2665 char *oldptr;
2666 long capa = len + expand;
2667
2668 if (len > capa) len = capa;
2669
2670 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2671 ptr = RSTRING(str)->as.heap.ptr;
2672 STR_SET_EMBED(str);
2673 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2674 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2675 STR_SET_LEN(str, len);
2676 return;
2677 }
2678
2679 ptr = ALLOC_N(char, (size_t)capa + termlen);
2680 oldptr = RSTRING_PTR(str);
2681 if (oldptr) {
2682 memcpy(ptr, oldptr, len);
2683 }
2684 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2685 xfree(oldptr);
2686 }
2687 STR_SET_NOEMBED(str);
2688 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2689 TERM_FILL(ptr + len, termlen);
2690 RSTRING(str)->as.heap.ptr = ptr;
2691 STR_SET_LEN(str, len);
2692 RSTRING(str)->as.heap.aux.capa = capa;
2693}
2694
2695void
2696rb_str_modify(VALUE str)
2697{
2698 if (!str_independent(str))
2699 str_make_independent(str);
2701}
2702
2703void
2705{
2706 RUBY_ASSERT(ruby_thread_has_gvl_p());
2707
2708 int termlen = TERM_LEN(str);
2709 long len = RSTRING_LEN(str);
2710
2711 if (expand < 0) {
2712 rb_raise(rb_eArgError, "negative expanding string size");
2713 }
2714 if (expand >= LONG_MAX - len) {
2715 rb_raise(rb_eArgError, "string size too big");
2716 }
2717
2718 if (!str_independent(str)) {
2719 str_make_independent_expand(str, len, expand, termlen);
2720 }
2721 else if (expand > 0) {
2722 RESIZE_CAPA_TERM(str, len + expand, termlen);
2723 }
2725}
2726
2727/* As rb_str_modify(), but don't clear coderange */
2728static void
2729str_modify_keep_cr(VALUE str)
2730{
2731 if (!str_independent(str))
2732 str_make_independent(str);
2734 /* Force re-scan later */
2736}
2737
2738static inline void
2739str_discard(VALUE str)
2740{
2741 str_modifiable(str);
2742 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2743 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2744 RSTRING(str)->as.heap.ptr = 0;
2745 STR_SET_LEN(str, 0);
2746 }
2747}
2748
2749void
2751{
2752 int encindex = rb_enc_get_index(str);
2753
2754 if (RB_UNLIKELY(encindex == -1)) {
2755 rb_raise(rb_eTypeError, "not encoding capable object");
2756 }
2757
2758 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2759 return;
2760 }
2761
2762 rb_encoding *enc = rb_enc_from_index(encindex);
2763 if (!rb_enc_asciicompat(enc)) {
2764 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2765 }
2766}
2767
2768VALUE
2770{
2771 RUBY_ASSERT(ruby_thread_has_gvl_p());
2772
2773 VALUE s = *ptr;
2774 if (!RB_TYPE_P(s, T_STRING)) {
2775 s = rb_str_to_str(s);
2776 *ptr = s;
2777 }
2778 return s;
2779}
2780
2781char *
2783{
2784 VALUE str = rb_string_value(ptr);
2785 return RSTRING_PTR(str);
2786}
2787
2788static int
2789zero_filled(const char *s, int n)
2790{
2791 for (; n > 0; --n) {
2792 if (*s++) return 0;
2793 }
2794 return 1;
2795}
2796
2797static const char *
2798str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2799{
2800 const char *e = s + len;
2801
2802 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2803 if (zero_filled(s, minlen)) return s;
2804 }
2805 return 0;
2806}
2807
2808static char *
2809str_fill_term(VALUE str, char *s, long len, int termlen)
2810{
2811 /* This function assumes that (capa + termlen) bytes of memory
2812 * is allocated, like many other functions in this file.
2813 */
2814 if (str_dependent_p(str)) {
2815 if (!zero_filled(s + len, termlen))
2816 str_make_independent_expand(str, len, 0L, termlen);
2817 }
2818 else {
2819 TERM_FILL(s + len, termlen);
2820 return s;
2821 }
2822 return RSTRING_PTR(str);
2823}
2824
2825void
2826rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2827{
2828 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2829 long len = RSTRING_LEN(str);
2830
2831 RUBY_ASSERT(capa >= len);
2832 if (capa - len < termlen) {
2833 rb_check_lockedtmp(str);
2834 str_make_independent_expand(str, len, 0L, termlen);
2835 }
2836 else if (str_dependent_p(str)) {
2837 if (termlen > oldtermlen)
2838 str_make_independent_expand(str, len, 0L, termlen);
2839 }
2840 else {
2841 if (!STR_EMBED_P(str)) {
2842 /* modify capa instead of realloc */
2843 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2844 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2845 }
2846 if (termlen > oldtermlen) {
2847 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2848 }
2849 }
2850
2851 return;
2852}
2853
2854static char *
2855str_null_check(VALUE str, int *w)
2856{
2857 char *s = RSTRING_PTR(str);
2858 long len = RSTRING_LEN(str);
2859 rb_encoding *enc = rb_enc_get(str);
2860 const int minlen = rb_enc_mbminlen(enc);
2861
2862 if (minlen > 1) {
2863 *w = 1;
2864 if (str_null_char(s, len, minlen, enc)) {
2865 return NULL;
2866 }
2867 return str_fill_term(str, s, len, minlen);
2868 }
2869 *w = 0;
2870 if (!s || memchr(s, 0, len)) {
2871 return NULL;
2872 }
2873 if (s[len]) {
2874 s = str_fill_term(str, s, len, minlen);
2875 }
2876 return s;
2877}
2878
2879char *
2880rb_str_to_cstr(VALUE str)
2881{
2882 int w;
2883 return str_null_check(str, &w);
2884}
2885
2886char *
2888{
2889 VALUE str = rb_string_value(ptr);
2890 int w;
2891 char *s = str_null_check(str, &w);
2892 if (!s) {
2893 if (w) {
2894 rb_raise(rb_eArgError, "string contains null char");
2895 }
2896 rb_raise(rb_eArgError, "string contains null byte");
2897 }
2898 return s;
2899}
2900
2901char *
2902rb_str_fill_terminator(VALUE str, const int newminlen)
2903{
2904 char *s = RSTRING_PTR(str);
2905 long len = RSTRING_LEN(str);
2906 return str_fill_term(str, s, len, newminlen);
2907}
2908
2909VALUE
2911{
2912 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2913 return str;
2914}
2915
2916/*
2917 * call-seq:
2918 * String.try_convert(object) -> object, new_string, or nil
2919 *
2920 * Attempts to convert the given +object+ to a string.
2921 *
2922 * If +object+ is already a string, returns +object+, unmodified.
2923 *
2924 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2925 * calls <tt>object.to_str</tt> and returns the result.
2926 *
2927 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2928 *
2929 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2930 */
2931static VALUE
2932rb_str_s_try_convert(VALUE dummy, VALUE str)
2933{
2934 return rb_check_string_type(str);
2935}
2936
2937static char*
2938str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2939{
2940 long nth = *nthp;
2941 if (rb_enc_mbmaxlen(enc) == 1) {
2942 p += nth;
2943 }
2944 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2945 p += nth * rb_enc_mbmaxlen(enc);
2946 }
2947 else if (rb_enc_asciicompat(enc)) {
2948 const char *p2, *e2;
2949 int n;
2950
2951 while (p < e && 0 < nth) {
2952 e2 = p + nth;
2953 if (e < e2) {
2954 *nthp = nth;
2955 return (char *)e;
2956 }
2957 if (ISASCII(*p)) {
2958 p2 = search_nonascii(p, e2);
2959 if (!p2) {
2960 nth -= e2 - p;
2961 *nthp = nth;
2962 return (char *)e2;
2963 }
2964 nth -= p2 - p;
2965 p = p2;
2966 }
2967 n = rb_enc_mbclen(p, e, enc);
2968 p += n;
2969 nth--;
2970 }
2971 *nthp = nth;
2972 if (nth != 0) {
2973 return (char *)e;
2974 }
2975 return (char *)p;
2976 }
2977 else {
2978 while (p < e && nth--) {
2979 p += rb_enc_mbclen(p, e, enc);
2980 }
2981 }
2982 if (p > e) p = e;
2983 *nthp = nth;
2984 return (char*)p;
2985}
2986
2987char*
2988rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2989{
2990 return str_nth_len(p, e, &nth, enc);
2991}
2992
2993static char*
2994str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2995{
2996 if (singlebyte)
2997 p += nth;
2998 else {
2999 p = str_nth_len(p, e, &nth, enc);
3000 }
3001 if (!p) return 0;
3002 if (p > e) p = e;
3003 return (char *)p;
3004}
3005
3006/* char offset to byte offset */
3007static long
3008str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3009{
3010 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3011 if (!pp) return e - p;
3012 return pp - p;
3013}
3014
3015long
3016rb_str_offset(VALUE str, long pos)
3017{
3018 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3019 STR_ENC_GET(str), single_byte_optimizable(str));
3020}
3021
3022#ifdef NONASCII_MASK
3023static char *
3024str_utf8_nth(const char *p, const char *e, long *nthp)
3025{
3026 long nth = *nthp;
3027 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3028 const uintptr_t *s, *t;
3029 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3030 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3031 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3032 while (p < (const char *)s) {
3033 if (is_utf8_lead_byte(*p)) nth--;
3034 p++;
3035 }
3036 do {
3037 nth -= count_utf8_lead_bytes_with_word(s);
3038 s++;
3039 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3040 p = (char *)s;
3041 }
3042 while (p < e) {
3043 if (is_utf8_lead_byte(*p)) {
3044 if (nth == 0) break;
3045 nth--;
3046 }
3047 p++;
3048 }
3049 *nthp = nth;
3050 return (char *)p;
3051}
3052
3053static long
3054str_utf8_offset(const char *p, const char *e, long nth)
3055{
3056 const char *pp = str_utf8_nth(p, e, &nth);
3057 return pp - p;
3058}
3059#endif
3060
3061/* byte offset to char offset */
3062long
3063rb_str_sublen(VALUE str, long pos)
3064{
3065 if (single_byte_optimizable(str) || pos < 0)
3066 return pos;
3067 else {
3068 char *p = RSTRING_PTR(str);
3069 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3070 }
3071}
3072
3073static VALUE
3074str_subseq(VALUE str, long beg, long len)
3075{
3076 VALUE str2;
3077
3078 RUBY_ASSERT(beg >= 0);
3079 RUBY_ASSERT(len >= 0);
3080 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3081
3082 const int termlen = TERM_LEN(str);
3083 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3084 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3085 RB_GC_GUARD(str);
3086 return str2;
3087 }
3088
3089 str2 = str_alloc_heap(rb_cString);
3090 if (str_embed_capa(str2) >= len + termlen) {
3091 char *ptr2 = RSTRING(str2)->as.embed.ary;
3092 STR_SET_EMBED(str2);
3093 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3094 TERM_FILL(ptr2+len, termlen);
3095
3096 STR_SET_LEN(str2, len);
3097 RB_GC_GUARD(str);
3098 }
3099 else {
3100 str_replace_shared(str2, str);
3101 RUBY_ASSERT(!STR_EMBED_P(str2));
3102 ENC_CODERANGE_CLEAR(str2);
3103 RSTRING(str2)->as.heap.ptr += beg;
3104 if (RSTRING_LEN(str2) > len) {
3105 STR_SET_LEN(str2, len);
3106 }
3107 }
3108
3109 return str2;
3110}
3111
3112VALUE
3113rb_str_subseq(VALUE str, long beg, long len)
3114{
3115 VALUE str2 = str_subseq(str, beg, len);
3116 rb_enc_cr_str_copy_for_substr(str2, str);
3117 return str2;
3118}
3119
3120char *
3121rb_str_subpos(VALUE str, long beg, long *lenp)
3122{
3123 long len = *lenp;
3124 long slen = -1L;
3125 const long blen = RSTRING_LEN(str);
3126 rb_encoding *enc = STR_ENC_GET(str);
3127 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3128
3129 if (len < 0) return 0;
3130 if (beg < 0 && -beg < 0) return 0;
3131 if (!blen) {
3132 len = 0;
3133 }
3134 if (single_byte_optimizable(str)) {
3135 if (beg > blen) return 0;
3136 if (beg < 0) {
3137 beg += blen;
3138 if (beg < 0) return 0;
3139 }
3140 if (len > blen - beg)
3141 len = blen - beg;
3142 if (len < 0) return 0;
3143 p = s + beg;
3144 goto end;
3145 }
3146 if (beg < 0) {
3147 if (len > -beg) len = -beg;
3148 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3149 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3150 beg = -beg;
3151 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3152 p = e;
3153 if (!p) return 0;
3154 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3155 if (!p) return 0;
3156 len = e - p;
3157 goto end;
3158 }
3159 else {
3160 slen = str_strlen(str, enc);
3161 beg += slen;
3162 if (beg < 0) return 0;
3163 p = s + beg;
3164 if (len == 0) goto end;
3165 }
3166 }
3167 else if (beg > 0 && beg > blen) {
3168 return 0;
3169 }
3170 if (len == 0) {
3171 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3172 p = s + beg;
3173 }
3174#ifdef NONASCII_MASK
3175 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3176 enc == rb_utf8_encoding()) {
3177 p = str_utf8_nth(s, e, &beg);
3178 if (beg > 0) return 0;
3179 len = str_utf8_offset(p, e, len);
3180 }
3181#endif
3182 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3183 int char_sz = rb_enc_mbmaxlen(enc);
3184
3185 p = s + beg * char_sz;
3186 if (p > e) {
3187 return 0;
3188 }
3189 else if (len * char_sz > e - p)
3190 len = e - p;
3191 else
3192 len *= char_sz;
3193 }
3194 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3195 if (beg > 0) return 0;
3196 len = 0;
3197 }
3198 else {
3199 len = str_offset(p, e, len, enc, 0);
3200 }
3201 end:
3202 *lenp = len;
3203 RB_GC_GUARD(str);
3204 return p;
3205}
3206
3207static VALUE str_substr(VALUE str, long beg, long len, int empty);
3208
3209VALUE
3210rb_str_substr(VALUE str, long beg, long len)
3211{
3212 return str_substr(str, beg, len, TRUE);
3213}
3214
3215VALUE
3216rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3217{
3218 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3219}
3220
3221static VALUE
3222str_substr(VALUE str, long beg, long len, int empty)
3223{
3224 char *p = rb_str_subpos(str, beg, &len);
3225
3226 if (!p) return Qnil;
3227 if (!len && !empty) return Qnil;
3228
3229 beg = p - RSTRING_PTR(str);
3230
3231 VALUE str2 = str_subseq(str, beg, len);
3232 rb_enc_cr_str_copy_for_substr(str2, str);
3233 return str2;
3234}
3235
3236/* :nodoc: */
3237VALUE
3239{
3240 if (CHILLED_STRING_P(str)) {
3241 FL_UNSET_RAW(str, STR_CHILLED);
3242 }
3243
3244 if (OBJ_FROZEN(str)) return str;
3245 rb_str_resize(str, RSTRING_LEN(str));
3246 return rb_obj_freeze(str);
3247}
3248
3249/*
3250 * call-seq:
3251 * +string -> new_string or self
3252 *
3253 * Returns +self+ if +self+ is not frozen and can be mutated
3254 * without warning issuance.
3255 *
3256 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3257 *
3258 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3259 */
3260static VALUE
3261str_uplus(VALUE str)
3262{
3263 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3264 return rb_str_dup(str);
3265 }
3266 else {
3267 return str;
3268 }
3269}
3270
3271/*
3272 * call-seq:
3273 * -self -> frozen_string
3274 *
3275 * Returns a frozen string equal to +self+.
3276 *
3277 * The returned string is +self+ if and only if all of the following are true:
3278 *
3279 * - +self+ is already frozen.
3280 * - +self+ is an instance of \String (rather than of a subclass of \String)
3281 * - +self+ has no instance variables set on it.
3282 *
3283 * Otherwise, the returned string is a frozen copy of +self+.
3284 *
3285 * Returning +self+, when possible, saves duplicating +self+;
3286 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3287 *
3288 * It may also save duplicating other, already-existing, strings:
3289 *
3290 * s0 = 'foo'
3291 * s1 = 'foo'
3292 * s0.object_id == s1.object_id # => false
3293 * (-s0).object_id == (-s1).object_id # => true
3294 *
3295 * Note that method #-@ is convenient for defining a constant:
3296 *
3297 * FileName = -'config/database.yml'
3298 *
3299 * While its alias #dedup is better suited for chaining:
3300 *
3301 * 'foo'.dedup.gsub!('o')
3302 *
3303 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3304 */
3305static VALUE
3306str_uminus(VALUE str)
3307{
3308 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3309 str = rb_str_dup(str);
3310 }
3311 return rb_fstring(str);
3312}
3313
3314RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3315#define rb_str_dup_frozen rb_str_new_frozen
3316
3317VALUE
3319{
3320 rb_check_frozen(str);
3321 if (FL_TEST(str, STR_TMPLOCK)) {
3322 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3323 }
3324 FL_SET(str, STR_TMPLOCK);
3325 return str;
3326}
3327
3328VALUE
3330{
3331 rb_check_frozen(str);
3332 if (!FL_TEST(str, STR_TMPLOCK)) {
3333 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3334 }
3335 FL_UNSET(str, STR_TMPLOCK);
3336 return str;
3337}
3338
3339VALUE
3340rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3341{
3342 rb_str_locktmp(str);
3343 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3344}
3345
3346void
3348{
3349 RUBY_ASSERT(ruby_thread_has_gvl_p());
3350
3351 long capa;
3352 const int termlen = TERM_LEN(str);
3353
3354 str_modifiable(str);
3355 if (STR_SHARED_P(str)) {
3356 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3357 }
3358 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3359 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3360 }
3361
3362 int cr = ENC_CODERANGE(str);
3363 if (len == 0) {
3364 /* Empty string does not contain non-ASCII */
3366 }
3367 else if (cr == ENC_CODERANGE_UNKNOWN) {
3368 /* Leave unknown. */
3369 }
3370 else if (len > RSTRING_LEN(str)) {
3371 if (ENC_CODERANGE_CLEAN_P(cr)) {
3372 /* Update the coderange regarding the extended part. */
3373 const char *const prev_end = RSTRING_END(str);
3374 const char *const new_end = RSTRING_PTR(str) + len;
3375 rb_encoding *enc = rb_enc_get(str);
3376 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3377 ENC_CODERANGE_SET(str, cr);
3378 }
3379 else if (cr == ENC_CODERANGE_BROKEN) {
3380 /* May be valid now, by appended part. */
3382 }
3383 }
3384 else if (len < RSTRING_LEN(str)) {
3385 if (cr != ENC_CODERANGE_7BIT) {
3386 /* ASCII-only string is keeping after truncated. Valid
3387 * and broken may be invalid or valid, leave unknown. */
3389 }
3390 }
3391
3392 STR_SET_LEN(str, len);
3393 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3394}
3395
3396VALUE
3397rb_str_resize(VALUE str, long len)
3398{
3399 if (len < 0) {
3400 rb_raise(rb_eArgError, "negative string size (or size too big)");
3401 }
3402
3403 int independent = str_independent(str);
3404 long slen = RSTRING_LEN(str);
3405 const int termlen = TERM_LEN(str);
3406
3407 if (slen > len || (termlen != 1 && slen < len)) {
3409 }
3410
3411 {
3412 long capa;
3413 if (STR_EMBED_P(str)) {
3414 if (len == slen) return str;
3415 if (str_embed_capa(str) >= len + termlen) {
3416 STR_SET_LEN(str, len);
3417 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3418 return str;
3419 }
3420 str_make_independent_expand(str, slen, len - slen, termlen);
3421 }
3422 else if (str_embed_capa(str) >= len + termlen) {
3423 char *ptr = STR_HEAP_PTR(str);
3424 STR_SET_EMBED(str);
3425 if (slen > len) slen = len;
3426 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3427 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3428 STR_SET_LEN(str, len);
3429 if (independent) ruby_xfree(ptr);
3430 return str;
3431 }
3432 else if (!independent) {
3433 if (len == slen) return str;
3434 str_make_independent_expand(str, slen, len - slen, termlen);
3435 }
3436 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3437 (capa - len) > (len < 1024 ? len : 1024)) {
3438 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3439 (size_t)len + termlen, STR_HEAP_SIZE(str));
3440 RSTRING(str)->as.heap.aux.capa = len;
3441 }
3442 else if (len == slen) return str;
3443 STR_SET_LEN(str, len);
3444 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3445 }
3446 return str;
3447}
3448
3449static void
3450str_ensure_available_capa(VALUE str, long len)
3451{
3452 str_modify_keep_cr(str);
3453
3454 const int termlen = TERM_LEN(str);
3455 long olen = RSTRING_LEN(str);
3456
3457 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3458 rb_raise(rb_eArgError, "string sizes too big");
3459 }
3460
3461 long total = olen + len;
3462 long capa = str_capacity(str, termlen);
3463
3464 if (capa < total) {
3465 if (total >= LONG_MAX / 2) {
3466 capa = total;
3467 }
3468 while (total > capa) {
3469 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3470 }
3471 RESIZE_CAPA_TERM(str, capa, termlen);
3472 }
3473}
3474
3475static VALUE
3476str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3477{
3478 if (keep_cr) {
3479 str_modify_keep_cr(str);
3480 }
3481 else {
3482 rb_str_modify(str);
3483 }
3484 if (len == 0) return 0;
3485
3486 long total, olen, off = -1;
3487 char *sptr;
3488 const int termlen = TERM_LEN(str);
3489
3490 RSTRING_GETMEM(str, sptr, olen);
3491 if (ptr >= sptr && ptr <= sptr + olen) {
3492 off = ptr - sptr;
3493 }
3494
3495 long capa = str_capacity(str, termlen);
3496
3497 if (olen > LONG_MAX - len) {
3498 rb_raise(rb_eArgError, "string sizes too big");
3499 }
3500 total = olen + len;
3501 if (capa < total) {
3502 if (total >= LONG_MAX / 2) {
3503 capa = total;
3504 }
3505 while (total > capa) {
3506 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3507 }
3508 RESIZE_CAPA_TERM(str, capa, termlen);
3509 sptr = RSTRING_PTR(str);
3510 }
3511 if (off != -1) {
3512 ptr = sptr + off;
3513 }
3514 memcpy(sptr + olen, ptr, len);
3515 STR_SET_LEN(str, total);
3516 TERM_FILL(sptr + total, termlen); /* sentinel */
3517
3518 return str;
3519}
3520
3521#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3522#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3523
3524VALUE
3525rb_str_cat(VALUE str, const char *ptr, long len)
3526{
3527 if (len == 0) return str;
3528 if (len < 0) {
3529 rb_raise(rb_eArgError, "negative string size (or size too big)");
3530 }
3531 return str_buf_cat(str, ptr, len);
3532}
3533
3534VALUE
3535rb_str_cat_cstr(VALUE str, const char *ptr)
3536{
3537 must_not_null(ptr);
3538 return rb_str_buf_cat(str, ptr, strlen(ptr));
3539}
3540
3541static void
3542rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3543{
3544 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3545
3546 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3547 if (UNLIKELY(!str_independent(str))) {
3548 str_make_independent(str);
3549 }
3550
3551 long string_length = -1;
3552 const int null_terminator_length = 1;
3553 char *sptr;
3554 RSTRING_GETMEM(str, sptr, string_length);
3555
3556 // Ensure the resulting string wouldn't be too long.
3557 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3558 rb_raise(rb_eArgError, "string sizes too big");
3559 }
3560
3561 long string_capacity = str_capacity(str, null_terminator_length);
3562
3563 // Get the code range before any modifications since those might clear the code range.
3564 int cr = ENC_CODERANGE(str);
3565
3566 // Check if the string has spare string_capacity to write the new byte.
3567 if (LIKELY(string_capacity >= string_length + 1)) {
3568 // In fast path we can write the new byte and note the string's new length.
3569 sptr[string_length] = byte;
3570 STR_SET_LEN(str, string_length + 1);
3571 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3572 }
3573 else {
3574 // If there's not enough string_capacity, make a call into the general string concatenation function.
3575 str_buf_cat(str, (char *)&byte, 1);
3576 }
3577
3578 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3579 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3580 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3581 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3582 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3583 if (ISASCII(byte)) {
3585 }
3586 else {
3588
3589 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3590 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3591 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3592 }
3593 }
3594 }
3595}
3596
3597RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3598RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3599RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3600
3601static VALUE
3602rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3603 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3604{
3605 int str_encindex = ENCODING_GET(str);
3606 int res_encindex;
3607 int str_cr, res_cr;
3608 rb_encoding *str_enc, *ptr_enc;
3609
3610 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3611
3612 if (str_encindex == ptr_encindex) {
3613 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3614 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3615 }
3616 }
3617 else {
3618 str_enc = rb_enc_from_index(str_encindex);
3619 ptr_enc = rb_enc_from_index(ptr_encindex);
3620 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3621 if (len == 0)
3622 return str;
3623 if (RSTRING_LEN(str) == 0) {
3624 rb_str_buf_cat(str, ptr, len);
3625 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3626 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3627 return str;
3628 }
3629 goto incompatible;
3630 }
3631 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3632 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3633 }
3634 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3635 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3636 str_cr = rb_enc_str_coderange(str);
3637 }
3638 }
3639 }
3640 if (ptr_cr_ret)
3641 *ptr_cr_ret = ptr_cr;
3642
3643 if (str_encindex != ptr_encindex &&
3644 str_cr != ENC_CODERANGE_7BIT &&
3645 ptr_cr != ENC_CODERANGE_7BIT) {
3646 str_enc = rb_enc_from_index(str_encindex);
3647 ptr_enc = rb_enc_from_index(ptr_encindex);
3648 goto incompatible;
3649 }
3650
3651 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3652 res_encindex = str_encindex;
3653 res_cr = ENC_CODERANGE_UNKNOWN;
3654 }
3655 else if (str_cr == ENC_CODERANGE_7BIT) {
3656 if (ptr_cr == ENC_CODERANGE_7BIT) {
3657 res_encindex = str_encindex;
3658 res_cr = ENC_CODERANGE_7BIT;
3659 }
3660 else {
3661 res_encindex = ptr_encindex;
3662 res_cr = ptr_cr;
3663 }
3664 }
3665 else if (str_cr == ENC_CODERANGE_VALID) {
3666 res_encindex = str_encindex;
3667 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3668 res_cr = str_cr;
3669 else
3670 res_cr = ptr_cr;
3671 }
3672 else { /* str_cr == ENC_CODERANGE_BROKEN */
3673 res_encindex = str_encindex;
3674 res_cr = str_cr;
3675 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3676 }
3677
3678 if (len < 0) {
3679 rb_raise(rb_eArgError, "negative string size (or size too big)");
3680 }
3681 str_buf_cat(str, ptr, len);
3682 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3683 return str;
3684
3685 incompatible:
3686 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3687 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3689}
3690
3691VALUE
3692rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3693{
3694 return rb_enc_cr_str_buf_cat(str, ptr, len,
3695 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3696}
3697
3698VALUE
3700{
3701 /* ptr must reference NUL terminated ASCII string. */
3702 int encindex = ENCODING_GET(str);
3703 rb_encoding *enc = rb_enc_from_index(encindex);
3704 if (rb_enc_asciicompat(enc)) {
3705 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3706 encindex, ENC_CODERANGE_7BIT, 0);
3707 }
3708 else {
3709 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3710 while (*ptr) {
3711 unsigned int c = (unsigned char)*ptr;
3712 int len = rb_enc_codelen(c, enc);
3713 rb_enc_mbcput(c, buf, enc);
3714 rb_enc_cr_str_buf_cat(str, buf, len,
3715 encindex, ENC_CODERANGE_VALID, 0);
3716 ptr++;
3717 }
3718 return str;
3719 }
3720}
3721
3722VALUE
3724{
3725 int str2_cr = rb_enc_str_coderange(str2);
3726
3727 if (str_enc_fastpath(str)) {
3728 switch (str2_cr) {
3729 case ENC_CODERANGE_7BIT:
3730 // If RHS is 7bit we can do simple concatenation
3731 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3732 RB_GC_GUARD(str2);
3733 return str;
3735 // If RHS is valid, we can do simple concatenation if encodings are the same
3736 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3737 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3738 int str_cr = ENC_CODERANGE(str);
3739 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3740 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3741 }
3742 RB_GC_GUARD(str2);
3743 return str;
3744 }
3745 }
3746 }
3747
3748 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3749 ENCODING_GET(str2), str2_cr, &str2_cr);
3750
3751 ENC_CODERANGE_SET(str2, str2_cr);
3752
3753 return str;
3754}
3755
3756VALUE
3758{
3759 StringValue(str2);
3760 return rb_str_buf_append(str, str2);
3761}
3762
3763VALUE
3764rb_str_concat_literals(size_t num, const VALUE *strary)
3765{
3766 VALUE str;
3767 size_t i, s = 0;
3768 unsigned long len = 1;
3769
3770 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3771 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3772
3773 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3774 str = rb_str_buf_new(len);
3775 str_enc_copy_direct(str, strary[0]);
3776
3777 for (i = s; i < num; ++i) {
3778 const VALUE v = strary[i];
3779 int encidx = ENCODING_GET(v);
3780
3781 rb_str_buf_append(str, v);
3782 if (encidx != ENCINDEX_US_ASCII) {
3783 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3784 rb_enc_set_index(str, encidx);
3785 }
3786 }
3787 return str;
3788}
3789
3790/*
3791 * call-seq:
3792 * concat(*objects) -> string
3793 *
3794 * :include: doc/string/concat.rdoc
3795 */
3796static VALUE
3797rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3798{
3799 str_modifiable(str);
3800
3801 if (argc == 1) {
3802 return rb_str_concat(str, argv[0]);
3803 }
3804 else if (argc > 1) {
3805 int i;
3806 VALUE arg_str = rb_str_tmp_new(0);
3807 rb_enc_copy(arg_str, str);
3808 for (i = 0; i < argc; i++) {
3809 rb_str_concat(arg_str, argv[i]);
3810 }
3811 rb_str_buf_append(str, arg_str);
3812 }
3813
3814 return str;
3815}
3816
3817/*
3818 * call-seq:
3819 * append_as_bytes(*objects) -> self
3820 *
3821 * Concatenates each object in +objects+ into +self+; returns +self+;
3822 * performs no encoding validation or conversion:
3823 *
3824 * s = 'foo'
3825 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3826 * s.valid_encoding? # => false
3827 * s.append_as_bytes("\xAC 12")
3828 * s.valid_encoding? # => true
3829 *
3830 * When a given object is an integer,
3831 * the value is considered an 8-bit byte;
3832 * if the integer occupies more than one byte (i.e,. is greater than 255),
3833 * appends only the low-order byte (similar to String#setbyte):
3834 *
3835 * s = ""
3836 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3837 * s.bytesize # => 2
3838 *
3839 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3840 */
3841
3842VALUE
3843rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3844{
3845 long needed_capacity = 0;
3846 volatile VALUE t0;
3847 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3848
3849 for (int index = 0; index < argc; index++) {
3850 VALUE obj = argv[index];
3851 enum ruby_value_type type = types[index] = rb_type(obj);
3852 switch (type) {
3853 case T_FIXNUM:
3854 case T_BIGNUM:
3855 needed_capacity++;
3856 break;
3857 case T_STRING:
3858 needed_capacity += RSTRING_LEN(obj);
3859 break;
3860 default:
3861 rb_raise(
3863 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3864 rb_obj_class(obj)
3865 );
3866 break;
3867 }
3868 }
3869
3870 str_ensure_available_capa(str, needed_capacity);
3871 char *sptr = RSTRING_END(str);
3872
3873 for (int index = 0; index < argc; index++) {
3874 VALUE obj = argv[index];
3875 enum ruby_value_type type = types[index];
3876 switch (type) {
3877 case T_FIXNUM:
3878 case T_BIGNUM: {
3879 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3880 char byte = (char)(NUM2INT(obj) & 0xFF);
3881 *sptr = byte;
3882 sptr++;
3883 break;
3884 }
3885 case T_STRING: {
3886 const char *ptr;
3887 long len;
3888 RSTRING_GETMEM(obj, ptr, len);
3889 memcpy(sptr, ptr, len);
3890 sptr += len;
3891 break;
3892 }
3893 default:
3894 rb_bug("append_as_bytes arguments should have been validated");
3895 }
3896 }
3897
3898 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3899 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3900
3901 int cr = ENC_CODERANGE(str);
3902 switch (cr) {
3903 case ENC_CODERANGE_7BIT: {
3904 for (int index = 0; index < argc; index++) {
3905 VALUE obj = argv[index];
3906 enum ruby_value_type type = types[index];
3907 switch (type) {
3908 case T_FIXNUM:
3909 case T_BIGNUM: {
3910 if (!ISASCII(NUM2INT(obj))) {
3911 goto clear_cr;
3912 }
3913 break;
3914 }
3915 case T_STRING: {
3916 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3917 goto clear_cr;
3918 }
3919 break;
3920 }
3921 default:
3922 rb_bug("append_as_bytes arguments should have been validated");
3923 }
3924 }
3925 break;
3926 }
3928 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3929 goto keep_cr;
3930 }
3931 else {
3932 goto clear_cr;
3933 }
3934 break;
3935 default:
3936 goto clear_cr;
3937 break;
3938 }
3939
3940 RB_GC_GUARD(t0);
3941
3942 clear_cr:
3943 // If no fast path was hit, we clear the coderange.
3944 // append_as_bytes is predominently meant to be used in
3945 // buffering situation, hence it's likely the coderange
3946 // will never be scanned, so it's not worth spending time
3947 // precomputing the coderange except for simple and common
3948 // situations.
3950 keep_cr:
3951 return str;
3952}
3953
3954/*
3955 * call-seq:
3956 * self << object -> self
3957 *
3958 * Appends a string representation of +object+ to +self+;
3959 * returns +self+.
3960 *
3961 * If +object+ is a string, appends it to +self+:
3962 *
3963 * s = 'foo'
3964 * s << 'bar' # => "foobar"
3965 * s # => "foobar"
3966 *
3967 * If +object+ is an integer,
3968 * its value is considered a codepoint;
3969 * converts the value to a character before concatenating:
3970 *
3971 * s = 'foo'
3972 * s << 33 # => "foo!"
3973 *
3974 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
3975 * and the encoding of +self+ is Encoding::US_ASCII,
3976 * changes the encoding to Encoding::ASCII_8BIT:
3977 *
3978 * s = 'foo'.encode(Encoding::US_ASCII)
3979 * s.encoding # => #<Encoding:US-ASCII>
3980 * s << 0xff # => "foo\xFF"
3981 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3982 *
3983 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
3984 *
3985 * s = 'foo'
3986 * s.encoding # => <Encoding:UTF-8>
3987 * s << 0x00110000 # 1114112 out of char range (RangeError)
3988 * s = 'foo'.encode(Encoding::EUC_JP)
3989 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3990 *
3991 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3992 */
3993VALUE
3995{
3996 unsigned int code;
3997 rb_encoding *enc = STR_ENC_GET(str1);
3998 int encidx;
3999
4000 if (RB_INTEGER_TYPE_P(str2)) {
4001 if (rb_num_to_uint(str2, &code) == 0) {
4002 }
4003 else if (FIXNUM_P(str2)) {
4004 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4005 }
4006 else {
4007 rb_raise(rb_eRangeError, "bignum out of char range");
4008 }
4009 }
4010 else {
4011 return rb_str_append(str1, str2);
4012 }
4013
4014 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4015
4016 if (encidx >= 0) {
4017 rb_str_buf_cat_byte(str1, (unsigned char)code);
4018 }
4019 else {
4020 long pos = RSTRING_LEN(str1);
4021 int cr = ENC_CODERANGE(str1);
4022 int len;
4023 char *buf;
4024
4025 switch (len = rb_enc_codelen(code, enc)) {
4026 case ONIGERR_INVALID_CODE_POINT_VALUE:
4027 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4028 break;
4029 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4030 case 0:
4031 rb_raise(rb_eRangeError, "%u out of char range", code);
4032 break;
4033 }
4034 buf = ALLOCA_N(char, len + 1);
4035 rb_enc_mbcput(code, buf, enc);
4036 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4037 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4038 }
4039 rb_str_resize(str1, pos+len);
4040 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4041 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4043 }
4044 else if (cr == ENC_CODERANGE_BROKEN) {
4046 }
4047 ENC_CODERANGE_SET(str1, cr);
4048 }
4049 return str1;
4050}
4051
4052int
4053rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4054{
4055 int encidx = rb_enc_to_index(enc);
4056
4057 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4058 /* US-ASCII automatically extended to ASCII-8BIT */
4059 if (code > 0xFF) {
4060 rb_raise(rb_eRangeError, "%u out of char range", code);
4061 }
4062 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4063 return ENCINDEX_ASCII_8BIT;
4064 }
4065 return encidx;
4066 }
4067 else {
4068 return -1;
4069 }
4070}
4071
4072/*
4073 * call-seq:
4074 * prepend(*other_strings) -> string
4075 *
4076 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4077 *
4078 * s = 'foo'
4079 * s.prepend('bar', 'baz') # => "barbazfoo"
4080 * s # => "barbazfoo"
4081 *
4082 * Related: String#concat.
4083 */
4084
4085static VALUE
4086rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4087{
4088 str_modifiable(str);
4089
4090 if (argc == 1) {
4091 rb_str_update(str, 0L, 0L, argv[0]);
4092 }
4093 else if (argc > 1) {
4094 int i;
4095 VALUE arg_str = rb_str_tmp_new(0);
4096 rb_enc_copy(arg_str, str);
4097 for (i = 0; i < argc; i++) {
4098 rb_str_append(arg_str, argv[i]);
4099 }
4100 rb_str_update(str, 0L, 0L, arg_str);
4101 }
4102
4103 return str;
4104}
4105
4106st_index_t
4108{
4109 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4110 st_index_t precomputed_hash;
4111 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4112
4113 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4114 return precomputed_hash;
4115 }
4116
4117 return str_do_hash(str);
4118}
4119
4120int
4122{
4123 long len1, len2;
4124 const char *ptr1, *ptr2;
4125 RSTRING_GETMEM(str1, ptr1, len1);
4126 RSTRING_GETMEM(str2, ptr2, len2);
4127 return (len1 != len2 ||
4128 !rb_str_comparable(str1, str2) ||
4129 memcmp(ptr1, ptr2, len1) != 0);
4130}
4131
4132/*
4133 * call-seq:
4134 * hash -> integer
4135 *
4136 * :include: doc/string/hash.rdoc
4137 *
4138 */
4139
4140static VALUE
4141rb_str_hash_m(VALUE str)
4142{
4143 st_index_t hval = rb_str_hash(str);
4144 return ST2FIX(hval);
4145}
4146
4147#define lesser(a,b) (((a)>(b))?(b):(a))
4148
4149int
4151{
4152 int idx1, idx2;
4153 int rc1, rc2;
4154
4155 if (RSTRING_LEN(str1) == 0) return TRUE;
4156 if (RSTRING_LEN(str2) == 0) return TRUE;
4157 idx1 = ENCODING_GET(str1);
4158 idx2 = ENCODING_GET(str2);
4159 if (idx1 == idx2) return TRUE;
4160 rc1 = rb_enc_str_coderange(str1);
4161 rc2 = rb_enc_str_coderange(str2);
4162 if (rc1 == ENC_CODERANGE_7BIT) {
4163 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4164 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4165 return TRUE;
4166 }
4167 if (rc2 == ENC_CODERANGE_7BIT) {
4168 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4169 return TRUE;
4170 }
4171 return FALSE;
4172}
4173
4174int
4176{
4177 long len1, len2;
4178 const char *ptr1, *ptr2;
4179 int retval;
4180
4181 if (str1 == str2) return 0;
4182 RSTRING_GETMEM(str1, ptr1, len1);
4183 RSTRING_GETMEM(str2, ptr2, len2);
4184 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4185 if (len1 == len2) {
4186 if (!rb_str_comparable(str1, str2)) {
4187 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4188 return 1;
4189 return -1;
4190 }
4191 return 0;
4192 }
4193 if (len1 > len2) return 1;
4194 return -1;
4195 }
4196 if (retval > 0) return 1;
4197 return -1;
4198}
4199
4200/*
4201 * call-seq:
4202 * self == object -> true or false
4203 *
4204 * Returns whether +object+ is equal to +self+.
4205 *
4206 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4207 *
4208 * s = 'foo'
4209 * s == 'foo' # => true
4210 * s == 'food' # => false
4211 * s == 'FOO' # => false
4212 *
4213 * Returns +false+ if the two strings' encodings are not compatible:
4214 *
4215 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4216 *
4217 * When +object+ is not a string:
4218 *
4219 * - If +object+ responds to method <tt>to_str</tt>,
4220 * <tt>object == self</tt> is called and its return value is returned.
4221 * - If +object+ does not respond to <tt>to_str</tt>,
4222 * +false+ is returned.
4223 *
4224 * Related: {Comparing}[rdoc-ref:String@Comparing].
4225 */
4226
4227VALUE
4229{
4230 if (str1 == str2) return Qtrue;
4231 if (!RB_TYPE_P(str2, T_STRING)) {
4232 if (!rb_respond_to(str2, idTo_str)) {
4233 return Qfalse;
4234 }
4235 return rb_equal(str2, str1);
4236 }
4237 return rb_str_eql_internal(str1, str2);
4238}
4239
4240/*
4241 * call-seq:
4242 * eql?(object) -> true or false
4243 *
4244 * :include: doc/string/eql_p.rdoc
4245 *
4246 */
4247
4248VALUE
4249rb_str_eql(VALUE str1, VALUE str2)
4250{
4251 if (str1 == str2) return Qtrue;
4252 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4253 return rb_str_eql_internal(str1, str2);
4254}
4255
4256/*
4257 * call-seq:
4258 * self <=> other_string -> -1, 0, 1, or nil
4259 *
4260 * Compares +self+ and +other_string+, returning:
4261 *
4262 * - -1 if +other_string+ is larger.
4263 * - 0 if the two are equal.
4264 * - 1 if +other_string+ is smaller.
4265 * - +nil+ if the two are incomparable.
4266 *
4267 * Examples:
4268 *
4269 * 'foo' <=> 'foo' # => 0
4270 * 'foo' <=> 'food' # => -1
4271 * 'food' <=> 'foo' # => 1
4272 * 'FOO' <=> 'foo' # => -1
4273 * 'foo' <=> 'FOO' # => 1
4274 * 'foo' <=> 1 # => nil
4275 *
4276 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4277 */
4278
4279static VALUE
4280rb_str_cmp_m(VALUE str1, VALUE str2)
4281{
4282 int result;
4283 VALUE s = rb_check_string_type(str2);
4284 if (NIL_P(s)) {
4285 return rb_invcmp(str1, str2);
4286 }
4287 result = rb_str_cmp(str1, s);
4288 return INT2FIX(result);
4289}
4290
4291static VALUE str_casecmp(VALUE str1, VALUE str2);
4292static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4293
4294/*
4295 * call-seq:
4296 * casecmp(other_string) -> -1, 0, 1, or nil
4297 *
4298 * Ignoring case, compares +self+ and +other_string+; returns:
4299 *
4300 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4301 * - 0 if the two are equal.
4302 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4303 * - +nil+ if the two are incomparable.
4304 *
4305 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4306 *
4307 * Examples:
4308 *
4309 * 'foo'.casecmp('goo') # => -1
4310 * 'goo'.casecmp('foo') # => 1
4311 * 'foo'.casecmp('food') # => -1
4312 * 'food'.casecmp('foo') # => 1
4313 * 'FOO'.casecmp('foo') # => 0
4314 * 'foo'.casecmp('FOO') # => 0
4315 * 'foo'.casecmp(1) # => nil
4316 *
4317 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4318 */
4319
4320static VALUE
4321rb_str_casecmp(VALUE str1, VALUE str2)
4322{
4323 VALUE s = rb_check_string_type(str2);
4324 if (NIL_P(s)) {
4325 return Qnil;
4326 }
4327 return str_casecmp(str1, s);
4328}
4329
4330static VALUE
4331str_casecmp(VALUE str1, VALUE str2)
4332{
4333 long len;
4334 rb_encoding *enc;
4335 const char *p1, *p1end, *p2, *p2end;
4336
4337 enc = rb_enc_compatible(str1, str2);
4338 if (!enc) {
4339 return Qnil;
4340 }
4341
4342 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4343 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4344 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4345 while (p1 < p1end && p2 < p2end) {
4346 if (*p1 != *p2) {
4347 unsigned int c1 = TOLOWER(*p1 & 0xff);
4348 unsigned int c2 = TOLOWER(*p2 & 0xff);
4349 if (c1 != c2)
4350 return INT2FIX(c1 < c2 ? -1 : 1);
4351 }
4352 p1++;
4353 p2++;
4354 }
4355 }
4356 else {
4357 while (p1 < p1end && p2 < p2end) {
4358 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4359 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4360
4361 if (0 <= c1 && 0 <= c2) {
4362 c1 = TOLOWER(c1);
4363 c2 = TOLOWER(c2);
4364 if (c1 != c2)
4365 return INT2FIX(c1 < c2 ? -1 : 1);
4366 }
4367 else {
4368 int r;
4369 l1 = rb_enc_mbclen(p1, p1end, enc);
4370 l2 = rb_enc_mbclen(p2, p2end, enc);
4371 len = l1 < l2 ? l1 : l2;
4372 r = memcmp(p1, p2, len);
4373 if (r != 0)
4374 return INT2FIX(r < 0 ? -1 : 1);
4375 if (l1 != l2)
4376 return INT2FIX(l1 < l2 ? -1 : 1);
4377 }
4378 p1 += l1;
4379 p2 += l2;
4380 }
4381 }
4382 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4383 if (p1 == p1end) return INT2FIX(-1);
4384 return INT2FIX(1);
4385}
4386
4387/*
4388 * call-seq:
4389 * casecmp?(other_string) -> true, false, or nil
4390 *
4391 * Returns +true+ if +self+ and +other_string+ are equal after
4392 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4393 *
4394 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4395 *
4396 * Examples:
4397 *
4398 * 'foo'.casecmp?('goo') # => false
4399 * 'goo'.casecmp?('foo') # => false
4400 * 'foo'.casecmp?('food') # => false
4401 * 'food'.casecmp?('foo') # => false
4402 * 'FOO'.casecmp?('foo') # => true
4403 * 'foo'.casecmp?('FOO') # => true
4404 * 'foo'.casecmp?(1) # => nil
4405 *
4406 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4407 */
4408
4409static VALUE
4410rb_str_casecmp_p(VALUE str1, VALUE str2)
4411{
4412 VALUE s = rb_check_string_type(str2);
4413 if (NIL_P(s)) {
4414 return Qnil;
4415 }
4416 return str_casecmp_p(str1, s);
4417}
4418
4419static VALUE
4420str_casecmp_p(VALUE str1, VALUE str2)
4421{
4422 rb_encoding *enc;
4423 VALUE folded_str1, folded_str2;
4424 VALUE fold_opt = sym_fold;
4425
4426 enc = rb_enc_compatible(str1, str2);
4427 if (!enc) {
4428 return Qnil;
4429 }
4430
4431 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4432 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4433
4434 return rb_str_eql(folded_str1, folded_str2);
4435}
4436
4437static long
4438strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4439 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4440{
4441 const char *search_start = str_ptr;
4442 long pos, search_len = str_len - offset;
4443
4444 for (;;) {
4445 const char *t;
4446 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4447 if (pos < 0) return pos;
4448 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4449 if (t == search_start + pos) break;
4450 search_len -= t - search_start;
4451 if (search_len <= 0) return -1;
4452 offset += t - search_start;
4453 search_start = t;
4454 }
4455 return pos + offset;
4456}
4457
4458/* found index in byte */
4459#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4460#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4461
4462static long
4463rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4464{
4465 const char *str_ptr, *str_ptr_end, *sub_ptr;
4466 long str_len, sub_len;
4467 rb_encoding *enc;
4468
4469 enc = rb_enc_check(str, sub);
4470 if (is_broken_string(sub)) return -1;
4471
4472 str_ptr = RSTRING_PTR(str);
4473 str_ptr_end = RSTRING_END(str);
4474 str_len = RSTRING_LEN(str);
4475 sub_ptr = RSTRING_PTR(sub);
4476 sub_len = RSTRING_LEN(sub);
4477
4478 if (str_len < sub_len) return -1;
4479
4480 if (offset != 0) {
4481 long str_len_char, sub_len_char;
4482 int single_byte = single_byte_optimizable(str);
4483 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4484 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4485 if (offset < 0) {
4486 offset += str_len_char;
4487 if (offset < 0) return -1;
4488 }
4489 if (str_len_char - offset < sub_len_char) return -1;
4490 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4491 str_ptr += offset;
4492 }
4493 if (sub_len == 0) return offset;
4494
4495 /* need proceed one character at a time */
4496 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4497}
4498
4499
4500/*
4501 * call-seq:
4502 * index(pattern, offset = 0) -> integer or nil
4503 *
4504 * :include: doc/string/index.rdoc
4505 *
4506 */
4507
4508static VALUE
4509rb_str_index_m(int argc, VALUE *argv, VALUE str)
4510{
4511 VALUE sub;
4512 VALUE initpos;
4513 rb_encoding *enc = STR_ENC_GET(str);
4514 long pos;
4515
4516 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4517 long slen = str_strlen(str, enc); /* str's enc */
4518 pos = NUM2LONG(initpos);
4519 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4520 if (RB_TYPE_P(sub, T_REGEXP)) {
4522 }
4523 return Qnil;
4524 }
4525 }
4526 else {
4527 pos = 0;
4528 }
4529
4530 if (RB_TYPE_P(sub, T_REGEXP)) {
4531 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4532 enc, single_byte_optimizable(str));
4533
4534 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4535 VALUE match = rb_backref_get();
4536 struct re_registers *regs = RMATCH_REGS(match);
4537 pos = rb_str_sublen(str, BEG(0));
4538 return LONG2NUM(pos);
4539 }
4540 }
4541 else {
4542 StringValue(sub);
4543 pos = rb_str_index(str, sub, pos);
4544 if (pos >= 0) {
4545 pos = rb_str_sublen(str, pos);
4546 return LONG2NUM(pos);
4547 }
4548 }
4549 return Qnil;
4550}
4551
4552/* Ensure that the given pos is a valid character boundary.
4553 * Note that in this function, "character" means a code point
4554 * (Unicode scalar value), not a grapheme cluster.
4555 */
4556static void
4557str_ensure_byte_pos(VALUE str, long pos)
4558{
4559 if (!single_byte_optimizable(str)) {
4560 const char *s = RSTRING_PTR(str);
4561 const char *e = RSTRING_END(str);
4562 const char *p = s + pos;
4563 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4564 rb_raise(rb_eIndexError,
4565 "offset %ld does not land on character boundary", pos);
4566 }
4567 }
4568}
4569
4570/*
4571 * call-seq:
4572 * byteindex(object, offset = 0) -> integer or nil
4573 *
4574 * Returns the 0-based integer index of a substring of +self+
4575 * specified by +object+ (a string or Regexp) and +offset+,
4576 * or +nil+ if there is no such substring;
4577 * the returned index is the count of _bytes_ (not characters).
4578 *
4579 * When +object+ is a string,
4580 * returns the index of the first found substring equal to +object+:
4581 *
4582 * s = 'foo' # => "foo"
4583 * s.size # => 3 # Three 1-byte characters.
4584 * s.bytesize # => 3 # Three bytes.
4585 * s.byteindex('f') # => 0
4586 * s.byteindex('o') # => 1
4587 * s.byteindex('oo') # => 1
4588 * s.byteindex('ooo') # => nil
4589 *
4590 * When +object+ is a Regexp,
4591 * returns the index of the first found substring matching +object+;
4592 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4593 *
4594 * s = 'foo'
4595 * s.byteindex(/f/) # => 0
4596 * $~ # => #<MatchData "f">
4597 * s.byteindex(/o/) # => 1
4598 * s.byteindex(/oo/) # => 1
4599 * s.byteindex(/ooo/) # => nil
4600 * $~ # => nil
4601 *
4602 * \Integer argument +offset+, if given, specifies the 0-based index
4603 * of the byte where searching is to begin.
4604 *
4605 * When +offset+ is non-negative,
4606 * searching begins at byte position +offset+:
4607 *
4608 * s = 'foo'
4609 * s.byteindex('o', 1) # => 1
4610 * s.byteindex('o', 2) # => 2
4611 * s.byteindex('o', 3) # => nil
4612 *
4613 * When +offset+ is negative, counts backward from the end of +self+:
4614 *
4615 * s = 'foo'
4616 * s.byteindex('o', -1) # => 2
4617 * s.byteindex('o', -2) # => 1
4618 * s.byteindex('o', -3) # => 1
4619 * s.byteindex('o', -4) # => nil
4620 *
4621 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4622 *
4623 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4624 * s.size # => 2 # Two 3-byte characters.
4625 * s.bytesize # => 6 # Six bytes.
4626 * s.byteindex("\uFFFF") # => 0
4627 * s.byteindex("\uFFFF", 1) # Raises IndexError
4628 * s.byteindex("\uFFFF", 2) # Raises IndexError
4629 * s.byteindex("\uFFFF", 3) # => 3
4630 * s.byteindex("\uFFFF", 4) # Raises IndexError
4631 * s.byteindex("\uFFFF", 5) # Raises IndexError
4632 * s.byteindex("\uFFFF", 6) # => nil
4633 *
4634 * Related: see {Querying}[rdoc-ref:String@Querying].
4635 */
4636
4637static VALUE
4638rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4639{
4640 VALUE sub;
4641 VALUE initpos;
4642 long pos;
4643
4644 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4645 long slen = RSTRING_LEN(str);
4646 pos = NUM2LONG(initpos);
4647 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4648 if (RB_TYPE_P(sub, T_REGEXP)) {
4650 }
4651 return Qnil;
4652 }
4653 }
4654 else {
4655 pos = 0;
4656 }
4657
4658 str_ensure_byte_pos(str, pos);
4659
4660 if (RB_TYPE_P(sub, T_REGEXP)) {
4661 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4662 VALUE match = rb_backref_get();
4663 struct re_registers *regs = RMATCH_REGS(match);
4664 pos = BEG(0);
4665 return LONG2NUM(pos);
4666 }
4667 }
4668 else {
4669 StringValue(sub);
4670 pos = rb_str_byteindex(str, sub, pos);
4671 if (pos >= 0) return LONG2NUM(pos);
4672 }
4673 return Qnil;
4674}
4675
4676#ifndef HAVE_MEMRCHR
4677static void*
4678memrchr(const char *search_str, int chr, long search_len)
4679{
4680 const char *ptr = search_str + search_len;
4681 while (ptr > search_str) {
4682 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4683 }
4684
4685 return ((void *)0);
4686}
4687#endif
4688
4689static long
4690str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4691{
4692 char *hit, *adjusted;
4693 int c;
4694 long slen, searchlen;
4695 char *sbeg, *e, *t;
4696
4697 sbeg = RSTRING_PTR(str);
4698 slen = RSTRING_LEN(sub);
4699 if (slen == 0) return s - sbeg;
4700 e = RSTRING_END(str);
4701 t = RSTRING_PTR(sub);
4702 c = *t & 0xff;
4703 searchlen = s - sbeg + 1;
4704
4705 if (memcmp(s, t, slen) == 0) {
4706 return s - sbeg;
4707 }
4708
4709 do {
4710 hit = memrchr(sbeg, c, searchlen);
4711 if (!hit) break;
4712 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4713 if (hit != adjusted) {
4714 searchlen = adjusted - sbeg;
4715 continue;
4716 }
4717 if (memcmp(hit, t, slen) == 0)
4718 return hit - sbeg;
4719 searchlen = adjusted - sbeg;
4720 } while (searchlen > 0);
4721
4722 return -1;
4723}
4724
4725/* found index in byte */
4726static long
4727rb_str_rindex(VALUE str, VALUE sub, long pos)
4728{
4729 long len, slen;
4730 char *sbeg, *s;
4731 rb_encoding *enc;
4732 int singlebyte;
4733
4734 enc = rb_enc_check(str, sub);
4735 if (is_broken_string(sub)) return -1;
4736 singlebyte = single_byte_optimizable(str);
4737 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4738 slen = str_strlen(sub, enc); /* rb_enc_check */
4739
4740 /* substring longer than string */
4741 if (len < slen) return -1;
4742 if (len - pos < slen) pos = len - slen;
4743 if (len == 0) return pos;
4744
4745 sbeg = RSTRING_PTR(str);
4746
4747 if (pos == 0) {
4748 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4749 return 0;
4750 else
4751 return -1;
4752 }
4753
4754 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4755 return str_rindex(str, sub, s, enc);
4756}
4757
4758/*
4759 * call-seq:
4760 * rindex(substring, offset = self.length) -> integer or nil
4761 * rindex(regexp, offset = self.length) -> integer or nil
4762 *
4763 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4764 * or +nil+ if none found:
4765 *
4766 * 'foo'.rindex('f') # => 0
4767 * 'foo'.rindex('o') # => 2
4768 * 'foo'.rindex('oo') # => 1
4769 * 'foo'.rindex('ooo') # => nil
4770 *
4771 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4772 * or +nil+ if none found:
4773 *
4774 * 'foo'.rindex(/f/) # => 0
4775 * 'foo'.rindex(/o/) # => 2
4776 * 'foo'.rindex(/oo/) # => 1
4777 * 'foo'.rindex(/ooo/) # => nil
4778 *
4779 * The _last_ match means starting at the possible last position, not
4780 * the last of longest matches.
4781 *
4782 * 'foo'.rindex(/o+/) # => 2
4783 * $~ #=> #<MatchData "o">
4784 *
4785 * To get the last longest match, needs to combine with negative
4786 * lookbehind.
4787 *
4788 * 'foo'.rindex(/(?<!o)o+/) # => 1
4789 * $~ #=> #<MatchData "oo">
4790 *
4791 * Or String#index with negative lookforward.
4792 *
4793 * 'foo'.index(/o+(?!.*o)/) # => 1
4794 * $~ #=> #<MatchData "oo">
4795 *
4796 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4797 * string to _end_ the search:
4798 *
4799 * 'foo'.rindex('o', 0) # => nil
4800 * 'foo'.rindex('o', 1) # => 1
4801 * 'foo'.rindex('o', 2) # => 2
4802 * 'foo'.rindex('o', 3) # => 2
4803 *
4804 * If +offset+ is a negative Integer, the maximum starting position in the
4805 * string to _end_ the search is the sum of the string's length and +offset+:
4806 *
4807 * 'foo'.rindex('o', -1) # => 2
4808 * 'foo'.rindex('o', -2) # => 1
4809 * 'foo'.rindex('o', -3) # => nil
4810 * 'foo'.rindex('o', -4) # => nil
4811 *
4812 * Related: String#index.
4813 */
4814
4815static VALUE
4816rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4817{
4818 VALUE sub;
4819 VALUE initpos;
4820 rb_encoding *enc = STR_ENC_GET(str);
4821 long pos, len = str_strlen(str, enc); /* str's enc */
4822
4823 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4824 pos = NUM2LONG(initpos);
4825 if (pos < 0 && (pos += len) < 0) {
4826 if (RB_TYPE_P(sub, T_REGEXP)) {
4828 }
4829 return Qnil;
4830 }
4831 if (pos > len) pos = len;
4832 }
4833 else {
4834 pos = len;
4835 }
4836
4837 if (RB_TYPE_P(sub, T_REGEXP)) {
4838 /* enc = rb_enc_check(str, sub); */
4839 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4840 enc, single_byte_optimizable(str));
4841
4842 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4843 VALUE match = rb_backref_get();
4844 struct re_registers *regs = RMATCH_REGS(match);
4845 pos = rb_str_sublen(str, BEG(0));
4846 return LONG2NUM(pos);
4847 }
4848 }
4849 else {
4850 StringValue(sub);
4851 pos = rb_str_rindex(str, sub, pos);
4852 if (pos >= 0) {
4853 pos = rb_str_sublen(str, pos);
4854 return LONG2NUM(pos);
4855 }
4856 }
4857 return Qnil;
4858}
4859
4860static long
4861rb_str_byterindex(VALUE str, VALUE sub, long pos)
4862{
4863 long len, slen;
4864 char *sbeg, *s;
4865 rb_encoding *enc;
4866
4867 enc = rb_enc_check(str, sub);
4868 if (is_broken_string(sub)) return -1;
4869 len = RSTRING_LEN(str);
4870 slen = RSTRING_LEN(sub);
4871
4872 /* substring longer than string */
4873 if (len < slen) return -1;
4874 if (len - pos < slen) pos = len - slen;
4875 if (len == 0) return pos;
4876
4877 sbeg = RSTRING_PTR(str);
4878
4879 if (pos == 0) {
4880 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4881 return 0;
4882 else
4883 return -1;
4884 }
4885
4886 s = sbeg + pos;
4887 return str_rindex(str, sub, s, enc);
4888}
4889
4890/*
4891 * call-seq:
4892 * byterindex(object, offset = self.bytesize) -> integer or nil
4893 *
4894 * Returns the 0-based integer index of a substring of +self+
4895 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4896 * or +nil+ if there is no such substring;
4897 * the returned index is the count of _bytes_ (not characters).
4898 *
4899 * When +object+ is a string,
4900 * returns the index of the _last_ found substring equal to +object+:
4901 *
4902 * s = 'foo' # => "foo"
4903 * s.size # => 3 # Three 1-byte characters.
4904 * s.bytesize # => 3 # Three bytes.
4905 * s.byterindex('f') # => 0
4906 s.byterindex('o') # => 2
4907 s.byterindex('oo') # => 1
4908 s.byterindex('ooo') # => nil
4909 *
4910 * When +object+ is a Regexp,
4911 * returns the index of the last found substring matching +object+;
4912 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4913 *
4914 * s = 'foo'
4915 * s.byterindex(/f/) # => 0
4916 * $~ # => #<MatchData "f">
4917 * s.byterindex(/o/) # => 2
4918 * s.byterindex(/oo/) # => 1
4919 * s.byterindex(/ooo/) # => nil
4920 * $~ # => nil
4921 *
4922 * The last match means starting at the possible last position,
4923 * not the last of the longest matches:
4924 *
4925 * s = 'foo'
4926 * s.byterindex(/o+/) # => 2
4927 * $~ #=> #<MatchData "o">
4928 *
4929 * To get the last longest match, use a negative lookbehind:
4930 *
4931 * s = 'foo'
4932 * s.byterindex(/(?<!o)o+/) # => 1
4933 * $~ # => #<MatchData "oo">
4934 *
4935 * Or use method #byteindex with negative lookahead:
4936 *
4937 * s = 'foo'
4938 * s.byteindex(/o+(?!.*o)/) # => 1
4939 * $~ #=> #<MatchData "oo">
4940 *
4941 * \Integer argument +offset+, if given, specifies the 0-based index
4942 * of the byte where searching is to end.
4943 *
4944 * When +offset+ is non-negative,
4945 * searching ends at byte position +offset+:
4946 *
4947 * s = 'foo'
4948 * s.byterindex('o', 0) # => nil
4949 * s.byterindex('o', 1) # => 1
4950 * s.byterindex('o', 2) # => 2
4951 * s.byterindex('o', 3) # => 2
4952 *
4953 * When +offset+ is negative, counts backward from the end of +self+:
4954 *
4955 * s = 'foo'
4956 * s.byterindex('o', -1) # => 2
4957 * s.byterindex('o', -2) # => 1
4958 * s.byterindex('o', -3) # => nil
4959 *
4960 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4961 *
4962 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4963 * s.size # => 2 # Two 3-byte characters.
4964 * s.bytesize # => 6 # Six bytes.
4965 * s.byterindex("\uFFFF") # => 3
4966 * s.byterindex("\uFFFF", 1) # Raises IndexError
4967 * s.byterindex("\uFFFF", 2) # Raises IndexError
4968 * s.byterindex("\uFFFF", 3) # => 3
4969 * s.byterindex("\uFFFF", 4) # Raises IndexError
4970 * s.byterindex("\uFFFF", 5) # Raises IndexError
4971 * s.byterindex("\uFFFF", 6) # => nil
4972 *
4973 * Related: see {Querying}[rdoc-ref:String@Querying].
4974 */
4975
4976static VALUE
4977rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4978{
4979 VALUE sub;
4980 VALUE initpos;
4981 long pos, len = RSTRING_LEN(str);
4982
4983 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4984 pos = NUM2LONG(initpos);
4985 if (pos < 0 && (pos += len) < 0) {
4986 if (RB_TYPE_P(sub, T_REGEXP)) {
4988 }
4989 return Qnil;
4990 }
4991 if (pos > len) pos = len;
4992 }
4993 else {
4994 pos = len;
4995 }
4996
4997 str_ensure_byte_pos(str, pos);
4998
4999 if (RB_TYPE_P(sub, T_REGEXP)) {
5000 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5001 VALUE match = rb_backref_get();
5002 struct re_registers *regs = RMATCH_REGS(match);
5003 pos = BEG(0);
5004 return LONG2NUM(pos);
5005 }
5006 }
5007 else {
5008 StringValue(sub);
5009 pos = rb_str_byterindex(str, sub, pos);
5010 if (pos >= 0) return LONG2NUM(pos);
5011 }
5012 return Qnil;
5013}
5014
5015/*
5016 * call-seq:
5017 * self =~ object -> integer or nil
5018 *
5019 * When +object+ is a Regexp, returns the index of the first substring in +self+
5020 * matched by +object+,
5021 * or +nil+ if no match is found;
5022 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5023 *
5024 * 'foo' =~ /f/ # => 0
5025 * $~ # => #<MatchData "f">
5026 * 'foo' =~ /o/ # => 1
5027 * $~ # => #<MatchData "o">
5028 * 'foo' =~ /x/ # => nil
5029 * $~ # => nil
5030 *
5031 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5032 * (see Regexp#=~):
5033 *
5034 * number = nil
5035 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5036 * number # => nil # Not assigned.
5037 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5038 * number # => "9" # Assigned.
5039 *
5040 * If +object+ is not a Regexp, returns the value
5041 * returned by <tt>object =~ self</tt>.
5042 *
5043 * Related: see {Querying}[rdoc-ref:String@Querying].
5044 */
5045
5046static VALUE
5047rb_str_match(VALUE x, VALUE y)
5048{
5049 switch (OBJ_BUILTIN_TYPE(y)) {
5050 case T_STRING:
5051 rb_raise(rb_eTypeError, "type mismatch: String given");
5052
5053 case T_REGEXP:
5054 return rb_reg_match(y, x);
5055
5056 default:
5057 return rb_funcall(y, idEqTilde, 1, x);
5058 }
5059}
5060
5061
5062static VALUE get_pat(VALUE);
5063
5064
5065/*
5066 * call-seq:
5067 * match(pattern, offset = 0) -> matchdata or nil
5068 * match(pattern, offset = 0) {|matchdata| ... } -> object
5069 *
5070 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
5071 *
5072 * Note: also updates Regexp@Global+Variables.
5073 *
5074 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5075 * regexp = Regexp.new(pattern)
5076 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5077 * (see Regexp#match):
5078 * matchdata = regexp.match(self)
5079 *
5080 * With no block given, returns the computed +matchdata+:
5081 *
5082 * 'foo'.match('f') # => #<MatchData "f">
5083 * 'foo'.match('o') # => #<MatchData "o">
5084 * 'foo'.match('x') # => nil
5085 *
5086 * If Integer argument +offset+ is given, the search begins at index +offset+:
5087 *
5088 * 'foo'.match('f', 1) # => nil
5089 * 'foo'.match('o', 1) # => #<MatchData "o">
5090 *
5091 * With a block given, calls the block with the computed +matchdata+
5092 * and returns the block's return value:
5093 *
5094 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5095 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
5096 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
5097 *
5098 */
5099
5100static VALUE
5101rb_str_match_m(int argc, VALUE *argv, VALUE str)
5102{
5103 VALUE re, result;
5104 if (argc < 1)
5105 rb_check_arity(argc, 1, 2);
5106 re = argv[0];
5107 argv[0] = str;
5108 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5109 if (!NIL_P(result) && rb_block_given_p()) {
5110 return rb_yield(result);
5111 }
5112 return result;
5113}
5114
5115/*
5116 * call-seq:
5117 * match?(pattern, offset = 0) -> true or false
5118 *
5119 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5120 *
5121 * Note: does not update Regexp@Global+Variables.
5122 *
5123 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5124 * regexp = Regexp.new(pattern)
5125 *
5126 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5127 * +false+ otherwise:
5128 *
5129 * 'foo'.match?(/o/) # => true
5130 * 'foo'.match?('o') # => true
5131 * 'foo'.match?(/x/) # => false
5132 *
5133 * If Integer argument +offset+ is given, the search begins at index +offset+:
5134 * 'foo'.match?('f', 1) # => false
5135 * 'foo'.match?('o', 1) # => true
5136 *
5137 */
5138
5139static VALUE
5140rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5141{
5142 VALUE re;
5143 rb_check_arity(argc, 1, 2);
5144 re = get_pat(argv[0]);
5145 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5146}
5147
5148enum neighbor_char {
5149 NEIGHBOR_NOT_CHAR,
5150 NEIGHBOR_FOUND,
5151 NEIGHBOR_WRAPPED
5152};
5153
5154static enum neighbor_char
5155enc_succ_char(char *p, long len, rb_encoding *enc)
5156{
5157 long i;
5158 int l;
5159
5160 if (rb_enc_mbminlen(enc) > 1) {
5161 /* wchar, trivial case */
5162 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5163 if (!MBCLEN_CHARFOUND_P(r)) {
5164 return NEIGHBOR_NOT_CHAR;
5165 }
5166 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5167 l = rb_enc_code_to_mbclen(c, enc);
5168 if (!l) return NEIGHBOR_NOT_CHAR;
5169 if (l != len) return NEIGHBOR_WRAPPED;
5170 rb_enc_mbcput(c, p, enc);
5171 r = rb_enc_precise_mbclen(p, p + len, enc);
5172 if (!MBCLEN_CHARFOUND_P(r)) {
5173 return NEIGHBOR_NOT_CHAR;
5174 }
5175 return NEIGHBOR_FOUND;
5176 }
5177 while (1) {
5178 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5179 p[i] = '\0';
5180 if (i < 0)
5181 return NEIGHBOR_WRAPPED;
5182 ++((unsigned char*)p)[i];
5183 l = rb_enc_precise_mbclen(p, p+len, enc);
5184 if (MBCLEN_CHARFOUND_P(l)) {
5185 l = MBCLEN_CHARFOUND_LEN(l);
5186 if (l == len) {
5187 return NEIGHBOR_FOUND;
5188 }
5189 else {
5190 memset(p+l, 0xff, len-l);
5191 }
5192 }
5193 if (MBCLEN_INVALID_P(l) && i < len-1) {
5194 long len2;
5195 int l2;
5196 for (len2 = len-1; 0 < len2; len2--) {
5197 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5198 if (!MBCLEN_INVALID_P(l2))
5199 break;
5200 }
5201 memset(p+len2+1, 0xff, len-(len2+1));
5202 }
5203 }
5204}
5205
5206static enum neighbor_char
5207enc_pred_char(char *p, long len, rb_encoding *enc)
5208{
5209 long i;
5210 int l;
5211 if (rb_enc_mbminlen(enc) > 1) {
5212 /* wchar, trivial case */
5213 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5214 if (!MBCLEN_CHARFOUND_P(r)) {
5215 return NEIGHBOR_NOT_CHAR;
5216 }
5217 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5218 if (!c) return NEIGHBOR_NOT_CHAR;
5219 --c;
5220 l = rb_enc_code_to_mbclen(c, enc);
5221 if (!l) return NEIGHBOR_NOT_CHAR;
5222 if (l != len) return NEIGHBOR_WRAPPED;
5223 rb_enc_mbcput(c, p, enc);
5224 r = rb_enc_precise_mbclen(p, p + len, enc);
5225 if (!MBCLEN_CHARFOUND_P(r)) {
5226 return NEIGHBOR_NOT_CHAR;
5227 }
5228 return NEIGHBOR_FOUND;
5229 }
5230 while (1) {
5231 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5232 p[i] = '\xff';
5233 if (i < 0)
5234 return NEIGHBOR_WRAPPED;
5235 --((unsigned char*)p)[i];
5236 l = rb_enc_precise_mbclen(p, p+len, enc);
5237 if (MBCLEN_CHARFOUND_P(l)) {
5238 l = MBCLEN_CHARFOUND_LEN(l);
5239 if (l == len) {
5240 return NEIGHBOR_FOUND;
5241 }
5242 else {
5243 memset(p+l, 0, len-l);
5244 }
5245 }
5246 if (MBCLEN_INVALID_P(l) && i < len-1) {
5247 long len2;
5248 int l2;
5249 for (len2 = len-1; 0 < len2; len2--) {
5250 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5251 if (!MBCLEN_INVALID_P(l2))
5252 break;
5253 }
5254 memset(p+len2+1, 0, len-(len2+1));
5255 }
5256 }
5257}
5258
5259/*
5260 overwrite +p+ by succeeding letter in +enc+ and returns
5261 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5262 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5263 assuming each ranges are successive, and mbclen
5264 never change in each ranges.
5265 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5266 character.
5267 */
5268static enum neighbor_char
5269enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5270{
5271 enum neighbor_char ret;
5272 unsigned int c;
5273 int ctype;
5274 int range;
5275 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5276
5277 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5278 int try;
5279 const int max_gaps = 1;
5280
5281 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5282 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5283 ctype = ONIGENC_CTYPE_DIGIT;
5284 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5285 ctype = ONIGENC_CTYPE_ALPHA;
5286 else
5287 return NEIGHBOR_NOT_CHAR;
5288
5289 MEMCPY(save, p, char, len);
5290 for (try = 0; try <= max_gaps; ++try) {
5291 ret = enc_succ_char(p, len, enc);
5292 if (ret == NEIGHBOR_FOUND) {
5293 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5294 if (rb_enc_isctype(c, ctype, enc))
5295 return NEIGHBOR_FOUND;
5296 }
5297 }
5298 MEMCPY(p, save, char, len);
5299 range = 1;
5300 while (1) {
5301 MEMCPY(save, p, char, len);
5302 ret = enc_pred_char(p, len, enc);
5303 if (ret == NEIGHBOR_FOUND) {
5304 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5305 if (!rb_enc_isctype(c, ctype, enc)) {
5306 MEMCPY(p, save, char, len);
5307 break;
5308 }
5309 }
5310 else {
5311 MEMCPY(p, save, char, len);
5312 break;
5313 }
5314 range++;
5315 }
5316 if (range == 1) {
5317 return NEIGHBOR_NOT_CHAR;
5318 }
5319
5320 if (ctype != ONIGENC_CTYPE_DIGIT) {
5321 MEMCPY(carry, p, char, len);
5322 return NEIGHBOR_WRAPPED;
5323 }
5324
5325 MEMCPY(carry, p, char, len);
5326 enc_succ_char(carry, len, enc);
5327 return NEIGHBOR_WRAPPED;
5328}
5329
5330
5331static VALUE str_succ(VALUE str);
5332
5333/*
5334 * call-seq:
5335 * succ -> new_str
5336 *
5337 * Returns the successor to +self+. The successor is calculated by
5338 * incrementing characters.
5339 *
5340 * The first character to be incremented is the rightmost alphanumeric:
5341 * or, if no alphanumerics, the rightmost character:
5342 *
5343 * 'THX1138'.succ # => "THX1139"
5344 * '<<koala>>'.succ # => "<<koalb>>"
5345 * '***'.succ # => '**+'
5346 *
5347 * The successor to a digit is another digit, "carrying" to the next-left
5348 * character for a "rollover" from 9 to 0, and prepending another digit
5349 * if necessary:
5350 *
5351 * '00'.succ # => "01"
5352 * '09'.succ # => "10"
5353 * '99'.succ # => "100"
5354 *
5355 * The successor to a letter is another letter of the same case,
5356 * carrying to the next-left character for a rollover,
5357 * and prepending another same-case letter if necessary:
5358 *
5359 * 'aa'.succ # => "ab"
5360 * 'az'.succ # => "ba"
5361 * 'zz'.succ # => "aaa"
5362 * 'AA'.succ # => "AB"
5363 * 'AZ'.succ # => "BA"
5364 * 'ZZ'.succ # => "AAA"
5365 *
5366 * The successor to a non-alphanumeric character is the next character
5367 * in the underlying character set's collating sequence,
5368 * carrying to the next-left character for a rollover,
5369 * and prepending another character if necessary:
5370 *
5371 * s = 0.chr * 3
5372 * s # => "\x00\x00\x00"
5373 * s.succ # => "\x00\x00\x01"
5374 * s = 255.chr * 3
5375 * s # => "\xFF\xFF\xFF"
5376 * s.succ # => "\x01\x00\x00\x00"
5377 *
5378 * Carrying can occur between and among mixtures of alphanumeric characters:
5379 *
5380 * s = 'zz99zz99'
5381 * s.succ # => "aaa00aa00"
5382 * s = '99zz99zz'
5383 * s.succ # => "100aa00aa"
5384 *
5385 * The successor to an empty +String+ is a new empty +String+:
5386 *
5387 * ''.succ # => ""
5388 *
5389 */
5390
5391VALUE
5393{
5394 VALUE str;
5395 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5396 rb_enc_cr_str_copy_for_substr(str, orig);
5397 return str_succ(str);
5398}
5399
5400static VALUE
5401str_succ(VALUE str)
5402{
5403 rb_encoding *enc;
5404 char *sbeg, *s, *e, *last_alnum = 0;
5405 int found_alnum = 0;
5406 long l, slen;
5407 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5408 long carry_pos = 0, carry_len = 1;
5409 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5410
5411 slen = RSTRING_LEN(str);
5412 if (slen == 0) return str;
5413
5414 enc = STR_ENC_GET(str);
5415 sbeg = RSTRING_PTR(str);
5416 s = e = sbeg + slen;
5417
5418 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5419 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5420 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5421 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5422 break;
5423 }
5424 }
5425 l = rb_enc_precise_mbclen(s, e, enc);
5426 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5427 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5428 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5429 switch (neighbor) {
5430 case NEIGHBOR_NOT_CHAR:
5431 continue;
5432 case NEIGHBOR_FOUND:
5433 return str;
5434 case NEIGHBOR_WRAPPED:
5435 last_alnum = s;
5436 break;
5437 }
5438 found_alnum = 1;
5439 carry_pos = s - sbeg;
5440 carry_len = l;
5441 }
5442 if (!found_alnum) { /* str contains no alnum */
5443 s = e;
5444 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5445 enum neighbor_char neighbor;
5446 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5447 l = rb_enc_precise_mbclen(s, e, enc);
5448 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5449 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5450 MEMCPY(tmp, s, char, l);
5451 neighbor = enc_succ_char(tmp, l, enc);
5452 switch (neighbor) {
5453 case NEIGHBOR_FOUND:
5454 MEMCPY(s, tmp, char, l);
5455 return str;
5456 break;
5457 case NEIGHBOR_WRAPPED:
5458 MEMCPY(s, tmp, char, l);
5459 break;
5460 case NEIGHBOR_NOT_CHAR:
5461 break;
5462 }
5463 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5464 /* wrapped to \0...\0. search next valid char. */
5465 enc_succ_char(s, l, enc);
5466 }
5467 if (!rb_enc_asciicompat(enc)) {
5468 MEMCPY(carry, s, char, l);
5469 carry_len = l;
5470 }
5471 carry_pos = s - sbeg;
5472 }
5474 }
5475 RESIZE_CAPA(str, slen + carry_len);
5476 sbeg = RSTRING_PTR(str);
5477 s = sbeg + carry_pos;
5478 memmove(s + carry_len, s, slen - carry_pos);
5479 memmove(s, carry, carry_len);
5480 slen += carry_len;
5481 STR_SET_LEN(str, slen);
5482 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5484 return str;
5485}
5486
5487
5488/*
5489 * call-seq:
5490 * succ! -> self
5491 *
5492 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5493 */
5494
5495static VALUE
5496rb_str_succ_bang(VALUE str)
5497{
5498 rb_str_modify(str);
5499 str_succ(str);
5500 return str;
5501}
5502
5503static int
5504all_digits_p(const char *s, long len)
5505{
5506 while (len-- > 0) {
5507 if (!ISDIGIT(*s)) return 0;
5508 s++;
5509 }
5510 return 1;
5511}
5512
5513static int
5514str_upto_i(VALUE str, VALUE arg)
5515{
5516 rb_yield(str);
5517 return 0;
5518}
5519
5520/*
5521 * call-seq:
5522 * upto(other_string, exclusive = false) {|string| ... } -> self
5523 * upto(other_string, exclusive = false) -> new_enumerator
5524 *
5525 * With a block given, calls the block with each +String+ value
5526 * returned by successive calls to String#succ;
5527 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5528 * the sequence terminates when value +other_string+ is reached;
5529 * returns +self+:
5530 *
5531 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5532 * Output:
5533 *
5534 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5535 *
5536 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5537 *
5538 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5539 *
5540 * Output:
5541 *
5542 * a8 a9 b0 b1 b2 b3 b4 b5
5543 *
5544 * If +other_string+ would not be reached, does not call the block:
5545 *
5546 * '25'.upto('5') {|s| fail s }
5547 * 'aa'.upto('a') {|s| fail s }
5548 *
5549 * With no block given, returns a new Enumerator:
5550 *
5551 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5552 *
5553 */
5554
5555static VALUE
5556rb_str_upto(int argc, VALUE *argv, VALUE beg)
5557{
5558 VALUE end, exclusive;
5559
5560 rb_scan_args(argc, argv, "11", &end, &exclusive);
5561 RETURN_ENUMERATOR(beg, argc, argv);
5562 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5563}
5564
5565VALUE
5566rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5567{
5568 VALUE current, after_end;
5569 ID succ;
5570 int n, ascii;
5571 rb_encoding *enc;
5572
5573 CONST_ID(succ, "succ");
5574 StringValue(end);
5575 enc = rb_enc_check(beg, end);
5576 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5577 /* single character */
5578 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5579 char c = RSTRING_PTR(beg)[0];
5580 char e = RSTRING_PTR(end)[0];
5581
5582 if (c > e || (excl && c == e)) return beg;
5583 for (;;) {
5584 VALUE str = rb_enc_str_new(&c, 1, enc);
5586 if ((*each)(str, arg)) break;
5587 if (!excl && c == e) break;
5588 c++;
5589 if (excl && c == e) break;
5590 }
5591 return beg;
5592 }
5593 /* both edges are all digits */
5594 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5595 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5596 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5597 VALUE b, e;
5598 int width;
5599
5600 width = RSTRING_LENINT(beg);
5601 b = rb_str_to_inum(beg, 10, FALSE);
5602 e = rb_str_to_inum(end, 10, FALSE);
5603 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5604 long bi = FIX2LONG(b);
5605 long ei = FIX2LONG(e);
5606 rb_encoding *usascii = rb_usascii_encoding();
5607
5608 while (bi <= ei) {
5609 if (excl && bi == ei) break;
5610 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5611 bi++;
5612 }
5613 }
5614 else {
5615 ID op = excl ? '<' : idLE;
5616 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5617
5618 args[0] = INT2FIX(width);
5619 while (rb_funcall(b, op, 1, e)) {
5620 args[1] = b;
5621 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5622 b = rb_funcallv(b, succ, 0, 0);
5623 }
5624 }
5625 return beg;
5626 }
5627 /* normal case */
5628 n = rb_str_cmp(beg, end);
5629 if (n > 0 || (excl && n == 0)) return beg;
5630
5631 after_end = rb_funcallv(end, succ, 0, 0);
5632 current = str_duplicate(rb_cString, beg);
5633 while (!rb_str_equal(current, after_end)) {
5634 VALUE next = Qnil;
5635 if (excl || !rb_str_equal(current, end))
5636 next = rb_funcallv(current, succ, 0, 0);
5637 if ((*each)(current, arg)) break;
5638 if (NIL_P(next)) break;
5639 current = next;
5640 StringValue(current);
5641 if (excl && rb_str_equal(current, end)) break;
5642 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5643 break;
5644 }
5645
5646 return beg;
5647}
5648
5649VALUE
5650rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5651{
5652 VALUE current;
5653 ID succ;
5654
5655 CONST_ID(succ, "succ");
5656 /* both edges are all digits */
5657 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5658 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5659 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5660 int width = RSTRING_LENINT(beg);
5661 b = rb_str_to_inum(beg, 10, FALSE);
5662 if (FIXNUM_P(b)) {
5663 long bi = FIX2LONG(b);
5664 rb_encoding *usascii = rb_usascii_encoding();
5665
5666 while (FIXABLE(bi)) {
5667 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5668 bi++;
5669 }
5670 b = LONG2NUM(bi);
5671 }
5672 args[0] = INT2FIX(width);
5673 while (1) {
5674 args[1] = b;
5675 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5676 b = rb_funcallv(b, succ, 0, 0);
5677 }
5678 }
5679 /* normal case */
5680 current = str_duplicate(rb_cString, beg);
5681 while (1) {
5682 VALUE next = rb_funcallv(current, succ, 0, 0);
5683 if ((*each)(current, arg)) break;
5684 current = next;
5685 StringValue(current);
5686 if (RSTRING_LEN(current) == 0)
5687 break;
5688 }
5689
5690 return beg;
5691}
5692
5693static int
5694include_range_i(VALUE str, VALUE arg)
5695{
5696 VALUE *argp = (VALUE *)arg;
5697 if (!rb_equal(str, *argp)) return 0;
5698 *argp = Qnil;
5699 return 1;
5700}
5701
5702VALUE
5703rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5704{
5705 beg = rb_str_new_frozen(beg);
5706 StringValue(end);
5707 end = rb_str_new_frozen(end);
5708 if (NIL_P(val)) return Qfalse;
5709 val = rb_check_string_type(val);
5710 if (NIL_P(val)) return Qfalse;
5711 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5712 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5713 rb_enc_asciicompat(STR_ENC_GET(val))) {
5714 const char *bp = RSTRING_PTR(beg);
5715 const char *ep = RSTRING_PTR(end);
5716 const char *vp = RSTRING_PTR(val);
5717 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5718 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5719 return Qfalse;
5720 else {
5721 char b = *bp;
5722 char e = *ep;
5723 char v = *vp;
5724
5725 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5726 if (b <= v && v < e) return Qtrue;
5727 return RBOOL(!RTEST(exclusive) && v == e);
5728 }
5729 }
5730 }
5731#if 0
5732 /* both edges are all digits */
5733 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5734 all_digits_p(bp, RSTRING_LEN(beg)) &&
5735 all_digits_p(ep, RSTRING_LEN(end))) {
5736 /* TODO */
5737 }
5738#endif
5739 }
5740 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5741
5742 return RBOOL(NIL_P(val));
5743}
5744
5745static VALUE
5746rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5747{
5748 if (rb_reg_search(re, str, 0, 0) >= 0) {
5749 VALUE match = rb_backref_get();
5750 int nth = rb_reg_backref_number(match, backref);
5751 return rb_reg_nth_match(nth, match);
5752 }
5753 return Qnil;
5754}
5755
5756static VALUE
5757rb_str_aref(VALUE str, VALUE indx)
5758{
5759 long idx;
5760
5761 if (FIXNUM_P(indx)) {
5762 idx = FIX2LONG(indx);
5763 }
5764 else if (RB_TYPE_P(indx, T_REGEXP)) {
5765 return rb_str_subpat(str, indx, INT2FIX(0));
5766 }
5767 else if (RB_TYPE_P(indx, T_STRING)) {
5768 if (rb_str_index(str, indx, 0) != -1)
5769 return str_duplicate(rb_cString, indx);
5770 return Qnil;
5771 }
5772 else {
5773 /* check if indx is Range */
5774 long beg, len = str_strlen(str, NULL);
5775 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5776 case Qfalse:
5777 break;
5778 case Qnil:
5779 return Qnil;
5780 default:
5781 return rb_str_substr(str, beg, len);
5782 }
5783 idx = NUM2LONG(indx);
5784 }
5785
5786 return str_substr(str, idx, 1, FALSE);
5787}
5788
5789
5790/*
5791 * call-seq:
5792 * self[index] -> new_string or nil
5793 * self[start, length] -> new_string or nil
5794 * self[range] -> new_string or nil
5795 * self[regexp, capture = 0] -> new_string or nil
5796 * self[substring] -> new_string or nil
5797 *
5798 * Returns the substring of +self+ specified by the arguments.
5799 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5800 *
5801 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
5802 */
5803
5804static VALUE
5805rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5806{
5807 if (argc == 2) {
5808 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5809 return rb_str_subpat(str, argv[0], argv[1]);
5810 }
5811 else {
5812 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5813 }
5814 }
5815 rb_check_arity(argc, 1, 2);
5816 return rb_str_aref(str, argv[0]);
5817}
5818
5819VALUE
5821{
5822 char *ptr = RSTRING_PTR(str);
5823 long olen = RSTRING_LEN(str), nlen;
5824
5825 str_modifiable(str);
5826 if (len > olen) len = olen;
5827 nlen = olen - len;
5828 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5829 char *oldptr = ptr;
5830 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5831 STR_SET_EMBED(str);
5832 ptr = RSTRING(str)->as.embed.ary;
5833 memmove(ptr, oldptr + len, nlen);
5834 if (fl == STR_NOEMBED) xfree(oldptr);
5835 }
5836 else {
5837 if (!STR_SHARED_P(str)) {
5838 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5839 rb_enc_cr_str_exact_copy(shared, str);
5840 OBJ_FREEZE(shared);
5841 }
5842 ptr = RSTRING(str)->as.heap.ptr += len;
5843 }
5844 STR_SET_LEN(str, nlen);
5845
5846 if (!SHARABLE_MIDDLE_SUBSTRING) {
5847 TERM_FILL(ptr + nlen, TERM_LEN(str));
5848 }
5850 return str;
5851}
5852
5853static void
5854rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5855{
5856 char *sptr;
5857 long slen;
5858 int cr;
5859
5860 if (beg == 0 && vlen == 0) {
5861 rb_str_drop_bytes(str, len);
5862 return;
5863 }
5864
5865 str_modify_keep_cr(str);
5866 RSTRING_GETMEM(str, sptr, slen);
5867 if (len < vlen) {
5868 /* expand string */
5869 RESIZE_CAPA(str, slen + vlen - len);
5870 sptr = RSTRING_PTR(str);
5871 }
5872
5874 cr = rb_enc_str_coderange(val);
5875 else
5877
5878 if (vlen != len) {
5879 memmove(sptr + beg + vlen,
5880 sptr + beg + len,
5881 slen - (beg + len));
5882 }
5883 if (vlen < beg && len < 0) {
5884 MEMZERO(sptr + slen, char, -len);
5885 }
5886 if (vlen > 0) {
5887 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5888 }
5889 slen += vlen - len;
5890 STR_SET_LEN(str, slen);
5891 TERM_FILL(&sptr[slen], TERM_LEN(str));
5892 ENC_CODERANGE_SET(str, cr);
5893}
5894
5895static inline void
5896rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5897{
5898 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5899}
5900
5901void
5902rb_str_update(VALUE str, long beg, long len, VALUE val)
5903{
5904 long slen;
5905 char *p, *e;
5906 rb_encoding *enc;
5907 int singlebyte = single_byte_optimizable(str);
5908 int cr;
5909
5910 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5911
5912 StringValue(val);
5913 enc = rb_enc_check(str, val);
5914 slen = str_strlen(str, enc); /* rb_enc_check */
5915
5916 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5917 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5918 }
5919 if (beg < 0) {
5920 beg += slen;
5921 }
5922 RUBY_ASSERT(beg >= 0);
5923 RUBY_ASSERT(beg <= slen);
5924
5925 if (len > slen - beg) {
5926 len = slen - beg;
5927 }
5928 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5929 if (!p) p = RSTRING_END(str);
5930 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5931 if (!e) e = RSTRING_END(str);
5932 /* error check */
5933 beg = p - RSTRING_PTR(str); /* physical position */
5934 len = e - p; /* physical length */
5935 rb_str_update_0(str, beg, len, val);
5936 rb_enc_associate(str, enc);
5938 if (cr != ENC_CODERANGE_BROKEN)
5939 ENC_CODERANGE_SET(str, cr);
5940}
5941
5942static void
5943rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5944{
5945 int nth;
5946 VALUE match;
5947 long start, end, len;
5948 rb_encoding *enc;
5949 struct re_registers *regs;
5950
5951 if (rb_reg_search(re, str, 0, 0) < 0) {
5952 rb_raise(rb_eIndexError, "regexp not matched");
5953 }
5954 match = rb_backref_get();
5955 nth = rb_reg_backref_number(match, backref);
5956 regs = RMATCH_REGS(match);
5957 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5958 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5959 }
5960 if (nth < 0) {
5961 nth += regs->num_regs;
5962 }
5963
5964 start = BEG(nth);
5965 if (start == -1) {
5966 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5967 }
5968 end = END(nth);
5969 len = end - start;
5970 StringValue(val);
5971 enc = rb_enc_check_str(str, val);
5972 rb_str_update_0(str, start, len, val);
5973 rb_enc_associate(str, enc);
5974}
5975
5976static VALUE
5977rb_str_aset(VALUE str, VALUE indx, VALUE val)
5978{
5979 long idx, beg;
5980
5981 switch (TYPE(indx)) {
5982 case T_REGEXP:
5983 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5984 return val;
5985
5986 case T_STRING:
5987 beg = rb_str_index(str, indx, 0);
5988 if (beg < 0) {
5989 rb_raise(rb_eIndexError, "string not matched");
5990 }
5991 beg = rb_str_sublen(str, beg);
5992 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5993 return val;
5994
5995 default:
5996 /* check if indx is Range */
5997 {
5998 long beg, len;
5999 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6000 rb_str_update(str, beg, len, val);
6001 return val;
6002 }
6003 }
6004 /* FALLTHROUGH */
6005
6006 case T_FIXNUM:
6007 idx = NUM2LONG(indx);
6008 rb_str_update(str, idx, 1, val);
6009 return val;
6010 }
6011}
6012
6013/*
6014 * call-seq:
6015 * self[index] = new_string
6016 * self[start, length] = new_string
6017 * self[range] = new_string
6018 * self[regexp, capture = 0] = new_string
6019 * self[substring] = new_string
6020 *
6021 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6022 * See {String Slices}[rdoc-ref:String@String+Slices].
6023 *
6024 * A few examples:
6025 *
6026 * s = 'foo'
6027 * s[2] = 'rtune' # => "rtune"
6028 * s # => "fortune"
6029 * s[1, 5] = 'init' # => "init"
6030 * s # => "finite"
6031 * s[3..4] = 'al' # => "al"
6032 * s # => "finale"
6033 * s[/e$/] = 'ly' # => "ly"
6034 * s # => "finally"
6035 * s['lly'] = 'ncial' # => "ncial"
6036 * s # => "financial"
6037 *
6038 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6039 */
6040
6041static VALUE
6042rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6043{
6044 if (argc == 3) {
6045 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6046 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6047 }
6048 else {
6049 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6050 }
6051 return argv[2];
6052 }
6053 rb_check_arity(argc, 2, 3);
6054 return rb_str_aset(str, argv[0], argv[1]);
6055}
6056
6057/*
6058 * call-seq:
6059 * insert(offset, other_string) -> self
6060 *
6061 * :include: doc/string/insert.rdoc
6062 *
6063 */
6064
6065static VALUE
6066rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6067{
6068 long pos = NUM2LONG(idx);
6069
6070 if (pos == -1) {
6071 return rb_str_append(str, str2);
6072 }
6073 else if (pos < 0) {
6074 pos++;
6075 }
6076 rb_str_update(str, pos, 0, str2);
6077 return str;
6078}
6079
6080
6081/*
6082 * call-seq:
6083 * slice!(index) -> new_string or nil
6084 * slice!(start, length) -> new_string or nil
6085 * slice!(range) -> new_string or nil
6086 * slice!(regexp, capture = 0) -> new_string or nil
6087 * slice!(substring) -> new_string or nil
6088 *
6089 * Removes and returns the substring of +self+ specified by the arguments.
6090 * See {String Slices}[rdoc-ref:String@String+Slices].
6091 *
6092 * A few examples:
6093 *
6094 * string = "This is a string"
6095 * string.slice!(2) #=> "i"
6096 * string.slice!(3..6) #=> " is "
6097 * string.slice!(/s.*t/) #=> "sa st"
6098 * string.slice!("r") #=> "r"
6099 * string #=> "Thing"
6100 *
6101 */
6102
6103static VALUE
6104rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6105{
6106 VALUE result = Qnil;
6107 VALUE indx;
6108 long beg, len = 1;
6109 char *p;
6110
6111 rb_check_arity(argc, 1, 2);
6112 str_modify_keep_cr(str);
6113 indx = argv[0];
6114 if (RB_TYPE_P(indx, T_REGEXP)) {
6115 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6116 VALUE match = rb_backref_get();
6117 struct re_registers *regs = RMATCH_REGS(match);
6118 int nth = 0;
6119 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6120 if ((nth += regs->num_regs) <= 0) return Qnil;
6121 }
6122 else if (nth >= regs->num_regs) return Qnil;
6123 beg = BEG(nth);
6124 len = END(nth) - beg;
6125 goto subseq;
6126 }
6127 else if (argc == 2) {
6128 beg = NUM2LONG(indx);
6129 len = NUM2LONG(argv[1]);
6130 goto num_index;
6131 }
6132 else if (FIXNUM_P(indx)) {
6133 beg = FIX2LONG(indx);
6134 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6135 if (!len) return Qnil;
6136 beg = p - RSTRING_PTR(str);
6137 goto subseq;
6138 }
6139 else if (RB_TYPE_P(indx, T_STRING)) {
6140 beg = rb_str_index(str, indx, 0);
6141 if (beg == -1) return Qnil;
6142 len = RSTRING_LEN(indx);
6143 result = str_duplicate(rb_cString, indx);
6144 goto squash;
6145 }
6146 else {
6147 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6148 case Qnil:
6149 return Qnil;
6150 case Qfalse:
6151 beg = NUM2LONG(indx);
6152 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6153 if (!len) return Qnil;
6154 beg = p - RSTRING_PTR(str);
6155 goto subseq;
6156 default:
6157 goto num_index;
6158 }
6159 }
6160
6161 num_index:
6162 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6163 beg = p - RSTRING_PTR(str);
6164
6165 subseq:
6166 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6167 rb_enc_cr_str_copy_for_substr(result, str);
6168
6169 squash:
6170 if (len > 0) {
6171 if (beg == 0) {
6172 rb_str_drop_bytes(str, len);
6173 }
6174 else {
6175 char *sptr = RSTRING_PTR(str);
6176 long slen = RSTRING_LEN(str);
6177 if (beg + len > slen) /* pathological check */
6178 len = slen - beg;
6179 memmove(sptr + beg,
6180 sptr + beg + len,
6181 slen - (beg + len));
6182 slen -= len;
6183 STR_SET_LEN(str, slen);
6184 TERM_FILL(&sptr[slen], TERM_LEN(str));
6185 }
6186 }
6187 return result;
6188}
6189
6190static VALUE
6191get_pat(VALUE pat)
6192{
6193 VALUE val;
6194
6195 switch (OBJ_BUILTIN_TYPE(pat)) {
6196 case T_REGEXP:
6197 return pat;
6198
6199 case T_STRING:
6200 break;
6201
6202 default:
6203 val = rb_check_string_type(pat);
6204 if (NIL_P(val)) {
6205 Check_Type(pat, T_REGEXP);
6206 }
6207 pat = val;
6208 }
6209
6210 return rb_reg_regcomp(pat);
6211}
6212
6213static VALUE
6214get_pat_quoted(VALUE pat, int check)
6215{
6216 VALUE val;
6217
6218 switch (OBJ_BUILTIN_TYPE(pat)) {
6219 case T_REGEXP:
6220 return pat;
6221
6222 case T_STRING:
6223 break;
6224
6225 default:
6226 val = rb_check_string_type(pat);
6227 if (NIL_P(val)) {
6228 Check_Type(pat, T_REGEXP);
6229 }
6230 pat = val;
6231 }
6232 if (check && is_broken_string(pat)) {
6233 rb_exc_raise(rb_reg_check_preprocess(pat));
6234 }
6235 return pat;
6236}
6237
6238static long
6239rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6240{
6241 if (BUILTIN_TYPE(pat) == T_STRING) {
6242 pos = rb_str_byteindex(str, pat, pos);
6243 if (set_backref_str) {
6244 if (pos >= 0) {
6245 str = rb_str_new_frozen_String(str);
6246 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6247 if (match) {
6248 *match = match_data;
6249 }
6250 }
6251 else {
6253 }
6254 }
6255 return pos;
6256 }
6257 else {
6258 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6259 }
6260}
6261
6262static long
6263rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6264{
6265 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6266}
6267
6268
6269/*
6270 * call-seq:
6271 * sub!(pattern, replacement) -> self or nil
6272 * sub!(pattern) {|match| ... } -> self or nil
6273 *
6274 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6275 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6276 *
6277 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6278 *
6279 * Related: String#sub, String#gsub, String#gsub!.
6280 *
6281 */
6282
6283static VALUE
6284rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6285{
6286 VALUE pat, repl, hash = Qnil;
6287 int iter = 0;
6288 long plen;
6289 int min_arity = rb_block_given_p() ? 1 : 2;
6290 long beg;
6291
6292 rb_check_arity(argc, min_arity, 2);
6293 if (argc == 1) {
6294 iter = 1;
6295 }
6296 else {
6297 repl = argv[1];
6298 hash = rb_check_hash_type(argv[1]);
6299 if (NIL_P(hash)) {
6300 StringValue(repl);
6301 }
6302 }
6303
6304 pat = get_pat_quoted(argv[0], 1);
6305
6306 str_modifiable(str);
6307 beg = rb_pat_search(pat, str, 0, 1);
6308 if (beg >= 0) {
6309 rb_encoding *enc;
6310 int cr = ENC_CODERANGE(str);
6311 long beg0, end0;
6312 VALUE match, match0 = Qnil;
6313 struct re_registers *regs;
6314 char *p, *rp;
6315 long len, rlen;
6316
6317 match = rb_backref_get();
6318 regs = RMATCH_REGS(match);
6319 if (RB_TYPE_P(pat, T_STRING)) {
6320 beg0 = beg;
6321 end0 = beg0 + RSTRING_LEN(pat);
6322 match0 = pat;
6323 }
6324 else {
6325 beg0 = BEG(0);
6326 end0 = END(0);
6327 if (iter) match0 = rb_reg_nth_match(0, match);
6328 }
6329
6330 if (iter || !NIL_P(hash)) {
6331 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6332
6333 if (iter) {
6334 repl = rb_obj_as_string(rb_yield(match0));
6335 }
6336 else {
6337 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6338 repl = rb_obj_as_string(repl);
6339 }
6340 str_mod_check(str, p, len);
6341 rb_check_frozen(str);
6342 }
6343 else {
6344 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6345 }
6346
6347 enc = rb_enc_compatible(str, repl);
6348 if (!enc) {
6349 rb_encoding *str_enc = STR_ENC_GET(str);
6350 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6351 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6352 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6353 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6354 rb_enc_inspect_name(str_enc),
6355 rb_enc_inspect_name(STR_ENC_GET(repl)));
6356 }
6357 enc = STR_ENC_GET(repl);
6358 }
6359 rb_str_modify(str);
6360 rb_enc_associate(str, enc);
6362 int cr2 = ENC_CODERANGE(repl);
6363 if (cr2 == ENC_CODERANGE_BROKEN ||
6364 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6366 else
6367 cr = cr2;
6368 }
6369 plen = end0 - beg0;
6370 rlen = RSTRING_LEN(repl);
6371 len = RSTRING_LEN(str);
6372 if (rlen > plen) {
6373 RESIZE_CAPA(str, len + rlen - plen);
6374 }
6375 p = RSTRING_PTR(str);
6376 if (rlen != plen) {
6377 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6378 }
6379 rp = RSTRING_PTR(repl);
6380 memmove(p + beg0, rp, rlen);
6381 len += rlen - plen;
6382 STR_SET_LEN(str, len);
6383 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6384 ENC_CODERANGE_SET(str, cr);
6385
6386 RB_GC_GUARD(match);
6387
6388 return str;
6389 }
6390 return Qnil;
6391}
6392
6393
6394/*
6395 * call-seq:
6396 * sub(pattern, replacement) -> new_string
6397 * sub(pattern) {|match| ... } -> new_string
6398 *
6399 * Returns a copy of +self+ with only the first occurrence
6400 * (not all occurrences) of the given +pattern+ replaced.
6401 *
6402 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6403 *
6404 * Related: String#sub!, String#gsub, String#gsub!.
6405 *
6406 */
6407
6408static VALUE
6409rb_str_sub(int argc, VALUE *argv, VALUE str)
6410{
6411 str = str_duplicate(rb_cString, str);
6412 rb_str_sub_bang(argc, argv, str);
6413 return str;
6414}
6415
6416static VALUE
6417str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6418{
6419 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6420 long beg, beg0, end0;
6421 long offset, blen, slen, len, last;
6422 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6423 char *sp, *cp;
6424 int need_backref_str = -1;
6425 rb_encoding *str_enc;
6426
6427 switch (argc) {
6428 case 1:
6429 RETURN_ENUMERATOR(str, argc, argv);
6430 mode = ITER;
6431 break;
6432 case 2:
6433 repl = argv[1];
6434 hash = rb_check_hash_type(argv[1]);
6435 if (NIL_P(hash)) {
6436 StringValue(repl);
6437 }
6438 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6439 mode = FAST_MAP;
6440 }
6441 else {
6442 mode = MAP;
6443 }
6444 break;
6445 default:
6446 rb_error_arity(argc, 1, 2);
6447 }
6448
6449 pat = get_pat_quoted(argv[0], 1);
6450 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6451
6452 if (beg < 0) {
6453 if (bang) return Qnil; /* no match, no substitution */
6454 return str_duplicate(rb_cString, str);
6455 }
6456
6457 offset = 0;
6458 blen = RSTRING_LEN(str) + 30; /* len + margin */
6459 dest = rb_str_buf_new(blen);
6460 sp = RSTRING_PTR(str);
6461 slen = RSTRING_LEN(str);
6462 cp = sp;
6463 str_enc = STR_ENC_GET(str);
6464 rb_enc_associate(dest, str_enc);
6465 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6466
6467 do {
6468 struct re_registers *regs = RMATCH_REGS(match);
6469 if (RB_TYPE_P(pat, T_STRING)) {
6470 beg0 = beg;
6471 end0 = beg0 + RSTRING_LEN(pat);
6472 match0 = pat;
6473 }
6474 else {
6475 beg0 = BEG(0);
6476 end0 = END(0);
6477 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6478 }
6479
6480 if (mode != STR) {
6481 if (mode == ITER) {
6482 val = rb_obj_as_string(rb_yield(match0));
6483 }
6484 else {
6485 struct RString fake_str;
6486 VALUE key;
6487 if (mode == FAST_MAP) {
6488 // It is safe to use a fake_str here because we established that it won't escape,
6489 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6490 // default proc.
6491 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6492 }
6493 else {
6494 key = rb_str_subseq(str, beg0, end0 - beg0);
6495 }
6496 val = rb_hash_aref(hash, key);
6497 val = rb_obj_as_string(val);
6498 }
6499 str_mod_check(str, sp, slen);
6500 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6501 rb_raise(rb_eRuntimeError, "block should not cheat");
6502 }
6503 }
6504 else if (need_backref_str) {
6505 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6506 if (need_backref_str < 0) {
6507 need_backref_str = val != repl;
6508 }
6509 }
6510 else {
6511 val = repl;
6512 }
6513
6514 len = beg0 - offset; /* copy pre-match substr */
6515 if (len) {
6516 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6517 }
6518
6519 rb_str_buf_append(dest, val);
6520
6521 last = offset;
6522 offset = end0;
6523 if (beg0 == end0) {
6524 /*
6525 * Always consume at least one character of the input string
6526 * in order to prevent infinite loops.
6527 */
6528 if (RSTRING_LEN(str) <= end0) break;
6529 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6530 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6531 offset = end0 + len;
6532 }
6533 cp = RSTRING_PTR(str) + offset;
6534 if (offset > RSTRING_LEN(str)) break;
6535
6536 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6537 if (mode != FAST_MAP && mode != STR) {
6538 match = Qnil;
6539 }
6540 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6541
6542 RB_GC_GUARD(match);
6543 } while (beg >= 0);
6544
6545 if (RSTRING_LEN(str) > offset) {
6546 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6547 }
6548 rb_pat_search0(pat, str, last, 1, &match);
6549 if (bang) {
6550 str_shared_replace(str, dest);
6551 }
6552 else {
6553 str = dest;
6554 }
6555
6556 return str;
6557}
6558
6559
6560/*
6561 * call-seq:
6562 * gsub!(pattern, replacement) -> self or nil
6563 * gsub!(pattern) {|match| ... } -> self or nil
6564 * gsub!(pattern) -> an_enumerator
6565 *
6566 * Like String#gsub, except that:
6567 *
6568 * - Performs substitutions in +self+ (not in a copy of +self+).
6569 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6570 *
6571 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6572 */
6573
6574static VALUE
6575rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6576{
6577 str_modify_keep_cr(str);
6578 return str_gsub(argc, argv, str, 1);
6579}
6580
6581
6582/*
6583 * call-seq:
6584 * gsub(pattern, replacement) -> new_string
6585 * gsub(pattern) {|match| ... } -> new_string
6586 * gsub(pattern) -> enumerator
6587 *
6588 * Returns a copy of +self+ with zero or more substrings replaced.
6589 *
6590 * Argument +pattern+ may be a string or a Regexp;
6591 * argument +replacement+ may be a string or a Hash.
6592 * Varying types for the argument values makes this method very versatile.
6593 *
6594 * Below are some simple examples;
6595 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6596 *
6597 * With arguments +pattern+ and string +replacement+ given,
6598 * replaces each matching substring with the given +replacement+ string:
6599 *
6600 * s = 'abracadabra'
6601 * s.gsub('ab', 'AB') # => "ABracadABra"
6602 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6603 *
6604 * With arguments +pattern+ and hash +replacement+ given,
6605 * replaces each matching substring with a value from the given +replacement+ hash,
6606 * or removes it:
6607 *
6608 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6609 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6610 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6611 *
6612 * With argument +pattern+ and a block given,
6613 * calls the block with each matching substring;
6614 * replaces that substring with the block's return value:
6615 *
6616 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6617 * # => "ABrACADABrA"
6618 *
6619 * With argument +pattern+ and no block given,
6620 * returns a new Enumerator.
6621 *
6622 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6623 */
6624
6625static VALUE
6626rb_str_gsub(int argc, VALUE *argv, VALUE str)
6627{
6628 return str_gsub(argc, argv, str, 0);
6629}
6630
6631
6632/*
6633 * call-seq:
6634 * replace(other_string) -> self
6635 *
6636 * Replaces the contents of +self+ with the contents of +other_string+:
6637 *
6638 * s = 'foo' # => "foo"
6639 * s.replace('bar') # => "bar"
6640 *
6641 */
6642
6643VALUE
6645{
6646 str_modifiable(str);
6647 if (str == str2) return str;
6648
6649 StringValue(str2);
6650 str_discard(str);
6651 return str_replace(str, str2);
6652}
6653
6654/*
6655 * call-seq:
6656 * clear -> self
6657 *
6658 * Removes the contents of +self+:
6659 *
6660 * s = 'foo'
6661 * s.clear # => ""
6662 * s # => ""
6663 *
6664 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6665 */
6666
6667static VALUE
6668rb_str_clear(VALUE str)
6669{
6670 str_discard(str);
6671 STR_SET_EMBED(str);
6672 STR_SET_LEN(str, 0);
6673 RSTRING_PTR(str)[0] = 0;
6674 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6676 else
6678 return str;
6679}
6680
6681/*
6682 * call-seq:
6683 * chr -> string
6684 *
6685 * :include: doc/string/chr.rdoc
6686 *
6687 */
6688
6689static VALUE
6690rb_str_chr(VALUE str)
6691{
6692 return rb_str_substr(str, 0, 1);
6693}
6694
6695/*
6696 * call-seq:
6697 * getbyte(index) -> integer or nil
6698 *
6699 * :include: doc/string/getbyte.rdoc
6700 *
6701 */
6702VALUE
6703rb_str_getbyte(VALUE str, VALUE index)
6704{
6705 long pos = NUM2LONG(index);
6706
6707 if (pos < 0)
6708 pos += RSTRING_LEN(str);
6709 if (pos < 0 || RSTRING_LEN(str) <= pos)
6710 return Qnil;
6711
6712 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6713}
6714
6715/*
6716 * call-seq:
6717 * setbyte(index, integer) -> integer
6718 *
6719 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6720 *
6721 * s = 'abcde' # => "abcde"
6722 * s.setbyte(0, 98) # => 98
6723 * s # => "bbcde"
6724 *
6725 * Related: String#getbyte.
6726 */
6727VALUE
6728rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6729{
6730 long pos = NUM2LONG(index);
6731 long len = RSTRING_LEN(str);
6732 char *ptr, *head, *left = 0;
6733 rb_encoding *enc;
6734 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6735
6736 if (pos < -len || len <= pos)
6737 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6738 if (pos < 0)
6739 pos += len;
6740
6741 VALUE v = rb_to_int(value);
6742 VALUE w = rb_int_and(v, INT2FIX(0xff));
6743 char byte = (char)(NUM2INT(w) & 0xFF);
6744
6745 if (!str_independent(str))
6746 str_make_independent(str);
6747 enc = STR_ENC_GET(str);
6748 head = RSTRING_PTR(str);
6749 ptr = &head[pos];
6750 if (!STR_EMBED_P(str)) {
6751 cr = ENC_CODERANGE(str);
6752 switch (cr) {
6753 case ENC_CODERANGE_7BIT:
6754 left = ptr;
6755 *ptr = byte;
6756 if (ISASCII(byte)) goto end;
6757 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6758 if (!MBCLEN_CHARFOUND_P(nlen))
6760 else
6762 goto end;
6764 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6765 width = rb_enc_precise_mbclen(left, head+len, enc);
6766 *ptr = byte;
6767 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6768 if (!MBCLEN_CHARFOUND_P(nlen))
6770 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6772 goto end;
6773 }
6774 }
6776 *ptr = byte;
6777
6778 end:
6779 return value;
6780}
6781
6782static VALUE
6783str_byte_substr(VALUE str, long beg, long len, int empty)
6784{
6785 long n = RSTRING_LEN(str);
6786
6787 if (beg > n || len < 0) return Qnil;
6788 if (beg < 0) {
6789 beg += n;
6790 if (beg < 0) return Qnil;
6791 }
6792 if (len > n - beg)
6793 len = n - beg;
6794 if (len <= 0) {
6795 if (!empty) return Qnil;
6796 len = 0;
6797 }
6798
6799 VALUE str2 = str_subseq(str, beg, len);
6800
6801 str_enc_copy_direct(str2, str);
6802
6803 if (RSTRING_LEN(str2) == 0) {
6804 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6806 else
6808 }
6809 else {
6810 switch (ENC_CODERANGE(str)) {
6811 case ENC_CODERANGE_7BIT:
6813 break;
6814 default:
6816 break;
6817 }
6818 }
6819
6820 return str2;
6821}
6822
6823VALUE
6824rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6825{
6826 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6827}
6828
6829static VALUE
6830str_byte_aref(VALUE str, VALUE indx)
6831{
6832 long idx;
6833 if (FIXNUM_P(indx)) {
6834 idx = FIX2LONG(indx);
6835 }
6836 else {
6837 /* check if indx is Range */
6838 long beg, len = RSTRING_LEN(str);
6839
6840 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6841 case Qfalse:
6842 break;
6843 case Qnil:
6844 return Qnil;
6845 default:
6846 return str_byte_substr(str, beg, len, TRUE);
6847 }
6848
6849 idx = NUM2LONG(indx);
6850 }
6851 return str_byte_substr(str, idx, 1, FALSE);
6852}
6853
6854/*
6855 * call-seq:
6856 * byteslice(offset, length = 1) -> string or nil
6857 * byteslice(range) -> string or nil
6858 *
6859 * :include: doc/string/byteslice.rdoc
6860 */
6861
6862static VALUE
6863rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6864{
6865 if (argc == 2) {
6866 long beg = NUM2LONG(argv[0]);
6867 long len = NUM2LONG(argv[1]);
6868 return str_byte_substr(str, beg, len, TRUE);
6869 }
6870 rb_check_arity(argc, 1, 2);
6871 return str_byte_aref(str, argv[0]);
6872}
6873
6874static void
6875str_check_beg_len(VALUE str, long *beg, long *len)
6876{
6877 long end, slen = RSTRING_LEN(str);
6878
6879 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6880 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6881 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6882 }
6883 if (*beg < 0) {
6884 *beg += slen;
6885 }
6886 RUBY_ASSERT(*beg >= 0);
6887 RUBY_ASSERT(*beg <= slen);
6888
6889 if (*len > slen - *beg) {
6890 *len = slen - *beg;
6891 }
6892 end = *beg + *len;
6893 str_ensure_byte_pos(str, *beg);
6894 str_ensure_byte_pos(str, end);
6895}
6896
6897/*
6898 * call-seq:
6899 * bytesplice(offset, length, str) -> self
6900 * bytesplice(offset, length, str, str_offset, str_length) -> self
6901 * bytesplice(range, str) -> self
6902 * bytesplice(range, str, str_range) -> self
6903 *
6904 * :include: doc/string/bytesplice.rdoc
6905 */
6906
6907static VALUE
6908rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6909{
6910 long beg, len, vbeg, vlen;
6911 VALUE val;
6912 int cr;
6913
6914 rb_check_arity(argc, 2, 5);
6915 if (!(argc == 2 || argc == 3 || argc == 5)) {
6916 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6917 }
6918 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6919 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6920 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6921 rb_builtin_class_name(argv[0]));
6922 }
6923 val = argv[1];
6924 StringValue(val);
6925 if (argc == 2) {
6926 /* bytesplice(range, str) */
6927 vbeg = 0;
6928 vlen = RSTRING_LEN(val);
6929 }
6930 else {
6931 /* bytesplice(range, str, str_range) */
6932 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6933 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6934 rb_builtin_class_name(argv[2]));
6935 }
6936 }
6937 }
6938 else {
6939 beg = NUM2LONG(argv[0]);
6940 len = NUM2LONG(argv[1]);
6941 val = argv[2];
6942 StringValue(val);
6943 if (argc == 3) {
6944 /* bytesplice(index, length, str) */
6945 vbeg = 0;
6946 vlen = RSTRING_LEN(val);
6947 }
6948 else {
6949 /* bytesplice(index, length, str, str_index, str_length) */
6950 vbeg = NUM2LONG(argv[3]);
6951 vlen = NUM2LONG(argv[4]);
6952 }
6953 }
6954 str_check_beg_len(str, &beg, &len);
6955 str_check_beg_len(val, &vbeg, &vlen);
6956 str_modify_keep_cr(str);
6957
6958 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6959 rb_enc_associate(str, rb_enc_check(str, val));
6960 }
6961
6962 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6964 if (cr != ENC_CODERANGE_BROKEN)
6965 ENC_CODERANGE_SET(str, cr);
6966 return str;
6967}
6968
6969/*
6970 * call-seq:
6971 * reverse -> string
6972 *
6973 * Returns a new string with the characters from +self+ in reverse order.
6974 *
6975 * 'stressed'.reverse # => "desserts"
6976 *
6977 */
6978
6979static VALUE
6980rb_str_reverse(VALUE str)
6981{
6982 rb_encoding *enc;
6983 VALUE rev;
6984 char *s, *e, *p;
6985 int cr;
6986
6987 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6988 enc = STR_ENC_GET(str);
6989 rev = rb_str_new(0, RSTRING_LEN(str));
6990 s = RSTRING_PTR(str); e = RSTRING_END(str);
6991 p = RSTRING_END(rev);
6992 cr = ENC_CODERANGE(str);
6993
6994 if (RSTRING_LEN(str) > 1) {
6995 if (single_byte_optimizable(str)) {
6996 while (s < e) {
6997 *--p = *s++;
6998 }
6999 }
7000 else if (cr == ENC_CODERANGE_VALID) {
7001 while (s < e) {
7002 int clen = rb_enc_fast_mbclen(s, e, enc);
7003
7004 p -= clen;
7005 memcpy(p, s, clen);
7006 s += clen;
7007 }
7008 }
7009 else {
7010 cr = rb_enc_asciicompat(enc) ?
7012 while (s < e) {
7013 int clen = rb_enc_mbclen(s, e, enc);
7014
7015 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7016 p -= clen;
7017 memcpy(p, s, clen);
7018 s += clen;
7019 }
7020 }
7021 }
7022 STR_SET_LEN(rev, RSTRING_LEN(str));
7023 str_enc_copy_direct(rev, str);
7024 ENC_CODERANGE_SET(rev, cr);
7025
7026 return rev;
7027}
7028
7029
7030/*
7031 * call-seq:
7032 * reverse! -> self
7033 *
7034 * Returns +self+ with its characters reversed:
7035 *
7036 * s = 'stressed'
7037 * s.reverse! # => "desserts"
7038 * s # => "desserts"
7039 *
7040 */
7041
7042static VALUE
7043rb_str_reverse_bang(VALUE str)
7044{
7045 if (RSTRING_LEN(str) > 1) {
7046 if (single_byte_optimizable(str)) {
7047 char *s, *e, c;
7048
7049 str_modify_keep_cr(str);
7050 s = RSTRING_PTR(str);
7051 e = RSTRING_END(str) - 1;
7052 while (s < e) {
7053 c = *s;
7054 *s++ = *e;
7055 *e-- = c;
7056 }
7057 }
7058 else {
7059 str_shared_replace(str, rb_str_reverse(str));
7060 }
7061 }
7062 else {
7063 str_modify_keep_cr(str);
7064 }
7065 return str;
7066}
7067
7068
7069/*
7070 * call-seq:
7071 * include?(other_string) -> true or false
7072 *
7073 * Returns whether +self+ contains +other_string+:
7074 *
7075 * s = 'bar'
7076 * s.include?('ba') # => true
7077 * s.include?('ar') # => true
7078 * s.include?('bar') # => true
7079 * s.include?('a') # => true
7080 * s.include?('') # => true
7081 * s.include?('foo') # => false
7082 *
7083 * Related: see {Querying}[rdoc-ref:String@Querying].
7084 */
7085
7086VALUE
7087rb_str_include(VALUE str, VALUE arg)
7088{
7089 long i;
7090
7091 StringValue(arg);
7092 i = rb_str_index(str, arg, 0);
7093
7094 return RBOOL(i != -1);
7095}
7096
7097
7098/*
7099 * call-seq:
7100 * to_i(base = 10) -> integer
7101 *
7102 * Returns the result of interpreting leading characters in +self+
7103 * as an integer in the given +base+ (which must be in (0, 2..36)):
7104 *
7105 * '123456'.to_i # => 123456
7106 * '123def'.to_i(16) # => 1195503
7107 *
7108 * With +base+ zero, string +object+ may contain leading characters
7109 * to specify the actual base:
7110 *
7111 * '123def'.to_i(0) # => 123
7112 * '0123def'.to_i(0) # => 83
7113 * '0b123def'.to_i(0) # => 1
7114 * '0o123def'.to_i(0) # => 83
7115 * '0d123def'.to_i(0) # => 123
7116 * '0x123def'.to_i(0) # => 1195503
7117 *
7118 * Characters past a leading valid number (in the given +base+) are ignored:
7119 *
7120 * '12.345'.to_i # => 12
7121 * '12345'.to_i(2) # => 1
7122 *
7123 * Returns zero if there is no leading valid number:
7124 *
7125 * 'abcdef'.to_i # => 0
7126 * '2'.to_i(2) # => 0
7127 *
7128 */
7129
7130static VALUE
7131rb_str_to_i(int argc, VALUE *argv, VALUE str)
7132{
7133 int base = 10;
7134
7135 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7136 rb_raise(rb_eArgError, "invalid radix %d", base);
7137 }
7138 return rb_str_to_inum(str, base, FALSE);
7139}
7140
7141
7142/*
7143 * call-seq:
7144 * to_f -> float
7145 *
7146 * Returns the result of interpreting leading characters in +self+ as a Float:
7147 *
7148 * '3.14159'.to_f # => 3.14159
7149 * '1.234e-2'.to_f # => 0.01234
7150 *
7151 * Characters past a leading valid number (in the given +base+) are ignored:
7152 *
7153 * '3.14 (pi to two places)'.to_f # => 3.14
7154 *
7155 * Returns zero if there is no leading valid number:
7156 *
7157 * 'abcdef'.to_f # => 0.0
7158 *
7159 */
7160
7161static VALUE
7162rb_str_to_f(VALUE str)
7163{
7164 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7165}
7166
7167
7168/*
7169 * call-seq:
7170 * to_s -> self or string
7171 *
7172 * Returns +self+ if +self+ is a +String+,
7173 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7174 */
7175
7176static VALUE
7177rb_str_to_s(VALUE str)
7178{
7179 if (rb_obj_class(str) != rb_cString) {
7180 return str_duplicate(rb_cString, str);
7181 }
7182 return str;
7183}
7184
7185#if 0
7186static void
7187str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7188{
7189 char s[RUBY_MAX_CHAR_LEN];
7190 int n = rb_enc_codelen(c, enc);
7191
7192 rb_enc_mbcput(c, s, enc);
7193 rb_enc_str_buf_cat(str, s, n, enc);
7194}
7195#endif
7196
7197#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7198
7199int
7200rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7201{
7202 char buf[CHAR_ESC_LEN + 1];
7203 int l;
7204
7205#if SIZEOF_INT > 4
7206 c &= 0xffffffff;
7207#endif
7208 if (unicode_p) {
7209 if (c < 0x7F && ISPRINT(c)) {
7210 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7211 }
7212 else if (c < 0x10000) {
7213 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7214 }
7215 else {
7216 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7217 }
7218 }
7219 else {
7220 if (c < 0x100) {
7221 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7222 }
7223 else {
7224 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7225 }
7226 }
7227 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7228 rb_str_buf_cat(result, buf, l);
7229 return l;
7230}
7231
7232const char *
7233ruby_escaped_char(int c)
7234{
7235 switch (c) {
7236 case '\0': return "\\0";
7237 case '\n': return "\\n";
7238 case '\r': return "\\r";
7239 case '\t': return "\\t";
7240 case '\f': return "\\f";
7241 case '\013': return "\\v";
7242 case '\010': return "\\b";
7243 case '\007': return "\\a";
7244 case '\033': return "\\e";
7245 case '\x7f': return "\\c?";
7246 }
7247 return NULL;
7248}
7249
7250VALUE
7251rb_str_escape(VALUE str)
7252{
7253 int encidx = ENCODING_GET(str);
7254 rb_encoding *enc = rb_enc_from_index(encidx);
7255 const char *p = RSTRING_PTR(str);
7256 const char *pend = RSTRING_END(str);
7257 const char *prev = p;
7258 char buf[CHAR_ESC_LEN + 1];
7259 VALUE result = rb_str_buf_new(0);
7260 int unicode_p = rb_enc_unicode_p(enc);
7261 int asciicompat = rb_enc_asciicompat(enc);
7262
7263 while (p < pend) {
7264 unsigned int c;
7265 const char *cc;
7266 int n = rb_enc_precise_mbclen(p, pend, enc);
7267 if (!MBCLEN_CHARFOUND_P(n)) {
7268 if (p > prev) str_buf_cat(result, prev, p - prev);
7269 n = rb_enc_mbminlen(enc);
7270 if (pend < p + n)
7271 n = (int)(pend - p);
7272 while (n--) {
7273 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7274 str_buf_cat(result, buf, strlen(buf));
7275 prev = ++p;
7276 }
7277 continue;
7278 }
7279 n = MBCLEN_CHARFOUND_LEN(n);
7280 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7281 p += n;
7282 cc = ruby_escaped_char(c);
7283 if (cc) {
7284 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7285 str_buf_cat(result, cc, strlen(cc));
7286 prev = p;
7287 }
7288 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7289 }
7290 else {
7291 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7292 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7293 prev = p;
7294 }
7295 }
7296 if (p > prev) str_buf_cat(result, prev, p - prev);
7297 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7298
7299 return result;
7300}
7301
7302/*
7303 * call-seq:
7304 * inspect -> string
7305 *
7306 * :include: doc/string/inspect.rdoc
7307 *
7308 */
7309
7310VALUE
7312{
7313 int encidx = ENCODING_GET(str);
7314 rb_encoding *enc = rb_enc_from_index(encidx);
7315 const char *p, *pend, *prev;
7316 char buf[CHAR_ESC_LEN + 1];
7317 VALUE result = rb_str_buf_new(0);
7318 rb_encoding *resenc = rb_default_internal_encoding();
7319 int unicode_p = rb_enc_unicode_p(enc);
7320 int asciicompat = rb_enc_asciicompat(enc);
7321
7322 if (resenc == NULL) resenc = rb_default_external_encoding();
7323 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7324 rb_enc_associate(result, resenc);
7325 str_buf_cat2(result, "\"");
7326
7327 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7328 prev = p;
7329 while (p < pend) {
7330 unsigned int c, cc;
7331 int n;
7332
7333 n = rb_enc_precise_mbclen(p, pend, enc);
7334 if (!MBCLEN_CHARFOUND_P(n)) {
7335 if (p > prev) str_buf_cat(result, prev, p - prev);
7336 n = rb_enc_mbminlen(enc);
7337 if (pend < p + n)
7338 n = (int)(pend - p);
7339 while (n--) {
7340 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7341 str_buf_cat(result, buf, strlen(buf));
7342 prev = ++p;
7343 }
7344 continue;
7345 }
7346 n = MBCLEN_CHARFOUND_LEN(n);
7347 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7348 p += n;
7349 if ((asciicompat || unicode_p) &&
7350 (c == '"'|| c == '\\' ||
7351 (c == '#' &&
7352 p < pend &&
7353 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7354 (cc = rb_enc_codepoint(p,pend,enc),
7355 (cc == '$' || cc == '@' || cc == '{'))))) {
7356 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7357 str_buf_cat2(result, "\\");
7358 if (asciicompat || enc == resenc) {
7359 prev = p - n;
7360 continue;
7361 }
7362 }
7363 switch (c) {
7364 case '\n': cc = 'n'; break;
7365 case '\r': cc = 'r'; break;
7366 case '\t': cc = 't'; break;
7367 case '\f': cc = 'f'; break;
7368 case '\013': cc = 'v'; break;
7369 case '\010': cc = 'b'; break;
7370 case '\007': cc = 'a'; break;
7371 case 033: cc = 'e'; break;
7372 default: cc = 0; break;
7373 }
7374 if (cc) {
7375 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7376 buf[0] = '\\';
7377 buf[1] = (char)cc;
7378 str_buf_cat(result, buf, 2);
7379 prev = p;
7380 continue;
7381 }
7382 /* The special casing of 0x85 (NEXT_LINE) here is because
7383 * Oniguruma historically treats it as printable, but it
7384 * doesn't match the print POSIX bracket class or character
7385 * property in regexps.
7386 *
7387 * See Ruby Bug #16842 for details:
7388 * https://bugs.ruby-lang.org/issues/16842
7389 */
7390 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7391 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7392 continue;
7393 }
7394 else {
7395 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7396 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7397 prev = p;
7398 continue;
7399 }
7400 }
7401 if (p > prev) str_buf_cat(result, prev, p - prev);
7402 str_buf_cat2(result, "\"");
7403
7404 return result;
7405}
7406
7407#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7408
7409/*
7410 * call-seq:
7411 * dump -> new_string
7412 *
7413 * :include: doc/string/dump.rdoc
7414 *
7415 */
7416
7417VALUE
7419{
7420 int encidx = rb_enc_get_index(str);
7421 rb_encoding *enc = rb_enc_from_index(encidx);
7422 long len;
7423 const char *p, *pend;
7424 char *q, *qend;
7425 VALUE result;
7426 int u8 = (encidx == rb_utf8_encindex());
7427 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7428
7429 len = 2; /* "" */
7430 if (!rb_enc_asciicompat(enc)) {
7431 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7432 len += strlen(enc->name);
7433 }
7434
7435 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7436 while (p < pend) {
7437 int clen;
7438 unsigned char c = *p++;
7439
7440 switch (c) {
7441 case '"': case '\\':
7442 case '\n': case '\r':
7443 case '\t': case '\f':
7444 case '\013': case '\010': case '\007': case '\033':
7445 clen = 2;
7446 break;
7447
7448 case '#':
7449 clen = IS_EVSTR(p, pend) ? 2 : 1;
7450 break;
7451
7452 default:
7453 if (ISPRINT(c)) {
7454 clen = 1;
7455 }
7456 else {
7457 if (u8 && c > 0x7F) { /* \u notation */
7458 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7459 if (MBCLEN_CHARFOUND_P(n)) {
7460 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7461 if (cc <= 0xFFFF)
7462 clen = 6; /* \uXXXX */
7463 else if (cc <= 0xFFFFF)
7464 clen = 9; /* \u{XXXXX} */
7465 else
7466 clen = 10; /* \u{XXXXXX} */
7467 p += MBCLEN_CHARFOUND_LEN(n)-1;
7468 break;
7469 }
7470 }
7471 clen = 4; /* \xNN */
7472 }
7473 break;
7474 }
7475
7476 if (clen > LONG_MAX - len) {
7477 rb_raise(rb_eRuntimeError, "string size too big");
7478 }
7479 len += clen;
7480 }
7481
7482 result = rb_str_new(0, len);
7483 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7484 q = RSTRING_PTR(result); qend = q + len + 1;
7485
7486 *q++ = '"';
7487 while (p < pend) {
7488 unsigned char c = *p++;
7489
7490 if (c == '"' || c == '\\') {
7491 *q++ = '\\';
7492 *q++ = c;
7493 }
7494 else if (c == '#') {
7495 if (IS_EVSTR(p, pend)) *q++ = '\\';
7496 *q++ = '#';
7497 }
7498 else if (c == '\n') {
7499 *q++ = '\\';
7500 *q++ = 'n';
7501 }
7502 else if (c == '\r') {
7503 *q++ = '\\';
7504 *q++ = 'r';
7505 }
7506 else if (c == '\t') {
7507 *q++ = '\\';
7508 *q++ = 't';
7509 }
7510 else if (c == '\f') {
7511 *q++ = '\\';
7512 *q++ = 'f';
7513 }
7514 else if (c == '\013') {
7515 *q++ = '\\';
7516 *q++ = 'v';
7517 }
7518 else if (c == '\010') {
7519 *q++ = '\\';
7520 *q++ = 'b';
7521 }
7522 else if (c == '\007') {
7523 *q++ = '\\';
7524 *q++ = 'a';
7525 }
7526 else if (c == '\033') {
7527 *q++ = '\\';
7528 *q++ = 'e';
7529 }
7530 else if (ISPRINT(c)) {
7531 *q++ = c;
7532 }
7533 else {
7534 *q++ = '\\';
7535 if (u8) {
7536 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7537 if (MBCLEN_CHARFOUND_P(n)) {
7538 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7539 p += n;
7540 if (cc <= 0xFFFF)
7541 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7542 else
7543 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7544 q += strlen(q);
7545 continue;
7546 }
7547 }
7548 snprintf(q, qend-q, "x%02X", c);
7549 q += 3;
7550 }
7551 }
7552 *q++ = '"';
7553 *q = '\0';
7554 if (!rb_enc_asciicompat(enc)) {
7555 snprintf(q, qend-q, nonascii_suffix, enc->name);
7556 encidx = rb_ascii8bit_encindex();
7557 }
7558 /* result from dump is ASCII */
7559 rb_enc_associate_index(result, encidx);
7561 return result;
7562}
7563
7564static int
7565unescape_ascii(unsigned int c)
7566{
7567 switch (c) {
7568 case 'n':
7569 return '\n';
7570 case 'r':
7571 return '\r';
7572 case 't':
7573 return '\t';
7574 case 'f':
7575 return '\f';
7576 case 'v':
7577 return '\13';
7578 case 'b':
7579 return '\010';
7580 case 'a':
7581 return '\007';
7582 case 'e':
7583 return 033;
7584 }
7586}
7587
7588static void
7589undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7590{
7591 const char *s = *ss;
7592 unsigned int c;
7593 int codelen;
7594 size_t hexlen;
7595 unsigned char buf[6];
7596 static rb_encoding *enc_utf8 = NULL;
7597
7598 switch (*s) {
7599 case '\\':
7600 case '"':
7601 case '#':
7602 rb_str_cat(undumped, s, 1); /* cat itself */
7603 s++;
7604 break;
7605 case 'n':
7606 case 'r':
7607 case 't':
7608 case 'f':
7609 case 'v':
7610 case 'b':
7611 case 'a':
7612 case 'e':
7613 *buf = unescape_ascii(*s);
7614 rb_str_cat(undumped, (char *)buf, 1);
7615 s++;
7616 break;
7617 case 'u':
7618 if (*binary) {
7619 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7620 }
7621 *utf8 = true;
7622 if (++s >= s_end) {
7623 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7624 }
7625 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7626 if (*penc != enc_utf8) {
7627 *penc = enc_utf8;
7628 rb_enc_associate(undumped, enc_utf8);
7629 }
7630 if (*s == '{') { /* handle \u{...} form */
7631 s++;
7632 for (;;) {
7633 if (s >= s_end) {
7634 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7635 }
7636 if (*s == '}') {
7637 s++;
7638 break;
7639 }
7640 if (ISSPACE(*s)) {
7641 s++;
7642 continue;
7643 }
7644 c = scan_hex(s, s_end-s, &hexlen);
7645 if (hexlen == 0 || hexlen > 6) {
7646 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7647 }
7648 if (c > 0x10ffff) {
7649 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7650 }
7651 if (0xd800 <= c && c <= 0xdfff) {
7652 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7653 }
7654 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7655 rb_str_cat(undumped, (char *)buf, codelen);
7656 s += hexlen;
7657 }
7658 }
7659 else { /* handle \uXXXX form */
7660 c = scan_hex(s, 4, &hexlen);
7661 if (hexlen != 4) {
7662 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7663 }
7664 if (0xd800 <= c && c <= 0xdfff) {
7665 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7666 }
7667 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7668 rb_str_cat(undumped, (char *)buf, codelen);
7669 s += hexlen;
7670 }
7671 break;
7672 case 'x':
7673 if (*utf8) {
7674 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7675 }
7676 *binary = true;
7677 if (++s >= s_end) {
7678 rb_raise(rb_eRuntimeError, "invalid hex escape");
7679 }
7680 *buf = scan_hex(s, 2, &hexlen);
7681 if (hexlen != 2) {
7682 rb_raise(rb_eRuntimeError, "invalid hex escape");
7683 }
7684 rb_str_cat(undumped, (char *)buf, 1);
7685 s += hexlen;
7686 break;
7687 default:
7688 rb_str_cat(undumped, s-1, 2);
7689 s++;
7690 }
7691
7692 *ss = s;
7693}
7694
7695static VALUE rb_str_is_ascii_only_p(VALUE str);
7696
7697/*
7698 * call-seq:
7699 * undump -> string
7700 *
7701 * Returns an unescaped version of +self+:
7702 *
7703 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7704 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7705 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7706 * s_undumped == s_orig # => true
7707 *
7708 * Related: String#dump (inverse of String#undump).
7709 *
7710 */
7711
7712static VALUE
7713str_undump(VALUE str)
7714{
7715 const char *s = RSTRING_PTR(str);
7716 const char *s_end = RSTRING_END(str);
7717 rb_encoding *enc = rb_enc_get(str);
7718 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7719 bool utf8 = false;
7720 bool binary = false;
7721 int w;
7722
7724 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7725 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7726 }
7727 if (!str_null_check(str, &w)) {
7728 rb_raise(rb_eRuntimeError, "string contains null byte");
7729 }
7730 if (RSTRING_LEN(str) < 2) goto invalid_format;
7731 if (*s != '"') goto invalid_format;
7732
7733 /* strip '"' at the start */
7734 s++;
7735
7736 for (;;) {
7737 if (s >= s_end) {
7738 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7739 }
7740
7741 if (*s == '"') {
7742 /* epilogue */
7743 s++;
7744 if (s == s_end) {
7745 /* ascii compatible dumped string */
7746 break;
7747 }
7748 else {
7749 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7750 static const char dup_suffix[] = ".dup";
7751 const char *encname;
7752 int encidx;
7753 ptrdiff_t size;
7754
7755 /* check separately for strings dumped by older versions */
7756 size = sizeof(dup_suffix) - 1;
7757 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7758
7759 size = sizeof(force_encoding_suffix) - 1;
7760 if (s_end - s <= size) goto invalid_format;
7761 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7762 s += size;
7763
7764 if (utf8) {
7765 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7766 }
7767
7768 encname = s;
7769 s = memchr(s, '"', s_end-s);
7770 size = s - encname;
7771 if (!s) goto invalid_format;
7772 if (s_end - s != 2) goto invalid_format;
7773 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7774
7775 encidx = rb_enc_find_index2(encname, (long)size);
7776 if (encidx < 0) {
7777 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7778 }
7779 rb_enc_associate_index(undumped, encidx);
7780 }
7781 break;
7782 }
7783
7784 if (*s == '\\') {
7785 s++;
7786 if (s >= s_end) {
7787 rb_raise(rb_eRuntimeError, "invalid escape");
7788 }
7789 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7790 }
7791 else {
7792 rb_str_cat(undumped, s++, 1);
7793 }
7794 }
7795
7796 RB_GC_GUARD(str);
7797
7798 return undumped;
7799invalid_format:
7800 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7801}
7802
7803static void
7804rb_str_check_dummy_enc(rb_encoding *enc)
7805{
7806 if (rb_enc_dummy_p(enc)) {
7807 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7808 rb_enc_name(enc));
7809 }
7810}
7811
7812static rb_encoding *
7813str_true_enc(VALUE str)
7814{
7815 rb_encoding *enc = STR_ENC_GET(str);
7816 rb_str_check_dummy_enc(enc);
7817 return enc;
7818}
7819
7820static OnigCaseFoldType
7821check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7822{
7823 if (argc==0)
7824 return flags;
7825 if (argc>2)
7826 rb_raise(rb_eArgError, "too many options");
7827 if (argv[0]==sym_turkic) {
7828 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7829 if (argc==2) {
7830 if (argv[1]==sym_lithuanian)
7831 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7832 else
7833 rb_raise(rb_eArgError, "invalid second option");
7834 }
7835 }
7836 else if (argv[0]==sym_lithuanian) {
7837 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7838 if (argc==2) {
7839 if (argv[1]==sym_turkic)
7840 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7841 else
7842 rb_raise(rb_eArgError, "invalid second option");
7843 }
7844 }
7845 else if (argc>1)
7846 rb_raise(rb_eArgError, "too many options");
7847 else if (argv[0]==sym_ascii)
7848 flags |= ONIGENC_CASE_ASCII_ONLY;
7849 else if (argv[0]==sym_fold) {
7850 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7851 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7852 else
7853 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7854 }
7855 else
7856 rb_raise(rb_eArgError, "invalid option");
7857 return flags;
7858}
7859
7860static inline bool
7861case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7862{
7863 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7864 return true;
7865 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7866}
7867
7868/* 16 should be long enough to absorb any kind of single character length increase */
7869#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7870#ifndef CASEMAP_DEBUG
7871# define CASEMAP_DEBUG 0
7872#endif
7873
7874struct mapping_buffer;
7875typedef struct mapping_buffer {
7876 size_t capa;
7877 size_t used;
7878 struct mapping_buffer *next;
7879 OnigUChar space[FLEX_ARY_LEN];
7881
7882static void
7883mapping_buffer_free(void *p)
7884{
7885 mapping_buffer *previous_buffer;
7886 mapping_buffer *current_buffer = p;
7887 while (current_buffer) {
7888 previous_buffer = current_buffer;
7889 current_buffer = current_buffer->next;
7890 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7891 }
7892}
7893
7894static const rb_data_type_t mapping_buffer_type = {
7895 "mapping_buffer",
7896 {0, mapping_buffer_free,},
7897 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7898};
7899
7900static VALUE
7901rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7902{
7903 VALUE target;
7904
7905 const OnigUChar *source_current, *source_end;
7906 int target_length = 0;
7907 VALUE buffer_anchor;
7908 mapping_buffer *current_buffer = 0;
7909 mapping_buffer **pre_buffer;
7910 size_t buffer_count = 0;
7911 int buffer_length_or_invalid;
7912
7913 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7914
7915 source_current = (OnigUChar*)RSTRING_PTR(source);
7916 source_end = (OnigUChar*)RSTRING_END(source);
7917
7918 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7919 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7920 while (source_current < source_end) {
7921 /* increase multiplier using buffer count to converge quickly */
7922 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7923 if (CASEMAP_DEBUG) {
7924 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7925 }
7926 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7927 *pre_buffer = current_buffer;
7928 pre_buffer = &current_buffer->next;
7929 current_buffer->next = NULL;
7930 current_buffer->capa = capa;
7931 buffer_length_or_invalid = enc->case_map(flags,
7932 &source_current, source_end,
7933 current_buffer->space,
7934 current_buffer->space+current_buffer->capa,
7935 enc);
7936 if (buffer_length_or_invalid < 0) {
7937 current_buffer = DATA_PTR(buffer_anchor);
7938 DATA_PTR(buffer_anchor) = 0;
7939 mapping_buffer_free(current_buffer);
7940 rb_raise(rb_eArgError, "input string invalid");
7941 }
7942 target_length += current_buffer->used = buffer_length_or_invalid;
7943 }
7944 if (CASEMAP_DEBUG) {
7945 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7946 }
7947
7948 if (buffer_count==1) {
7949 target = rb_str_new((const char*)current_buffer->space, target_length);
7950 }
7951 else {
7952 char *target_current;
7953
7954 target = rb_str_new(0, target_length);
7955 target_current = RSTRING_PTR(target);
7956 current_buffer = DATA_PTR(buffer_anchor);
7957 while (current_buffer) {
7958 memcpy(target_current, current_buffer->space, current_buffer->used);
7959 target_current += current_buffer->used;
7960 current_buffer = current_buffer->next;
7961 }
7962 }
7963 current_buffer = DATA_PTR(buffer_anchor);
7964 DATA_PTR(buffer_anchor) = 0;
7965 mapping_buffer_free(current_buffer);
7966
7967 RB_GC_GUARD(buffer_anchor);
7968
7969 /* TODO: check about string terminator character */
7970 str_enc_copy_direct(target, source);
7971 /*ENC_CODERANGE_SET(mapped, cr);*/
7972
7973 return target;
7974}
7975
7976static VALUE
7977rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7978{
7979 const OnigUChar *source_current, *source_end;
7980 OnigUChar *target_current, *target_end;
7981 long old_length = RSTRING_LEN(source);
7982 int length_or_invalid;
7983
7984 if (old_length == 0) return Qnil;
7985
7986 source_current = (OnigUChar*)RSTRING_PTR(source);
7987 source_end = (OnigUChar*)RSTRING_END(source);
7988 if (source == target) {
7989 target_current = (OnigUChar*)source_current;
7990 target_end = (OnigUChar*)source_end;
7991 }
7992 else {
7993 target_current = (OnigUChar*)RSTRING_PTR(target);
7994 target_end = (OnigUChar*)RSTRING_END(target);
7995 }
7996
7997 length_or_invalid = onigenc_ascii_only_case_map(flags,
7998 &source_current, source_end,
7999 target_current, target_end, enc);
8000 if (length_or_invalid < 0)
8001 rb_raise(rb_eArgError, "input string invalid");
8002 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8003 fprintf(stderr, "problem with rb_str_ascii_casemap"
8004 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8005 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8006 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8007 }
8008
8009 str_enc_copy(target, source);
8010
8011 return target;
8012}
8013
8014static bool
8015upcase_single(VALUE str)
8016{
8017 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8018 bool modified = false;
8019
8020 while (s < send) {
8021 unsigned int c = *(unsigned char*)s;
8022
8023 if ('a' <= c && c <= 'z') {
8024 *s = 'A' + (c - 'a');
8025 modified = true;
8026 }
8027 s++;
8028 }
8029 return modified;
8030}
8031
8032/*
8033 * call-seq:
8034 * upcase!(mapping) -> self or nil
8035 *
8036 * Upcases the characters in +self+;
8037 * returns +self+ if any changes were made, +nil+ otherwise:
8038 *
8039 * s = 'Hello World!' # => "Hello World!"
8040 * s.upcase! # => "HELLO WORLD!"
8041 * s # => "HELLO WORLD!"
8042 * s.upcase! # => nil
8043 *
8044 * The casing may be affected by the given +mapping+;
8045 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8046 *
8047 * Related: String#upcase, String#downcase, String#downcase!.
8048 *
8049 */
8050
8051static VALUE
8052rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8053{
8054 rb_encoding *enc;
8055 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8056
8057 flags = check_case_options(argc, argv, flags);
8058 str_modify_keep_cr(str);
8059 enc = str_true_enc(str);
8060 if (case_option_single_p(flags, enc, str)) {
8061 if (upcase_single(str))
8062 flags |= ONIGENC_CASE_MODIFIED;
8063 }
8064 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8065 rb_str_ascii_casemap(str, str, &flags, enc);
8066 else
8067 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8068
8069 if (ONIGENC_CASE_MODIFIED&flags) return str;
8070 return Qnil;
8071}
8072
8073
8074/*
8075 * call-seq:
8076 * upcase(mapping) -> string
8077 *
8078 * Returns a string containing the upcased characters in +self+:
8079 *
8080 * s = 'Hello World!' # => "Hello World!"
8081 * s.upcase # => "HELLO WORLD!"
8082 *
8083 * The casing may be affected by the given +mapping+;
8084 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8085 *
8086 * Related: String#upcase!, String#downcase, String#downcase!.
8087 *
8088 */
8089
8090static VALUE
8091rb_str_upcase(int argc, VALUE *argv, VALUE str)
8092{
8093 rb_encoding *enc;
8094 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8095 VALUE ret;
8096
8097 flags = check_case_options(argc, argv, flags);
8098 enc = str_true_enc(str);
8099 if (case_option_single_p(flags, enc, str)) {
8100 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8101 str_enc_copy_direct(ret, str);
8102 upcase_single(ret);
8103 }
8104 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8105 ret = rb_str_new(0, RSTRING_LEN(str));
8106 rb_str_ascii_casemap(str, ret, &flags, enc);
8107 }
8108 else {
8109 ret = rb_str_casemap(str, &flags, enc);
8110 }
8111
8112 return ret;
8113}
8114
8115static bool
8116downcase_single(VALUE str)
8117{
8118 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8119 bool modified = false;
8120
8121 while (s < send) {
8122 unsigned int c = *(unsigned char*)s;
8123
8124 if ('A' <= c && c <= 'Z') {
8125 *s = 'a' + (c - 'A');
8126 modified = true;
8127 }
8128 s++;
8129 }
8130
8131 return modified;
8132}
8133
8134/*
8135 * call-seq:
8136 * downcase!(mapping) -> self or nil
8137 *
8138 * Like String#downcase, except that:
8139 *
8140 * - Changes character casings in +self+ (not in a copy of +self+).
8141 * - Returns +self+ if any changes are made, +nil+ otherwise.
8142 *
8143 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8144 */
8145
8146static VALUE
8147rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8148{
8149 rb_encoding *enc;
8150 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8151
8152 flags = check_case_options(argc, argv, flags);
8153 str_modify_keep_cr(str);
8154 enc = str_true_enc(str);
8155 if (case_option_single_p(flags, enc, str)) {
8156 if (downcase_single(str))
8157 flags |= ONIGENC_CASE_MODIFIED;
8158 }
8159 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8160 rb_str_ascii_casemap(str, str, &flags, enc);
8161 else
8162 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8163
8164 if (ONIGENC_CASE_MODIFIED&flags) return str;
8165 return Qnil;
8166}
8167
8168
8169/*
8170 * call-seq:
8171 * downcase(mapping) -> string
8172 *
8173 * :include: doc/string/downcase.rdoc
8174 *
8175 */
8176
8177static VALUE
8178rb_str_downcase(int argc, VALUE *argv, VALUE str)
8179{
8180 rb_encoding *enc;
8181 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8182 VALUE ret;
8183
8184 flags = check_case_options(argc, argv, flags);
8185 enc = str_true_enc(str);
8186 if (case_option_single_p(flags, enc, str)) {
8187 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8188 str_enc_copy_direct(ret, str);
8189 downcase_single(ret);
8190 }
8191 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8192 ret = rb_str_new(0, RSTRING_LEN(str));
8193 rb_str_ascii_casemap(str, ret, &flags, enc);
8194 }
8195 else {
8196 ret = rb_str_casemap(str, &flags, enc);
8197 }
8198
8199 return ret;
8200}
8201
8202
8203/*
8204 * call-seq:
8205 * capitalize!(mapping = :ascii) -> self or nil
8206 *
8207 * Like String#capitalize, except that:
8208 *
8209 * - Changes character casings in +self+ (not in a copy of +self+).
8210 * - Returns +self+ if any changes are made, +nil+ otherwise.
8211 *
8212 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8213 */
8214
8215static VALUE
8216rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8217{
8218 rb_encoding *enc;
8219 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8220
8221 flags = check_case_options(argc, argv, flags);
8222 str_modify_keep_cr(str);
8223 enc = str_true_enc(str);
8224 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8225 if (flags&ONIGENC_CASE_ASCII_ONLY)
8226 rb_str_ascii_casemap(str, str, &flags, enc);
8227 else
8228 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8229
8230 if (ONIGENC_CASE_MODIFIED&flags) return str;
8231 return Qnil;
8232}
8233
8234
8235/*
8236 * call-seq:
8237 * capitalize(mapping = :ascii) -> string
8238 *
8239 * Returns a string containing the characters in +self+,
8240 * each with possibly changed case:
8241 *
8242 * - The first character is upcased.
8243 * - All other characters are downcased.
8244 *
8245 * Examples:
8246 *
8247 * 'hello world'.capitalize # => "Hello world"
8248 * 'HELLO WORLD'.capitalize # => "Hello world"
8249 *
8250 * Some characters do not have upcase and downcase, and so are not changed;
8251 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc]:
8252 *
8253 * '1, 2, 3, ...'.capitalize # => "1, 2, 3, ..."
8254 *
8255 * The casing is affected by the given +mapping+,
8256 * which may be +:ascii+, +:fold+, or +:turkic+;
8257 * see {Case Mappings}[rdoc-ref:case_mapping.rdoc@Case+Mappings].
8258 *
8259 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8260 */
8261
8262static VALUE
8263rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8264{
8265 rb_encoding *enc;
8266 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8267 VALUE ret;
8268
8269 flags = check_case_options(argc, argv, flags);
8270 enc = str_true_enc(str);
8271 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8272 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8273 ret = rb_str_new(0, RSTRING_LEN(str));
8274 rb_str_ascii_casemap(str, ret, &flags, enc);
8275 }
8276 else {
8277 ret = rb_str_casemap(str, &flags, enc);
8278 }
8279 return ret;
8280}
8281
8282
8283/*
8284 * call-seq:
8285 * swapcase!(mapping) -> self or nil
8286 *
8287 * Upcases each lowercase character in +self+;
8288 * downcases uppercase character;
8289 * returns +self+ if any changes were made, +nil+ otherwise:
8290 *
8291 * s = 'Hello World!' # => "Hello World!"
8292 * s.swapcase! # => "hELLO wORLD!"
8293 * s # => "hELLO wORLD!"
8294 * ''.swapcase! # => nil
8295 *
8296 * The casing may be affected by the given +mapping+;
8297 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8298 *
8299 * Related: String#swapcase.
8300 *
8301 */
8302
8303static VALUE
8304rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8305{
8306 rb_encoding *enc;
8307 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8308
8309 flags = check_case_options(argc, argv, flags);
8310 str_modify_keep_cr(str);
8311 enc = str_true_enc(str);
8312 if (flags&ONIGENC_CASE_ASCII_ONLY)
8313 rb_str_ascii_casemap(str, str, &flags, enc);
8314 else
8315 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8316
8317 if (ONIGENC_CASE_MODIFIED&flags) return str;
8318 return Qnil;
8319}
8320
8321
8322/*
8323 * call-seq:
8324 * swapcase(mapping) -> string
8325 *
8326 * Returns a string containing the characters in +self+, with cases reversed;
8327 * each uppercase character is downcased;
8328 * each lowercase character is upcased:
8329 *
8330 * s = 'Hello World!' # => "Hello World!"
8331 * s.swapcase # => "hELLO wORLD!"
8332 *
8333 * The casing may be affected by the given +mapping+;
8334 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8335 *
8336 * Related: String#swapcase!.
8337 *
8338 */
8339
8340static VALUE
8341rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8342{
8343 rb_encoding *enc;
8344 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8345 VALUE ret;
8346
8347 flags = check_case_options(argc, argv, flags);
8348 enc = str_true_enc(str);
8349 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8350 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8351 ret = rb_str_new(0, RSTRING_LEN(str));
8352 rb_str_ascii_casemap(str, ret, &flags, enc);
8353 }
8354 else {
8355 ret = rb_str_casemap(str, &flags, enc);
8356 }
8357 return ret;
8358}
8359
8360typedef unsigned char *USTR;
8361
8362struct tr {
8363 int gen;
8364 unsigned int now, max;
8365 char *p, *pend;
8366};
8367
8368static unsigned int
8369trnext(struct tr *t, rb_encoding *enc)
8370{
8371 int n;
8372
8373 for (;;) {
8374 nextpart:
8375 if (!t->gen) {
8376 if (t->p == t->pend) return -1;
8377 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8378 t->p += n;
8379 }
8380 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8381 t->p += n;
8382 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8383 t->p += n;
8384 if (t->p < t->pend) {
8385 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8386 t->p += n;
8387 if (t->now > c) {
8388 if (t->now < 0x80 && c < 0x80) {
8389 rb_raise(rb_eArgError,
8390 "invalid range \"%c-%c\" in string transliteration",
8391 t->now, c);
8392 }
8393 else {
8394 rb_raise(rb_eArgError, "invalid range in string transliteration");
8395 }
8396 continue; /* not reached */
8397 }
8398 else if (t->now < c) {
8399 t->gen = 1;
8400 t->max = c;
8401 }
8402 }
8403 }
8404 return t->now;
8405 }
8406 else {
8407 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8408 if (t->now == t->max) {
8409 t->gen = 0;
8410 goto nextpart;
8411 }
8412 }
8413 if (t->now < t->max) {
8414 return t->now;
8415 }
8416 else {
8417 t->gen = 0;
8418 return t->max;
8419 }
8420 }
8421 }
8422}
8423
8424static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8425
8426static VALUE
8427tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8428{
8429 const unsigned int errc = -1;
8430 unsigned int trans[256];
8431 rb_encoding *enc, *e1, *e2;
8432 struct tr trsrc, trrepl;
8433 int cflag = 0;
8434 unsigned int c, c0, last = 0;
8435 int modify = 0, i, l;
8436 unsigned char *s, *send;
8437 VALUE hash = 0;
8438 int singlebyte = single_byte_optimizable(str);
8439 int termlen;
8440 int cr;
8441
8442#define CHECK_IF_ASCII(c) \
8443 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8444 (cr = ENC_CODERANGE_VALID) : 0)
8445
8446 StringValue(src);
8447 StringValue(repl);
8448 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8449 if (RSTRING_LEN(repl) == 0) {
8450 return rb_str_delete_bang(1, &src, str);
8451 }
8452
8453 cr = ENC_CODERANGE(str);
8454 e1 = rb_enc_check(str, src);
8455 e2 = rb_enc_check(str, repl);
8456 if (e1 == e2) {
8457 enc = e1;
8458 }
8459 else {
8460 enc = rb_enc_check(src, repl);
8461 }
8462 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8463 if (RSTRING_LEN(src) > 1 &&
8464 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8465 trsrc.p + l < trsrc.pend) {
8466 cflag = 1;
8467 trsrc.p += l;
8468 }
8469 trrepl.p = RSTRING_PTR(repl);
8470 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8471 trsrc.gen = trrepl.gen = 0;
8472 trsrc.now = trrepl.now = 0;
8473 trsrc.max = trrepl.max = 0;
8474
8475 if (cflag) {
8476 for (i=0; i<256; i++) {
8477 trans[i] = 1;
8478 }
8479 while ((c = trnext(&trsrc, enc)) != errc) {
8480 if (c < 256) {
8481 trans[c] = errc;
8482 }
8483 else {
8484 if (!hash) hash = rb_hash_new();
8485 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8486 }
8487 }
8488 while ((c = trnext(&trrepl, enc)) != errc)
8489 /* retrieve last replacer */;
8490 last = trrepl.now;
8491 for (i=0; i<256; i++) {
8492 if (trans[i] != errc) {
8493 trans[i] = last;
8494 }
8495 }
8496 }
8497 else {
8498 unsigned int r;
8499
8500 for (i=0; i<256; i++) {
8501 trans[i] = errc;
8502 }
8503 while ((c = trnext(&trsrc, enc)) != errc) {
8504 r = trnext(&trrepl, enc);
8505 if (r == errc) r = trrepl.now;
8506 if (c < 256) {
8507 trans[c] = r;
8508 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8509 }
8510 else {
8511 if (!hash) hash = rb_hash_new();
8512 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8513 }
8514 }
8515 }
8516
8517 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8518 cr = ENC_CODERANGE_7BIT;
8519 str_modify_keep_cr(str);
8520 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8521 termlen = rb_enc_mbminlen(enc);
8522 if (sflag) {
8523 int clen, tlen;
8524 long offset, max = RSTRING_LEN(str);
8525 unsigned int save = -1;
8526 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8527
8528 while (s < send) {
8529 int may_modify = 0;
8530
8531 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8532 if (!MBCLEN_CHARFOUND_P(r)) {
8533 xfree(buf);
8534 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8535 }
8536 clen = MBCLEN_CHARFOUND_LEN(r);
8537 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8538
8539 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8540
8541 s += clen;
8542 if (c < 256) {
8543 c = trans[c];
8544 }
8545 else if (hash) {
8546 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8547 if (NIL_P(tmp)) {
8548 if (cflag) c = last;
8549 else c = errc;
8550 }
8551 else if (cflag) c = errc;
8552 else c = NUM2INT(tmp);
8553 }
8554 else {
8555 c = errc;
8556 }
8557 if (c != (unsigned int)-1) {
8558 if (save == c) {
8559 CHECK_IF_ASCII(c);
8560 continue;
8561 }
8562 save = c;
8563 tlen = rb_enc_codelen(c, enc);
8564 modify = 1;
8565 }
8566 else {
8567 save = -1;
8568 c = c0;
8569 if (enc != e1) may_modify = 1;
8570 }
8571 if ((offset = t - buf) + tlen > max) {
8572 size_t MAYBE_UNUSED(old) = max + termlen;
8573 max = offset + tlen + (send - s);
8574 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8575 t = buf + offset;
8576 }
8577 rb_enc_mbcput(c, t, enc);
8578 if (may_modify && memcmp(s, t, tlen) != 0) {
8579 modify = 1;
8580 }
8581 CHECK_IF_ASCII(c);
8582 t += tlen;
8583 }
8584 if (!STR_EMBED_P(str)) {
8585 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8586 }
8587 TERM_FILL((char *)t, termlen);
8588 RSTRING(str)->as.heap.ptr = (char *)buf;
8589 STR_SET_LEN(str, t - buf);
8590 STR_SET_NOEMBED(str);
8591 RSTRING(str)->as.heap.aux.capa = max;
8592 }
8593 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8594 while (s < send) {
8595 c = (unsigned char)*s;
8596 if (trans[c] != errc) {
8597 if (!cflag) {
8598 c = trans[c];
8599 *s = c;
8600 modify = 1;
8601 }
8602 else {
8603 *s = last;
8604 modify = 1;
8605 }
8606 }
8607 CHECK_IF_ASCII(c);
8608 s++;
8609 }
8610 }
8611 else {
8612 int clen, tlen;
8613 long offset, max = (long)((send - s) * 1.2);
8614 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8615
8616 while (s < send) {
8617 int may_modify = 0;
8618
8619 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8620 if (!MBCLEN_CHARFOUND_P(r)) {
8621 xfree(buf);
8622 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8623 }
8624 clen = MBCLEN_CHARFOUND_LEN(r);
8625 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8626
8627 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8628
8629 if (c < 256) {
8630 c = trans[c];
8631 }
8632 else if (hash) {
8633 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8634 if (NIL_P(tmp)) {
8635 if (cflag) c = last;
8636 else c = errc;
8637 }
8638 else if (cflag) c = errc;
8639 else c = NUM2INT(tmp);
8640 }
8641 else {
8642 c = cflag ? last : errc;
8643 }
8644 if (c != errc) {
8645 tlen = rb_enc_codelen(c, enc);
8646 modify = 1;
8647 }
8648 else {
8649 c = c0;
8650 if (enc != e1) may_modify = 1;
8651 }
8652 if ((offset = t - buf) + tlen > max) {
8653 size_t MAYBE_UNUSED(old) = max + termlen;
8654 max = offset + tlen + (long)((send - s) * 1.2);
8655 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8656 t = buf + offset;
8657 }
8658 if (s != t) {
8659 rb_enc_mbcput(c, t, enc);
8660 if (may_modify && memcmp(s, t, tlen) != 0) {
8661 modify = 1;
8662 }
8663 }
8664 CHECK_IF_ASCII(c);
8665 s += clen;
8666 t += tlen;
8667 }
8668 if (!STR_EMBED_P(str)) {
8669 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8670 }
8671 TERM_FILL((char *)t, termlen);
8672 RSTRING(str)->as.heap.ptr = (char *)buf;
8673 STR_SET_LEN(str, t - buf);
8674 STR_SET_NOEMBED(str);
8675 RSTRING(str)->as.heap.aux.capa = max;
8676 }
8677
8678 if (modify) {
8679 if (cr != ENC_CODERANGE_BROKEN)
8680 ENC_CODERANGE_SET(str, cr);
8681 rb_enc_associate(str, enc);
8682 return str;
8683 }
8684 return Qnil;
8685}
8686
8687
8688/*
8689 * call-seq:
8690 * tr!(selector, replacements) -> self or nil
8691 *
8692 * Like String#tr, but modifies +self+ in place.
8693 * Returns +self+ if any changes were made, +nil+ otherwise.
8694 *
8695 */
8696
8697static VALUE
8698rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8699{
8700 return tr_trans(str, src, repl, 0);
8701}
8702
8703
8704/*
8705 * call-seq:
8706 * tr(selector, replacements) -> new_string
8707 *
8708 * Returns a copy of +self+ with each character specified by string +selector+
8709 * translated to the corresponding character in string +replacements+.
8710 * The correspondence is _positional_:
8711 *
8712 * - Each occurrence of the first character specified by +selector+
8713 * is translated to the first character in +replacements+.
8714 * - Each occurrence of the second character specified by +selector+
8715 * is translated to the second character in +replacements+.
8716 * - And so on.
8717 *
8718 * Example:
8719 *
8720 * 'hello'.tr('el', 'ip') #=> "hippo"
8721 *
8722 * If +replacements+ is shorter than +selector+,
8723 * it is implicitly padded with its own last character:
8724 *
8725 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8726 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8727 *
8728 * Arguments +selector+ and +replacements+ must be valid character selectors
8729 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8730 * and may use any of its valid forms, including negation, ranges, and escaping:
8731 *
8732 * # Negation.
8733 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8734 * # Ranges.
8735 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8736 * # Escapes.
8737 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8738 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8739 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8740 *
8741 */
8742
8743static VALUE
8744rb_str_tr(VALUE str, VALUE src, VALUE repl)
8745{
8746 str = str_duplicate(rb_cString, str);
8747 tr_trans(str, src, repl, 0);
8748 return str;
8749}
8750
8751#define TR_TABLE_MAX (UCHAR_MAX+1)
8752#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8753static void
8754tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8755 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8756{
8757 const unsigned int errc = -1;
8758 char buf[TR_TABLE_MAX];
8759 struct tr tr;
8760 unsigned int c;
8761 VALUE table = 0, ptable = 0;
8762 int i, l, cflag = 0;
8763
8764 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8765 tr.gen = tr.now = tr.max = 0;
8766
8767 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8768 cflag = 1;
8769 tr.p += l;
8770 }
8771 if (first) {
8772 for (i=0; i<TR_TABLE_MAX; i++) {
8773 stable[i] = 1;
8774 }
8775 stable[TR_TABLE_MAX] = cflag;
8776 }
8777 else if (stable[TR_TABLE_MAX] && !cflag) {
8778 stable[TR_TABLE_MAX] = 0;
8779 }
8780 for (i=0; i<TR_TABLE_MAX; i++) {
8781 buf[i] = cflag;
8782 }
8783
8784 while ((c = trnext(&tr, enc)) != errc) {
8785 if (c < TR_TABLE_MAX) {
8786 buf[(unsigned char)c] = !cflag;
8787 }
8788 else {
8789 VALUE key = UINT2NUM(c);
8790
8791 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8792 if (cflag) {
8793 ptable = *ctablep;
8794 table = ptable ? ptable : rb_hash_new();
8795 *ctablep = table;
8796 }
8797 else {
8798 table = rb_hash_new();
8799 ptable = *tablep;
8800 *tablep = table;
8801 }
8802 }
8803 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8804 rb_hash_aset(table, key, Qtrue);
8805 }
8806 }
8807 }
8808 for (i=0; i<TR_TABLE_MAX; i++) {
8809 stable[i] = stable[i] && buf[i];
8810 }
8811 if (!table && !cflag) {
8812 *tablep = 0;
8813 }
8814}
8815
8816
8817static int
8818tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8819{
8820 if (c < TR_TABLE_MAX) {
8821 return table[c] != 0;
8822 }
8823 else {
8824 VALUE v = UINT2NUM(c);
8825
8826 if (del) {
8827 if (!NIL_P(rb_hash_lookup(del, v)) &&
8828 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8829 return TRUE;
8830 }
8831 }
8832 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8833 return FALSE;
8834 }
8835 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8836 }
8837}
8838
8839/*
8840 * call-seq:
8841 * delete!(*selectors) -> self or nil
8842 *
8843 * Like String#delete, but modifies +self+ in place;
8844 * returns +self+ if any characters were deleted, +nil+ otherwise.
8845 *
8846 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8847 */
8848
8849static VALUE
8850rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8851{
8852 char squeez[TR_TABLE_SIZE];
8853 rb_encoding *enc = 0;
8854 char *s, *send, *t;
8855 VALUE del = 0, nodel = 0;
8856 int modify = 0;
8857 int i, ascompat, cr;
8858
8859 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8861 for (i=0; i<argc; i++) {
8862 VALUE s = argv[i];
8863
8864 StringValue(s);
8865 enc = rb_enc_check(str, s);
8866 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8867 }
8868
8869 str_modify_keep_cr(str);
8870 ascompat = rb_enc_asciicompat(enc);
8871 s = t = RSTRING_PTR(str);
8872 send = RSTRING_END(str);
8873 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8874 while (s < send) {
8875 unsigned int c;
8876 int clen;
8877
8878 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8879 if (squeez[c]) {
8880 modify = 1;
8881 }
8882 else {
8883 if (t != s) *t = c;
8884 t++;
8885 }
8886 s++;
8887 }
8888 else {
8889 c = rb_enc_codepoint_len(s, send, &clen, enc);
8890
8891 if (tr_find(c, squeez, del, nodel)) {
8892 modify = 1;
8893 }
8894 else {
8895 if (t != s) rb_enc_mbcput(c, t, enc);
8896 t += clen;
8898 }
8899 s += clen;
8900 }
8901 }
8902 TERM_FILL(t, TERM_LEN(str));
8903 STR_SET_LEN(str, t - RSTRING_PTR(str));
8904 ENC_CODERANGE_SET(str, cr);
8905
8906 if (modify) return str;
8907 return Qnil;
8908}
8909
8910
8911/*
8912 * call-seq:
8913 * delete(*selectors) -> new_string
8914 *
8915 * :include: doc/string/delete.rdoc
8916 *
8917 */
8918
8919static VALUE
8920rb_str_delete(int argc, VALUE *argv, VALUE str)
8921{
8922 str = str_duplicate(rb_cString, str);
8923 rb_str_delete_bang(argc, argv, str);
8924 return str;
8925}
8926
8927
8928/*
8929 * call-seq:
8930 * squeeze!(*selectors) -> self or nil
8931 *
8932 * Like String#squeeze, but modifies +self+ in place.
8933 * Returns +self+ if any changes were made, +nil+ otherwise.
8934 */
8935
8936static VALUE
8937rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8938{
8939 char squeez[TR_TABLE_SIZE];
8940 rb_encoding *enc = 0;
8941 VALUE del = 0, nodel = 0;
8942 unsigned char *s, *send, *t;
8943 int i, modify = 0;
8944 int ascompat, singlebyte = single_byte_optimizable(str);
8945 unsigned int save;
8946
8947 if (argc == 0) {
8948 enc = STR_ENC_GET(str);
8949 }
8950 else {
8951 for (i=0; i<argc; i++) {
8952 VALUE s = argv[i];
8953
8954 StringValue(s);
8955 enc = rb_enc_check(str, s);
8956 if (singlebyte && !single_byte_optimizable(s))
8957 singlebyte = 0;
8958 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8959 }
8960 }
8961
8962 str_modify_keep_cr(str);
8963 s = t = (unsigned char *)RSTRING_PTR(str);
8964 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8965 send = (unsigned char *)RSTRING_END(str);
8966 save = -1;
8967 ascompat = rb_enc_asciicompat(enc);
8968
8969 if (singlebyte) {
8970 while (s < send) {
8971 unsigned int c = *s++;
8972 if (c != save || (argc > 0 && !squeez[c])) {
8973 *t++ = save = c;
8974 }
8975 }
8976 }
8977 else {
8978 while (s < send) {
8979 unsigned int c;
8980 int clen;
8981
8982 if (ascompat && (c = *s) < 0x80) {
8983 if (c != save || (argc > 0 && !squeez[c])) {
8984 *t++ = save = c;
8985 }
8986 s++;
8987 }
8988 else {
8989 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8990
8991 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8992 if (t != s) rb_enc_mbcput(c, t, enc);
8993 save = c;
8994 t += clen;
8995 }
8996 s += clen;
8997 }
8998 }
8999 }
9000
9001 TERM_FILL((char *)t, TERM_LEN(str));
9002 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9003 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9004 modify = 1;
9005 }
9006
9007 if (modify) return str;
9008 return Qnil;
9009}
9010
9011
9012/*
9013 * call-seq:
9014 * squeeze(*selectors) -> new_string
9015 *
9016 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9017 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9018 *
9019 * "Squeezed" means that each multiple-character run of a selected character
9020 * is squeezed down to a single character;
9021 * with no arguments given, squeezes all characters:
9022 *
9023 * "yellow moon".squeeze #=> "yelow mon"
9024 * " now is the".squeeze(" ") #=> " now is the"
9025 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9026 *
9027 */
9028
9029static VALUE
9030rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9031{
9032 str = str_duplicate(rb_cString, str);
9033 rb_str_squeeze_bang(argc, argv, str);
9034 return str;
9035}
9036
9037
9038/*
9039 * call-seq:
9040 * tr_s!(selector, replacements) -> self or nil
9041 *
9042 * Like String#tr_s, but modifies +self+ in place.
9043 * Returns +self+ if any changes were made, +nil+ otherwise.
9044 *
9045 * Related: String#squeeze!.
9046 */
9047
9048static VALUE
9049rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9050{
9051 return tr_trans(str, src, repl, 1);
9052}
9053
9054
9055/*
9056 * call-seq:
9057 * tr_s(selector, replacements) -> string
9058 *
9059 * Like String#tr, but also squeezes the modified portions of the translated string;
9060 * returns a new string (translated and squeezed).
9061 *
9062 * 'hello'.tr_s('l', 'r') #=> "hero"
9063 * 'hello'.tr_s('el', '-') #=> "h-o"
9064 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9065 *
9066 * Related: String#squeeze.
9067 *
9068 */
9069
9070static VALUE
9071rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9072{
9073 str = str_duplicate(rb_cString, str);
9074 tr_trans(str, src, repl, 1);
9075 return str;
9076}
9077
9078
9079/*
9080 * call-seq:
9081 * count(*selectors) -> integer
9082 *
9083 * :include: doc/string/count.rdoc
9084 */
9085
9086static VALUE
9087rb_str_count(int argc, VALUE *argv, VALUE str)
9088{
9089 char table[TR_TABLE_SIZE];
9090 rb_encoding *enc = 0;
9091 VALUE del = 0, nodel = 0, tstr;
9092 char *s, *send;
9093 int i;
9094 int ascompat;
9095 size_t n = 0;
9096
9098
9099 tstr = argv[0];
9100 StringValue(tstr);
9101 enc = rb_enc_check(str, tstr);
9102 if (argc == 1) {
9103 const char *ptstr;
9104 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9105 (ptstr = RSTRING_PTR(tstr),
9106 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9107 !is_broken_string(str)) {
9108 int clen;
9109 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9110
9111 s = RSTRING_PTR(str);
9112 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9113 send = RSTRING_END(str);
9114 while (s < send) {
9115 if (*(unsigned char*)s++ == c) n++;
9116 }
9117 return SIZET2NUM(n);
9118 }
9119 }
9120
9121 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9122 for (i=1; i<argc; i++) {
9123 tstr = argv[i];
9124 StringValue(tstr);
9125 enc = rb_enc_check(str, tstr);
9126 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9127 }
9128
9129 s = RSTRING_PTR(str);
9130 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9131 send = RSTRING_END(str);
9132 ascompat = rb_enc_asciicompat(enc);
9133 while (s < send) {
9134 unsigned int c;
9135
9136 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9137 if (table[c]) {
9138 n++;
9139 }
9140 s++;
9141 }
9142 else {
9143 int clen;
9144 c = rb_enc_codepoint_len(s, send, &clen, enc);
9145 if (tr_find(c, table, del, nodel)) {
9146 n++;
9147 }
9148 s += clen;
9149 }
9150 }
9151
9152 return SIZET2NUM(n);
9153}
9154
9155static VALUE
9156rb_fs_check(VALUE val)
9157{
9158 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9159 val = rb_check_string_type(val);
9160 if (NIL_P(val)) return 0;
9161 }
9162 return val;
9163}
9164
9165static const char isspacetable[256] = {
9166 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9167 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9168 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9169 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9173 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9182};
9183
9184#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9185
9186static long
9187split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9188{
9189 if (empty_count >= 0 && len == 0) {
9190 return empty_count + 1;
9191 }
9192 if (empty_count > 0) {
9193 /* make different substrings */
9194 if (result) {
9195 do {
9196 rb_ary_push(result, str_new_empty_String(str));
9197 } while (--empty_count > 0);
9198 }
9199 else {
9200 do {
9201 rb_yield(str_new_empty_String(str));
9202 } while (--empty_count > 0);
9203 }
9204 }
9205 str = rb_str_subseq(str, beg, len);
9206 if (result) {
9207 rb_ary_push(result, str);
9208 }
9209 else {
9210 rb_yield(str);
9211 }
9212 return empty_count;
9213}
9214
9215typedef enum {
9216 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9217} split_type_t;
9218
9219static split_type_t
9220literal_split_pattern(VALUE spat, split_type_t default_type)
9221{
9222 rb_encoding *enc = STR_ENC_GET(spat);
9223 const char *ptr;
9224 long len;
9225 RSTRING_GETMEM(spat, ptr, len);
9226 if (len == 0) {
9227 /* Special case - split into chars */
9228 return SPLIT_TYPE_CHARS;
9229 }
9230 else if (rb_enc_asciicompat(enc)) {
9231 if (len == 1 && ptr[0] == ' ') {
9232 return SPLIT_TYPE_AWK;
9233 }
9234 }
9235 else {
9236 int l;
9237 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9238 return SPLIT_TYPE_AWK;
9239 }
9240 }
9241 return default_type;
9242}
9243
9244/*
9245 * call-seq:
9246 * split(field_sep = $;, limit = 0) -> array
9247 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9248 *
9249 * :include: doc/string/split.rdoc
9250 *
9251 */
9252
9253static VALUE
9254rb_str_split_m(int argc, VALUE *argv, VALUE str)
9255{
9256 rb_encoding *enc;
9257 VALUE spat;
9258 VALUE limit;
9259 split_type_t split_type;
9260 long beg, end, i = 0, empty_count = -1;
9261 int lim = 0;
9262 VALUE result, tmp;
9263
9264 result = rb_block_given_p() ? Qfalse : Qnil;
9265 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9266 lim = NUM2INT(limit);
9267 if (lim <= 0) limit = Qnil;
9268 else if (lim == 1) {
9269 if (RSTRING_LEN(str) == 0)
9270 return result ? rb_ary_new2(0) : str;
9271 tmp = str_duplicate(rb_cString, str);
9272 if (!result) {
9273 rb_yield(tmp);
9274 return str;
9275 }
9276 return rb_ary_new3(1, tmp);
9277 }
9278 i = 1;
9279 }
9280 if (NIL_P(limit) && !lim) empty_count = 0;
9281
9282 enc = STR_ENC_GET(str);
9283 split_type = SPLIT_TYPE_REGEXP;
9284 if (!NIL_P(spat)) {
9285 spat = get_pat_quoted(spat, 0);
9286 }
9287 else if (NIL_P(spat = rb_fs)) {
9288 split_type = SPLIT_TYPE_AWK;
9289 }
9290 else if (!(spat = rb_fs_check(spat))) {
9291 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9292 }
9293 else {
9294 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9295 }
9296 if (split_type != SPLIT_TYPE_AWK) {
9297 switch (BUILTIN_TYPE(spat)) {
9298 case T_REGEXP:
9299 rb_reg_options(spat); /* check if uninitialized */
9300 tmp = RREGEXP_SRC(spat);
9301 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9302 if (split_type == SPLIT_TYPE_AWK) {
9303 spat = tmp;
9304 split_type = SPLIT_TYPE_STRING;
9305 }
9306 break;
9307
9308 case T_STRING:
9309 mustnot_broken(spat);
9310 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9311 break;
9312
9313 default:
9315 }
9316 }
9317
9318#define SPLIT_STR(beg, len) ( \
9319 empty_count = split_string(result, str, beg, len, empty_count), \
9320 str_mod_check(str, str_start, str_len))
9321
9322 beg = 0;
9323 char *ptr = RSTRING_PTR(str);
9324 char *const str_start = ptr;
9325 const long str_len = RSTRING_LEN(str);
9326 char *const eptr = str_start + str_len;
9327 if (split_type == SPLIT_TYPE_AWK) {
9328 char *bptr = ptr;
9329 int skip = 1;
9330 unsigned int c;
9331
9332 if (result) result = rb_ary_new();
9333 end = beg;
9334 if (is_ascii_string(str)) {
9335 while (ptr < eptr) {
9336 c = (unsigned char)*ptr++;
9337 if (skip) {
9338 if (ascii_isspace(c)) {
9339 beg = ptr - bptr;
9340 }
9341 else {
9342 end = ptr - bptr;
9343 skip = 0;
9344 if (!NIL_P(limit) && lim <= i) break;
9345 }
9346 }
9347 else if (ascii_isspace(c)) {
9348 SPLIT_STR(beg, end-beg);
9349 skip = 1;
9350 beg = ptr - bptr;
9351 if (!NIL_P(limit)) ++i;
9352 }
9353 else {
9354 end = ptr - bptr;
9355 }
9356 }
9357 }
9358 else {
9359 while (ptr < eptr) {
9360 int n;
9361
9362 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9363 ptr += n;
9364 if (skip) {
9365 if (rb_isspace(c)) {
9366 beg = ptr - bptr;
9367 }
9368 else {
9369 end = ptr - bptr;
9370 skip = 0;
9371 if (!NIL_P(limit) && lim <= i) break;
9372 }
9373 }
9374 else if (rb_isspace(c)) {
9375 SPLIT_STR(beg, end-beg);
9376 skip = 1;
9377 beg = ptr - bptr;
9378 if (!NIL_P(limit)) ++i;
9379 }
9380 else {
9381 end = ptr - bptr;
9382 }
9383 }
9384 }
9385 }
9386 else if (split_type == SPLIT_TYPE_STRING) {
9387 char *substr_start = ptr;
9388 char *sptr = RSTRING_PTR(spat);
9389 long slen = RSTRING_LEN(spat);
9390
9391 if (result) result = rb_ary_new();
9392 mustnot_broken(str);
9393 enc = rb_enc_check(str, spat);
9394 while (ptr < eptr &&
9395 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9396 /* Check we are at the start of a char */
9397 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9398 if (t != ptr + end) {
9399 ptr = t;
9400 continue;
9401 }
9402 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9403 str_mod_check(spat, sptr, slen);
9404 ptr += end + slen;
9405 substr_start = ptr;
9406 if (!NIL_P(limit) && lim <= ++i) break;
9407 }
9408 beg = ptr - str_start;
9409 }
9410 else if (split_type == SPLIT_TYPE_CHARS) {
9411 int n;
9412
9413 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9414 mustnot_broken(str);
9415 enc = rb_enc_get(str);
9416 while (ptr < eptr &&
9417 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9418 SPLIT_STR(ptr - str_start, n);
9419 ptr += n;
9420 if (!NIL_P(limit) && lim <= ++i) break;
9421 }
9422 beg = ptr - str_start;
9423 }
9424 else {
9425 if (result) result = rb_ary_new();
9426 long len = RSTRING_LEN(str);
9427 long start = beg;
9428 long idx;
9429 int last_null = 0;
9430 struct re_registers *regs;
9431 VALUE match = 0;
9432
9433 for (; rb_reg_search(spat, str, start, 0) >= 0;
9434 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9435 match = rb_backref_get();
9436 if (!result) rb_match_busy(match);
9437 regs = RMATCH_REGS(match);
9438 end = BEG(0);
9439 if (start == end && BEG(0) == END(0)) {
9440 if (!ptr) {
9441 SPLIT_STR(0, 0);
9442 break;
9443 }
9444 else if (last_null == 1) {
9445 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9446 beg = start;
9447 }
9448 else {
9449 if (start == len)
9450 start++;
9451 else
9452 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9453 last_null = 1;
9454 continue;
9455 }
9456 }
9457 else {
9458 SPLIT_STR(beg, end-beg);
9459 beg = start = END(0);
9460 }
9461 last_null = 0;
9462
9463 for (idx=1; idx < regs->num_regs; idx++) {
9464 if (BEG(idx) == -1) continue;
9465 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9466 }
9467 if (!NIL_P(limit) && lim <= ++i) break;
9468 }
9469 if (match) rb_match_unbusy(match);
9470 }
9471 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9472 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9473 }
9474
9475 return result ? result : str;
9476}
9477
9478VALUE
9479rb_str_split(VALUE str, const char *sep0)
9480{
9481 VALUE sep;
9482
9483 StringValue(str);
9484 sep = rb_str_new_cstr(sep0);
9485 return rb_str_split_m(1, &sep, str);
9486}
9487
9488#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9489
9490static inline int
9491enumerator_element(VALUE ary, VALUE e)
9492{
9493 if (ary) {
9494 rb_ary_push(ary, e);
9495 return 0;
9496 }
9497 else {
9498 rb_yield(e);
9499 return 1;
9500 }
9501}
9502
9503#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9504
9505static const char *
9506chomp_newline(const char *p, const char *e, rb_encoding *enc)
9507{
9508 const char *prev = rb_enc_prev_char(p, e, e, enc);
9509 if (rb_enc_is_newline(prev, e, enc)) {
9510 e = prev;
9511 prev = rb_enc_prev_char(p, e, e, enc);
9512 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9513 e = prev;
9514 }
9515 return e;
9516}
9517
9518static VALUE
9519get_rs(void)
9520{
9521 VALUE rs = rb_rs;
9522 if (!NIL_P(rs) &&
9523 (!RB_TYPE_P(rs, T_STRING) ||
9524 RSTRING_LEN(rs) != 1 ||
9525 RSTRING_PTR(rs)[0] != '\n')) {
9526 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9527 }
9528 return rs;
9529}
9530
9531#define rb_rs get_rs()
9532
9533static VALUE
9534rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9535{
9536 rb_encoding *enc;
9537 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9538 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9539 long pos, len, rslen;
9540 int rsnewline = 0;
9541
9542 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9543 rs = rb_rs;
9544 if (!NIL_P(opts)) {
9545 static ID keywords[1];
9546 if (!keywords[0]) {
9547 keywords[0] = rb_intern_const("chomp");
9548 }
9549 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9550 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9551 }
9552
9553 if (NIL_P(rs)) {
9554 if (!ENUM_ELEM(ary, str)) {
9555 return ary;
9556 }
9557 else {
9558 return orig;
9559 }
9560 }
9561
9562 if (!RSTRING_LEN(str)) goto end;
9563 str = rb_str_new_frozen(str);
9564 ptr = subptr = RSTRING_PTR(str);
9565 pend = RSTRING_END(str);
9566 len = RSTRING_LEN(str);
9567 StringValue(rs);
9568 rslen = RSTRING_LEN(rs);
9569
9570 if (rs == rb_default_rs)
9571 enc = rb_enc_get(str);
9572 else
9573 enc = rb_enc_check(str, rs);
9574
9575 if (rslen == 0) {
9576 /* paragraph mode */
9577 int n;
9578 const char *eol = NULL;
9579 subend = subptr;
9580 while (subend < pend) {
9581 long chomp_rslen = 0;
9582 do {
9583 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9584 n = 0;
9585 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9586 if (rb_enc_is_newline(subend + n, pend, enc)) {
9587 if (eol == subend) break;
9588 subend += rslen;
9589 if (subptr) {
9590 eol = subend;
9591 chomp_rslen = -rslen;
9592 }
9593 }
9594 else {
9595 if (!subptr) subptr = subend;
9596 subend += rslen;
9597 }
9598 rslen = 0;
9599 } while (subend < pend);
9600 if (!subptr) break;
9601 if (rslen == 0) chomp_rslen = 0;
9602 line = rb_str_subseq(str, subptr - ptr,
9603 subend - subptr + (chomp ? chomp_rslen : rslen));
9604 if (ENUM_ELEM(ary, line)) {
9605 str_mod_check(str, ptr, len);
9606 }
9607 subptr = eol = NULL;
9608 }
9609 goto end;
9610 }
9611 else {
9612 rsptr = RSTRING_PTR(rs);
9613 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9614 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9615 rsnewline = 1;
9616 }
9617 }
9618
9619 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9620 rs = rb_str_new(rsptr, rslen);
9621 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9622 rsptr = RSTRING_PTR(rs);
9623 rslen = RSTRING_LEN(rs);
9624 }
9625
9626 while (subptr < pend) {
9627 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9628 if (pos < 0) break;
9629 hit = subptr + pos;
9630 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9631 if (hit != adjusted) {
9632 subptr = adjusted;
9633 continue;
9634 }
9635 subend = hit += rslen;
9636 if (chomp) {
9637 if (rsnewline) {
9638 subend = chomp_newline(subptr, subend, enc);
9639 }
9640 else {
9641 subend -= rslen;
9642 }
9643 }
9644 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9645 if (ENUM_ELEM(ary, line)) {
9646 str_mod_check(str, ptr, len);
9647 }
9648 subptr = hit;
9649 }
9650
9651 if (subptr != pend) {
9652 if (chomp) {
9653 if (rsnewline) {
9654 pend = chomp_newline(subptr, pend, enc);
9655 }
9656 else if (pend - subptr >= rslen &&
9657 memcmp(pend - rslen, rsptr, rslen) == 0) {
9658 pend -= rslen;
9659 }
9660 }
9661 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9662 ENUM_ELEM(ary, line);
9663 RB_GC_GUARD(str);
9664 }
9665
9666 end:
9667 if (ary)
9668 return ary;
9669 else
9670 return orig;
9671}
9672
9673/*
9674 * call-seq:
9675 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9676 * each_line(record_separator = $/, chomp: false) -> enumerator
9677 *
9678 * :include: doc/string/each_line.rdoc
9679 *
9680 */
9681
9682static VALUE
9683rb_str_each_line(int argc, VALUE *argv, VALUE str)
9684{
9685 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9686 return rb_str_enumerate_lines(argc, argv, str, 0);
9687}
9688
9689/*
9690 * call-seq:
9691 * lines(record_separator = $/, chomp: false) -> array_of_strings
9692 *
9693 * Returns substrings ("lines") of +self+
9694 * according to the given arguments:
9695 *
9696 * s = <<~EOT
9697 * This is the first line.
9698 * This is line two.
9699 *
9700 * This is line four.
9701 * This is line five.
9702 * EOT
9703 *
9704 * With the default argument values:
9705 *
9706 * $/ # => "\n"
9707 * s.lines
9708 * # =>
9709 * ["This is the first line.\n",
9710 * "This is line two.\n",
9711 * "\n",
9712 * "This is line four.\n",
9713 * "This is line five.\n"]
9714 *
9715 * With a different +record_separator+:
9716 *
9717 * record_separator = ' is '
9718 * s.lines(record_separator)
9719 * # =>
9720 * ["This is ",
9721 * "the first line.\nThis is ",
9722 * "line two.\n\nThis is ",
9723 * "line four.\nThis is ",
9724 * "line five.\n"]
9725 *
9726 * With keyword argument +chomp+ as +true+,
9727 * removes the trailing newline from each line:
9728 *
9729 * s.lines(chomp: true)
9730 * # =>
9731 * ["This is the first line.",
9732 * "This is line two.",
9733 * "",
9734 * "This is line four.",
9735 * "This is line five."]
9736 *
9737 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9738 */
9739
9740static VALUE
9741rb_str_lines(int argc, VALUE *argv, VALUE str)
9742{
9743 VALUE ary = WANTARRAY("lines", 0);
9744 return rb_str_enumerate_lines(argc, argv, str, ary);
9745}
9746
9747static VALUE
9748rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9749{
9750 return LONG2FIX(RSTRING_LEN(str));
9751}
9752
9753static VALUE
9754rb_str_enumerate_bytes(VALUE str, VALUE ary)
9755{
9756 long i;
9757
9758 for (i=0; i<RSTRING_LEN(str); i++) {
9759 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9760 }
9761 if (ary)
9762 return ary;
9763 else
9764 return str;
9765}
9766
9767/*
9768 * call-seq:
9769 * each_byte {|byte| ... } -> self
9770 * each_byte -> enumerator
9771 *
9772 * :include: doc/string/each_byte.rdoc
9773 *
9774 */
9775
9776static VALUE
9777rb_str_each_byte(VALUE str)
9778{
9779 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9780 return rb_str_enumerate_bytes(str, 0);
9781}
9782
9783/*
9784 * call-seq:
9785 * bytes -> array_of_bytes
9786 *
9787 * :include: doc/string/bytes.rdoc
9788 *
9789 */
9790
9791static VALUE
9792rb_str_bytes(VALUE str)
9793{
9794 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9795 return rb_str_enumerate_bytes(str, ary);
9796}
9797
9798static VALUE
9799rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9800{
9801 return rb_str_length(str);
9802}
9803
9804static VALUE
9805rb_str_enumerate_chars(VALUE str, VALUE ary)
9806{
9807 VALUE orig = str;
9808 long i, len, n;
9809 const char *ptr;
9810 rb_encoding *enc;
9811
9812 str = rb_str_new_frozen(str);
9813 ptr = RSTRING_PTR(str);
9814 len = RSTRING_LEN(str);
9815 enc = rb_enc_get(str);
9816
9818 for (i = 0; i < len; i += n) {
9819 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9820 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9821 }
9822 }
9823 else {
9824 for (i = 0; i < len; i += n) {
9825 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9826 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9827 }
9828 }
9829 RB_GC_GUARD(str);
9830 if (ary)
9831 return ary;
9832 else
9833 return orig;
9834}
9835
9836/*
9837 * call-seq:
9838 * each_char {|char| ... } -> self
9839 * each_char -> enumerator
9840 *
9841 * :include: doc/string/each_char.rdoc
9842 *
9843 */
9844
9845static VALUE
9846rb_str_each_char(VALUE str)
9847{
9848 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9849 return rb_str_enumerate_chars(str, 0);
9850}
9851
9852/*
9853 * call-seq:
9854 * chars -> array_of_characters
9855 *
9856 * :include: doc/string/chars.rdoc
9857 *
9858 */
9859
9860static VALUE
9861rb_str_chars(VALUE str)
9862{
9863 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9864 return rb_str_enumerate_chars(str, ary);
9865}
9866
9867static VALUE
9868rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9869{
9870 VALUE orig = str;
9871 int n;
9872 unsigned int c;
9873 const char *ptr, *end;
9874 rb_encoding *enc;
9875
9876 if (single_byte_optimizable(str))
9877 return rb_str_enumerate_bytes(str, ary);
9878
9879 str = rb_str_new_frozen(str);
9880 ptr = RSTRING_PTR(str);
9881 end = RSTRING_END(str);
9882 enc = STR_ENC_GET(str);
9883
9884 while (ptr < end) {
9885 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9886 ENUM_ELEM(ary, UINT2NUM(c));
9887 ptr += n;
9888 }
9889 RB_GC_GUARD(str);
9890 if (ary)
9891 return ary;
9892 else
9893 return orig;
9894}
9895
9896/*
9897 * call-seq:
9898 * each_codepoint {|codepoint| ... } -> self
9899 * each_codepoint -> enumerator
9900 *
9901 * :include: doc/string/each_codepoint.rdoc
9902 *
9903 */
9904
9905static VALUE
9906rb_str_each_codepoint(VALUE str)
9907{
9908 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9909 return rb_str_enumerate_codepoints(str, 0);
9910}
9911
9912/*
9913 * call-seq:
9914 * codepoints -> array_of_integers
9915 *
9916 * :include: doc/string/codepoints.rdoc
9917 *
9918 */
9919
9920static VALUE
9921rb_str_codepoints(VALUE str)
9922{
9923 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9924 return rb_str_enumerate_codepoints(str, ary);
9925}
9926
9927static regex_t *
9928get_reg_grapheme_cluster(rb_encoding *enc)
9929{
9930 int encidx = rb_enc_to_index(enc);
9931
9932 const OnigUChar source_ascii[] = "\\X";
9933 const OnigUChar *source = source_ascii;
9934 size_t source_len = sizeof(source_ascii) - 1;
9935
9936 switch (encidx) {
9937#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9938#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9939#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9940#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9941#define CASE_UTF(e) \
9942 case ENCINDEX_UTF_##e: { \
9943 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9944 source = source_UTF_##e; \
9945 source_len = sizeof(source_UTF_##e); \
9946 break; \
9947 }
9948 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9949#undef CASE_UTF
9950#undef CHARS_16BE
9951#undef CHARS_16LE
9952#undef CHARS_32BE
9953#undef CHARS_32LE
9954 }
9955
9956 regex_t *reg_grapheme_cluster;
9957 OnigErrorInfo einfo;
9958 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9959 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9960 if (r) {
9961 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9962 onig_error_code_to_str(message, r, &einfo);
9963 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9964 }
9965
9966 return reg_grapheme_cluster;
9967}
9968
9969static regex_t *
9970get_cached_reg_grapheme_cluster(rb_encoding *enc)
9971{
9972 int encidx = rb_enc_to_index(enc);
9973 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9974
9975 if (encidx == rb_utf8_encindex()) {
9976 if (!reg_grapheme_cluster_utf8) {
9977 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9978 }
9979
9980 return reg_grapheme_cluster_utf8;
9981 }
9982
9983 return NULL;
9984}
9985
9986static VALUE
9987rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9988{
9989 size_t grapheme_cluster_count = 0;
9990 rb_encoding *enc = get_encoding(str);
9991 const char *ptr, *end;
9992
9993 if (!rb_enc_unicode_p(enc)) {
9994 return rb_str_length(str);
9995 }
9996
9997 bool cached_reg_grapheme_cluster = true;
9998 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9999 if (!reg_grapheme_cluster) {
10000 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10001 cached_reg_grapheme_cluster = false;
10002 }
10003
10004 ptr = RSTRING_PTR(str);
10005 end = RSTRING_END(str);
10006
10007 while (ptr < end) {
10008 OnigPosition len = onig_match(reg_grapheme_cluster,
10009 (const OnigUChar *)ptr, (const OnigUChar *)end,
10010 (const OnigUChar *)ptr, NULL, 0);
10011 if (len <= 0) break;
10012 grapheme_cluster_count++;
10013 ptr += len;
10014 }
10015
10016 if (!cached_reg_grapheme_cluster) {
10017 onig_free(reg_grapheme_cluster);
10018 }
10019
10020 return SIZET2NUM(grapheme_cluster_count);
10021}
10022
10023static VALUE
10024rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
10025{
10026 VALUE orig = str;
10027 rb_encoding *enc = get_encoding(str);
10028 const char *ptr0, *ptr, *end;
10029
10030 if (!rb_enc_unicode_p(enc)) {
10031 return rb_str_enumerate_chars(str, ary);
10032 }
10033
10034 if (!ary) str = rb_str_new_frozen(str);
10035
10036 bool cached_reg_grapheme_cluster = true;
10037 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10038 if (!reg_grapheme_cluster) {
10039 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10040 cached_reg_grapheme_cluster = false;
10041 }
10042
10043 ptr0 = ptr = RSTRING_PTR(str);
10044 end = RSTRING_END(str);
10045
10046 while (ptr < end) {
10047 OnigPosition len = onig_match(reg_grapheme_cluster,
10048 (const OnigUChar *)ptr, (const OnigUChar *)end,
10049 (const OnigUChar *)ptr, NULL, 0);
10050 if (len <= 0) break;
10051 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10052 ptr += len;
10053 }
10054
10055 if (!cached_reg_grapheme_cluster) {
10056 onig_free(reg_grapheme_cluster);
10057 }
10058
10059 RB_GC_GUARD(str);
10060 if (ary)
10061 return ary;
10062 else
10063 return orig;
10064}
10065
10066/*
10067 * call-seq:
10068 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
10069 * each_grapheme_cluster -> enumerator
10070 *
10071 * :include: doc/string/each_grapheme_cluster.rdoc
10072 *
10073 */
10074
10075static VALUE
10076rb_str_each_grapheme_cluster(VALUE str)
10077{
10078 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10079 return rb_str_enumerate_grapheme_clusters(str, 0);
10080}
10081
10082/*
10083 * call-seq:
10084 * grapheme_clusters -> array_of_grapheme_clusters
10085 *
10086 * :include: doc/string/grapheme_clusters.rdoc
10087 *
10088 */
10089
10090static VALUE
10091rb_str_grapheme_clusters(VALUE str)
10092{
10093 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10094 return rb_str_enumerate_grapheme_clusters(str, ary);
10095}
10096
10097static long
10098chopped_length(VALUE str)
10099{
10100 rb_encoding *enc = STR_ENC_GET(str);
10101 const char *p, *p2, *beg, *end;
10102
10103 beg = RSTRING_PTR(str);
10104 end = beg + RSTRING_LEN(str);
10105 if (beg >= end) return 0;
10106 p = rb_enc_prev_char(beg, end, end, enc);
10107 if (!p) return 0;
10108 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10109 p2 = rb_enc_prev_char(beg, p, end, enc);
10110 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10111 }
10112 return p - beg;
10113}
10114
10115/*
10116 * call-seq:
10117 * chop! -> self or nil
10118 *
10119 * Like String#chop, except that:
10120 *
10121 * - Removes trailing characters from +self+ (not from a copy of +self+).
10122 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10123 *
10124 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10125 */
10126
10127static VALUE
10128rb_str_chop_bang(VALUE str)
10129{
10130 str_modify_keep_cr(str);
10131 if (RSTRING_LEN(str) > 0) {
10132 long len;
10133 len = chopped_length(str);
10134 STR_SET_LEN(str, len);
10135 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10136 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10138 }
10139 return str;
10140 }
10141 return Qnil;
10142}
10143
10144
10145/*
10146 * call-seq:
10147 * chop -> new_string
10148 *
10149 * :include: doc/string/chop.rdoc
10150 *
10151 */
10152
10153static VALUE
10154rb_str_chop(VALUE str)
10155{
10156 return rb_str_subseq(str, 0, chopped_length(str));
10157}
10158
10159static long
10160smart_chomp(VALUE str, const char *e, const char *p)
10161{
10162 rb_encoding *enc = rb_enc_get(str);
10163 if (rb_enc_mbminlen(enc) > 1) {
10164 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10165 if (rb_enc_is_newline(pp, e, enc)) {
10166 e = pp;
10167 }
10168 pp = e - rb_enc_mbminlen(enc);
10169 if (pp >= p) {
10170 pp = rb_enc_left_char_head(p, pp, e, enc);
10171 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10172 e = pp;
10173 }
10174 }
10175 }
10176 else {
10177 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10178 case '\n':
10179 if (--e > p && *(e-1) == '\r') {
10180 --e;
10181 }
10182 break;
10183 case '\r':
10184 --e;
10185 break;
10186 }
10187 }
10188 return e - p;
10189}
10190
10191static long
10192chompped_length(VALUE str, VALUE rs)
10193{
10194 rb_encoding *enc;
10195 int newline;
10196 char *pp, *e, *rsptr;
10197 long rslen;
10198 char *const p = RSTRING_PTR(str);
10199 long len = RSTRING_LEN(str);
10200
10201 if (len == 0) return 0;
10202 e = p + len;
10203 if (rs == rb_default_rs) {
10204 return smart_chomp(str, e, p);
10205 }
10206
10207 enc = rb_enc_get(str);
10208 RSTRING_GETMEM(rs, rsptr, rslen);
10209 if (rslen == 0) {
10210 if (rb_enc_mbminlen(enc) > 1) {
10211 while (e > p) {
10212 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10213 if (!rb_enc_is_newline(pp, e, enc)) break;
10214 e = pp;
10215 pp -= rb_enc_mbminlen(enc);
10216 if (pp >= p) {
10217 pp = rb_enc_left_char_head(p, pp, e, enc);
10218 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10219 e = pp;
10220 }
10221 }
10222 }
10223 }
10224 else {
10225 while (e > p && *(e-1) == '\n') {
10226 --e;
10227 if (e > p && *(e-1) == '\r')
10228 --e;
10229 }
10230 }
10231 return e - p;
10232 }
10233 if (rslen > len) return len;
10234
10235 enc = rb_enc_get(rs);
10236 newline = rsptr[rslen-1];
10237 if (rslen == rb_enc_mbminlen(enc)) {
10238 if (rslen == 1) {
10239 if (newline == '\n')
10240 return smart_chomp(str, e, p);
10241 }
10242 else {
10243 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10244 return smart_chomp(str, e, p);
10245 }
10246 }
10247
10248 enc = rb_enc_check(str, rs);
10249 if (is_broken_string(rs)) {
10250 return len;
10251 }
10252 pp = e - rslen;
10253 if (p[len-1] == newline &&
10254 (rslen <= 1 ||
10255 memcmp(rsptr, pp, rslen) == 0)) {
10256 if (at_char_boundary(p, pp, e, enc))
10257 return len - rslen;
10258 RB_GC_GUARD(rs);
10259 }
10260 return len;
10261}
10262
10268static VALUE
10269chomp_rs(int argc, const VALUE *argv)
10270{
10271 rb_check_arity(argc, 0, 1);
10272 if (argc > 0) {
10273 VALUE rs = argv[0];
10274 if (!NIL_P(rs)) StringValue(rs);
10275 return rs;
10276 }
10277 else {
10278 return rb_rs;
10279 }
10280}
10281
10282VALUE
10283rb_str_chomp_string(VALUE str, VALUE rs)
10284{
10285 long olen = RSTRING_LEN(str);
10286 long len = chompped_length(str, rs);
10287 if (len >= olen) return Qnil;
10288 str_modify_keep_cr(str);
10289 STR_SET_LEN(str, len);
10290 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10291 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10293 }
10294 return str;
10295}
10296
10297/*
10298 * call-seq:
10299 * chomp!(line_sep = $/) -> self or nil
10300 *
10301 * Like String#chomp, except that:
10302 *
10303 * - Removes trailing characters from +self+ (not from a copy of +self+).
10304 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10305 *
10306 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10307 */
10308
10309static VALUE
10310rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10311{
10312 VALUE rs;
10313 str_modifiable(str);
10314 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10315 rs = chomp_rs(argc, argv);
10316 if (NIL_P(rs)) return Qnil;
10317 return rb_str_chomp_string(str, rs);
10318}
10319
10320
10321/*
10322 * call-seq:
10323 * chomp(line_sep = $/) -> new_string
10324 *
10325 * :include: doc/string/chomp.rdoc
10326 *
10327 */
10328
10329static VALUE
10330rb_str_chomp(int argc, VALUE *argv, VALUE str)
10331{
10332 VALUE rs = chomp_rs(argc, argv);
10333 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10334 return rb_str_subseq(str, 0, chompped_length(str, rs));
10335}
10336
10337static long
10338lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10339{
10340 const char *const start = s;
10341
10342 if (!s || s >= e) return 0;
10343
10344 /* remove spaces at head */
10345 if (single_byte_optimizable(str)) {
10346 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10347 }
10348 else {
10349 while (s < e) {
10350 int n;
10351 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10352
10353 if (cc && !rb_isspace(cc)) break;
10354 s += n;
10355 }
10356 }
10357 return s - start;
10358}
10359
10360/*
10361 * call-seq:
10362 * lstrip! -> self or nil
10363 *
10364 * Like String#lstrip, except that any modifications are made in +self+;
10365 * returns +self+ if any modification are made, +nil+ otherwise.
10366 *
10367 * Related: String#rstrip!, String#strip!.
10368 */
10369
10370static VALUE
10371rb_str_lstrip_bang(VALUE str)
10372{
10373 rb_encoding *enc;
10374 char *start, *s;
10375 long olen, loffset;
10376
10377 str_modify_keep_cr(str);
10378 enc = STR_ENC_GET(str);
10379 RSTRING_GETMEM(str, start, olen);
10380 loffset = lstrip_offset(str, start, start+olen, enc);
10381 if (loffset > 0) {
10382 long len = olen-loffset;
10383 s = start + loffset;
10384 memmove(start, s, len);
10385 STR_SET_LEN(str, len);
10386 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10387 return str;
10388 }
10389 return Qnil;
10390}
10391
10392
10393/*
10394 * call-seq:
10395 * lstrip -> new_string
10396 *
10397 * Returns a copy of +self+ with leading whitespace removed;
10398 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10399 *
10400 * whitespace = "\x00\t\n\v\f\r "
10401 * s = whitespace + 'abc' + whitespace
10402 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10403 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10404 *
10405 * Related: String#rstrip, String#strip.
10406 */
10407
10408static VALUE
10409rb_str_lstrip(VALUE str)
10410{
10411 char *start;
10412 long len, loffset;
10413 RSTRING_GETMEM(str, start, len);
10414 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10415 if (loffset <= 0) return str_duplicate(rb_cString, str);
10416 return rb_str_subseq(str, loffset, len - loffset);
10417}
10418
10419static long
10420rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10421{
10422 const char *t;
10423
10424 rb_str_check_dummy_enc(enc);
10426 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10427 }
10428 if (!s || s >= e) return 0;
10429 t = e;
10430
10431 /* remove trailing spaces or '\0's */
10432 if (single_byte_optimizable(str)) {
10433 unsigned char c;
10434 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10435 }
10436 else {
10437 char *tp;
10438
10439 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10440 unsigned int c = rb_enc_codepoint(tp, e, enc);
10441 if (c && !rb_isspace(c)) break;
10442 t = tp;
10443 }
10444 }
10445 return e - t;
10446}
10447
10448/*
10449 * call-seq:
10450 * rstrip! -> self or nil
10451 *
10452 * Like String#rstrip, except that any modifications are made in +self+;
10453 * returns +self+ if any modification are made, +nil+ otherwise.
10454 *
10455 * Related: String#lstrip!, String#strip!.
10456 */
10457
10458static VALUE
10459rb_str_rstrip_bang(VALUE str)
10460{
10461 rb_encoding *enc;
10462 char *start;
10463 long olen, roffset;
10464
10465 str_modify_keep_cr(str);
10466 enc = STR_ENC_GET(str);
10467 RSTRING_GETMEM(str, start, olen);
10468 roffset = rstrip_offset(str, start, start+olen, enc);
10469 if (roffset > 0) {
10470 long len = olen - roffset;
10471
10472 STR_SET_LEN(str, len);
10473 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10474 return str;
10475 }
10476 return Qnil;
10477}
10478
10479
10480/*
10481 * call-seq:
10482 * rstrip -> new_string
10483 *
10484 * Returns a copy of the receiver with trailing whitespace removed;
10485 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10486 *
10487 * whitespace = "\x00\t\n\v\f\r "
10488 * s = whitespace + 'abc' + whitespace
10489 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10490 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10491 *
10492 * Related: String#lstrip, String#strip.
10493 */
10494
10495static VALUE
10496rb_str_rstrip(VALUE str)
10497{
10498 rb_encoding *enc;
10499 char *start;
10500 long olen, roffset;
10501
10502 enc = STR_ENC_GET(str);
10503 RSTRING_GETMEM(str, start, olen);
10504 roffset = rstrip_offset(str, start, start+olen, enc);
10505
10506 if (roffset <= 0) return str_duplicate(rb_cString, str);
10507 return rb_str_subseq(str, 0, olen-roffset);
10508}
10509
10510
10511/*
10512 * call-seq:
10513 * strip! -> self or nil
10514 *
10515 * Like String#strip, except that any modifications are made in +self+;
10516 * returns +self+ if any modification are made, +nil+ otherwise.
10517 *
10518 * Related: String#lstrip!, String#strip!.
10519 */
10520
10521static VALUE
10522rb_str_strip_bang(VALUE str)
10523{
10524 char *start;
10525 long olen, loffset, roffset;
10526 rb_encoding *enc;
10527
10528 str_modify_keep_cr(str);
10529 enc = STR_ENC_GET(str);
10530 RSTRING_GETMEM(str, start, olen);
10531 loffset = lstrip_offset(str, start, start+olen, enc);
10532 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10533
10534 if (loffset > 0 || roffset > 0) {
10535 long len = olen-roffset;
10536 if (loffset > 0) {
10537 len -= loffset;
10538 memmove(start, start + loffset, len);
10539 }
10540 STR_SET_LEN(str, len);
10541 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10542 return str;
10543 }
10544 return Qnil;
10545}
10546
10547
10548/*
10549 * call-seq:
10550 * strip -> new_string
10551 *
10552 * Returns a copy of the receiver with leading and trailing whitespace removed;
10553 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10554 *
10555 * whitespace = "\x00\t\n\v\f\r "
10556 * s = whitespace + 'abc' + whitespace
10557 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10558 * s.strip # => "abc"
10559 *
10560 * Related: String#lstrip, String#rstrip.
10561 */
10562
10563static VALUE
10564rb_str_strip(VALUE str)
10565{
10566 char *start;
10567 long olen, loffset, roffset;
10568 rb_encoding *enc = STR_ENC_GET(str);
10569
10570 RSTRING_GETMEM(str, start, olen);
10571 loffset = lstrip_offset(str, start, start+olen, enc);
10572 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10573
10574 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10575 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10576}
10577
10578static VALUE
10579scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10580{
10581 VALUE result = Qnil;
10582 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10583 if (pos >= 0) {
10584 VALUE match;
10585 struct re_registers *regs;
10586 if (BUILTIN_TYPE(pat) == T_STRING) {
10587 regs = NULL;
10588 end = pos + RSTRING_LEN(pat);
10589 }
10590 else {
10591 match = rb_backref_get();
10592 regs = RMATCH_REGS(match);
10593 pos = BEG(0);
10594 end = END(0);
10595 }
10596
10597 if (pos == end) {
10598 rb_encoding *enc = STR_ENC_GET(str);
10599 /*
10600 * Always consume at least one character of the input string
10601 */
10602 if (RSTRING_LEN(str) > end)
10603 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10604 RSTRING_END(str), enc);
10605 else
10606 *start = end + 1;
10607 }
10608 else {
10609 *start = end;
10610 }
10611
10612 if (!regs || regs->num_regs == 1) {
10613 result = rb_str_subseq(str, pos, end - pos);
10614 return result;
10615 }
10616 else {
10617 result = rb_ary_new2(regs->num_regs);
10618 for (int i = 1; i < regs->num_regs; i++) {
10619 VALUE s = Qnil;
10620 if (BEG(i) >= 0) {
10621 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10622 }
10623
10624 rb_ary_push(result, s);
10625 }
10626 }
10627
10628 RB_GC_GUARD(match);
10629 }
10630
10631 return result;
10632}
10633
10634
10635/*
10636 * call-seq:
10637 * scan(string_or_regexp) -> array
10638 * scan(string_or_regexp) {|matches| ... } -> self
10639 *
10640 * Matches a pattern against +self+; the pattern is:
10641 *
10642 * - +string_or_regexp+ itself, if it is a Regexp.
10643 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10644 *
10645 * Iterates through +self+, generating a collection of matching results:
10646 *
10647 * - If the pattern contains no groups, each result is the
10648 * matched string, <code>$&</code>.
10649 * - If the pattern contains groups, each result is an array
10650 * containing one entry per group.
10651 *
10652 * With no block given, returns an array of the results:
10653 *
10654 * s = 'cruel world'
10655 * s.scan(/\w+/) # => ["cruel", "world"]
10656 * s.scan(/.../) # => ["cru", "el ", "wor"]
10657 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10658 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10659 *
10660 * With a block given, calls the block with each result; returns +self+:
10661 *
10662 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10663 * print "\n"
10664 * s.scan(/(.)(.)/) {|x,y| print y, x }
10665 * print "\n"
10666 *
10667 * Output:
10668 *
10669 * <<cruel>> <<world>>
10670 * rceu lowlr
10671 *
10672 */
10673
10674static VALUE
10675rb_str_scan(VALUE str, VALUE pat)
10676{
10677 VALUE result;
10678 long start = 0;
10679 long last = -1, prev = 0;
10680 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10681
10682 pat = get_pat_quoted(pat, 1);
10683 mustnot_broken(str);
10684 if (!rb_block_given_p()) {
10685 VALUE ary = rb_ary_new();
10686
10687 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10688 last = prev;
10689 prev = start;
10690 rb_ary_push(ary, result);
10691 }
10692 if (last >= 0) rb_pat_search(pat, str, last, 1);
10693 else rb_backref_set(Qnil);
10694 return ary;
10695 }
10696
10697 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10698 last = prev;
10699 prev = start;
10700 rb_yield(result);
10701 str_mod_check(str, p, len);
10702 }
10703 if (last >= 0) rb_pat_search(pat, str, last, 1);
10704 return str;
10705}
10706
10707
10708/*
10709 * call-seq:
10710 * hex -> integer
10711 *
10712 * Interprets the leading substring of +self+ as hexadecimal;
10713 * returns its integer value:
10714 *
10715 * '0xFFFF'.hex # => 65535
10716 * 'FFzzzFF'.hex # => 255 # Hex ends at first non-hex character, 'z'.
10717 * 'ffzzzFF'.hex # => 255 # Case does not matter.
10718 * '-FFzzzFF'.hex # => -255 # May have leading '-'.
10719 * '0xFFzzzFF'.hex # => 255 # May have leading '0x'.
10720 * '-0xFFzzzFF'.hex # => -255 # May have leading '-0x'.
10721 *
10722 * Returns zero if there is no such leading substring:
10723 *
10724 * 'zzz'.hex # => 0
10725 *
10726 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10727 */
10728
10729static VALUE
10730rb_str_hex(VALUE str)
10731{
10732 return rb_str_to_inum(str, 16, FALSE);
10733}
10734
10735
10736/*
10737 * call-seq:
10738 * oct -> integer
10739 *
10740 * Interprets the leading substring of +self+ as a string of octal digits
10741 * (with an optional sign) and returns the corresponding number;
10742 * returns zero if there is no such leading substring:
10743 *
10744 * '123'.oct # => 83
10745 * '-377'.oct # => -255
10746 * '0377non-numeric'.oct # => 255
10747 * 'non-numeric'.oct # => 0
10748 *
10749 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10750 * see Kernel#Integer.
10751 *
10752 * Related: String#hex.
10753 *
10754 */
10755
10756static VALUE
10757rb_str_oct(VALUE str)
10758{
10759 return rb_str_to_inum(str, -8, FALSE);
10760}
10761
10762#ifndef HAVE_CRYPT_R
10763# include "ruby/thread_native.h"
10764# include "ruby/atomic.h"
10765
10766static struct {
10767 rb_nativethread_lock_t lock;
10768} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10769#endif
10770
10771/*
10772 * call-seq:
10773 * crypt(salt_str) -> new_string
10774 *
10775 * Returns the string generated by calling <code>crypt(3)</code>
10776 * standard library function with <code>str</code> and
10777 * <code>salt_str</code>, in this order, as its arguments. Please do
10778 * not use this method any longer. It is legacy; provided only for
10779 * backward compatibility with ruby scripts in earlier days. It is
10780 * bad to use in contemporary programs for several reasons:
10781 *
10782 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10783 * run. The generated string lacks data portability.
10784 *
10785 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10786 * (i.e. silently ends up in unexpected results).
10787 *
10788 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10789 * thread safe.
10790 *
10791 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10792 * very very weak. According to its manpage, Linux's traditional
10793 * <code>crypt(3)</code> output has only 2**56 variations; too
10794 * easy to brute force today. And this is the default behaviour.
10795 *
10796 * * In order to make things robust some OSes implement so-called
10797 * "modular" usage. To go through, you have to do a complex
10798 * build-up of the <code>salt_str</code> parameter, by hand.
10799 * Failure in generation of a proper salt string tends not to
10800 * yield any errors; typos in parameters are normally not
10801 * detectable.
10802 *
10803 * * For instance, in the following example, the second invocation
10804 * of String#crypt is wrong; it has a typo in "round=" (lacks
10805 * "s"). However the call does not fail and something unexpected
10806 * is generated.
10807 *
10808 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10809 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10810 *
10811 * * Even in the "modular" mode, some hash functions are considered
10812 * archaic and no longer recommended at all; for instance module
10813 * <code>$1$</code> is officially abandoned by its author: see
10814 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10815 * instance module <code>$3$</code> is considered completely
10816 * broken: see the manpage of FreeBSD.
10817 *
10818 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10819 * written above, <code>crypt(3)</code> on Mac OS never fails.
10820 * This means even if you build up a proper salt string it
10821 * generates a traditional DES hash anyways, and there is no way
10822 * for you to be aware of.
10823 *
10824 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10825 *
10826 * If for some reason you cannot migrate to other secure contemporary
10827 * password hashing algorithms, install the string-crypt gem and
10828 * <code>require 'string/crypt'</code> to continue using it.
10829 */
10830
10831static VALUE
10832rb_str_crypt(VALUE str, VALUE salt)
10833{
10834#ifdef HAVE_CRYPT_R
10835 VALUE databuf;
10836 struct crypt_data *data;
10837# define CRYPT_END() ALLOCV_END(databuf)
10838#else
10839 char *tmp_buf;
10840 extern char *crypt(const char *, const char *);
10841# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10842#endif
10843 VALUE result;
10844 const char *s, *saltp;
10845 char *res;
10846#ifdef BROKEN_CRYPT
10847 char salt_8bit_clean[3];
10848#endif
10849
10850 StringValue(salt);
10851 mustnot_wchar(str);
10852 mustnot_wchar(salt);
10853 s = StringValueCStr(str);
10854 saltp = RSTRING_PTR(salt);
10855 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10856 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10857 }
10858
10859#ifdef BROKEN_CRYPT
10860 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10861 salt_8bit_clean[0] = saltp[0] & 0x7f;
10862 salt_8bit_clean[1] = saltp[1] & 0x7f;
10863 salt_8bit_clean[2] = '\0';
10864 saltp = salt_8bit_clean;
10865 }
10866#endif
10867#ifdef HAVE_CRYPT_R
10868 data = ALLOCV(databuf, sizeof(struct crypt_data));
10869# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10870 data->initialized = 0;
10871# endif
10872 res = crypt_r(s, saltp, data);
10873#else
10874 rb_nativethread_lock_lock(&crypt_mutex.lock);
10875 res = crypt(s, saltp);
10876#endif
10877 if (!res) {
10878 int err = errno;
10879 CRYPT_END();
10880 rb_syserr_fail(err, "crypt");
10881 }
10882#ifdef HAVE_CRYPT_R
10883 result = rb_str_new_cstr(res);
10884 CRYPT_END();
10885#else
10886 // We need to copy this buffer because it's static and we need to unlock the mutex
10887 // before allocating a new object (the string to be returned). If we allocate while
10888 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10889 // if other ractors are waiting on this lock.
10890 size_t res_size = strlen(res)+1;
10891 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10892 memcpy(tmp_buf, res, res_size);
10893 res = tmp_buf;
10894 CRYPT_END();
10895 result = rb_str_new_cstr(res);
10896#endif
10897 return result;
10898}
10899
10900
10901/*
10902 * call-seq:
10903 * ord -> integer
10904 *
10905 * :include: doc/string/ord.rdoc
10906 *
10907 */
10908
10909static VALUE
10910rb_str_ord(VALUE s)
10911{
10912 unsigned int c;
10913
10914 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10915 return UINT2NUM(c);
10916}
10917/*
10918 * call-seq:
10919 * sum(n = 16) -> integer
10920 *
10921 * :include: doc/string/sum.rdoc
10922 *
10923 */
10924
10925static VALUE
10926rb_str_sum(int argc, VALUE *argv, VALUE str)
10927{
10928 int bits = 16;
10929 char *ptr, *p, *pend;
10930 long len;
10931 VALUE sum = INT2FIX(0);
10932 unsigned long sum0 = 0;
10933
10934 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10935 bits = 0;
10936 }
10937 ptr = p = RSTRING_PTR(str);
10938 len = RSTRING_LEN(str);
10939 pend = p + len;
10940
10941 while (p < pend) {
10942 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10943 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10944 str_mod_check(str, ptr, len);
10945 sum0 = 0;
10946 }
10947 sum0 += (unsigned char)*p;
10948 p++;
10949 }
10950
10951 if (bits == 0) {
10952 if (sum0) {
10953 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10954 }
10955 }
10956 else {
10957 if (sum == INT2FIX(0)) {
10958 if (bits < (int)sizeof(long)*CHAR_BIT) {
10959 sum0 &= (((unsigned long)1)<<bits)-1;
10960 }
10961 sum = LONG2FIX(sum0);
10962 }
10963 else {
10964 VALUE mod;
10965
10966 if (sum0) {
10967 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10968 }
10969
10970 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10971 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10972 sum = rb_funcall(sum, '&', 1, mod);
10973 }
10974 }
10975 return sum;
10976}
10977
10978static VALUE
10979rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10980{
10981 rb_encoding *enc;
10982 VALUE w;
10983 long width, len, flen = 1, fclen = 1;
10984 VALUE res;
10985 char *p;
10986 const char *f = " ";
10987 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10988 VALUE pad;
10989 int singlebyte = 1, cr;
10990 int termlen;
10991
10992 rb_scan_args(argc, argv, "11", &w, &pad);
10993 enc = STR_ENC_GET(str);
10994 termlen = rb_enc_mbminlen(enc);
10995 width = NUM2LONG(w);
10996 if (argc == 2) {
10997 StringValue(pad);
10998 enc = rb_enc_check(str, pad);
10999 f = RSTRING_PTR(pad);
11000 flen = RSTRING_LEN(pad);
11001 fclen = str_strlen(pad, enc); /* rb_enc_check */
11002 singlebyte = single_byte_optimizable(pad);
11003 if (flen == 0 || fclen == 0) {
11004 rb_raise(rb_eArgError, "zero width padding");
11005 }
11006 }
11007 len = str_strlen(str, enc); /* rb_enc_check */
11008 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11009 n = width - len;
11010 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11011 rlen = n - llen;
11012 cr = ENC_CODERANGE(str);
11013 if (flen > 1) {
11014 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11015 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11016 }
11017 size = RSTRING_LEN(str);
11018 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11019 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11020 (len += llen2 + rlen2) >= LONG_MAX - size) {
11021 rb_raise(rb_eArgError, "argument too big");
11022 }
11023 len += size;
11024 res = str_enc_new(rb_cString, 0, len, enc);
11025 p = RSTRING_PTR(res);
11026 if (flen <= 1) {
11027 memset(p, *f, llen);
11028 p += llen;
11029 }
11030 else {
11031 while (llen >= fclen) {
11032 memcpy(p,f,flen);
11033 p += flen;
11034 llen -= fclen;
11035 }
11036 if (llen > 0) {
11037 memcpy(p, f, llen2);
11038 p += llen2;
11039 }
11040 }
11041 memcpy(p, RSTRING_PTR(str), size);
11042 p += size;
11043 if (flen <= 1) {
11044 memset(p, *f, rlen);
11045 p += rlen;
11046 }
11047 else {
11048 while (rlen >= fclen) {
11049 memcpy(p,f,flen);
11050 p += flen;
11051 rlen -= fclen;
11052 }
11053 if (rlen > 0) {
11054 memcpy(p, f, rlen2);
11055 p += rlen2;
11056 }
11057 }
11058 TERM_FILL(p, termlen);
11059 STR_SET_LEN(res, p-RSTRING_PTR(res));
11060
11061 if (argc == 2)
11062 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11063 if (cr != ENC_CODERANGE_BROKEN)
11064 ENC_CODERANGE_SET(res, cr);
11065
11066 RB_GC_GUARD(pad);
11067 return res;
11068}
11069
11070
11071/*
11072 * call-seq:
11073 * ljust(width, pad_string = ' ') -> new_string
11074 *
11075 * :include: doc/string/ljust.rdoc
11076 *
11077 */
11078
11079static VALUE
11080rb_str_ljust(int argc, VALUE *argv, VALUE str)
11081{
11082 return rb_str_justify(argc, argv, str, 'l');
11083}
11084
11085/*
11086 * call-seq:
11087 * rjust(size, pad_string = ' ') -> new_string
11088 *
11089 * :include: doc/string/rjust.rdoc
11090 *
11091 * Related: String#ljust, String#center.
11092 *
11093 */
11094
11095static VALUE
11096rb_str_rjust(int argc, VALUE *argv, VALUE str)
11097{
11098 return rb_str_justify(argc, argv, str, 'r');
11099}
11100
11101
11102/*
11103 * call-seq:
11104 * center(size, pad_string = ' ') -> new_string
11105 *
11106 * :include: doc/string/center.rdoc
11107 *
11108 */
11109
11110static VALUE
11111rb_str_center(int argc, VALUE *argv, VALUE str)
11112{
11113 return rb_str_justify(argc, argv, str, 'c');
11114}
11115
11116/*
11117 * call-seq:
11118 * partition(string_or_regexp) -> [head, match, tail]
11119 *
11120 * :include: doc/string/partition.rdoc
11121 *
11122 */
11123
11124static VALUE
11125rb_str_partition(VALUE str, VALUE sep)
11126{
11127 long pos;
11128
11129 sep = get_pat_quoted(sep, 0);
11130 if (RB_TYPE_P(sep, T_REGEXP)) {
11131 if (rb_reg_search(sep, str, 0, 0) < 0) {
11132 goto failed;
11133 }
11134 VALUE match = rb_backref_get();
11135 struct re_registers *regs = RMATCH_REGS(match);
11136
11137 pos = BEG(0);
11138 sep = rb_str_subseq(str, pos, END(0) - pos);
11139 }
11140 else {
11141 pos = rb_str_index(str, sep, 0);
11142 if (pos < 0) goto failed;
11143 }
11144 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11145 sep,
11146 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11147 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11148
11149 failed:
11150 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11151}
11152
11153/*
11154 * call-seq:
11155 * rpartition(sep) -> [head, match, tail]
11156 *
11157 * :include: doc/string/rpartition.rdoc
11158 *
11159 */
11160
11161static VALUE
11162rb_str_rpartition(VALUE str, VALUE sep)
11163{
11164 long pos = RSTRING_LEN(str);
11165
11166 sep = get_pat_quoted(sep, 0);
11167 if (RB_TYPE_P(sep, T_REGEXP)) {
11168 if (rb_reg_search(sep, str, pos, 1) < 0) {
11169 goto failed;
11170 }
11171 VALUE match = rb_backref_get();
11172 struct re_registers *regs = RMATCH_REGS(match);
11173
11174 pos = BEG(0);
11175 sep = rb_str_subseq(str, pos, END(0) - pos);
11176 }
11177 else {
11178 pos = rb_str_sublen(str, pos);
11179 pos = rb_str_rindex(str, sep, pos);
11180 if (pos < 0) {
11181 goto failed;
11182 }
11183 }
11184
11185 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11186 sep,
11187 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11188 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11189 failed:
11190 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11191}
11192
11193/*
11194 * call-seq:
11195 * start_with?(*string_or_regexp) -> true or false
11196 *
11197 * :include: doc/string/start_with_p.rdoc
11198 *
11199 */
11200
11201static VALUE
11202rb_str_start_with(int argc, VALUE *argv, VALUE str)
11203{
11204 int i;
11205
11206 for (i=0; i<argc; i++) {
11207 VALUE tmp = argv[i];
11208 if (RB_TYPE_P(tmp, T_REGEXP)) {
11209 if (rb_reg_start_with_p(tmp, str))
11210 return Qtrue;
11211 }
11212 else {
11213 const char *p, *s, *e;
11214 long slen, tlen;
11215 rb_encoding *enc;
11216
11217 StringValue(tmp);
11218 enc = rb_enc_check(str, tmp);
11219 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11220 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11221 p = RSTRING_PTR(str);
11222 e = p + slen;
11223 s = p + tlen;
11224 if (!at_char_right_boundary(p, s, e, enc))
11225 continue;
11226 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11227 return Qtrue;
11228 }
11229 }
11230 return Qfalse;
11231}
11232
11233/*
11234 * call-seq:
11235 * end_with?(*strings) -> true or false
11236 *
11237 * :include: doc/string/end_with_p.rdoc
11238 *
11239 */
11240
11241static VALUE
11242rb_str_end_with(int argc, VALUE *argv, VALUE str)
11243{
11244 int i;
11245
11246 for (i=0; i<argc; i++) {
11247 VALUE tmp = argv[i];
11248 const char *p, *s, *e;
11249 long slen, tlen;
11250 rb_encoding *enc;
11251
11252 StringValue(tmp);
11253 enc = rb_enc_check(str, tmp);
11254 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11255 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11256 p = RSTRING_PTR(str);
11257 e = p + slen;
11258 s = e - tlen;
11259 if (!at_char_boundary(p, s, e, enc))
11260 continue;
11261 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11262 return Qtrue;
11263 }
11264 return Qfalse;
11265}
11266
11276static long
11277deleted_prefix_length(VALUE str, VALUE prefix)
11278{
11279 const char *strptr, *prefixptr;
11280 long olen, prefixlen;
11281 rb_encoding *enc = rb_enc_get(str);
11282
11283 StringValue(prefix);
11284
11285 if (!is_broken_string(prefix) ||
11286 !rb_enc_asciicompat(enc) ||
11287 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11288 enc = rb_enc_check(str, prefix);
11289 }
11290
11291 /* return 0 if not start with prefix */
11292 prefixlen = RSTRING_LEN(prefix);
11293 if (prefixlen <= 0) return 0;
11294 olen = RSTRING_LEN(str);
11295 if (olen < prefixlen) return 0;
11296 strptr = RSTRING_PTR(str);
11297 prefixptr = RSTRING_PTR(prefix);
11298 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11299 if (is_broken_string(prefix)) {
11300 if (!is_broken_string(str)) {
11301 /* prefix in a valid string cannot be broken */
11302 return 0;
11303 }
11304 const char *strend = strptr + olen;
11305 const char *after_prefix = strptr + prefixlen;
11306 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11307 /* prefix does not end at char-boundary */
11308 return 0;
11309 }
11310 }
11311 /* prefix part in `str` also should be valid. */
11312
11313 return prefixlen;
11314}
11315
11316/*
11317 * call-seq:
11318 * delete_prefix!(prefix) -> self or nil
11319 *
11320 * Like String#delete_prefix, except that +self+ is modified in place;
11321 * returns +self+ if the prefix is removed, +nil+ otherwise.
11322 *
11323 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11324 */
11325
11326static VALUE
11327rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11328{
11329 long prefixlen;
11330 str_modify_keep_cr(str);
11331
11332 prefixlen = deleted_prefix_length(str, prefix);
11333 if (prefixlen <= 0) return Qnil;
11334
11335 return rb_str_drop_bytes(str, prefixlen);
11336}
11337
11338/*
11339 * call-seq:
11340 * delete_prefix(prefix) -> new_string
11341 *
11342 * :include: doc/string/delete_prefix.rdoc
11343 *
11344 */
11345
11346static VALUE
11347rb_str_delete_prefix(VALUE str, VALUE prefix)
11348{
11349 long prefixlen;
11350
11351 prefixlen = deleted_prefix_length(str, prefix);
11352 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11353
11354 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11355}
11356
11366static long
11367deleted_suffix_length(VALUE str, VALUE suffix)
11368{
11369 const char *strptr, *suffixptr;
11370 long olen, suffixlen;
11371 rb_encoding *enc;
11372
11373 StringValue(suffix);
11374 if (is_broken_string(suffix)) return 0;
11375 enc = rb_enc_check(str, suffix);
11376
11377 /* return 0 if not start with suffix */
11378 suffixlen = RSTRING_LEN(suffix);
11379 if (suffixlen <= 0) return 0;
11380 olen = RSTRING_LEN(str);
11381 if (olen < suffixlen) return 0;
11382 strptr = RSTRING_PTR(str);
11383 suffixptr = RSTRING_PTR(suffix);
11384 const char *strend = strptr + olen;
11385 const char *before_suffix = strend - suffixlen;
11386 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11387 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11388
11389 return suffixlen;
11390}
11391
11392/*
11393 * call-seq:
11394 * delete_suffix!(suffix) -> self or nil
11395 *
11396 * Like String#delete_suffix, except that +self+ is modified in place;
11397 * returns +self+ if the suffix is removed, +nil+ otherwise.
11398 *
11399 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11400 */
11401
11402static VALUE
11403rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11404{
11405 long olen, suffixlen, len;
11406 str_modifiable(str);
11407
11408 suffixlen = deleted_suffix_length(str, suffix);
11409 if (suffixlen <= 0) return Qnil;
11410
11411 olen = RSTRING_LEN(str);
11412 str_modify_keep_cr(str);
11413 len = olen - suffixlen;
11414 STR_SET_LEN(str, len);
11415 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11416 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11418 }
11419 return str;
11420}
11421
11422/*
11423 * call-seq:
11424 * delete_suffix(suffix) -> new_string
11425 *
11426 * :include: doc/string/delete_suffix.rdoc
11427 *
11428 */
11429
11430static VALUE
11431rb_str_delete_suffix(VALUE str, VALUE suffix)
11432{
11433 long suffixlen;
11434
11435 suffixlen = deleted_suffix_length(str, suffix);
11436 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11437
11438 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11439}
11440
11441void
11442rb_str_setter(VALUE val, ID id, VALUE *var)
11443{
11444 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11445 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11446 }
11447 *var = val;
11448}
11449
11450static void
11451rb_fs_setter(VALUE val, ID id, VALUE *var)
11452{
11453 val = rb_fs_check(val);
11454 if (!val) {
11455 rb_raise(rb_eTypeError,
11456 "value of %"PRIsVALUE" must be String or Regexp",
11457 rb_id2str(id));
11458 }
11459 if (!NIL_P(val)) {
11460 rb_warn_deprecated("'$;'", NULL);
11461 }
11462 *var = val;
11463}
11464
11465
11466/*
11467 * call-seq:
11468 * force_encoding(encoding) -> self
11469 *
11470 * :include: doc/string/force_encoding.rdoc
11471 *
11472 */
11473
11474static VALUE
11475rb_str_force_encoding(VALUE str, VALUE enc)
11476{
11477 str_modifiable(str);
11478
11479 rb_encoding *encoding = rb_to_encoding(enc);
11480 int idx = rb_enc_to_index(encoding);
11481
11482 // If the encoding is unchanged, we do nothing.
11483 if (ENCODING_GET(str) == idx) {
11484 return str;
11485 }
11486
11487 rb_enc_associate_index(str, idx);
11488
11489 // If the coderange was 7bit and the new encoding is ASCII-compatible
11490 // we can keep the coderange.
11491 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11492 return str;
11493 }
11494
11496 return str;
11497}
11498
11499/*
11500 * call-seq:
11501 * b -> new_string
11502 *
11503 * :include: doc/string/b.rdoc
11504 *
11505 */
11506
11507static VALUE
11508rb_str_b(VALUE str)
11509{
11510 VALUE str2;
11511 if (STR_EMBED_P(str)) {
11512 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11513 }
11514 else {
11515 str2 = str_alloc_heap(rb_cString);
11516 }
11517 str_replace_shared_without_enc(str2, str);
11518
11519 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11520 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11521 // If we know the receiver's code range then we know the result's code range.
11522 int cr = ENC_CODERANGE(str);
11523 switch (cr) {
11524 case ENC_CODERANGE_7BIT:
11526 break;
11530 break;
11531 default:
11532 ENC_CODERANGE_CLEAR(str2);
11533 break;
11534 }
11535 }
11536
11537 return str2;
11538}
11539
11540/*
11541 * call-seq:
11542 * valid_encoding? -> true or false
11543 *
11544 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11545 *
11546 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11547 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11548 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11549 */
11550
11551static VALUE
11552rb_str_valid_encoding_p(VALUE str)
11553{
11554 int cr = rb_enc_str_coderange(str);
11555
11556 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11557}
11558
11559/*
11560 * call-seq:
11561 * ascii_only? -> true or false
11562 *
11563 * Returns whether +self+ contains only ASCII characters:
11564 *
11565 * 'abc'.ascii_only? # => true
11566 * "abc\u{6666}".ascii_only? # => false
11567 *
11568 * Related: see {Querying}[rdoc-ref:String@Querying].
11569 */
11570
11571static VALUE
11572rb_str_is_ascii_only_p(VALUE str)
11573{
11574 int cr = rb_enc_str_coderange(str);
11575
11576 return RBOOL(cr == ENC_CODERANGE_7BIT);
11577}
11578
11579VALUE
11581{
11582 static const char ellipsis[] = "...";
11583 const long ellipsislen = sizeof(ellipsis) - 1;
11584 rb_encoding *const enc = rb_enc_get(str);
11585 const long blen = RSTRING_LEN(str);
11586 const char *const p = RSTRING_PTR(str), *e = p + blen;
11587 VALUE estr, ret = 0;
11588
11589 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11590 if (len * rb_enc_mbminlen(enc) >= blen ||
11591 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11592 ret = str;
11593 }
11594 else if (len <= ellipsislen ||
11595 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11596 if (rb_enc_asciicompat(enc)) {
11597 ret = rb_str_new(ellipsis, len);
11598 rb_enc_associate(ret, enc);
11599 }
11600 else {
11601 estr = rb_usascii_str_new(ellipsis, len);
11602 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11603 }
11604 }
11605 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11606 rb_str_cat(ret, ellipsis, ellipsislen);
11607 }
11608 else {
11609 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11610 rb_enc_from_encoding(enc), 0, Qnil);
11611 rb_str_append(ret, estr);
11612 }
11613 return ret;
11614}
11615
11616static VALUE
11617str_compat_and_valid(VALUE str, rb_encoding *enc)
11618{
11619 int cr;
11620 str = StringValue(str);
11621 cr = rb_enc_str_coderange(str);
11622 if (cr == ENC_CODERANGE_BROKEN) {
11623 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11624 }
11625 else {
11626 rb_encoding *e = STR_ENC_GET(str);
11627 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11628 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11629 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11630 }
11631 }
11632 return str;
11633}
11634
11635static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11636
11637VALUE
11639{
11640 rb_encoding *enc = STR_ENC_GET(str);
11641 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11642}
11643
11644VALUE
11645rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11646{
11647 int cr = ENC_CODERANGE_UNKNOWN;
11648 if (enc == STR_ENC_GET(str)) {
11649 /* cached coderange makes sense only when enc equals the
11650 * actual encoding of str */
11651 cr = ENC_CODERANGE(str);
11652 }
11653 return enc_str_scrub(enc, str, repl, cr);
11654}
11655
11656static VALUE
11657enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11658{
11659 int encidx;
11660 VALUE buf = Qnil;
11661 const char *rep, *p, *e, *p1, *sp;
11662 long replen = -1;
11663 long slen;
11664
11665 if (rb_block_given_p()) {
11666 if (!NIL_P(repl))
11667 rb_raise(rb_eArgError, "both of block and replacement given");
11668 replen = 0;
11669 }
11670
11671 if (ENC_CODERANGE_CLEAN_P(cr))
11672 return Qnil;
11673
11674 if (!NIL_P(repl)) {
11675 repl = str_compat_and_valid(repl, enc);
11676 }
11677
11678 if (rb_enc_dummy_p(enc)) {
11679 return Qnil;
11680 }
11681 encidx = rb_enc_to_index(enc);
11682
11683#define DEFAULT_REPLACE_CHAR(str) do { \
11684 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11685 rep = replace; replen = (int)sizeof(replace); \
11686 } while (0)
11687
11688 slen = RSTRING_LEN(str);
11689 p = RSTRING_PTR(str);
11690 e = RSTRING_END(str);
11691 p1 = p;
11692 sp = p;
11693
11694 if (rb_enc_asciicompat(enc)) {
11695 int rep7bit_p;
11696 if (!replen) {
11697 rep = NULL;
11698 rep7bit_p = FALSE;
11699 }
11700 else if (!NIL_P(repl)) {
11701 rep = RSTRING_PTR(repl);
11702 replen = RSTRING_LEN(repl);
11703 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11704 }
11705 else if (encidx == rb_utf8_encindex()) {
11706 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11707 rep7bit_p = FALSE;
11708 }
11709 else {
11710 DEFAULT_REPLACE_CHAR("?");
11711 rep7bit_p = TRUE;
11712 }
11713 cr = ENC_CODERANGE_7BIT;
11714
11715 p = search_nonascii(p, e);
11716 if (!p) {
11717 p = e;
11718 }
11719 while (p < e) {
11720 int ret = rb_enc_precise_mbclen(p, e, enc);
11721 if (MBCLEN_NEEDMORE_P(ret)) {
11722 break;
11723 }
11724 else if (MBCLEN_CHARFOUND_P(ret)) {
11726 p += MBCLEN_CHARFOUND_LEN(ret);
11727 }
11728 else if (MBCLEN_INVALID_P(ret)) {
11729 /*
11730 * p1~p: valid ascii/multibyte chars
11731 * p ~e: invalid bytes + unknown bytes
11732 */
11733 long clen = rb_enc_mbmaxlen(enc);
11734 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11735 if (p > p1) {
11736 rb_str_buf_cat(buf, p1, p - p1);
11737 }
11738
11739 if (e - p < clen) clen = e - p;
11740 if (clen <= 2) {
11741 clen = 1;
11742 }
11743 else {
11744 const char *q = p;
11745 clen--;
11746 for (; clen > 1; clen--) {
11747 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11748 if (MBCLEN_NEEDMORE_P(ret)) break;
11749 if (MBCLEN_INVALID_P(ret)) continue;
11751 }
11752 }
11753 if (rep) {
11754 rb_str_buf_cat(buf, rep, replen);
11755 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11756 }
11757 else {
11758 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11759 str_mod_check(str, sp, slen);
11760 repl = str_compat_and_valid(repl, enc);
11761 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11764 }
11765 p += clen;
11766 p1 = p;
11767 p = search_nonascii(p, e);
11768 if (!p) {
11769 p = e;
11770 break;
11771 }
11772 }
11773 else {
11775 }
11776 }
11777 if (NIL_P(buf)) {
11778 if (p == e) {
11779 ENC_CODERANGE_SET(str, cr);
11780 return Qnil;
11781 }
11782 buf = rb_str_buf_new(RSTRING_LEN(str));
11783 }
11784 if (p1 < p) {
11785 rb_str_buf_cat(buf, p1, p - p1);
11786 }
11787 if (p < e) {
11788 if (rep) {
11789 rb_str_buf_cat(buf, rep, replen);
11790 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11791 }
11792 else {
11793 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11794 str_mod_check(str, sp, slen);
11795 repl = str_compat_and_valid(repl, enc);
11796 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11799 }
11800 }
11801 }
11802 else {
11803 /* ASCII incompatible */
11804 long mbminlen = rb_enc_mbminlen(enc);
11805 if (!replen) {
11806 rep = NULL;
11807 }
11808 else if (!NIL_P(repl)) {
11809 rep = RSTRING_PTR(repl);
11810 replen = RSTRING_LEN(repl);
11811 }
11812 else if (encidx == ENCINDEX_UTF_16BE) {
11813 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11814 }
11815 else if (encidx == ENCINDEX_UTF_16LE) {
11816 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11817 }
11818 else if (encidx == ENCINDEX_UTF_32BE) {
11819 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11820 }
11821 else if (encidx == ENCINDEX_UTF_32LE) {
11822 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11823 }
11824 else {
11825 DEFAULT_REPLACE_CHAR("?");
11826 }
11827
11828 while (p < e) {
11829 int ret = rb_enc_precise_mbclen(p, e, enc);
11830 if (MBCLEN_NEEDMORE_P(ret)) {
11831 break;
11832 }
11833 else if (MBCLEN_CHARFOUND_P(ret)) {
11834 p += MBCLEN_CHARFOUND_LEN(ret);
11835 }
11836 else if (MBCLEN_INVALID_P(ret)) {
11837 const char *q = p;
11838 long clen = rb_enc_mbmaxlen(enc);
11839 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11840 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11841
11842 if (e - p < clen) clen = e - p;
11843 if (clen <= mbminlen * 2) {
11844 clen = mbminlen;
11845 }
11846 else {
11847 clen -= mbminlen;
11848 for (; clen > mbminlen; clen-=mbminlen) {
11849 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11850 if (MBCLEN_NEEDMORE_P(ret)) break;
11851 if (MBCLEN_INVALID_P(ret)) continue;
11853 }
11854 }
11855 if (rep) {
11856 rb_str_buf_cat(buf, rep, replen);
11857 }
11858 else {
11859 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11860 str_mod_check(str, sp, slen);
11861 repl = str_compat_and_valid(repl, enc);
11862 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11863 }
11864 p += clen;
11865 p1 = p;
11866 }
11867 else {
11869 }
11870 }
11871 if (NIL_P(buf)) {
11872 if (p == e) {
11874 return Qnil;
11875 }
11876 buf = rb_str_buf_new(RSTRING_LEN(str));
11877 }
11878 if (p1 < p) {
11879 rb_str_buf_cat(buf, p1, p - p1);
11880 }
11881 if (p < e) {
11882 if (rep) {
11883 rb_str_buf_cat(buf, rep, replen);
11884 }
11885 else {
11886 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11887 str_mod_check(str, sp, slen);
11888 repl = str_compat_and_valid(repl, enc);
11889 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11890 }
11891 }
11893 }
11894 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11895 return buf;
11896}
11897
11898/*
11899 * call-seq:
11900 * scrub(replacement_string = default_replacement) -> new_string
11901 * scrub{|bytes| ... } -> new_string
11902 *
11903 * :include: doc/string/scrub.rdoc
11904 *
11905 */
11906static VALUE
11907str_scrub(int argc, VALUE *argv, VALUE str)
11908{
11909 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11910 VALUE new = rb_str_scrub(str, repl);
11911 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11912}
11913
11914/*
11915 * call-seq:
11916 * scrub! -> self
11917 * scrub!(replacement_string = default_replacement) -> self
11918 * scrub!{|bytes| ... } -> self
11919 *
11920 * Like String#scrub, except that any replacements are made in +self+.
11921 *
11922 */
11923static VALUE
11924str_scrub_bang(int argc, VALUE *argv, VALUE str)
11925{
11926 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11927 VALUE new = rb_str_scrub(str, repl);
11928 if (!NIL_P(new)) rb_str_replace(str, new);
11929 return str;
11930}
11931
11932static ID id_normalize;
11933static ID id_normalized_p;
11934static VALUE mUnicodeNormalize;
11935
11936static VALUE
11937unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11938{
11939 static int UnicodeNormalizeRequired = 0;
11940 VALUE argv2[2];
11941
11942 if (!UnicodeNormalizeRequired) {
11943 rb_require("unicode_normalize/normalize.rb");
11944 UnicodeNormalizeRequired = 1;
11945 }
11946 argv2[0] = str;
11947 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11948 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11949}
11950
11951/*
11952 * call-seq:
11953 * unicode_normalize(form = :nfc) -> string
11954 *
11955 * Returns a copy of +self+ with
11956 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11957 *
11958 * Argument +form+ must be one of the following symbols
11959 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11960 *
11961 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11962 * - +:nfd+: Canonical decomposition.
11963 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11964 * - +:nfkd+: Compatibility decomposition.
11965 *
11966 * The encoding of +self+ must be one of:
11967 *
11968 * - Encoding::UTF_8
11969 * - Encoding::UTF_16BE
11970 * - Encoding::UTF_16LE
11971 * - Encoding::UTF_32BE
11972 * - Encoding::UTF_32LE
11973 * - Encoding::GB18030
11974 * - Encoding::UCS_2BE
11975 * - Encoding::UCS_4BE
11976 *
11977 * Examples:
11978 *
11979 * "a\u0300".unicode_normalize # => "a"
11980 * "\u00E0".unicode_normalize(:nfd) # => "a "
11981 *
11982 * Related: String#unicode_normalize!, String#unicode_normalized?.
11983 */
11984static VALUE
11985rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11986{
11987 return unicode_normalize_common(argc, argv, str, id_normalize);
11988}
11989
11990/*
11991 * call-seq:
11992 * unicode_normalize!(form = :nfc) -> self
11993 *
11994 * Like String#unicode_normalize, except that the normalization
11995 * is performed on +self+.
11996 *
11997 * Related String#unicode_normalized?.
11998 *
11999 */
12000static VALUE
12001rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12002{
12003 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12004}
12005
12006/* call-seq:
12007 * unicode_normalized?(form = :nfc) -> true or false
12008 *
12009 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
12010 * +false+ otherwise.
12011 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12012 *
12013 * Examples:
12014 *
12015 * "a\u0300".unicode_normalized? # => false
12016 * "a\u0300".unicode_normalized?(:nfd) # => true
12017 * "\u00E0".unicode_normalized? # => true
12018 * "\u00E0".unicode_normalized?(:nfd) # => false
12019 *
12020 *
12021 * Raises an exception if +self+ is not in a Unicode encoding:
12022 *
12023 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12024 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
12025 *
12026 * Related: String#unicode_normalize, String#unicode_normalize!.
12027 *
12028 */
12029static VALUE
12030rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12031{
12032 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12033}
12034
12035/**********************************************************************
12036 * Document-class: Symbol
12037 *
12038 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12039 *
12040 * You can create a +Symbol+ object explicitly with:
12041 *
12042 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12043 *
12044 * The same +Symbol+ object will be
12045 * created for a given name or string for the duration of a program's
12046 * execution, regardless of the context or meaning of that name. Thus
12047 * if <code>Fred</code> is a constant in one context, a method in
12048 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12049 * will be the same object in all three contexts.
12050 *
12051 * module One
12052 * class Fred
12053 * end
12054 * $f1 = :Fred
12055 * end
12056 * module Two
12057 * Fred = 1
12058 * $f2 = :Fred
12059 * end
12060 * def Fred()
12061 * end
12062 * $f3 = :Fred
12063 * $f1.object_id #=> 2514190
12064 * $f2.object_id #=> 2514190
12065 * $f3.object_id #=> 2514190
12066 *
12067 * Constant, method, and variable names are returned as symbols:
12068 *
12069 * module One
12070 * Two = 2
12071 * def three; 3 end
12072 * @four = 4
12073 * @@five = 5
12074 * $six = 6
12075 * end
12076 * seven = 7
12077 *
12078 * One.constants
12079 * # => [:Two]
12080 * One.instance_methods(true)
12081 * # => [:three]
12082 * One.instance_variables
12083 * # => [:@four]
12084 * One.class_variables
12085 * # => [:@@five]
12086 * global_variables.grep(/six/)
12087 * # => [:$six]
12088 * local_variables
12089 * # => [:seven]
12090 *
12091 * A +Symbol+ object differs from a String object in that
12092 * a +Symbol+ object represents an identifier, while a String object
12093 * represents text or data.
12094 *
12095 * == What's Here
12096 *
12097 * First, what's elsewhere. Class +Symbol+:
12098 *
12099 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12100 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12101 *
12102 * Here, class +Symbol+ provides methods that are useful for:
12103 *
12104 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12105 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12106 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12107 *
12108 * === Methods for Querying
12109 *
12110 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12111 * - #=~: Returns the index of the first substring in symbol that matches a
12112 * given Regexp or other object; returns +nil+ if no match is found.
12113 * - #[], #slice : Returns a substring of symbol
12114 * determined by a given index, start/length, or range, or string.
12115 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12116 * - #encoding: Returns the Encoding object that represents the encoding
12117 * of symbol.
12118 * - #end_with?: Returns +true+ if symbol ends with
12119 * any of the given strings.
12120 * - #match: Returns a MatchData object if symbol
12121 * matches a given Regexp; +nil+ otherwise.
12122 * - #match?: Returns +true+ if symbol
12123 * matches a given Regexp; +false+ otherwise.
12124 * - #length, #size: Returns the number of characters in symbol.
12125 * - #start_with?: Returns +true+ if symbol starts with
12126 * any of the given strings.
12127 *
12128 * === Methods for Comparing
12129 *
12130 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12131 * or larger than symbol.
12132 * - #==, #===: Returns +true+ if a given symbol has the same content and
12133 * encoding.
12134 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12135 * symbol is smaller than, equal to, or larger than symbol.
12136 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12137 * after Unicode case folding; +false+ otherwise.
12138 *
12139 * === Methods for Converting
12140 *
12141 * - #capitalize: Returns symbol with the first character upcased
12142 * and all other characters downcased.
12143 * - #downcase: Returns symbol with all characters downcased.
12144 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12145 * - #name: Returns the frozen string corresponding to symbol.
12146 * - #succ, #next: Returns the symbol that is the successor to symbol.
12147 * - #swapcase: Returns symbol with all upcase characters downcased
12148 * and all downcase characters upcased.
12149 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12150 * - #to_s, #id2name: Returns the string corresponding to +self+.
12151 * - #to_sym, #intern: Returns +self+.
12152 * - #upcase: Returns symbol with all characters upcased.
12153 *
12154 */
12155
12156
12157/*
12158 * call-seq:
12159 * symbol == object -> true or false
12160 *
12161 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12162 */
12163
12164#define sym_equal rb_obj_equal
12165
12166static int
12167sym_printable(const char *s, const char *send, rb_encoding *enc)
12168{
12169 while (s < send) {
12170 int n;
12171 int c = rb_enc_precise_mbclen(s, send, enc);
12172
12173 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12174 n = MBCLEN_CHARFOUND_LEN(c);
12175 c = rb_enc_mbc_to_codepoint(s, send, enc);
12176 if (!rb_enc_isprint(c, enc)) return FALSE;
12177 s += n;
12178 }
12179 return TRUE;
12180}
12181
12182int
12183rb_str_symname_p(VALUE sym)
12184{
12185 rb_encoding *enc;
12186 const char *ptr;
12187 long len;
12188 rb_encoding *resenc = rb_default_internal_encoding();
12189
12190 if (resenc == NULL) resenc = rb_default_external_encoding();
12191 enc = STR_ENC_GET(sym);
12192 ptr = RSTRING_PTR(sym);
12193 len = RSTRING_LEN(sym);
12194 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12195 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12196 return FALSE;
12197 }
12198 return TRUE;
12199}
12200
12201VALUE
12202rb_str_quote_unprintable(VALUE str)
12203{
12204 rb_encoding *enc;
12205 const char *ptr;
12206 long len;
12207 rb_encoding *resenc;
12208
12209 Check_Type(str, T_STRING);
12210 resenc = rb_default_internal_encoding();
12211 if (resenc == NULL) resenc = rb_default_external_encoding();
12212 enc = STR_ENC_GET(str);
12213 ptr = RSTRING_PTR(str);
12214 len = RSTRING_LEN(str);
12215 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12216 !sym_printable(ptr, ptr + len, enc)) {
12217 return rb_str_escape(str);
12218 }
12219 return str;
12220}
12221
12222VALUE
12223rb_id_quote_unprintable(ID id)
12224{
12225 VALUE str = rb_id2str(id);
12226 if (!rb_str_symname_p(str)) {
12227 return rb_str_escape(str);
12228 }
12229 return str;
12230}
12231
12232/*
12233 * call-seq:
12234 * inspect -> string
12235 *
12236 * Returns a string representation of +self+ (including the leading colon):
12237 *
12238 * :foo.inspect # => ":foo"
12239 *
12240 * Related: Symbol#to_s, Symbol#name.
12241 *
12242 */
12243
12244static VALUE
12245sym_inspect(VALUE sym)
12246{
12247 VALUE str = rb_sym2str(sym);
12248 const char *ptr;
12249 long len;
12250 char *dest;
12251
12252 if (!rb_str_symname_p(str)) {
12253 str = rb_str_inspect(str);
12254 len = RSTRING_LEN(str);
12255 rb_str_resize(str, len + 1);
12256 dest = RSTRING_PTR(str);
12257 memmove(dest + 1, dest, len);
12258 }
12259 else {
12260 rb_encoding *enc = STR_ENC_GET(str);
12261 VALUE orig_str = str;
12262
12263 len = RSTRING_LEN(orig_str);
12264 str = rb_enc_str_new(0, len + 1, enc);
12265
12266 // Get data pointer after allocation
12267 ptr = RSTRING_PTR(orig_str);
12268 dest = RSTRING_PTR(str);
12269 memcpy(dest + 1, ptr, len);
12270
12271 RB_GC_GUARD(orig_str);
12272 }
12273 dest[0] = ':';
12274
12276
12277 return str;
12278}
12279
12280VALUE
12282{
12283 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12284 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12285 return str;
12286}
12287
12288VALUE
12289rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12290{
12291 VALUE obj;
12292
12293 if (argc < 1) {
12294 rb_raise(rb_eArgError, "no receiver given");
12295 }
12296 obj = argv[0];
12297 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12298}
12299
12300/*
12301 * call-seq:
12302 * succ
12303 *
12304 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12305 *
12306 * :foo.succ # => :fop
12307 *
12308 * Related: String#succ.
12309 */
12310
12311static VALUE
12312sym_succ(VALUE sym)
12313{
12314 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12315}
12316
12317/*
12318 * call-seq:
12319 * symbol <=> object -> -1, 0, +1, or nil
12320 *
12321 * If +object+ is a symbol,
12322 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12323 *
12324 * :bar <=> :foo # => -1
12325 * :foo <=> :foo # => 0
12326 * :foo <=> :bar # => 1
12327 *
12328 * Otherwise, returns +nil+:
12329 *
12330 * :foo <=> 'bar' # => nil
12331 *
12332 * Related: String#<=>.
12333 */
12334
12335static VALUE
12336sym_cmp(VALUE sym, VALUE other)
12337{
12338 if (!SYMBOL_P(other)) {
12339 return Qnil;
12340 }
12341 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12342}
12343
12344/*
12345 * call-seq:
12346 * casecmp(object) -> -1, 0, 1, or nil
12347 *
12348 * :include: doc/symbol/casecmp.rdoc
12349 *
12350 */
12351
12352static VALUE
12353sym_casecmp(VALUE sym, VALUE other)
12354{
12355 if (!SYMBOL_P(other)) {
12356 return Qnil;
12357 }
12358 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12359}
12360
12361/*
12362 * call-seq:
12363 * casecmp?(object) -> true, false, or nil
12364 *
12365 * :include: doc/symbol/casecmp_p.rdoc
12366 *
12367 */
12368
12369static VALUE
12370sym_casecmp_p(VALUE sym, VALUE other)
12371{
12372 if (!SYMBOL_P(other)) {
12373 return Qnil;
12374 }
12375 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12376}
12377
12378/*
12379 * call-seq:
12380 * symbol =~ object -> integer or nil
12381 *
12382 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12383 * including possible updates to global variables;
12384 * see String#=~.
12385 *
12386 */
12387
12388static VALUE
12389sym_match(VALUE sym, VALUE other)
12390{
12391 return rb_str_match(rb_sym2str(sym), other);
12392}
12393
12394/*
12395 * call-seq:
12396 * match(pattern, offset = 0) -> matchdata or nil
12397 * match(pattern, offset = 0) {|matchdata| } -> object
12398 *
12399 * Equivalent to <tt>self.to_s.match</tt>,
12400 * including possible updates to global variables;
12401 * see String#match.
12402 *
12403 */
12404
12405static VALUE
12406sym_match_m(int argc, VALUE *argv, VALUE sym)
12407{
12408 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12409}
12410
12411/*
12412 * call-seq:
12413 * match?(pattern, offset) -> true or false
12414 *
12415 * Equivalent to <tt>sym.to_s.match?</tt>;
12416 * see String#match.
12417 *
12418 */
12419
12420static VALUE
12421sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12422{
12423 return rb_str_match_m_p(argc, argv, sym);
12424}
12425
12426/*
12427 * call-seq:
12428 * symbol[index] -> string or nil
12429 * symbol[start, length] -> string or nil
12430 * symbol[range] -> string or nil
12431 * symbol[regexp, capture = 0] -> string or nil
12432 * symbol[substring] -> string or nil
12433 *
12434 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12435 *
12436 */
12437
12438static VALUE
12439sym_aref(int argc, VALUE *argv, VALUE sym)
12440{
12441 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12442}
12443
12444/*
12445 * call-seq:
12446 * length -> integer
12447 *
12448 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12449 */
12450
12451static VALUE
12452sym_length(VALUE sym)
12453{
12454 return rb_str_length(rb_sym2str(sym));
12455}
12456
12457/*
12458 * call-seq:
12459 * empty? -> true or false
12460 *
12461 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12462 *
12463 */
12464
12465static VALUE
12466sym_empty(VALUE sym)
12467{
12468 return rb_str_empty(rb_sym2str(sym));
12469}
12470
12471/*
12472 * call-seq:
12473 * upcase(mapping) -> symbol
12474 *
12475 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12476 *
12477 * See String#upcase.
12478 *
12479 */
12480
12481static VALUE
12482sym_upcase(int argc, VALUE *argv, VALUE sym)
12483{
12484 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12485}
12486
12487/*
12488 * call-seq:
12489 * downcase(mapping) -> symbol
12490 *
12491 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12492 *
12493 * See String#downcase.
12494 *
12495 * Related: Symbol#upcase.
12496 *
12497 */
12498
12499static VALUE
12500sym_downcase(int argc, VALUE *argv, VALUE sym)
12501{
12502 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12503}
12504
12505/*
12506 * call-seq:
12507 * capitalize(mapping) -> symbol
12508 *
12509 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12510 *
12511 * See String#capitalize.
12512 *
12513 */
12514
12515static VALUE
12516sym_capitalize(int argc, VALUE *argv, VALUE sym)
12517{
12518 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12519}
12520
12521/*
12522 * call-seq:
12523 * swapcase(mapping) -> symbol
12524 *
12525 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12526 *
12527 * See String#swapcase.
12528 *
12529 */
12530
12531static VALUE
12532sym_swapcase(int argc, VALUE *argv, VALUE sym)
12533{
12534 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12535}
12536
12537/*
12538 * call-seq:
12539 * start_with?(*string_or_regexp) -> true or false
12540 *
12541 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12542 *
12543 */
12544
12545static VALUE
12546sym_start_with(int argc, VALUE *argv, VALUE sym)
12547{
12548 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12549}
12550
12551/*
12552 * call-seq:
12553 * end_with?(*strings) -> true or false
12554 *
12555 *
12556 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12557 *
12558 */
12559
12560static VALUE
12561sym_end_with(int argc, VALUE *argv, VALUE sym)
12562{
12563 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12564}
12565
12566/*
12567 * call-seq:
12568 * encoding -> encoding
12569 *
12570 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12571 *
12572 */
12573
12574static VALUE
12575sym_encoding(VALUE sym)
12576{
12577 return rb_obj_encoding(rb_sym2str(sym));
12578}
12579
12580static VALUE
12581string_for_symbol(VALUE name)
12582{
12583 if (!RB_TYPE_P(name, T_STRING)) {
12584 VALUE tmp = rb_check_string_type(name);
12585 if (NIL_P(tmp)) {
12586 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12587 name);
12588 }
12589 name = tmp;
12590 }
12591 return name;
12592}
12593
12594ID
12596{
12597 if (SYMBOL_P(name)) {
12598 return SYM2ID(name);
12599 }
12600 name = string_for_symbol(name);
12601 return rb_intern_str(name);
12602}
12603
12604VALUE
12606{
12607 if (SYMBOL_P(name)) {
12608 return name;
12609 }
12610 name = string_for_symbol(name);
12611 return rb_str_intern(name);
12612}
12613
12614/*
12615 * call-seq:
12616 * Symbol.all_symbols -> array_of_symbols
12617 *
12618 * Returns an array of all symbols currently in Ruby's symbol table:
12619 *
12620 * Symbol.all_symbols.size # => 9334
12621 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12622 *
12623 */
12624
12625static VALUE
12626sym_all_symbols(VALUE _)
12627{
12628 return rb_sym_all_symbols();
12629}
12630
12631VALUE
12632rb_str_to_interned_str(VALUE str)
12633{
12634 return rb_fstring(str);
12635}
12636
12637VALUE
12638rb_interned_str(const char *ptr, long len)
12639{
12640 struct RString fake_str;
12641 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12642}
12643
12644VALUE
12646{
12647 return rb_interned_str(ptr, strlen(ptr));
12648}
12649
12650VALUE
12651rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12652{
12653 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12654 rb_enc_autoload(enc);
12655 }
12656
12657 struct RString fake_str;
12658 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12659}
12660
12661VALUE
12662rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12663{
12664 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12665 rb_enc_autoload(enc);
12666 }
12667
12668 struct RString fake_str;
12669 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12670}
12671
12672VALUE
12674{
12675 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12676}
12677
12678#if USE_YJIT
12679void
12680rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12681{
12682 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12683 ssize_t code = RB_NUM2SSIZE(codepoint);
12684
12685 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12686 rb_str_buf_cat_byte(str, (char) code);
12687 return;
12688 }
12689 }
12690
12691 rb_str_concat(str, codepoint);
12692}
12693#endif
12694
12695static int
12696fstring_set_class_i(VALUE *str, void *data)
12697{
12698 RBASIC_SET_CLASS(*str, rb_cString);
12699
12700 return ST_CONTINUE;
12701}
12702
12703void
12704Init_String(void)
12705{
12706 rb_cString = rb_define_class("String", rb_cObject);
12707
12708 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12709
12711 rb_define_alloc_func(rb_cString, empty_str_alloc);
12712 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12713 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12714 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12715 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12716 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12719 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12720 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12721 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12722 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12725 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12726 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12727 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12728 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12731 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12732 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12733 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12734 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12735 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12737 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12739 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12740 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12741 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12742 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12743 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12744 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12746 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12747 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12748 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12749 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12750 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12751 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12752 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12753 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12755 rb_define_method(rb_cString, "+@", str_uplus, 0);
12756 rb_define_method(rb_cString, "-@", str_uminus, 0);
12757 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12758 rb_define_alias(rb_cString, "dedup", "-@");
12759
12760 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12761 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12762 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12763 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12766 rb_define_method(rb_cString, "undump", str_undump, 0);
12767
12768 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12769 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12770 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12771 sym_fold = ID2SYM(rb_intern_const("fold"));
12772
12773 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12774 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12775 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12776 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12777
12778 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12779 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12780 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12781 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12782
12783 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12784 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12785 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12786 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12787 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12788 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12789 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12790 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12791 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12792 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12793 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12794 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12796 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12797 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12798 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12799 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12800 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12801
12802 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12803 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12804 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12805
12806 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12807
12808 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12809 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12810 rb_define_method(rb_cString, "center", rb_str_center, -1);
12811
12812 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12813 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12814 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12815 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12816 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12817 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12818 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12819 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12820 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12821
12822 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12823 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12824 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12825 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12826 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12827 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12828 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12829 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12830 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12831
12832 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12833 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12834 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12835 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12836 rb_define_method(rb_cString, "count", rb_str_count, -1);
12837
12838 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12839 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12840 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12841 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12842
12843 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12844 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12845 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12846 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12847 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12848
12849 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12850
12851 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12852 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12853
12854 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12855 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12856
12857 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12858 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12859 rb_define_method(rb_cString, "b", rb_str_b, 0);
12860 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12861 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12862
12863 /* define UnicodeNormalize module here so that we don't have to look it up */
12864 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12865 id_normalize = rb_intern_const("normalize");
12866 id_normalized_p = rb_intern_const("normalized?");
12867
12868 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12869 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12870 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12871
12872 rb_fs = Qnil;
12873 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12874 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12875 rb_gc_register_address(&rb_fs);
12876
12877 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12881 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12882
12883 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12884 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12885 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12886 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12887 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12888 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12889
12890 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12891 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12892 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12893 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12894
12895 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12896 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12897 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12898 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12899 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12900 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12901 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12902
12903 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12904 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12905 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12906 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12907
12908 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12909 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12910
12911 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12912}
12913
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1691
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1474
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1592
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2843
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2663
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3133
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1036
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2922
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:206
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:682
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3908
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:643
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2122
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2140
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1308
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3536
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:242
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:550
VALUE rb_cSymbol
Symbol class.
Definition string.c:84
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:174
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1296
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:83
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3220
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1317
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:932
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1182
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2988
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1201
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12651
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2294
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3692
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1130
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1422
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1323
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:951
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12673
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:816
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2677
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2940
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:701
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1862
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1868
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1927
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4220
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3717
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1717
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1487
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2447
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3757
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1398
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12281
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2520
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1374
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1711
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3016
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5392
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4121
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3113
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11580
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1778
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1753
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1164
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:986
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1493
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1956
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4107
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3525
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2383
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:1974
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6644
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3121
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12645
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1404
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3723
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3063
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4228
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3347
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7311
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2750
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12638
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4175
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3994
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4150
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3699
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3238
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5902
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11638
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1667
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2910
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3210
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3329
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1176
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2704
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7418
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1386
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1683
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2397
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5820
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9479
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1170
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1815
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1984
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2063
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3340
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1603
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12605
ID rb_to_id(VALUE str)
Definition string.c:12595
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3496
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4464
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:163
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1416
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2887
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2769
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1410
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2782
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1744
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:458
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1586
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:204
Definition string.c:8362
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113