Ruby 3.5.0dev (2025-09-18 revision e6879401feba22e3657a231cbedc751998cb7176)
string.c (e6879401feba22e3657a231cbedc751998cb7176)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby_assert.h"
49#include "shape.h"
50#include "vm_sync.h"
52
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
55# include <crypt.h>
56# endif
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
60#endif
61
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
64
65#undef rb_str_new
66#undef rb_usascii_str_new
67#undef rb_utf8_str_new
68#undef rb_enc_str_new
69#undef rb_str_new_cstr
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
77#undef rb_str_buf_cat
78#undef rb_str_buf_cat2
79#undef rb_str_cat2
80#undef rb_str_cat_cstr
81#undef rb_fstring_cstr
82
85
86/* Flags of RString
87 *
88 * 0: STR_SHARED (equal to ELTS_SHARED)
89 * The string is shared. The buffer this string points to is owned by
90 * another string (the shared root).
91 * 1: RSTRING_NOEMBED
92 * The string is not embedded. When a string is embedded, the contents
93 * follow the header. When a string is not embedded, the contents is
94 * on a separately allocated buffer.
95 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
96 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
97 * It emits a deprecation warning when mutated for the first time.
98 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
99 * The string was allocated by the `Symbol#to_s` method.
100 * It emits a deprecation warning when mutated for the first time.
101 * 4: STR_PRECOMPUTED_HASH
102 * The string is embedded and has its precomputed hashcode stored
103 * after the terminator.
104 * 5: STR_SHARED_ROOT
105 * Other strings may point to the contents of this string. When this
106 * flag is set, STR_SHARED must not be set.
107 * 6: STR_BORROWED
108 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
109 * to be unshared by rb_str_tmp_frozen_release.
110 * 7: STR_TMPLOCK
111 * The pointer to the buffer is passed to a system call such as
112 * read(2). Any modification and realloc is prohibited.
113 * 8-9: ENC_CODERANGE
114 * Stores the coderange of the string.
115 * 10-16: ENCODING
116 * Stores the encoding of the string.
117 * 17: RSTRING_FSTR
118 * The string is a fstring. The string is deduplicated in the fstring
119 * table.
120 * 18: STR_NOFREE
121 * Do not free this string's buffer when the string is reclaimed
122 * by the garbage collector. Used for when the string buffer is a C
123 * string literal.
124 * 19: STR_FAKESTR
125 * The string is not allocated or managed by the garbage collector.
126 * Typically, the string object header (struct RString) is temporarily
127 * allocated on C stack.
128 */
129
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
137
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
141} while (0)
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
143
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
146} while (0)
147
148static inline bool
149str_encindex_fastpath(int encindex)
150{
151 // The overwhelming majority of strings are in one of these 3 encodings.
152 switch (encindex) {
153 case ENCINDEX_ASCII_8BIT:
154 case ENCINDEX_UTF_8:
155 case ENCINDEX_US_ASCII:
156 return true;
157 default:
158 return false;
159 }
160}
161
162static inline bool
163str_enc_fastpath(VALUE str)
164{
165 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
166}
167
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
175} while (0)
176
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
180} while (0)
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
191 }\
192 }\
193 else {\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
198 }\
199} while (0)
200
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
210 } \
211} while (0)
212
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
215/* TODO: include the terminator size in capa. */
216
217#define STR_ENC_GET(str) get_encoding(str)
218
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
221#endif
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
224#else
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
226#endif
227
228
229static inline long
230str_embed_capa(VALUE str)
231{
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233}
234
235bool
236rb_str_reembeddable_p(VALUE str)
237{
238 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
239}
240
241static inline size_t
242rb_str_embed_size(long capa)
243{
244 return offsetof(struct RString, as.embed.ary) + capa;
245}
246
247size_t
248rb_str_size_as_embedded(VALUE str)
249{
250 size_t real_size;
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
253 }
254 /* if the string is not currently embedded, but it can be embedded, how
255 * much space would it require */
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
258 }
259 else {
260 real_size = sizeof(struct RString);
261 }
262
263 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
264 real_size += sizeof(st_index_t);
265 }
266
267 return real_size;
268}
269
270static inline bool
271STR_EMBEDDABLE_P(long len, long termlen)
272{
273 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
274}
275
276static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
277static VALUE str_new_frozen(VALUE klass, VALUE orig);
278static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
279static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
280static VALUE str_new(VALUE klass, const char *ptr, long len);
281static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
282static inline void str_modifiable(VALUE str);
283static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
284static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
285
286static inline void
287str_make_independent(VALUE str)
288{
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str), len, 0L, termlen);
292}
293
294static inline int str_dependent_p(VALUE str);
295
296void
297rb_str_make_independent(VALUE str)
298{
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
301 }
302}
303
304void
305rb_str_make_embedded(VALUE str)
306{
307 RUBY_ASSERT(rb_str_reembeddable_p(str));
308 RUBY_ASSERT(!STR_EMBED_P(str));
309
310 char *buf = RSTRING(str)->as.heap.ptr;
311 long len = RSTRING(str)->len;
312
313 STR_SET_EMBED(str);
314 STR_SET_LEN(str, len);
315
316 if (len > 0) {
317 memcpy(RSTRING_PTR(str), buf, len);
318 ruby_xfree(buf);
319 }
320
321 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
322}
323
324void
325rb_debug_rstring_null_ptr(const char *func)
326{
327 fprintf(stderr, "%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
331 func);
332}
333
334/* symbols for [up|down|swap]case/capitalize options */
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
336
337static rb_encoding *
338get_encoding(VALUE str)
339{
340 return rb_enc_from_index(ENCODING_GET(str));
341}
342
343static void
344mustnot_broken(VALUE str)
345{
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348 }
349}
350
351static void
352mustnot_wchar(VALUE str)
353{
354 rb_encoding *enc = STR_ENC_GET(str);
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
357 }
358}
359
360static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
361
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
364#else
365#endif
366
367static inline bool
368BARE_STRING_P(VALUE str)
369{
370 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
371}
372
373static inline st_index_t
374str_do_hash(VALUE str)
375{
376 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
377 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
378 if (e && !is_ascii_string(str)) {
379 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
380 }
381 return h;
382}
383
384static VALUE
385str_store_precomputed_hash(VALUE str, st_index_t hash)
386{
387 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
388 RUBY_ASSERT(STR_EMBED_P(str));
389
390#if RUBY_DEBUG
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
393 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
394#endif
395
396 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
397
398 FL_SET(str, STR_PRECOMPUTED_HASH);
399
400 return str;
401}
402
403VALUE
404rb_fstring(VALUE str)
405{
406 VALUE fstr;
407 int bare;
408
409 Check_Type(str, T_STRING);
410
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
413
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE(str);
418 return str;
419 }
420
421 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
423 return str;
424 }
425 }
426
427 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
428 rb_str_resize(str, RSTRING_LEN(str));
429
430 fstr = register_fstring(str, false, false);
431
432 if (!bare) {
433 str_replace_shared_without_enc(str, fstr);
434 OBJ_FREEZE(str);
435 return str;
436 }
437 return fstr;
438}
439
440static VALUE fstring_table_obj;
441
442static VALUE
443fstring_concurrent_set_hash(VALUE str)
444{
445#ifdef PRECOMPUTED_FAKESTR_HASH
446 st_index_t h;
447 if (FL_TEST_RAW(str, STR_FAKESTR)) {
448 // register_fstring precomputes the hash and stores it in capa for fake strings
449 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
450 }
451 else {
452 h = rb_str_hash(str);
453 }
454 // rb_str_hash doesn't include the encoding for ascii only strings, so
455 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
456 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
457#else
458 return (VALUE)rb_str_hash(str);
459#endif
460}
461
462static bool
463fstring_concurrent_set_cmp(VALUE a, VALUE b)
464{
465 long alen, blen;
466 const char *aptr, *bptr;
467
470
471 RSTRING_GETMEM(a, aptr, alen);
472 RSTRING_GETMEM(b, bptr, blen);
473 return (alen == blen &&
474 ENCODING_GET(a) == ENCODING_GET(b) &&
475 memcmp(aptr, bptr, alen) == 0);
476}
477
479 bool copy;
480 bool force_precompute_hash;
481};
482
483static VALUE
484fstring_concurrent_set_create(VALUE str, void *data)
485{
486 struct fstr_create_arg *arg = data;
487
488 // Unless the string is empty or binary, its coderange has been precomputed.
489 int coderange = ENC_CODERANGE(str);
490
491 if (FL_TEST_RAW(str, STR_FAKESTR)) {
492 if (arg->copy) {
493 VALUE new_str;
494 long len = RSTRING_LEN(str);
495 long capa = len + sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
497
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
499 new_str = str_alloc_embed(rb_cString, capa + term_len);
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
502 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
505 }
506 else {
507 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
512 }
513#endif
514 }
515 str = new_str;
516 }
517 else {
518 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
519 RSTRING(str)->len,
520 ENCODING_GET(str));
521 }
522 OBJ_FREEZE(str);
523 }
524 else {
525 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
526 str = str_new_frozen(rb_cString, str);
527 }
528 if (STR_SHARED_P(str)) { /* str should not be shared */
529 /* shared substring */
530 str_make_independent(str);
532 }
533 if (!BARE_STRING_P(str)) {
534 str = str_new_frozen(rb_cString, str);
535 }
536 }
537
538 ENC_CODERANGE_SET(str, coderange);
539 RBASIC(str)->flags |= RSTRING_FSTR;
540
543 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
544 RUBY_ASSERT(!rb_obj_exivar_p(str));
546 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
547
548 return str;
549}
550
551static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
555 .free = NULL,
556};
557
558void
559Init_fstring_table(void)
560{
561 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
562 rb_gc_register_address(&fstring_table_obj);
563}
564
565static VALUE
566register_fstring(VALUE str, bool copy, bool force_precompute_hash)
567{
568 struct fstr_create_arg args = {
569 .copy = copy,
570 .force_precompute_hash = force_precompute_hash
571 };
572
573#if SIZEOF_VOIDP == SIZEOF_LONG
574 if (FL_TEST_RAW(str, STR_FAKESTR)) {
575 // if the string hasn't been interned, we'll need the hash twice, so we
576 // compute it once and store it in capa
577 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
578 }
579#endif
580
581 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
582
583 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
585 RUBY_ASSERT(OBJ_FROZEN(result));
586 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
588
589 return result;
590}
591
592bool
593rb_obj_is_fstring_table(VALUE obj)
594{
595 ASSERT_vm_locking();
596
597 return obj == fstring_table_obj;
598}
599
600void
601rb_gc_free_fstring(VALUE obj)
602{
603 // Assume locking and barrier (which there is no assert for)
604 ASSERT_vm_locking();
605
606 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
607
608 RB_DEBUG_COUNTER_INC(obj_str_fstr);
609
610 FL_UNSET(obj, RSTRING_FSTR);
611}
612
613void
614rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
615{
616 if (fstring_table_obj) {
617 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
618 }
619}
620
621static VALUE
622setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
623{
624 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
625 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
626
627 if (!name) {
629 name = "";
630 }
631
632 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
633
634 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
635 fake_str->len = len;
636 fake_str->as.heap.ptr = (char *)name;
637 fake_str->as.heap.aux.capa = len;
638 return (VALUE)fake_str;
639}
640
641/*
642 * set up a fake string which refers a static string literal.
643 */
644VALUE
645rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
646{
647 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
648}
649
650/*
651 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
652 * shared string which refers a static string literal. `ptr` must
653 * point a constant string.
654 */
655VALUE
656rb_fstring_new(const char *ptr, long len)
657{
658 struct RString fake_str;
659 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
660}
661
662VALUE
663rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
664{
665 struct RString fake_str;
666 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
667}
668
669VALUE
670rb_fstring_cstr(const char *ptr)
671{
672 return rb_fstring_new(ptr, strlen(ptr));
673}
674
675static inline bool
676single_byte_optimizable(VALUE str)
677{
678 int encindex = ENCODING_GET(str);
679 switch (encindex) {
680 case ENCINDEX_ASCII_8BIT:
681 case ENCINDEX_US_ASCII:
682 return true;
683 case ENCINDEX_UTF_8:
684 // For UTF-8 it's worth scanning the string coderange when unknown.
686 }
687 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
688 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
689 return true;
690 }
691
692 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
693 return true;
694 }
695
696 /* Conservative. Possibly single byte.
697 * "\xa1" in Shift_JIS for example. */
698 return false;
699}
700
702
703static inline const char *
704search_nonascii(const char *p, const char *e)
705{
706 const uintptr_t *s, *t;
707
708#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
709# if SIZEOF_UINTPTR_T == 8
710# define NONASCII_MASK UINT64_C(0x8080808080808080)
711# elif SIZEOF_UINTPTR_T == 4
712# define NONASCII_MASK UINT32_C(0x80808080)
713# else
714# error "don't know what to do."
715# endif
716#else
717# if SIZEOF_UINTPTR_T == 8
718# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
719# elif SIZEOF_UINTPTR_T == 4
720# define NONASCII_MASK 0x80808080UL /* or...? */
721# else
722# error "don't know what to do."
723# endif
724#endif
725
726 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
727#if !UNALIGNED_WORD_ACCESS
728 if ((uintptr_t)p % SIZEOF_VOIDP) {
729 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
730 p += l;
731 switch (l) {
732 default: UNREACHABLE;
733#if SIZEOF_VOIDP > 4
734 case 7: if (p[-7]&0x80) return p-7;
735 case 6: if (p[-6]&0x80) return p-6;
736 case 5: if (p[-5]&0x80) return p-5;
737 case 4: if (p[-4]&0x80) return p-4;
738#endif
739 case 3: if (p[-3]&0x80) return p-3;
740 case 2: if (p[-2]&0x80) return p-2;
741 case 1: if (p[-1]&0x80) return p-1;
742 case 0: break;
743 }
744 }
745#endif
746#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
747#define aligned_ptr(value) \
748 __builtin_assume_aligned((value), sizeof(uintptr_t))
749#else
750#define aligned_ptr(value) (uintptr_t *)(value)
751#endif
752 s = aligned_ptr(p);
753 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
754#undef aligned_ptr
755 for (;s < t; s++) {
756 if (*s & NONASCII_MASK) {
757#ifdef WORDS_BIGENDIAN
758 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
759#else
760 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
761#endif
762 }
763 }
764 p = (const char *)s;
765 }
766
767 switch (e - p) {
768 default: UNREACHABLE;
769#if SIZEOF_VOIDP > 4
770 case 7: if (e[-7]&0x80) return e-7;
771 case 6: if (e[-6]&0x80) return e-6;
772 case 5: if (e[-5]&0x80) return e-5;
773 case 4: if (e[-4]&0x80) return e-4;
774#endif
775 case 3: if (e[-3]&0x80) return e-3;
776 case 2: if (e[-2]&0x80) return e-2;
777 case 1: if (e[-1]&0x80) return e-1;
778 case 0: return NULL;
779 }
780}
781
782static int
783coderange_scan(const char *p, long len, rb_encoding *enc)
784{
785 const char *e = p + len;
786
787 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
788 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
789 p = search_nonascii(p, e);
791 }
792
793 if (rb_enc_asciicompat(enc)) {
794 p = search_nonascii(p, e);
795 if (!p) return ENC_CODERANGE_7BIT;
796 for (;;) {
797 int ret = rb_enc_precise_mbclen(p, e, enc);
799 p += MBCLEN_CHARFOUND_LEN(ret);
800 if (p == e) break;
801 p = search_nonascii(p, e);
802 if (!p) break;
803 }
804 }
805 else {
806 while (p < e) {
807 int ret = rb_enc_precise_mbclen(p, e, enc);
809 p += MBCLEN_CHARFOUND_LEN(ret);
810 }
811 }
812 return ENC_CODERANGE_VALID;
813}
814
815long
816rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
817{
818 const char *p = s;
819
820 if (*cr == ENC_CODERANGE_BROKEN)
821 return e - s;
822
823 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
824 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
825 if (*cr == ENC_CODERANGE_VALID) return e - s;
826 p = search_nonascii(p, e);
828 return e - s;
829 }
830 else if (rb_enc_asciicompat(enc)) {
831 p = search_nonascii(p, e);
832 if (!p) {
833 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
834 return e - s;
835 }
836 for (;;) {
837 int ret = rb_enc_precise_mbclen(p, e, enc);
838 if (!MBCLEN_CHARFOUND_P(ret)) {
840 return p - s;
841 }
842 p += MBCLEN_CHARFOUND_LEN(ret);
843 if (p == e) break;
844 p = search_nonascii(p, e);
845 if (!p) break;
846 }
847 }
848 else {
849 while (p < e) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 }
857 }
859 return e - s;
860}
861
862static inline void
863str_enc_copy(VALUE str1, VALUE str2)
864{
865 rb_enc_set_index(str1, ENCODING_GET(str2));
866}
867
868/* Like str_enc_copy, but does not check frozen status of str1.
869 * You should use this only if you're certain that str1 is not frozen. */
870static inline void
871str_enc_copy_direct(VALUE str1, VALUE str2)
872{
873 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
874 if (inlined_encoding == ENCODING_INLINE_MAX) {
875 rb_enc_set_index(str1, rb_enc_get_index(str2));
876 }
877 else {
878 ENCODING_SET_INLINED(str1, inlined_encoding);
879 }
880}
881
882static void
883rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
884{
885 /* this function is designed for copying encoding and coderange
886 * from src to new string "dest" which is made from the part of src.
887 */
888 str_enc_copy(dest, src);
889 if (RSTRING_LEN(dest) == 0) {
890 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
892 else
894 return;
895 }
896 switch (ENC_CODERANGE(src)) {
899 break;
901 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
902 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
904 else
906 break;
907 default:
908 break;
909 }
910}
911
912static void
913rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
914{
915 str_enc_copy(dest, src);
917}
918
919static int
920enc_coderange_scan(VALUE str, rb_encoding *enc)
921{
922 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
923}
924
925int
926rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
927{
928 return enc_coderange_scan(str, enc);
929}
930
931int
933{
934 int cr = ENC_CODERANGE(str);
935
936 if (cr == ENC_CODERANGE_UNKNOWN) {
937 cr = enc_coderange_scan(str, get_encoding(str));
938 ENC_CODERANGE_SET(str, cr);
939 }
940 return cr;
941}
942
943static inline bool
944rb_enc_str_asciicompat(VALUE str)
945{
946 int encindex = ENCODING_GET_INLINED(str);
947 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
948}
949
950int
952{
953 switch(ENC_CODERANGE(str)) {
955 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
957 return true;
958 default:
959 return false;
960 }
961}
962
963static inline void
964str_mod_check(VALUE s, const char *p, long len)
965{
966 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
967 rb_raise(rb_eRuntimeError, "string modified");
968 }
969}
970
971static size_t
972str_capacity(VALUE str, const int termlen)
973{
974 if (STR_EMBED_P(str)) {
975 return str_embed_capa(str) - termlen;
976 }
977 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
978 return RSTRING(str)->len;
979 }
980 else {
981 return RSTRING(str)->as.heap.aux.capa;
982 }
983}
984
985size_t
987{
988 return str_capacity(str, TERM_LEN(str));
989}
990
991static inline void
992must_not_null(const char *ptr)
993{
994 if (!ptr) {
995 rb_raise(rb_eArgError, "NULL pointer given");
996 }
997}
998
999static inline VALUE
1000str_alloc_embed(VALUE klass, size_t capa)
1001{
1002 size_t size = rb_str_embed_size(capa);
1003 RUBY_ASSERT(size > 0);
1004 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1005
1006 NEWOBJ_OF(str, struct RString, klass,
1008
1009 str->len = 0;
1010 str->as.embed.ary[0] = 0;
1011
1012 return (VALUE)str;
1013}
1014
1015static inline VALUE
1016str_alloc_heap(VALUE klass)
1017{
1018 NEWOBJ_OF(str, struct RString, klass,
1019 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1020
1021 str->len = 0;
1022 str->as.heap.aux.capa = 0;
1023 str->as.heap.ptr = NULL;
1024
1025 return (VALUE)str;
1026}
1027
1028static inline VALUE
1029empty_str_alloc(VALUE klass)
1030{
1031 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1032 VALUE str = str_alloc_embed(klass, 0);
1033 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1035 return str;
1036}
1037
1038static VALUE
1039str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1040{
1041 VALUE str;
1042
1043 if (len < 0) {
1044 rb_raise(rb_eArgError, "negative string size (or size too big)");
1045 }
1046
1047 if (enc == NULL) {
1048 enc = rb_ascii8bit_encoding();
1049 }
1050
1051 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1052
1053 int termlen = rb_enc_mbminlen(enc);
1054
1055 if (STR_EMBEDDABLE_P(len, termlen)) {
1056 str = str_alloc_embed(klass, len + termlen);
1057 if (len == 0) {
1058 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1059 }
1060 }
1061 else {
1062 str = str_alloc_heap(klass);
1063 RSTRING(str)->as.heap.aux.capa = len;
1064 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1065 * integer overflow. If we can STATIC_ASSERT that, the following
1066 * mul_add_mul can be reverted to a simple ALLOC_N. */
1067 RSTRING(str)->as.heap.ptr =
1068 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1069 }
1070
1071 rb_enc_raw_set(str, enc);
1072
1073 if (ptr) {
1074 memcpy(RSTRING_PTR(str), ptr, len);
1075 }
1076 else {
1077 memset(RSTRING_PTR(str), 0, len);
1078 }
1079
1080 STR_SET_LEN(str, len);
1081 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1082 return str;
1083}
1084
1085static VALUE
1086str_new(VALUE klass, const char *ptr, long len)
1087{
1088 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1089}
1090
1091VALUE
1092rb_str_new(const char *ptr, long len)
1093{
1094 return str_new(rb_cString, ptr, len);
1095}
1096
1097VALUE
1098rb_usascii_str_new(const char *ptr, long len)
1099{
1100 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1101}
1102
1103VALUE
1104rb_utf8_str_new(const char *ptr, long len)
1105{
1106 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1107}
1108
1109VALUE
1110rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1111{
1112 return str_enc_new(rb_cString, ptr, len, enc);
1113}
1114
1115VALUE
1117{
1118 must_not_null(ptr);
1119 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1120 * memory regions, and that cannot be detected by the MSAN. Just
1121 * trust the programmer that the argument passed here is a sane C
1122 * string. */
1123 __msan_unpoison_string(ptr);
1124 return rb_str_new(ptr, strlen(ptr));
1125}
1126
1127VALUE
1129{
1130 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1131}
1132
1133VALUE
1135{
1136 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1137}
1138
1139VALUE
1141{
1142 must_not_null(ptr);
1143 if (rb_enc_mbminlen(enc) != 1) {
1144 rb_raise(rb_eArgError, "wchar encoding given");
1145 }
1146 return rb_enc_str_new(ptr, strlen(ptr), enc);
1147}
1148
1149static VALUE
1150str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1151{
1152 VALUE str;
1153
1154 if (len < 0) {
1155 rb_raise(rb_eArgError, "negative string size (or size too big)");
1156 }
1157
1158 if (!ptr) {
1159 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1160 }
1161 else {
1162 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1163 str = str_alloc_heap(klass);
1164 RSTRING(str)->len = len;
1165 RSTRING(str)->as.heap.ptr = (char *)ptr;
1166 RSTRING(str)->as.heap.aux.capa = len;
1167 RBASIC(str)->flags |= STR_NOFREE;
1168 rb_enc_associate_index(str, encindex);
1169 }
1170 return str;
1171}
1172
1173VALUE
1174rb_str_new_static(const char *ptr, long len)
1175{
1176 return str_new_static(rb_cString, ptr, len, 0);
1177}
1178
1179VALUE
1181{
1182 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1183}
1184
1185VALUE
1187{
1188 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1189}
1190
1191VALUE
1193{
1194 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1195}
1196
1197static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1198 rb_encoding *from, rb_encoding *to,
1199 int ecflags, VALUE ecopts);
1200
1201static inline bool
1202is_enc_ascii_string(VALUE str, rb_encoding *enc)
1203{
1204 int encidx = rb_enc_to_index(enc);
1205 if (rb_enc_get_index(str) == encidx)
1206 return is_ascii_string(str);
1207 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1208}
1209
1210VALUE
1211rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1212{
1213 long len;
1214 const char *ptr;
1215 VALUE newstr;
1216
1217 if (!to) return str;
1218 if (!from) from = rb_enc_get(str);
1219 if (from == to) return str;
1220 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1221 rb_is_ascii8bit_enc(to)) {
1222 if (STR_ENC_GET(str) != to) {
1223 str = rb_str_dup(str);
1224 rb_enc_associate(str, to);
1225 }
1226 return str;
1227 }
1228
1229 RSTRING_GETMEM(str, ptr, len);
1230 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1231 from, to, ecflags, ecopts);
1232 if (NIL_P(newstr)) {
1233 /* some error, return original */
1234 return str;
1235 }
1236 return newstr;
1237}
1238
1239VALUE
1240rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1241 rb_encoding *from, int ecflags, VALUE ecopts)
1242{
1243 long olen;
1244
1245 olen = RSTRING_LEN(newstr);
1246 if (ofs < -olen || olen < ofs)
1247 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1248 if (ofs < 0) ofs += olen;
1249 if (!from) {
1250 STR_SET_LEN(newstr, ofs);
1251 return rb_str_cat(newstr, ptr, len);
1252 }
1253
1254 rb_str_modify(newstr);
1255 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1256 rb_enc_get(newstr),
1257 ecflags, ecopts);
1258}
1259
1260VALUE
1261rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1262{
1263 STR_SET_LEN(str, 0);
1264 rb_enc_associate(str, enc);
1265 rb_str_cat(str, ptr, len);
1266 return str;
1267}
1268
1269static VALUE
1270str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1271 rb_encoding *from, rb_encoding *to,
1272 int ecflags, VALUE ecopts)
1273{
1274 rb_econv_t *ec;
1276 long olen;
1277 VALUE econv_wrapper;
1278 const unsigned char *start, *sp;
1279 unsigned char *dest, *dp;
1280 size_t converted_output = (size_t)ofs;
1281
1282 olen = rb_str_capacity(newstr);
1283
1284 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1285 RBASIC_CLEAR_CLASS(econv_wrapper);
1286 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1287 if (!ec) return Qnil;
1288 DATA_PTR(econv_wrapper) = ec;
1289
1290 sp = (unsigned char*)ptr;
1291 start = sp;
1292 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1293 (dp = dest + converted_output),
1294 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1296 /* destination buffer short */
1297 size_t converted_input = sp - start;
1298 size_t rest = len - converted_input;
1299 converted_output = dp - dest;
1300 rb_str_set_len(newstr, converted_output);
1301 if (converted_input && converted_output &&
1302 rest < (LONG_MAX / converted_output)) {
1303 rest = (rest * converted_output) / converted_input;
1304 }
1305 else {
1306 rest = olen;
1307 }
1308 olen += rest < 2 ? 2 : rest;
1309 rb_str_resize(newstr, olen);
1310 }
1311 DATA_PTR(econv_wrapper) = 0;
1312 RB_GC_GUARD(econv_wrapper);
1313 rb_econv_close(ec);
1314 switch (ret) {
1315 case econv_finished:
1316 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1317 rb_str_set_len(newstr, len);
1318 rb_enc_associate(newstr, to);
1319 return newstr;
1320
1321 default:
1322 return Qnil;
1323 }
1324}
1325
1326VALUE
1328{
1329 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1330}
1331
1332VALUE
1334{
1335 rb_encoding *ienc;
1336 VALUE str;
1337 const int eidx = rb_enc_to_index(eenc);
1338
1339 if (!ptr) {
1340 return rb_enc_str_new(ptr, len, eenc);
1341 }
1342
1343 /* ASCII-8BIT case, no conversion */
1344 if ((eidx == rb_ascii8bit_encindex()) ||
1345 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1346 return rb_str_new(ptr, len);
1347 }
1348 /* no default_internal or same encoding, no conversion */
1349 ienc = rb_default_internal_encoding();
1350 if (!ienc || eenc == ienc) {
1351 return rb_enc_str_new(ptr, len, eenc);
1352 }
1353 /* ASCII compatible, and ASCII only string, no conversion in
1354 * default_internal */
1355 if ((eidx == rb_ascii8bit_encindex()) ||
1356 (eidx == rb_usascii_encindex()) ||
1357 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1358 return rb_enc_str_new(ptr, len, ienc);
1359 }
1360 /* convert from the given encoding to default_internal */
1361 str = rb_enc_str_new(NULL, 0, ienc);
1362 /* when the conversion failed for some reason, just ignore the
1363 * default_internal and result in the given encoding as-is. */
1364 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1365 rb_str_initialize(str, ptr, len, eenc);
1366 }
1367 return str;
1368}
1369
1370VALUE
1371rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1372{
1373 int eidx = rb_enc_to_index(eenc);
1374 if (eidx == rb_usascii_encindex() &&
1375 !is_ascii_string(str)) {
1376 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1377 return str;
1378 }
1379 rb_enc_associate_index(str, eidx);
1380 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1381}
1382
1383VALUE
1384rb_external_str_new(const char *ptr, long len)
1385{
1386 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1387}
1388
1389VALUE
1391{
1392 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1393}
1394
1395VALUE
1396rb_locale_str_new(const char *ptr, long len)
1397{
1398 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1399}
1400
1401VALUE
1403{
1404 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1405}
1406
1407VALUE
1409{
1410 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1411}
1412
1413VALUE
1415{
1416 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1417}
1418
1419VALUE
1421{
1422 return rb_str_export_to_enc(str, rb_default_external_encoding());
1423}
1424
1425VALUE
1427{
1428 return rb_str_export_to_enc(str, rb_locale_encoding());
1429}
1430
1431VALUE
1433{
1434 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1435}
1436
1437static VALUE
1438str_replace_shared_without_enc(VALUE str2, VALUE str)
1439{
1440 const int termlen = TERM_LEN(str);
1441 char *ptr;
1442 long len;
1443
1444 RSTRING_GETMEM(str, ptr, len);
1445 if (str_embed_capa(str2) >= len + termlen) {
1446 char *ptr2 = RSTRING(str2)->as.embed.ary;
1447 STR_SET_EMBED(str2);
1448 memcpy(ptr2, RSTRING_PTR(str), len);
1449 TERM_FILL(ptr2+len, termlen);
1450 }
1451 else {
1452 VALUE root;
1453 if (STR_SHARED_P(str)) {
1454 root = RSTRING(str)->as.heap.aux.shared;
1455 RSTRING_GETMEM(str, ptr, len);
1456 }
1457 else {
1458 root = rb_str_new_frozen(str);
1459 RSTRING_GETMEM(root, ptr, len);
1460 }
1461 RUBY_ASSERT(OBJ_FROZEN(root));
1462
1463 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1464 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1465 rb_fatal("about to free a possible shared root");
1466 }
1467 char *ptr2 = STR_HEAP_PTR(str2);
1468 if (ptr2 != ptr) {
1469 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1470 }
1471 }
1472 FL_SET(str2, STR_NOEMBED);
1473 RSTRING(str2)->as.heap.ptr = ptr;
1474 STR_SET_SHARED(str2, root);
1475 }
1476
1477 STR_SET_LEN(str2, len);
1478
1479 return str2;
1480}
1481
1482static VALUE
1483str_replace_shared(VALUE str2, VALUE str)
1484{
1485 str_replace_shared_without_enc(str2, str);
1486 rb_enc_cr_str_exact_copy(str2, str);
1487 return str2;
1488}
1489
1490static VALUE
1491str_new_shared(VALUE klass, VALUE str)
1492{
1493 return str_replace_shared(str_alloc_heap(klass), str);
1494}
1495
1496VALUE
1498{
1499 return str_new_shared(rb_obj_class(str), str);
1500}
1501
1502VALUE
1504{
1505 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1506 return str_new_frozen(rb_obj_class(orig), orig);
1507}
1508
1509static VALUE
1510rb_str_new_frozen_String(VALUE orig)
1511{
1512 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1513 return str_new_frozen(rb_cString, orig);
1514}
1515
1516
1517VALUE
1518rb_str_frozen_bare_string(VALUE orig)
1519{
1520 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1521 return str_new_frozen(rb_cString, orig);
1522}
1523
1524VALUE
1525rb_str_tmp_frozen_acquire(VALUE orig)
1526{
1527 if (OBJ_FROZEN_RAW(orig)) return orig;
1528 return str_new_frozen_buffer(0, orig, FALSE);
1529}
1530
1531VALUE
1532rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1533{
1534 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1535 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1536
1537 VALUE str = str_alloc_heap(0);
1538 OBJ_FREEZE(str);
1539 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1540 FL_SET(str, STR_SHARED_ROOT);
1541
1542 size_t capa = str_capacity(orig, TERM_LEN(orig));
1543
1544 /* If the string is embedded then we want to create a copy that is heap
1545 * allocated. If the string is shared then the shared root must be
1546 * embedded, so we want to create a copy. If the string is a shared root
1547 * then it must be embedded, so we want to create a copy. */
1548 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1549 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1550 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1551 }
1552 else {
1553 /* orig must be heap allocated and not shared, so we can safely transfer
1554 * the pointer to str. */
1555 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1556 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1557 RBASIC(orig)->flags &= ~STR_NOFREE;
1558 STR_SET_SHARED(orig, str);
1559 }
1560
1561 RSTRING(str)->len = RSTRING(orig)->len;
1562 RSTRING(str)->as.heap.aux.capa = capa;
1563
1564 return str;
1565}
1566
1567void
1568rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1569{
1570 if (RBASIC_CLASS(tmp) != 0)
1571 return;
1572
1573 if (STR_EMBED_P(tmp)) {
1575 }
1576 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1577 !OBJ_FROZEN_RAW(orig)) {
1578 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1579
1580 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1581 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1582 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1583
1584 /* Unshare orig since the root (tmp) only has this one child. */
1585 FL_UNSET_RAW(orig, STR_SHARED);
1586 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1587 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1589
1590 /* Make tmp embedded and empty so it is safe for sweeping. */
1591 STR_SET_EMBED(tmp);
1592 STR_SET_LEN(tmp, 0);
1593 }
1594 }
1595}
1596
1597static VALUE
1598str_new_frozen(VALUE klass, VALUE orig)
1599{
1600 return str_new_frozen_buffer(klass, orig, TRUE);
1601}
1602
1603static VALUE
1604heap_str_make_shared(VALUE klass, VALUE orig)
1605{
1606 RUBY_ASSERT(!STR_EMBED_P(orig));
1607 RUBY_ASSERT(!STR_SHARED_P(orig));
1608
1609 VALUE str = str_alloc_heap(klass);
1610 STR_SET_LEN(str, RSTRING_LEN(orig));
1611 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1612 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1613 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1614 RBASIC(orig)->flags &= ~STR_NOFREE;
1615 STR_SET_SHARED(orig, str);
1616 if (klass == 0)
1617 FL_UNSET_RAW(str, STR_BORROWED);
1618 return str;
1619}
1620
1621static VALUE
1622str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1623{
1624 VALUE str;
1625
1626 long len = RSTRING_LEN(orig);
1627 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1628 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1629
1630 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1631 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1632 RUBY_ASSERT(STR_EMBED_P(str));
1633 }
1634 else {
1635 if (FL_TEST_RAW(orig, STR_SHARED)) {
1636 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1637 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1638 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1639 RUBY_ASSERT(ofs >= 0);
1640 RUBY_ASSERT(rest >= 0);
1641 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1643
1644 if ((ofs > 0) || (rest > 0) ||
1645 (klass != RBASIC(shared)->klass) ||
1646 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1647 str = str_new_shared(klass, shared);
1648 RUBY_ASSERT(!STR_EMBED_P(str));
1649 RSTRING(str)->as.heap.ptr += ofs;
1650 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1651 }
1652 else {
1653 if (RBASIC_CLASS(shared) == 0)
1654 FL_SET_RAW(shared, STR_BORROWED);
1655 return shared;
1656 }
1657 }
1658 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1659 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1660 STR_SET_EMBED(str);
1661 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1662 STR_SET_LEN(str, RSTRING_LEN(orig));
1663 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1664 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1665 }
1666 else {
1667 str = heap_str_make_shared(klass, orig);
1668 }
1669 }
1670
1671 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1672 OBJ_FREEZE(str);
1673 return str;
1674}
1675
1676VALUE
1677rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1678{
1679 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1680}
1681
1682static VALUE
1683str_new_empty_String(VALUE str)
1684{
1685 VALUE v = rb_str_new(0, 0);
1686 rb_enc_copy(v, str);
1687 return v;
1688}
1689
1690#define STR_BUF_MIN_SIZE 63
1691
1692VALUE
1694{
1695 if (STR_EMBEDDABLE_P(capa, 1)) {
1696 return str_alloc_embed(rb_cString, capa + 1);
1697 }
1698
1699 VALUE str = str_alloc_heap(rb_cString);
1700
1701 RSTRING(str)->as.heap.aux.capa = capa;
1702 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1703 RSTRING(str)->as.heap.ptr[0] = '\0';
1704
1705 return str;
1706}
1707
1708VALUE
1710{
1711 VALUE str;
1712 long len = strlen(ptr);
1713
1714 str = rb_str_buf_new(len);
1715 rb_str_buf_cat(str, ptr, len);
1716
1717 return str;
1718}
1719
1720VALUE
1722{
1723 return str_new(0, 0, len);
1724}
1725
1726void
1728{
1729 if (STR_EMBED_P(str)) {
1730 RB_DEBUG_COUNTER_INC(obj_str_embed);
1731 }
1732 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1733 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1734 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1735 }
1736 else {
1737 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1738 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1739 }
1740}
1741
1742size_t
1743rb_str_memsize(VALUE str)
1744{
1745 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1746 return STR_HEAP_SIZE(str);
1747 }
1748 else {
1749 return 0;
1750 }
1751}
1752
1753VALUE
1755{
1756 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1757}
1758
1759static inline void str_discard(VALUE str);
1760static void str_shared_replace(VALUE str, VALUE str2);
1761
1762void
1764{
1765 if (str != str2) str_shared_replace(str, str2);
1766}
1767
1768static void
1769str_shared_replace(VALUE str, VALUE str2)
1770{
1771 rb_encoding *enc;
1772 int cr;
1773 int termlen;
1774
1775 RUBY_ASSERT(str2 != str);
1776 enc = STR_ENC_GET(str2);
1777 cr = ENC_CODERANGE(str2);
1778 str_discard(str);
1779 termlen = rb_enc_mbminlen(enc);
1780
1781 STR_SET_LEN(str, RSTRING_LEN(str2));
1782
1783 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1784 STR_SET_EMBED(str);
1785 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1786 rb_enc_associate(str, enc);
1787 ENC_CODERANGE_SET(str, cr);
1788 }
1789 else {
1790 if (STR_EMBED_P(str2)) {
1791 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1792 long len = RSTRING_LEN(str2);
1793 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1794
1795 char *new_ptr = ALLOC_N(char, len + termlen);
1796 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1797 RSTRING(str2)->as.heap.ptr = new_ptr;
1798 STR_SET_LEN(str2, len);
1799 RSTRING(str2)->as.heap.aux.capa = len;
1800 STR_SET_NOEMBED(str2);
1801 }
1802
1803 STR_SET_NOEMBED(str);
1804 FL_UNSET(str, STR_SHARED);
1805 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1806
1807 if (FL_TEST(str2, STR_SHARED)) {
1808 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1809 STR_SET_SHARED(str, shared);
1810 }
1811 else {
1812 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1813 }
1814
1815 /* abandon str2 */
1816 STR_SET_EMBED(str2);
1817 RSTRING_PTR(str2)[0] = 0;
1818 STR_SET_LEN(str2, 0);
1819 rb_enc_associate(str, enc);
1820 ENC_CODERANGE_SET(str, cr);
1821 }
1822}
1823
1824VALUE
1826{
1827 VALUE str;
1828
1829 if (RB_TYPE_P(obj, T_STRING)) {
1830 return obj;
1831 }
1832 str = rb_funcall(obj, idTo_s, 0);
1833 return rb_obj_as_string_result(str, obj);
1834}
1835
1836VALUE
1837rb_obj_as_string_result(VALUE str, VALUE obj)
1838{
1839 if (!RB_TYPE_P(str, T_STRING))
1840 return rb_any_to_s(obj);
1841 return str;
1842}
1843
1844static VALUE
1845str_replace(VALUE str, VALUE str2)
1846{
1847 long len;
1848
1849 len = RSTRING_LEN(str2);
1850 if (STR_SHARED_P(str2)) {
1851 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1853 STR_SET_NOEMBED(str);
1854 STR_SET_LEN(str, len);
1855 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1856 STR_SET_SHARED(str, shared);
1857 rb_enc_cr_str_exact_copy(str, str2);
1858 }
1859 else {
1860 str_replace_shared(str, str2);
1861 }
1862
1863 return str;
1864}
1865
1866static inline VALUE
1867ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1868{
1869 size_t size = rb_str_embed_size(capa);
1870 RUBY_ASSERT(size > 0);
1871 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1872
1873 NEWOBJ_OF(str, struct RString, klass,
1875
1876 str->len = 0;
1877
1878 return (VALUE)str;
1879}
1880
1881static inline VALUE
1882ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1883{
1884 NEWOBJ_OF(str, struct RString, klass,
1885 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1886
1887 str->as.heap.aux.capa = 0;
1888 str->as.heap.ptr = NULL;
1889
1890 return (VALUE)str;
1891}
1892
1893static inline VALUE
1894str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1895{
1896 int encidx = 0;
1897 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1898 encidx = rb_enc_get_index(str);
1899 flags &= ~ENCODING_MASK;
1900 }
1901 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1902 if (encidx) rb_enc_associate_index(dup, encidx);
1903 return dup;
1904}
1905
1906static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1907
1908static inline VALUE
1909str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1910{
1911 VALUE flags = FL_TEST_RAW(str, flag_mask);
1912 long len = RSTRING_LEN(str);
1913
1914 RUBY_ASSERT(STR_EMBED_P(dup));
1915 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1916 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1917 STR_SET_LEN(dup, RSTRING_LEN(str));
1918 return str_duplicate_setup_encoding(str, dup, flags);
1919}
1920
1921static inline VALUE
1922str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1923{
1924 VALUE flags = FL_TEST_RAW(str, flag_mask);
1925 VALUE root = str;
1926 if (FL_TEST_RAW(str, STR_SHARED)) {
1927 root = RSTRING(str)->as.heap.aux.shared;
1928 }
1929 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1930 root = str = str_new_frozen(klass, str);
1931 flags = FL_TEST_RAW(str, flag_mask);
1932 }
1933 RUBY_ASSERT(!STR_SHARED_P(root));
1935
1936 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1937 FL_SET(root, STR_SHARED_ROOT);
1938 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1939 flags |= RSTRING_NOEMBED | STR_SHARED;
1940
1941 STR_SET_LEN(dup, RSTRING_LEN(str));
1942 return str_duplicate_setup_encoding(str, dup, flags);
1943}
1944
1945static inline VALUE
1946str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1947{
1948 if (STR_EMBED_P(str)) {
1949 return str_duplicate_setup_embed(klass, str, dup);
1950 }
1951 else {
1952 return str_duplicate_setup_heap(klass, str, dup);
1953 }
1954}
1955
1956static inline VALUE
1957str_duplicate(VALUE klass, VALUE str)
1958{
1959 VALUE dup;
1960 if (STR_EMBED_P(str)) {
1961 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1962 }
1963 else {
1964 dup = str_alloc_heap(klass);
1965 }
1966
1967 return str_duplicate_setup(klass, str, dup);
1968}
1969
1970VALUE
1972{
1973 return str_duplicate(rb_obj_class(str), str);
1974}
1975
1976/* :nodoc: */
1977VALUE
1978rb_str_dup_m(VALUE str)
1979{
1980 if (LIKELY(BARE_STRING_P(str))) {
1981 return str_duplicate(rb_cString, str);
1982 }
1983 else {
1984 return rb_obj_dup(str);
1985 }
1986}
1987
1988VALUE
1990{
1991 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1992 return str_duplicate(rb_cString, str);
1993}
1994
1995VALUE
1996rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1997{
1998 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1999 VALUE new_str, klass = rb_cString;
2000
2001 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2002 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2003 str_duplicate_setup_embed(klass, str, new_str);
2004 }
2005 else {
2006 new_str = ec_str_alloc_heap(ec, klass);
2007 str_duplicate_setup_heap(klass, str, new_str);
2008 }
2009 if (chilled) {
2010 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2011 }
2012 return new_str;
2013}
2014
2015VALUE
2016rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2017{
2018 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2019 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2020 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2021 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2022 return rb_str_freeze(str);
2023}
2024
2025/*
2026 * The documentation block below uses an include (instead of inline text)
2027 * because the included text has non-ASCII characters (which are not allowed in a C file).
2028 */
2029
2030/*
2031 *
2032 * call-seq:
2033 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2034 *
2035 * :include: doc/string/new.rdoc
2036 *
2037 */
2038
2039static VALUE
2040rb_str_init(int argc, VALUE *argv, VALUE str)
2041{
2042 static ID keyword_ids[2];
2043 VALUE orig, opt, venc, vcapa;
2044 VALUE kwargs[2];
2045 rb_encoding *enc = 0;
2046 int n;
2047
2048 if (!keyword_ids[0]) {
2049 keyword_ids[0] = rb_id_encoding();
2050 CONST_ID(keyword_ids[1], "capacity");
2051 }
2052
2053 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2054 if (!NIL_P(opt)) {
2055 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2056 venc = kwargs[0];
2057 vcapa = kwargs[1];
2058 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2059 enc = rb_to_encoding(venc);
2060 }
2061 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2062 long capa = NUM2LONG(vcapa);
2063 long len = 0;
2064 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2065
2066 if (capa < STR_BUF_MIN_SIZE) {
2067 capa = STR_BUF_MIN_SIZE;
2068 }
2069 if (n == 1) {
2070 StringValue(orig);
2071 len = RSTRING_LEN(orig);
2072 if (capa < len) {
2073 capa = len;
2074 }
2075 if (orig == str) n = 0;
2076 }
2077 str_modifiable(str);
2078 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2079 /* make noembed always */
2080 const size_t size = (size_t)capa + termlen;
2081 const char *const old_ptr = RSTRING_PTR(str);
2082 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2083 char *new_ptr = ALLOC_N(char, size);
2084 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2085 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2086 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2087 RSTRING(str)->as.heap.ptr = new_ptr;
2088 }
2089 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2090 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2091 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2092 }
2093 STR_SET_LEN(str, len);
2094 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2095 if (n == 1) {
2096 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2097 rb_enc_cr_str_exact_copy(str, orig);
2098 }
2099 FL_SET(str, STR_NOEMBED);
2100 RSTRING(str)->as.heap.aux.capa = capa;
2101 }
2102 else if (n == 1) {
2103 rb_str_replace(str, orig);
2104 }
2105 if (enc) {
2106 rb_enc_associate(str, enc);
2108 }
2109 }
2110 else if (n == 1) {
2111 rb_str_replace(str, orig);
2112 }
2113 return str;
2114}
2115
2116/* :nodoc: */
2117static VALUE
2118rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2119{
2120 if (klass != rb_cString) {
2121 return rb_class_new_instance_pass_kw(argc, argv, klass);
2122 }
2123
2124 static ID keyword_ids[2];
2125 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2126 VALUE kwargs[2];
2127 rb_encoding *enc = NULL;
2128
2129 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2130 if (NIL_P(opt)) {
2131 return rb_class_new_instance_pass_kw(argc, argv, klass);
2132 }
2133
2134 keyword_ids[0] = rb_id_encoding();
2135 CONST_ID(keyword_ids[1], "capacity");
2136 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2137 encoding = kwargs[0];
2138 capacity = kwargs[1];
2139
2140 if (n == 1) {
2141 orig = StringValue(orig);
2142 }
2143 else {
2144 orig = Qnil;
2145 }
2146
2147 if (UNDEF_P(encoding)) {
2148 if (!NIL_P(orig)) {
2149 encoding = rb_obj_encoding(orig);
2150 }
2151 }
2152
2153 if (!UNDEF_P(encoding)) {
2154 enc = rb_to_encoding(encoding);
2155 }
2156
2157 // If capacity is nil, we're basically just duping `orig`.
2158 if (UNDEF_P(capacity)) {
2159 if (NIL_P(orig)) {
2160 VALUE empty_str = str_new(klass, "", 0);
2161 if (enc) {
2162 rb_enc_associate(empty_str, enc);
2163 }
2164 return empty_str;
2165 }
2166 VALUE copy = str_duplicate(klass, orig);
2167 rb_enc_associate(copy, enc);
2168 ENC_CODERANGE_CLEAR(copy);
2169 return copy;
2170 }
2171
2172 long capa = 0;
2173 capa = NUM2LONG(capacity);
2174 if (capa < 0) {
2175 capa = 0;
2176 }
2177
2178 if (!NIL_P(orig)) {
2179 long orig_capa = rb_str_capacity(orig);
2180 if (orig_capa > capa) {
2181 capa = orig_capa;
2182 }
2183 }
2184
2185 VALUE str = str_enc_new(klass, NULL, capa, enc);
2186 STR_SET_LEN(str, 0);
2187 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2188
2189 if (!NIL_P(orig)) {
2190 rb_str_buf_append(str, orig);
2191 }
2192
2193 return str;
2194}
2195
2196#ifdef NONASCII_MASK
2197#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2198
2199/*
2200 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2201 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2202 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2203 *
2204 * if (!(byte & 0x80))
2205 * byte |= 0x40; // turn on bit6
2206 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2207 *
2208 * This function calculates whether a byte is leading or not for all bytes
2209 * in the argument word by concurrently using the above logic, and then
2210 * adds up the number of leading bytes in the word.
2211 */
2212static inline uintptr_t
2213count_utf8_lead_bytes_with_word(const uintptr_t *s)
2214{
2215 uintptr_t d = *s;
2216
2217 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2218 d = (d>>6) | (~d>>7);
2219 d &= NONASCII_MASK >> 7;
2220
2221 /* Gather all bytes. */
2222#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2223 /* use only if it can use POPCNT */
2224 return rb_popcount_intptr(d);
2225#else
2226 d += (d>>8);
2227 d += (d>>16);
2228# if SIZEOF_VOIDP == 8
2229 d += (d>>32);
2230# endif
2231 return (d&0xF);
2232#endif
2233}
2234#endif
2235
2236static inline long
2237enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2238{
2239 long c;
2240 const char *q;
2241
2242 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2243 long diff = (long)(e - p);
2244 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2245 }
2246#ifdef NONASCII_MASK
2247 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2248 uintptr_t len = 0;
2249 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2250 const uintptr_t *s, *t;
2251 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2252 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2253 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2254 while (p < (const char *)s) {
2255 if (is_utf8_lead_byte(*p)) len++;
2256 p++;
2257 }
2258 while (s < t) {
2259 len += count_utf8_lead_bytes_with_word(s);
2260 s++;
2261 }
2262 p = (const char *)s;
2263 }
2264 while (p < e) {
2265 if (is_utf8_lead_byte(*p)) len++;
2266 p++;
2267 }
2268 return (long)len;
2269 }
2270#endif
2271 else if (rb_enc_asciicompat(enc)) {
2272 c = 0;
2273 if (ENC_CODERANGE_CLEAN_P(cr)) {
2274 while (p < e) {
2275 if (ISASCII(*p)) {
2276 q = search_nonascii(p, e);
2277 if (!q)
2278 return c + (e - p);
2279 c += q - p;
2280 p = q;
2281 }
2282 p += rb_enc_fast_mbclen(p, e, enc);
2283 c++;
2284 }
2285 }
2286 else {
2287 while (p < e) {
2288 if (ISASCII(*p)) {
2289 q = search_nonascii(p, e);
2290 if (!q)
2291 return c + (e - p);
2292 c += q - p;
2293 p = q;
2294 }
2295 p += rb_enc_mbclen(p, e, enc);
2296 c++;
2297 }
2298 }
2299 return c;
2300 }
2301
2302 for (c=0; p<e; c++) {
2303 p += rb_enc_mbclen(p, e, enc);
2304 }
2305 return c;
2306}
2307
2308long
2309rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2310{
2311 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2312}
2313
2314/* To get strlen with cr
2315 * Note that given cr is not used.
2316 */
2317long
2318rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2319{
2320 long c;
2321 const char *q;
2322 int ret;
2323
2324 *cr = 0;
2325 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2326 long diff = (long)(e - p);
2327 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2328 }
2329 else if (rb_enc_asciicompat(enc)) {
2330 c = 0;
2331 while (p < e) {
2332 if (ISASCII(*p)) {
2333 q = search_nonascii(p, e);
2334 if (!q) {
2335 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2336 return c + (e - p);
2337 }
2338 c += q - p;
2339 p = q;
2340 }
2341 ret = rb_enc_precise_mbclen(p, e, enc);
2342 if (MBCLEN_CHARFOUND_P(ret)) {
2343 *cr |= ENC_CODERANGE_VALID;
2344 p += MBCLEN_CHARFOUND_LEN(ret);
2345 }
2346 else {
2348 p++;
2349 }
2350 c++;
2351 }
2352 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2353 return c;
2354 }
2355
2356 for (c=0; p<e; c++) {
2357 ret = rb_enc_precise_mbclen(p, e, enc);
2358 if (MBCLEN_CHARFOUND_P(ret)) {
2359 *cr |= ENC_CODERANGE_VALID;
2360 p += MBCLEN_CHARFOUND_LEN(ret);
2361 }
2362 else {
2364 if (p + rb_enc_mbminlen(enc) <= e)
2365 p += rb_enc_mbminlen(enc);
2366 else
2367 p = e;
2368 }
2369 }
2370 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2371 return c;
2372}
2373
2374/* enc must be str's enc or rb_enc_check(str, str2) */
2375static long
2376str_strlen(VALUE str, rb_encoding *enc)
2377{
2378 const char *p, *e;
2379 int cr;
2380
2381 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2382 if (!enc) enc = STR_ENC_GET(str);
2383 p = RSTRING_PTR(str);
2384 e = RSTRING_END(str);
2385 cr = ENC_CODERANGE(str);
2386
2387 if (cr == ENC_CODERANGE_UNKNOWN) {
2388 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2389 if (cr) ENC_CODERANGE_SET(str, cr);
2390 return n;
2391 }
2392 else {
2393 return enc_strlen(p, e, enc, cr);
2394 }
2395}
2396
2397long
2399{
2400 return str_strlen(str, NULL);
2401}
2402
2403/*
2404 * call-seq:
2405 * length -> integer
2406 *
2407 * :include: doc/string/length.rdoc
2408 *
2409 */
2410
2411VALUE
2413{
2414 return LONG2NUM(str_strlen(str, NULL));
2415}
2416
2417/*
2418 * call-seq:
2419 * bytesize -> integer
2420 *
2421 * :include: doc/string/bytesize.rdoc
2422 *
2423 */
2424
2425VALUE
2426rb_str_bytesize(VALUE str)
2427{
2428 return LONG2NUM(RSTRING_LEN(str));
2429}
2430
2431/*
2432 * call-seq:
2433 * empty? -> true or false
2434 *
2435 * Returns whether the length of +self+ is zero:
2436 *
2437 * 'hello'.empty? # => false
2438 * ' '.empty? # => false
2439 * ''.empty? # => true
2440 *
2441 * Related: see {Querying}[rdoc-ref:String@Querying].
2442 */
2443
2444static VALUE
2445rb_str_empty(VALUE str)
2446{
2447 return RBOOL(RSTRING_LEN(str) == 0);
2448}
2449
2450/*
2451 * call-seq:
2452 * self + other_string -> new_string
2453 *
2454 * Returns a new string containing +other_string+ concatenated to +self+:
2455 *
2456 * 'Hello from ' + self.to_s # => "Hello from main"
2457 *
2458 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2459 */
2460
2461VALUE
2463{
2464 VALUE str3;
2465 rb_encoding *enc;
2466 char *ptr1, *ptr2, *ptr3;
2467 long len1, len2;
2468 int termlen;
2469
2470 StringValue(str2);
2471 enc = rb_enc_check_str(str1, str2);
2472 RSTRING_GETMEM(str1, ptr1, len1);
2473 RSTRING_GETMEM(str2, ptr2, len2);
2474 termlen = rb_enc_mbminlen(enc);
2475 if (len1 > LONG_MAX - len2) {
2476 rb_raise(rb_eArgError, "string size too big");
2477 }
2478 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2479 ptr3 = RSTRING_PTR(str3);
2480 memcpy(ptr3, ptr1, len1);
2481 memcpy(ptr3+len1, ptr2, len2);
2482 TERM_FILL(&ptr3[len1+len2], termlen);
2483
2484 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2486 RB_GC_GUARD(str1);
2487 RB_GC_GUARD(str2);
2488 return str3;
2489}
2490
2491/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2492VALUE
2493rb_str_opt_plus(VALUE str1, VALUE str2)
2494{
2497 long len1, len2;
2498 MAYBE_UNUSED(char) *ptr1, *ptr2;
2499 RSTRING_GETMEM(str1, ptr1, len1);
2500 RSTRING_GETMEM(str2, ptr2, len2);
2501 int enc1 = rb_enc_get_index(str1);
2502 int enc2 = rb_enc_get_index(str2);
2503
2504 if (enc1 < 0) {
2505 return Qundef;
2506 }
2507 else if (enc2 < 0) {
2508 return Qundef;
2509 }
2510 else if (enc1 != enc2) {
2511 return Qundef;
2512 }
2513 else if (len1 > LONG_MAX - len2) {
2514 return Qundef;
2515 }
2516 else {
2517 return rb_str_plus(str1, str2);
2518 }
2519
2520}
2521
2522/*
2523 * call-seq:
2524 * self * n -> new_string
2525 *
2526 * Returns a new string containing +n+ copies of +self+:
2527 *
2528 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2529 * 'No!' * 0 # => ""
2530 *
2531 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2532 */
2533
2534VALUE
2536{
2537 VALUE str2;
2538 long n, len;
2539 char *ptr2;
2540 int termlen;
2541
2542 if (times == INT2FIX(1)) {
2543 return str_duplicate(rb_cString, str);
2544 }
2545 if (times == INT2FIX(0)) {
2546 str2 = str_alloc_embed(rb_cString, 0);
2547 rb_enc_copy(str2, str);
2548 return str2;
2549 }
2550 len = NUM2LONG(times);
2551 if (len < 0) {
2552 rb_raise(rb_eArgError, "negative argument");
2553 }
2554 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2555 if (STR_EMBEDDABLE_P(len, 1)) {
2556 str2 = str_alloc_embed(rb_cString, len + 1);
2557 memset(RSTRING_PTR(str2), 0, len + 1);
2558 }
2559 else {
2560 str2 = str_alloc_heap(rb_cString);
2561 RSTRING(str2)->as.heap.aux.capa = len;
2562 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2563 }
2564 STR_SET_LEN(str2, len);
2565 rb_enc_copy(str2, str);
2566 return str2;
2567 }
2568 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2569 rb_raise(rb_eArgError, "argument too big");
2570 }
2571
2572 len *= RSTRING_LEN(str);
2573 termlen = TERM_LEN(str);
2574 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2575 ptr2 = RSTRING_PTR(str2);
2576 if (len) {
2577 n = RSTRING_LEN(str);
2578 memcpy(ptr2, RSTRING_PTR(str), n);
2579 while (n <= len/2) {
2580 memcpy(ptr2 + n, ptr2, n);
2581 n *= 2;
2582 }
2583 memcpy(ptr2 + n, ptr2, len-n);
2584 }
2585 STR_SET_LEN(str2, len);
2586 TERM_FILL(&ptr2[len], termlen);
2587 rb_enc_cr_str_copy_for_substr(str2, str);
2588
2589 return str2;
2590}
2591
2592/*
2593 * call-seq:
2594 * self % object -> new_string
2595 *
2596 * Returns the result of formatting +object+ into the format specifications
2597 * contained in +self+
2598 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2599 *
2600 * '%05d' % 123 # => "00123"
2601 *
2602 * If +self+ contains multiple format specifications,
2603 * +object+ must be an array or hash containing the objects to be formatted:
2604 *
2605 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2606 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2607 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2608 *
2609 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2610 */
2611
2612static VALUE
2613rb_str_format_m(VALUE str, VALUE arg)
2614{
2615 VALUE tmp = rb_check_array_type(arg);
2616
2617 if (!NIL_P(tmp)) {
2618 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2619 }
2620 return rb_str_format(1, &arg, str);
2621}
2622
2623static inline void
2624rb_check_lockedtmp(VALUE str)
2625{
2626 if (FL_TEST(str, STR_TMPLOCK)) {
2627 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2628 }
2629}
2630
2631// If none of these flags are set, we know we have an modifiable string.
2632// If any is set, we need to do more detailed checks.
2633#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2634static inline void
2635str_modifiable(VALUE str)
2636{
2637 RUBY_ASSERT(ruby_thread_has_gvl_p());
2638
2639 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2640 if (CHILLED_STRING_P(str)) {
2641 CHILLED_STRING_MUTATED(str);
2642 }
2643 rb_check_lockedtmp(str);
2644 rb_check_frozen(str);
2645 }
2646}
2647
2648static inline int
2649str_dependent_p(VALUE str)
2650{
2651 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2652 return FALSE;
2653 }
2654 else {
2655 return TRUE;
2656 }
2657}
2658
2659// If none of these flags are set, we know we have an independent string.
2660// If any is set, we need to do more detailed checks.
2661#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2662static inline int
2663str_independent(VALUE str)
2664{
2665 RUBY_ASSERT(ruby_thread_has_gvl_p());
2666
2667 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2668 str_modifiable(str);
2669 return !str_dependent_p(str);
2670 }
2671 return TRUE;
2672}
2673
2674static void
2675str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2676{
2677 RUBY_ASSERT(ruby_thread_has_gvl_p());
2678
2679 char *ptr;
2680 char *oldptr;
2681 long capa = len + expand;
2682
2683 if (len > capa) len = capa;
2684
2685 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2686 ptr = RSTRING(str)->as.heap.ptr;
2687 STR_SET_EMBED(str);
2688 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2689 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2690 STR_SET_LEN(str, len);
2691 return;
2692 }
2693
2694 ptr = ALLOC_N(char, (size_t)capa + termlen);
2695 oldptr = RSTRING_PTR(str);
2696 if (oldptr) {
2697 memcpy(ptr, oldptr, len);
2698 }
2699 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2700 xfree(oldptr);
2701 }
2702 STR_SET_NOEMBED(str);
2703 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2704 TERM_FILL(ptr + len, termlen);
2705 RSTRING(str)->as.heap.ptr = ptr;
2706 STR_SET_LEN(str, len);
2707 RSTRING(str)->as.heap.aux.capa = capa;
2708}
2709
2710void
2711rb_str_modify(VALUE str)
2712{
2713 if (!str_independent(str))
2714 str_make_independent(str);
2716}
2717
2718void
2720{
2721 RUBY_ASSERT(ruby_thread_has_gvl_p());
2722
2723 int termlen = TERM_LEN(str);
2724 long len = RSTRING_LEN(str);
2725
2726 if (expand < 0) {
2727 rb_raise(rb_eArgError, "negative expanding string size");
2728 }
2729 if (expand >= LONG_MAX - len) {
2730 rb_raise(rb_eArgError, "string size too big");
2731 }
2732
2733 if (!str_independent(str)) {
2734 str_make_independent_expand(str, len, expand, termlen);
2735 }
2736 else if (expand > 0) {
2737 RESIZE_CAPA_TERM(str, len + expand, termlen);
2738 }
2740}
2741
2742/* As rb_str_modify(), but don't clear coderange */
2743static void
2744str_modify_keep_cr(VALUE str)
2745{
2746 if (!str_independent(str))
2747 str_make_independent(str);
2749 /* Force re-scan later */
2751}
2752
2753static inline void
2754str_discard(VALUE str)
2755{
2756 str_modifiable(str);
2757 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2758 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2759 RSTRING(str)->as.heap.ptr = 0;
2760 STR_SET_LEN(str, 0);
2761 }
2762}
2763
2764void
2766{
2767 int encindex = rb_enc_get_index(str);
2768
2769 if (RB_UNLIKELY(encindex == -1)) {
2770 rb_raise(rb_eTypeError, "not encoding capable object");
2771 }
2772
2773 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2774 return;
2775 }
2776
2777 rb_encoding *enc = rb_enc_from_index(encindex);
2778 if (!rb_enc_asciicompat(enc)) {
2779 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2780 }
2781}
2782
2783VALUE
2785{
2786 RUBY_ASSERT(ruby_thread_has_gvl_p());
2787
2788 VALUE s = *ptr;
2789 if (!RB_TYPE_P(s, T_STRING)) {
2790 s = rb_str_to_str(s);
2791 *ptr = s;
2792 }
2793 return s;
2794}
2795
2796char *
2798{
2799 VALUE str = rb_string_value(ptr);
2800 return RSTRING_PTR(str);
2801}
2802
2803static int
2804zero_filled(const char *s, int n)
2805{
2806 for (; n > 0; --n) {
2807 if (*s++) return 0;
2808 }
2809 return 1;
2810}
2811
2812static const char *
2813str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2814{
2815 const char *e = s + len;
2816
2817 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2818 if (zero_filled(s, minlen)) return s;
2819 }
2820 return 0;
2821}
2822
2823static char *
2824str_fill_term(VALUE str, char *s, long len, int termlen)
2825{
2826 /* This function assumes that (capa + termlen) bytes of memory
2827 * is allocated, like many other functions in this file.
2828 */
2829 if (str_dependent_p(str)) {
2830 if (!zero_filled(s + len, termlen))
2831 str_make_independent_expand(str, len, 0L, termlen);
2832 }
2833 else {
2834 TERM_FILL(s + len, termlen);
2835 return s;
2836 }
2837 return RSTRING_PTR(str);
2838}
2839
2840void
2841rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2842{
2843 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2844 long len = RSTRING_LEN(str);
2845
2846 RUBY_ASSERT(capa >= len);
2847 if (capa - len < termlen) {
2848 rb_check_lockedtmp(str);
2849 str_make_independent_expand(str, len, 0L, termlen);
2850 }
2851 else if (str_dependent_p(str)) {
2852 if (termlen > oldtermlen)
2853 str_make_independent_expand(str, len, 0L, termlen);
2854 }
2855 else {
2856 if (!STR_EMBED_P(str)) {
2857 /* modify capa instead of realloc */
2858 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2859 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2860 }
2861 if (termlen > oldtermlen) {
2862 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2863 }
2864 }
2865
2866 return;
2867}
2868
2869static char *
2870str_null_check(VALUE str, int *w)
2871{
2872 char *s = RSTRING_PTR(str);
2873 long len = RSTRING_LEN(str);
2874 rb_encoding *enc = rb_enc_get(str);
2875 const int minlen = rb_enc_mbminlen(enc);
2876
2877 if (minlen > 1) {
2878 *w = 1;
2879 if (str_null_char(s, len, minlen, enc)) {
2880 return NULL;
2881 }
2882 return str_fill_term(str, s, len, minlen);
2883 }
2884 *w = 0;
2885 if (!s || memchr(s, 0, len)) {
2886 return NULL;
2887 }
2888 if (s[len]) {
2889 s = str_fill_term(str, s, len, minlen);
2890 }
2891 return s;
2892}
2893
2894char *
2895rb_str_to_cstr(VALUE str)
2896{
2897 int w;
2898 return str_null_check(str, &w);
2899}
2900
2901char *
2903{
2904 VALUE str = rb_string_value(ptr);
2905 int w;
2906 char *s = str_null_check(str, &w);
2907 if (!s) {
2908 if (w) {
2909 rb_raise(rb_eArgError, "string contains null char");
2910 }
2911 rb_raise(rb_eArgError, "string contains null byte");
2912 }
2913 return s;
2914}
2915
2916char *
2917rb_str_fill_terminator(VALUE str, const int newminlen)
2918{
2919 char *s = RSTRING_PTR(str);
2920 long len = RSTRING_LEN(str);
2921 return str_fill_term(str, s, len, newminlen);
2922}
2923
2924VALUE
2926{
2927 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2928 return str;
2929}
2930
2931/*
2932 * call-seq:
2933 * String.try_convert(object) -> object, new_string, or nil
2934 *
2935 * Attempts to convert the given +object+ to a string.
2936 *
2937 * If +object+ is already a string, returns +object+, unmodified.
2938 *
2939 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2940 * calls <tt>object.to_str</tt> and returns the result.
2941 *
2942 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2943 *
2944 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2945 */
2946static VALUE
2947rb_str_s_try_convert(VALUE dummy, VALUE str)
2948{
2949 return rb_check_string_type(str);
2950}
2951
2952static char*
2953str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2954{
2955 long nth = *nthp;
2956 if (rb_enc_mbmaxlen(enc) == 1) {
2957 p += nth;
2958 }
2959 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2960 p += nth * rb_enc_mbmaxlen(enc);
2961 }
2962 else if (rb_enc_asciicompat(enc)) {
2963 const char *p2, *e2;
2964 int n;
2965
2966 while (p < e && 0 < nth) {
2967 e2 = p + nth;
2968 if (e < e2) {
2969 *nthp = nth;
2970 return (char *)e;
2971 }
2972 if (ISASCII(*p)) {
2973 p2 = search_nonascii(p, e2);
2974 if (!p2) {
2975 nth -= e2 - p;
2976 *nthp = nth;
2977 return (char *)e2;
2978 }
2979 nth -= p2 - p;
2980 p = p2;
2981 }
2982 n = rb_enc_mbclen(p, e, enc);
2983 p += n;
2984 nth--;
2985 }
2986 *nthp = nth;
2987 if (nth != 0) {
2988 return (char *)e;
2989 }
2990 return (char *)p;
2991 }
2992 else {
2993 while (p < e && nth--) {
2994 p += rb_enc_mbclen(p, e, enc);
2995 }
2996 }
2997 if (p > e) p = e;
2998 *nthp = nth;
2999 return (char*)p;
3000}
3001
3002char*
3003rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3004{
3005 return str_nth_len(p, e, &nth, enc);
3006}
3007
3008static char*
3009str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3010{
3011 if (singlebyte)
3012 p += nth;
3013 else {
3014 p = str_nth_len(p, e, &nth, enc);
3015 }
3016 if (!p) return 0;
3017 if (p > e) p = e;
3018 return (char *)p;
3019}
3020
3021/* char offset to byte offset */
3022static long
3023str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3024{
3025 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3026 if (!pp) return e - p;
3027 return pp - p;
3028}
3029
3030long
3031rb_str_offset(VALUE str, long pos)
3032{
3033 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3034 STR_ENC_GET(str), single_byte_optimizable(str));
3035}
3036
3037#ifdef NONASCII_MASK
3038static char *
3039str_utf8_nth(const char *p, const char *e, long *nthp)
3040{
3041 long nth = *nthp;
3042 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3043 const uintptr_t *s, *t;
3044 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3045 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3046 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3047 while (p < (const char *)s) {
3048 if (is_utf8_lead_byte(*p)) nth--;
3049 p++;
3050 }
3051 do {
3052 nth -= count_utf8_lead_bytes_with_word(s);
3053 s++;
3054 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3055 p = (char *)s;
3056 }
3057 while (p < e) {
3058 if (is_utf8_lead_byte(*p)) {
3059 if (nth == 0) break;
3060 nth--;
3061 }
3062 p++;
3063 }
3064 *nthp = nth;
3065 return (char *)p;
3066}
3067
3068static long
3069str_utf8_offset(const char *p, const char *e, long nth)
3070{
3071 const char *pp = str_utf8_nth(p, e, &nth);
3072 return pp - p;
3073}
3074#endif
3075
3076/* byte offset to char offset */
3077long
3078rb_str_sublen(VALUE str, long pos)
3079{
3080 if (single_byte_optimizable(str) || pos < 0)
3081 return pos;
3082 else {
3083 char *p = RSTRING_PTR(str);
3084 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3085 }
3086}
3087
3088static VALUE
3089str_subseq(VALUE str, long beg, long len)
3090{
3091 VALUE str2;
3092
3093 RUBY_ASSERT(beg >= 0);
3094 RUBY_ASSERT(len >= 0);
3095 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3096
3097 const int termlen = TERM_LEN(str);
3098 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3099 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3100 RB_GC_GUARD(str);
3101 return str2;
3102 }
3103
3104 str2 = str_alloc_heap(rb_cString);
3105 if (str_embed_capa(str2) >= len + termlen) {
3106 char *ptr2 = RSTRING(str2)->as.embed.ary;
3107 STR_SET_EMBED(str2);
3108 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3109 TERM_FILL(ptr2+len, termlen);
3110
3111 STR_SET_LEN(str2, len);
3112 RB_GC_GUARD(str);
3113 }
3114 else {
3115 str_replace_shared(str2, str);
3116 RUBY_ASSERT(!STR_EMBED_P(str2));
3117 ENC_CODERANGE_CLEAR(str2);
3118 RSTRING(str2)->as.heap.ptr += beg;
3119 if (RSTRING_LEN(str2) > len) {
3120 STR_SET_LEN(str2, len);
3121 }
3122 }
3123
3124 return str2;
3125}
3126
3127VALUE
3128rb_str_subseq(VALUE str, long beg, long len)
3129{
3130 VALUE str2 = str_subseq(str, beg, len);
3131 rb_enc_cr_str_copy_for_substr(str2, str);
3132 return str2;
3133}
3134
3135char *
3136rb_str_subpos(VALUE str, long beg, long *lenp)
3137{
3138 long len = *lenp;
3139 long slen = -1L;
3140 const long blen = RSTRING_LEN(str);
3141 rb_encoding *enc = STR_ENC_GET(str);
3142 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3143
3144 if (len < 0) return 0;
3145 if (beg < 0 && -beg < 0) return 0;
3146 if (!blen) {
3147 len = 0;
3148 }
3149 if (single_byte_optimizable(str)) {
3150 if (beg > blen) return 0;
3151 if (beg < 0) {
3152 beg += blen;
3153 if (beg < 0) return 0;
3154 }
3155 if (len > blen - beg)
3156 len = blen - beg;
3157 if (len < 0) return 0;
3158 p = s + beg;
3159 goto end;
3160 }
3161 if (beg < 0) {
3162 if (len > -beg) len = -beg;
3163 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3164 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3165 beg = -beg;
3166 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3167 p = e;
3168 if (!p) return 0;
3169 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3170 if (!p) return 0;
3171 len = e - p;
3172 goto end;
3173 }
3174 else {
3175 slen = str_strlen(str, enc);
3176 beg += slen;
3177 if (beg < 0) return 0;
3178 p = s + beg;
3179 if (len == 0) goto end;
3180 }
3181 }
3182 else if (beg > 0 && beg > blen) {
3183 return 0;
3184 }
3185 if (len == 0) {
3186 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3187 p = s + beg;
3188 }
3189#ifdef NONASCII_MASK
3190 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3191 enc == rb_utf8_encoding()) {
3192 p = str_utf8_nth(s, e, &beg);
3193 if (beg > 0) return 0;
3194 len = str_utf8_offset(p, e, len);
3195 }
3196#endif
3197 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3198 int char_sz = rb_enc_mbmaxlen(enc);
3199
3200 p = s + beg * char_sz;
3201 if (p > e) {
3202 return 0;
3203 }
3204 else if (len * char_sz > e - p)
3205 len = e - p;
3206 else
3207 len *= char_sz;
3208 }
3209 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3210 if (beg > 0) return 0;
3211 len = 0;
3212 }
3213 else {
3214 len = str_offset(p, e, len, enc, 0);
3215 }
3216 end:
3217 *lenp = len;
3218 RB_GC_GUARD(str);
3219 return p;
3220}
3221
3222static VALUE str_substr(VALUE str, long beg, long len, int empty);
3223
3224VALUE
3225rb_str_substr(VALUE str, long beg, long len)
3226{
3227 return str_substr(str, beg, len, TRUE);
3228}
3229
3230VALUE
3231rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3232{
3233 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3234}
3235
3236static VALUE
3237str_substr(VALUE str, long beg, long len, int empty)
3238{
3239 char *p = rb_str_subpos(str, beg, &len);
3240
3241 if (!p) return Qnil;
3242 if (!len && !empty) return Qnil;
3243
3244 beg = p - RSTRING_PTR(str);
3245
3246 VALUE str2 = str_subseq(str, beg, len);
3247 rb_enc_cr_str_copy_for_substr(str2, str);
3248 return str2;
3249}
3250
3251/* :nodoc: */
3252VALUE
3254{
3255 if (CHILLED_STRING_P(str)) {
3256 FL_UNSET_RAW(str, STR_CHILLED);
3257 }
3258
3259 if (OBJ_FROZEN(str)) return str;
3260 rb_str_resize(str, RSTRING_LEN(str));
3261 return rb_obj_freeze(str);
3262}
3263
3264/*
3265 * call-seq:
3266 * +string -> new_string or self
3267 *
3268 * Returns +self+ if +self+ is not frozen and can be mutated
3269 * without warning issuance.
3270 *
3271 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3272 *
3273 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3274 */
3275static VALUE
3276str_uplus(VALUE str)
3277{
3278 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3279 return rb_str_dup(str);
3280 }
3281 else {
3282 return str;
3283 }
3284}
3285
3286/*
3287 * call-seq:
3288 * -self -> frozen_string
3289 *
3290 * Returns a frozen string equal to +self+.
3291 *
3292 * The returned string is +self+ if and only if all of the following are true:
3293 *
3294 * - +self+ is already frozen.
3295 * - +self+ is an instance of \String (rather than of a subclass of \String)
3296 * - +self+ has no instance variables set on it.
3297 *
3298 * Otherwise, the returned string is a frozen copy of +self+.
3299 *
3300 * Returning +self+, when possible, saves duplicating +self+;
3301 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3302 *
3303 * It may also save duplicating other, already-existing, strings:
3304 *
3305 * s0 = 'foo'
3306 * s1 = 'foo'
3307 * s0.object_id == s1.object_id # => false
3308 * (-s0).object_id == (-s1).object_id # => true
3309 *
3310 * Note that method #-@ is convenient for defining a constant:
3311 *
3312 * FileName = -'config/database.yml'
3313 *
3314 * While its alias #dedup is better suited for chaining:
3315 *
3316 * 'foo'.dedup.gsub!('o')
3317 *
3318 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3319 */
3320static VALUE
3321str_uminus(VALUE str)
3322{
3323 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3324 str = rb_str_dup(str);
3325 }
3326 return rb_fstring(str);
3327}
3328
3329RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3330#define rb_str_dup_frozen rb_str_new_frozen
3331
3332VALUE
3334{
3335 rb_check_frozen(str);
3336 if (FL_TEST(str, STR_TMPLOCK)) {
3337 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3338 }
3339 FL_SET(str, STR_TMPLOCK);
3340 return str;
3341}
3342
3343VALUE
3345{
3346 rb_check_frozen(str);
3347 if (!FL_TEST(str, STR_TMPLOCK)) {
3348 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3349 }
3350 FL_UNSET(str, STR_TMPLOCK);
3351 return str;
3352}
3353
3354VALUE
3355rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3356{
3357 rb_str_locktmp(str);
3358 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3359}
3360
3361void
3363{
3364 RUBY_ASSERT(ruby_thread_has_gvl_p());
3365
3366 long capa;
3367 const int termlen = TERM_LEN(str);
3368
3369 str_modifiable(str);
3370 if (STR_SHARED_P(str)) {
3371 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3372 }
3373 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3374 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3375 }
3376
3377 int cr = ENC_CODERANGE(str);
3378 if (len == 0) {
3379 /* Empty string does not contain non-ASCII */
3381 }
3382 else if (cr == ENC_CODERANGE_UNKNOWN) {
3383 /* Leave unknown. */
3384 }
3385 else if (len > RSTRING_LEN(str)) {
3386 if (ENC_CODERANGE_CLEAN_P(cr)) {
3387 /* Update the coderange regarding the extended part. */
3388 const char *const prev_end = RSTRING_END(str);
3389 const char *const new_end = RSTRING_PTR(str) + len;
3390 rb_encoding *enc = rb_enc_get(str);
3391 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3392 ENC_CODERANGE_SET(str, cr);
3393 }
3394 else if (cr == ENC_CODERANGE_BROKEN) {
3395 /* May be valid now, by appended part. */
3397 }
3398 }
3399 else if (len < RSTRING_LEN(str)) {
3400 if (cr != ENC_CODERANGE_7BIT) {
3401 /* ASCII-only string is keeping after truncated. Valid
3402 * and broken may be invalid or valid, leave unknown. */
3404 }
3405 }
3406
3407 STR_SET_LEN(str, len);
3408 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3409}
3410
3411VALUE
3412rb_str_resize(VALUE str, long len)
3413{
3414 if (len < 0) {
3415 rb_raise(rb_eArgError, "negative string size (or size too big)");
3416 }
3417
3418 int independent = str_independent(str);
3419 long slen = RSTRING_LEN(str);
3420 const int termlen = TERM_LEN(str);
3421
3422 if (slen > len || (termlen != 1 && slen < len)) {
3424 }
3425
3426 {
3427 long capa;
3428 if (STR_EMBED_P(str)) {
3429 if (len == slen) return str;
3430 if (str_embed_capa(str) >= len + termlen) {
3431 STR_SET_LEN(str, len);
3432 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3433 return str;
3434 }
3435 str_make_independent_expand(str, slen, len - slen, termlen);
3436 }
3437 else if (str_embed_capa(str) >= len + termlen) {
3438 char *ptr = STR_HEAP_PTR(str);
3439 STR_SET_EMBED(str);
3440 if (slen > len) slen = len;
3441 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3442 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3443 STR_SET_LEN(str, len);
3444 if (independent) ruby_xfree(ptr);
3445 return str;
3446 }
3447 else if (!independent) {
3448 if (len == slen) return str;
3449 str_make_independent_expand(str, slen, len - slen, termlen);
3450 }
3451 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3452 (capa - len) > (len < 1024 ? len : 1024)) {
3453 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3454 (size_t)len + termlen, STR_HEAP_SIZE(str));
3455 RSTRING(str)->as.heap.aux.capa = len;
3456 }
3457 else if (len == slen) return str;
3458 STR_SET_LEN(str, len);
3459 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3460 }
3461 return str;
3462}
3463
3464static void
3465str_ensure_available_capa(VALUE str, long len)
3466{
3467 str_modify_keep_cr(str);
3468
3469 const int termlen = TERM_LEN(str);
3470 long olen = RSTRING_LEN(str);
3471
3472 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3473 rb_raise(rb_eArgError, "string sizes too big");
3474 }
3475
3476 long total = olen + len;
3477 long capa = str_capacity(str, termlen);
3478
3479 if (capa < total) {
3480 if (total >= LONG_MAX / 2) {
3481 capa = total;
3482 }
3483 while (total > capa) {
3484 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3485 }
3486 RESIZE_CAPA_TERM(str, capa, termlen);
3487 }
3488}
3489
3490static VALUE
3491str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3492{
3493 if (keep_cr) {
3494 str_modify_keep_cr(str);
3495 }
3496 else {
3497 rb_str_modify(str);
3498 }
3499 if (len == 0) return 0;
3500
3501 long total, olen, off = -1;
3502 char *sptr;
3503 const int termlen = TERM_LEN(str);
3504
3505 RSTRING_GETMEM(str, sptr, olen);
3506 if (ptr >= sptr && ptr <= sptr + olen) {
3507 off = ptr - sptr;
3508 }
3509
3510 long capa = str_capacity(str, termlen);
3511
3512 if (olen > LONG_MAX - len) {
3513 rb_raise(rb_eArgError, "string sizes too big");
3514 }
3515 total = olen + len;
3516 if (capa < total) {
3517 if (total >= LONG_MAX / 2) {
3518 capa = total;
3519 }
3520 while (total > capa) {
3521 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3522 }
3523 RESIZE_CAPA_TERM(str, capa, termlen);
3524 sptr = RSTRING_PTR(str);
3525 }
3526 if (off != -1) {
3527 ptr = sptr + off;
3528 }
3529 memcpy(sptr + olen, ptr, len);
3530 STR_SET_LEN(str, total);
3531 TERM_FILL(sptr + total, termlen); /* sentinel */
3532
3533 return str;
3534}
3535
3536#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3537#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3538
3539VALUE
3540rb_str_cat(VALUE str, const char *ptr, long len)
3541{
3542 if (len == 0) return str;
3543 if (len < 0) {
3544 rb_raise(rb_eArgError, "negative string size (or size too big)");
3545 }
3546 return str_buf_cat(str, ptr, len);
3547}
3548
3549VALUE
3550rb_str_cat_cstr(VALUE str, const char *ptr)
3551{
3552 must_not_null(ptr);
3553 return rb_str_buf_cat(str, ptr, strlen(ptr));
3554}
3555
3556static void
3557rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3558{
3559 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3560
3561 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3562 if (UNLIKELY(!str_independent(str))) {
3563 str_make_independent(str);
3564 }
3565
3566 long string_length = -1;
3567 const int null_terminator_length = 1;
3568 char *sptr;
3569 RSTRING_GETMEM(str, sptr, string_length);
3570
3571 // Ensure the resulting string wouldn't be too long.
3572 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3573 rb_raise(rb_eArgError, "string sizes too big");
3574 }
3575
3576 long string_capacity = str_capacity(str, null_terminator_length);
3577
3578 // Get the code range before any modifications since those might clear the code range.
3579 int cr = ENC_CODERANGE(str);
3580
3581 // Check if the string has spare string_capacity to write the new byte.
3582 if (LIKELY(string_capacity >= string_length + 1)) {
3583 // In fast path we can write the new byte and note the string's new length.
3584 sptr[string_length] = byte;
3585 STR_SET_LEN(str, string_length + 1);
3586 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3587 }
3588 else {
3589 // If there's not enough string_capacity, make a call into the general string concatenation function.
3590 str_buf_cat(str, (char *)&byte, 1);
3591 }
3592
3593 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3594 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3595 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3596 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3597 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3598 if (ISASCII(byte)) {
3600 }
3601 else {
3603
3604 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3605 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3606 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3607 }
3608 }
3609 }
3610}
3611
3612RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3613RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3614RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3615
3616static VALUE
3617rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3618 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3619{
3620 int str_encindex = ENCODING_GET(str);
3621 int res_encindex;
3622 int str_cr, res_cr;
3623 rb_encoding *str_enc, *ptr_enc;
3624
3625 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3626
3627 if (str_encindex == ptr_encindex) {
3628 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3629 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3630 }
3631 }
3632 else {
3633 str_enc = rb_enc_from_index(str_encindex);
3634 ptr_enc = rb_enc_from_index(ptr_encindex);
3635 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3636 if (len == 0)
3637 return str;
3638 if (RSTRING_LEN(str) == 0) {
3639 rb_str_buf_cat(str, ptr, len);
3640 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3641 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3642 return str;
3643 }
3644 goto incompatible;
3645 }
3646 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3647 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3648 }
3649 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3650 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3651 str_cr = rb_enc_str_coderange(str);
3652 }
3653 }
3654 }
3655 if (ptr_cr_ret)
3656 *ptr_cr_ret = ptr_cr;
3657
3658 if (str_encindex != ptr_encindex &&
3659 str_cr != ENC_CODERANGE_7BIT &&
3660 ptr_cr != ENC_CODERANGE_7BIT) {
3661 str_enc = rb_enc_from_index(str_encindex);
3662 ptr_enc = rb_enc_from_index(ptr_encindex);
3663 goto incompatible;
3664 }
3665
3666 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3667 res_encindex = str_encindex;
3668 res_cr = ENC_CODERANGE_UNKNOWN;
3669 }
3670 else if (str_cr == ENC_CODERANGE_7BIT) {
3671 if (ptr_cr == ENC_CODERANGE_7BIT) {
3672 res_encindex = str_encindex;
3673 res_cr = ENC_CODERANGE_7BIT;
3674 }
3675 else {
3676 res_encindex = ptr_encindex;
3677 res_cr = ptr_cr;
3678 }
3679 }
3680 else if (str_cr == ENC_CODERANGE_VALID) {
3681 res_encindex = str_encindex;
3682 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3683 res_cr = str_cr;
3684 else
3685 res_cr = ptr_cr;
3686 }
3687 else { /* str_cr == ENC_CODERANGE_BROKEN */
3688 res_encindex = str_encindex;
3689 res_cr = str_cr;
3690 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3691 }
3692
3693 if (len < 0) {
3694 rb_raise(rb_eArgError, "negative string size (or size too big)");
3695 }
3696 str_buf_cat(str, ptr, len);
3697 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3698 return str;
3699
3700 incompatible:
3701 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3702 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3704}
3705
3706VALUE
3707rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3708{
3709 return rb_enc_cr_str_buf_cat(str, ptr, len,
3710 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3711}
3712
3713VALUE
3715{
3716 /* ptr must reference NUL terminated ASCII string. */
3717 int encindex = ENCODING_GET(str);
3718 rb_encoding *enc = rb_enc_from_index(encindex);
3719 if (rb_enc_asciicompat(enc)) {
3720 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3721 encindex, ENC_CODERANGE_7BIT, 0);
3722 }
3723 else {
3724 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3725 while (*ptr) {
3726 unsigned int c = (unsigned char)*ptr;
3727 int len = rb_enc_codelen(c, enc);
3728 rb_enc_mbcput(c, buf, enc);
3729 rb_enc_cr_str_buf_cat(str, buf, len,
3730 encindex, ENC_CODERANGE_VALID, 0);
3731 ptr++;
3732 }
3733 return str;
3734 }
3735}
3736
3737VALUE
3739{
3740 int str2_cr = rb_enc_str_coderange(str2);
3741
3742 if (str_enc_fastpath(str)) {
3743 switch (str2_cr) {
3744 case ENC_CODERANGE_7BIT:
3745 // If RHS is 7bit we can do simple concatenation
3746 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3747 RB_GC_GUARD(str2);
3748 return str;
3750 // If RHS is valid, we can do simple concatenation if encodings are the same
3751 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3752 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3753 int str_cr = ENC_CODERANGE(str);
3754 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3755 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3756 }
3757 RB_GC_GUARD(str2);
3758 return str;
3759 }
3760 }
3761 }
3762
3763 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3764 ENCODING_GET(str2), str2_cr, &str2_cr);
3765
3766 ENC_CODERANGE_SET(str2, str2_cr);
3767
3768 return str;
3769}
3770
3771VALUE
3773{
3774 StringValue(str2);
3775 return rb_str_buf_append(str, str2);
3776}
3777
3778VALUE
3779rb_str_concat_literals(size_t num, const VALUE *strary)
3780{
3781 VALUE str;
3782 size_t i, s = 0;
3783 unsigned long len = 1;
3784
3785 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3786 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3787
3788 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3789 str = rb_str_buf_new(len);
3790 str_enc_copy_direct(str, strary[0]);
3791
3792 for (i = s; i < num; ++i) {
3793 const VALUE v = strary[i];
3794 int encidx = ENCODING_GET(v);
3795
3796 rb_str_buf_append(str, v);
3797 if (encidx != ENCINDEX_US_ASCII) {
3798 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3799 rb_enc_set_index(str, encidx);
3800 }
3801 }
3802 return str;
3803}
3804
3805/*
3806 * call-seq:
3807 * concat(*objects) -> string
3808 *
3809 * :include: doc/string/concat.rdoc
3810 */
3811static VALUE
3812rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3813{
3814 str_modifiable(str);
3815
3816 if (argc == 1) {
3817 return rb_str_concat(str, argv[0]);
3818 }
3819 else if (argc > 1) {
3820 int i;
3821 VALUE arg_str = rb_str_tmp_new(0);
3822 rb_enc_copy(arg_str, str);
3823 for (i = 0; i < argc; i++) {
3824 rb_str_concat(arg_str, argv[i]);
3825 }
3826 rb_str_buf_append(str, arg_str);
3827 }
3828
3829 return str;
3830}
3831
3832/*
3833 * call-seq:
3834 * append_as_bytes(*objects) -> self
3835 *
3836 * Concatenates each object in +objects+ into +self+; returns +self+;
3837 * performs no encoding validation or conversion:
3838 *
3839 * s = 'foo'
3840 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3841 * s.valid_encoding? # => false
3842 * s.append_as_bytes("\xAC 12")
3843 * s.valid_encoding? # => true
3844 *
3845 * When a given object is an integer,
3846 * the value is considered an 8-bit byte;
3847 * if the integer occupies more than one byte (i.e,. is greater than 255),
3848 * appends only the low-order byte (similar to String#setbyte):
3849 *
3850 * s = ""
3851 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3852 * s.bytesize # => 2
3853 *
3854 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3855 */
3856
3857VALUE
3858rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3859{
3860 long needed_capacity = 0;
3861 volatile VALUE t0;
3862 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3863
3864 for (int index = 0; index < argc; index++) {
3865 VALUE obj = argv[index];
3866 enum ruby_value_type type = types[index] = rb_type(obj);
3867 switch (type) {
3868 case T_FIXNUM:
3869 case T_BIGNUM:
3870 needed_capacity++;
3871 break;
3872 case T_STRING:
3873 needed_capacity += RSTRING_LEN(obj);
3874 break;
3875 default:
3876 rb_raise(
3878 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3879 rb_obj_class(obj)
3880 );
3881 break;
3882 }
3883 }
3884
3885 str_ensure_available_capa(str, needed_capacity);
3886 char *sptr = RSTRING_END(str);
3887
3888 for (int index = 0; index < argc; index++) {
3889 VALUE obj = argv[index];
3890 enum ruby_value_type type = types[index];
3891 switch (type) {
3892 case T_FIXNUM:
3893 case T_BIGNUM: {
3894 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3895 char byte = (char)(NUM2INT(obj) & 0xFF);
3896 *sptr = byte;
3897 sptr++;
3898 break;
3899 }
3900 case T_STRING: {
3901 const char *ptr;
3902 long len;
3903 RSTRING_GETMEM(obj, ptr, len);
3904 memcpy(sptr, ptr, len);
3905 sptr += len;
3906 break;
3907 }
3908 default:
3909 rb_bug("append_as_bytes arguments should have been validated");
3910 }
3911 }
3912
3913 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3914 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3915
3916 int cr = ENC_CODERANGE(str);
3917 switch (cr) {
3918 case ENC_CODERANGE_7BIT: {
3919 for (int index = 0; index < argc; index++) {
3920 VALUE obj = argv[index];
3921 enum ruby_value_type type = types[index];
3922 switch (type) {
3923 case T_FIXNUM:
3924 case T_BIGNUM: {
3925 if (!ISASCII(NUM2INT(obj))) {
3926 goto clear_cr;
3927 }
3928 break;
3929 }
3930 case T_STRING: {
3931 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3932 goto clear_cr;
3933 }
3934 break;
3935 }
3936 default:
3937 rb_bug("append_as_bytes arguments should have been validated");
3938 }
3939 }
3940 break;
3941 }
3943 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3944 goto keep_cr;
3945 }
3946 else {
3947 goto clear_cr;
3948 }
3949 break;
3950 default:
3951 goto clear_cr;
3952 break;
3953 }
3954
3955 RB_GC_GUARD(t0);
3956
3957 clear_cr:
3958 // If no fast path was hit, we clear the coderange.
3959 // append_as_bytes is predominently meant to be used in
3960 // buffering situation, hence it's likely the coderange
3961 // will never be scanned, so it's not worth spending time
3962 // precomputing the coderange except for simple and common
3963 // situations.
3965 keep_cr:
3966 return str;
3967}
3968
3969/*
3970 * call-seq:
3971 * self << object -> self
3972 *
3973 * Appends a string representation of +object+ to +self+;
3974 * returns +self+.
3975 *
3976 * If +object+ is a string, appends it to +self+:
3977 *
3978 * s = 'foo'
3979 * s << 'bar' # => "foobar"
3980 * s # => "foobar"
3981 *
3982 * If +object+ is an integer,
3983 * its value is considered a codepoint;
3984 * converts the value to a character before concatenating:
3985 *
3986 * s = 'foo'
3987 * s << 33 # => "foo!"
3988 *
3989 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
3990 * and the encoding of +self+ is Encoding::US_ASCII,
3991 * changes the encoding to Encoding::ASCII_8BIT:
3992 *
3993 * s = 'foo'.encode(Encoding::US_ASCII)
3994 * s.encoding # => #<Encoding:US-ASCII>
3995 * s << 0xff # => "foo\xFF"
3996 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3997 *
3998 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
3999 *
4000 * s = 'foo'
4001 * s.encoding # => <Encoding:UTF-8>
4002 * s << 0x00110000 # 1114112 out of char range (RangeError)
4003 * s = 'foo'.encode(Encoding::EUC_JP)
4004 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4005 *
4006 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4007 */
4008VALUE
4010{
4011 unsigned int code;
4012 rb_encoding *enc = STR_ENC_GET(str1);
4013 int encidx;
4014
4015 if (RB_INTEGER_TYPE_P(str2)) {
4016 if (rb_num_to_uint(str2, &code) == 0) {
4017 }
4018 else if (FIXNUM_P(str2)) {
4019 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4020 }
4021 else {
4022 rb_raise(rb_eRangeError, "bignum out of char range");
4023 }
4024 }
4025 else {
4026 return rb_str_append(str1, str2);
4027 }
4028
4029 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4030
4031 if (encidx >= 0) {
4032 rb_str_buf_cat_byte(str1, (unsigned char)code);
4033 }
4034 else {
4035 long pos = RSTRING_LEN(str1);
4036 int cr = ENC_CODERANGE(str1);
4037 int len;
4038 char *buf;
4039
4040 switch (len = rb_enc_codelen(code, enc)) {
4041 case ONIGERR_INVALID_CODE_POINT_VALUE:
4042 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4043 break;
4044 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4045 case 0:
4046 rb_raise(rb_eRangeError, "%u out of char range", code);
4047 break;
4048 }
4049 buf = ALLOCA_N(char, len + 1);
4050 rb_enc_mbcput(code, buf, enc);
4051 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4052 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4053 }
4054 rb_str_resize(str1, pos+len);
4055 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4056 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4058 }
4059 else if (cr == ENC_CODERANGE_BROKEN) {
4061 }
4062 ENC_CODERANGE_SET(str1, cr);
4063 }
4064 return str1;
4065}
4066
4067int
4068rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4069{
4070 int encidx = rb_enc_to_index(enc);
4071
4072 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4073 /* US-ASCII automatically extended to ASCII-8BIT */
4074 if (code > 0xFF) {
4075 rb_raise(rb_eRangeError, "%u out of char range", code);
4076 }
4077 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4078 return ENCINDEX_ASCII_8BIT;
4079 }
4080 return encidx;
4081 }
4082 else {
4083 return -1;
4084 }
4085}
4086
4087/*
4088 * call-seq:
4089 * prepend(*other_strings) -> string
4090 *
4091 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4092 *
4093 * s = 'foo'
4094 * s.prepend('bar', 'baz') # => "barbazfoo"
4095 * s # => "barbazfoo"
4096 *
4097 * Related: String#concat.
4098 */
4099
4100static VALUE
4101rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4102{
4103 str_modifiable(str);
4104
4105 if (argc == 1) {
4106 rb_str_update(str, 0L, 0L, argv[0]);
4107 }
4108 else if (argc > 1) {
4109 int i;
4110 VALUE arg_str = rb_str_tmp_new(0);
4111 rb_enc_copy(arg_str, str);
4112 for (i = 0; i < argc; i++) {
4113 rb_str_append(arg_str, argv[i]);
4114 }
4115 rb_str_update(str, 0L, 0L, arg_str);
4116 }
4117
4118 return str;
4119}
4120
4121st_index_t
4123{
4124 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4125 st_index_t precomputed_hash;
4126 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4127
4128 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4129 return precomputed_hash;
4130 }
4131
4132 return str_do_hash(str);
4133}
4134
4135int
4137{
4138 long len1, len2;
4139 const char *ptr1, *ptr2;
4140 RSTRING_GETMEM(str1, ptr1, len1);
4141 RSTRING_GETMEM(str2, ptr2, len2);
4142 return (len1 != len2 ||
4143 !rb_str_comparable(str1, str2) ||
4144 memcmp(ptr1, ptr2, len1) != 0);
4145}
4146
4147/*
4148 * call-seq:
4149 * hash -> integer
4150 *
4151 * :include: doc/string/hash.rdoc
4152 *
4153 */
4154
4155static VALUE
4156rb_str_hash_m(VALUE str)
4157{
4158 st_index_t hval = rb_str_hash(str);
4159 return ST2FIX(hval);
4160}
4161
4162#define lesser(a,b) (((a)>(b))?(b):(a))
4163
4164int
4166{
4167 int idx1, idx2;
4168 int rc1, rc2;
4169
4170 if (RSTRING_LEN(str1) == 0) return TRUE;
4171 if (RSTRING_LEN(str2) == 0) return TRUE;
4172 idx1 = ENCODING_GET(str1);
4173 idx2 = ENCODING_GET(str2);
4174 if (idx1 == idx2) return TRUE;
4175 rc1 = rb_enc_str_coderange(str1);
4176 rc2 = rb_enc_str_coderange(str2);
4177 if (rc1 == ENC_CODERANGE_7BIT) {
4178 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4179 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4180 return TRUE;
4181 }
4182 if (rc2 == ENC_CODERANGE_7BIT) {
4183 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4184 return TRUE;
4185 }
4186 return FALSE;
4187}
4188
4189int
4191{
4192 long len1, len2;
4193 const char *ptr1, *ptr2;
4194 int retval;
4195
4196 if (str1 == str2) return 0;
4197 RSTRING_GETMEM(str1, ptr1, len1);
4198 RSTRING_GETMEM(str2, ptr2, len2);
4199 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4200 if (len1 == len2) {
4201 if (!rb_str_comparable(str1, str2)) {
4202 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4203 return 1;
4204 return -1;
4205 }
4206 return 0;
4207 }
4208 if (len1 > len2) return 1;
4209 return -1;
4210 }
4211 if (retval > 0) return 1;
4212 return -1;
4213}
4214
4215/*
4216 * call-seq:
4217 * self == object -> true or false
4218 *
4219 * Returns whether +object+ is equal to +self+.
4220 *
4221 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4222 *
4223 * s = 'foo'
4224 * s == 'foo' # => true
4225 * s == 'food' # => false
4226 * s == 'FOO' # => false
4227 *
4228 * Returns +false+ if the two strings' encodings are not compatible:
4229 *
4230 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4231 *
4232 * When +object+ is not a string:
4233 *
4234 * - If +object+ responds to method <tt>to_str</tt>,
4235 * <tt>object == self</tt> is called and its return value is returned.
4236 * - If +object+ does not respond to <tt>to_str</tt>,
4237 * +false+ is returned.
4238 *
4239 * Related: {Comparing}[rdoc-ref:String@Comparing].
4240 */
4241
4242VALUE
4244{
4245 if (str1 == str2) return Qtrue;
4246 if (!RB_TYPE_P(str2, T_STRING)) {
4247 if (!rb_respond_to(str2, idTo_str)) {
4248 return Qfalse;
4249 }
4250 return rb_equal(str2, str1);
4251 }
4252 return rb_str_eql_internal(str1, str2);
4253}
4254
4255/*
4256 * call-seq:
4257 * eql?(object) -> true or false
4258 *
4259 * :include: doc/string/eql_p.rdoc
4260 *
4261 */
4262
4263VALUE
4264rb_str_eql(VALUE str1, VALUE str2)
4265{
4266 if (str1 == str2) return Qtrue;
4267 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4268 return rb_str_eql_internal(str1, str2);
4269}
4270
4271/*
4272 * call-seq:
4273 * self <=> other_string -> -1, 0, 1, or nil
4274 *
4275 * Compares +self+ and +other_string+, returning:
4276 *
4277 * - -1 if +other_string+ is larger.
4278 * - 0 if the two are equal.
4279 * - 1 if +other_string+ is smaller.
4280 * - +nil+ if the two are incomparable.
4281 *
4282 * Examples:
4283 *
4284 * 'foo' <=> 'foo' # => 0
4285 * 'foo' <=> 'food' # => -1
4286 * 'food' <=> 'foo' # => 1
4287 * 'FOO' <=> 'foo' # => -1
4288 * 'foo' <=> 'FOO' # => 1
4289 * 'foo' <=> 1 # => nil
4290 *
4291 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4292 */
4293
4294static VALUE
4295rb_str_cmp_m(VALUE str1, VALUE str2)
4296{
4297 int result;
4298 VALUE s = rb_check_string_type(str2);
4299 if (NIL_P(s)) {
4300 return rb_invcmp(str1, str2);
4301 }
4302 result = rb_str_cmp(str1, s);
4303 return INT2FIX(result);
4304}
4305
4306static VALUE str_casecmp(VALUE str1, VALUE str2);
4307static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4308
4309/*
4310 * call-seq:
4311 * casecmp(other_string) -> -1, 0, 1, or nil
4312 *
4313 * Ignoring case, compares +self+ and +other_string+; returns:
4314 *
4315 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4316 * - 0 if the two are equal.
4317 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4318 * - +nil+ if the two are incomparable.
4319 *
4320 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4321 *
4322 * Examples:
4323 *
4324 * 'foo'.casecmp('goo') # => -1
4325 * 'goo'.casecmp('foo') # => 1
4326 * 'foo'.casecmp('food') # => -1
4327 * 'food'.casecmp('foo') # => 1
4328 * 'FOO'.casecmp('foo') # => 0
4329 * 'foo'.casecmp('FOO') # => 0
4330 * 'foo'.casecmp(1) # => nil
4331 *
4332 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4333 */
4334
4335static VALUE
4336rb_str_casecmp(VALUE str1, VALUE str2)
4337{
4338 VALUE s = rb_check_string_type(str2);
4339 if (NIL_P(s)) {
4340 return Qnil;
4341 }
4342 return str_casecmp(str1, s);
4343}
4344
4345static VALUE
4346str_casecmp(VALUE str1, VALUE str2)
4347{
4348 long len;
4349 rb_encoding *enc;
4350 const char *p1, *p1end, *p2, *p2end;
4351
4352 enc = rb_enc_compatible(str1, str2);
4353 if (!enc) {
4354 return Qnil;
4355 }
4356
4357 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4358 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4359 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4360 while (p1 < p1end && p2 < p2end) {
4361 if (*p1 != *p2) {
4362 unsigned int c1 = TOLOWER(*p1 & 0xff);
4363 unsigned int c2 = TOLOWER(*p2 & 0xff);
4364 if (c1 != c2)
4365 return INT2FIX(c1 < c2 ? -1 : 1);
4366 }
4367 p1++;
4368 p2++;
4369 }
4370 }
4371 else {
4372 while (p1 < p1end && p2 < p2end) {
4373 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4374 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4375
4376 if (0 <= c1 && 0 <= c2) {
4377 c1 = TOLOWER(c1);
4378 c2 = TOLOWER(c2);
4379 if (c1 != c2)
4380 return INT2FIX(c1 < c2 ? -1 : 1);
4381 }
4382 else {
4383 int r;
4384 l1 = rb_enc_mbclen(p1, p1end, enc);
4385 l2 = rb_enc_mbclen(p2, p2end, enc);
4386 len = l1 < l2 ? l1 : l2;
4387 r = memcmp(p1, p2, len);
4388 if (r != 0)
4389 return INT2FIX(r < 0 ? -1 : 1);
4390 if (l1 != l2)
4391 return INT2FIX(l1 < l2 ? -1 : 1);
4392 }
4393 p1 += l1;
4394 p2 += l2;
4395 }
4396 }
4397 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4398 if (p1 == p1end) return INT2FIX(-1);
4399 return INT2FIX(1);
4400}
4401
4402/*
4403 * call-seq:
4404 * casecmp?(other_string) -> true, false, or nil
4405 *
4406 * Returns +true+ if +self+ and +other_string+ are equal after
4407 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4408 *
4409 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4410 *
4411 * Examples:
4412 *
4413 * 'foo'.casecmp?('goo') # => false
4414 * 'goo'.casecmp?('foo') # => false
4415 * 'foo'.casecmp?('food') # => false
4416 * 'food'.casecmp?('foo') # => false
4417 * 'FOO'.casecmp?('foo') # => true
4418 * 'foo'.casecmp?('FOO') # => true
4419 * 'foo'.casecmp?(1) # => nil
4420 *
4421 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4422 */
4423
4424static VALUE
4425rb_str_casecmp_p(VALUE str1, VALUE str2)
4426{
4427 VALUE s = rb_check_string_type(str2);
4428 if (NIL_P(s)) {
4429 return Qnil;
4430 }
4431 return str_casecmp_p(str1, s);
4432}
4433
4434static VALUE
4435str_casecmp_p(VALUE str1, VALUE str2)
4436{
4437 rb_encoding *enc;
4438 VALUE folded_str1, folded_str2;
4439 VALUE fold_opt = sym_fold;
4440
4441 enc = rb_enc_compatible(str1, str2);
4442 if (!enc) {
4443 return Qnil;
4444 }
4445
4446 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4447 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4448
4449 return rb_str_eql(folded_str1, folded_str2);
4450}
4451
4452static long
4453strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4454 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4455{
4456 const char *search_start = str_ptr;
4457 long pos, search_len = str_len - offset;
4458
4459 for (;;) {
4460 const char *t;
4461 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4462 if (pos < 0) return pos;
4463 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4464 if (t == search_start + pos) break;
4465 search_len -= t - search_start;
4466 if (search_len <= 0) return -1;
4467 offset += t - search_start;
4468 search_start = t;
4469 }
4470 return pos + offset;
4471}
4472
4473/* found index in byte */
4474#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4475#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4476
4477static long
4478rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4479{
4480 const char *str_ptr, *str_ptr_end, *sub_ptr;
4481 long str_len, sub_len;
4482 rb_encoding *enc;
4483
4484 enc = rb_enc_check(str, sub);
4485 if (is_broken_string(sub)) return -1;
4486
4487 str_ptr = RSTRING_PTR(str);
4488 str_ptr_end = RSTRING_END(str);
4489 str_len = RSTRING_LEN(str);
4490 sub_ptr = RSTRING_PTR(sub);
4491 sub_len = RSTRING_LEN(sub);
4492
4493 if (str_len < sub_len) return -1;
4494
4495 if (offset != 0) {
4496 long str_len_char, sub_len_char;
4497 int single_byte = single_byte_optimizable(str);
4498 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4499 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4500 if (offset < 0) {
4501 offset += str_len_char;
4502 if (offset < 0) return -1;
4503 }
4504 if (str_len_char - offset < sub_len_char) return -1;
4505 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4506 str_ptr += offset;
4507 }
4508 if (sub_len == 0) return offset;
4509
4510 /* need proceed one character at a time */
4511 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4512}
4513
4514
4515/*
4516 * call-seq:
4517 * index(pattern, offset = 0) -> integer or nil
4518 *
4519 * :include: doc/string/index.rdoc
4520 *
4521 */
4522
4523static VALUE
4524rb_str_index_m(int argc, VALUE *argv, VALUE str)
4525{
4526 VALUE sub;
4527 VALUE initpos;
4528 rb_encoding *enc = STR_ENC_GET(str);
4529 long pos;
4530
4531 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4532 long slen = str_strlen(str, enc); /* str's enc */
4533 pos = NUM2LONG(initpos);
4534 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4535 if (RB_TYPE_P(sub, T_REGEXP)) {
4537 }
4538 return Qnil;
4539 }
4540 }
4541 else {
4542 pos = 0;
4543 }
4544
4545 if (RB_TYPE_P(sub, T_REGEXP)) {
4546 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4547 enc, single_byte_optimizable(str));
4548
4549 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4550 VALUE match = rb_backref_get();
4551 struct re_registers *regs = RMATCH_REGS(match);
4552 pos = rb_str_sublen(str, BEG(0));
4553 return LONG2NUM(pos);
4554 }
4555 }
4556 else {
4557 StringValue(sub);
4558 pos = rb_str_index(str, sub, pos);
4559 if (pos >= 0) {
4560 pos = rb_str_sublen(str, pos);
4561 return LONG2NUM(pos);
4562 }
4563 }
4564 return Qnil;
4565}
4566
4567/* Ensure that the given pos is a valid character boundary.
4568 * Note that in this function, "character" means a code point
4569 * (Unicode scalar value), not a grapheme cluster.
4570 */
4571static void
4572str_ensure_byte_pos(VALUE str, long pos)
4573{
4574 if (!single_byte_optimizable(str)) {
4575 const char *s = RSTRING_PTR(str);
4576 const char *e = RSTRING_END(str);
4577 const char *p = s + pos;
4578 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4579 rb_raise(rb_eIndexError,
4580 "offset %ld does not land on character boundary", pos);
4581 }
4582 }
4583}
4584
4585/*
4586 * call-seq:
4587 * byteindex(object, offset = 0) -> integer or nil
4588 *
4589 * Returns the 0-based integer index of a substring of +self+
4590 * specified by +object+ (a string or Regexp) and +offset+,
4591 * or +nil+ if there is no such substring;
4592 * the returned index is the count of _bytes_ (not characters).
4593 *
4594 * When +object+ is a string,
4595 * returns the index of the first found substring equal to +object+:
4596 *
4597 * s = 'foo' # => "foo"
4598 * s.size # => 3 # Three 1-byte characters.
4599 * s.bytesize # => 3 # Three bytes.
4600 * s.byteindex('f') # => 0
4601 * s.byteindex('o') # => 1
4602 * s.byteindex('oo') # => 1
4603 * s.byteindex('ooo') # => nil
4604 *
4605 * When +object+ is a Regexp,
4606 * returns the index of the first found substring matching +object+;
4607 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4608 *
4609 * s = 'foo'
4610 * s.byteindex(/f/) # => 0
4611 * $~ # => #<MatchData "f">
4612 * s.byteindex(/o/) # => 1
4613 * s.byteindex(/oo/) # => 1
4614 * s.byteindex(/ooo/) # => nil
4615 * $~ # => nil
4616 *
4617 * \Integer argument +offset+, if given, specifies the 0-based index
4618 * of the byte where searching is to begin.
4619 *
4620 * When +offset+ is non-negative,
4621 * searching begins at byte position +offset+:
4622 *
4623 * s = 'foo'
4624 * s.byteindex('o', 1) # => 1
4625 * s.byteindex('o', 2) # => 2
4626 * s.byteindex('o', 3) # => nil
4627 *
4628 * When +offset+ is negative, counts backward from the end of +self+:
4629 *
4630 * s = 'foo'
4631 * s.byteindex('o', -1) # => 2
4632 * s.byteindex('o', -2) # => 1
4633 * s.byteindex('o', -3) # => 1
4634 * s.byteindex('o', -4) # => nil
4635 *
4636 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4637 *
4638 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4639 * s.size # => 2 # Two 3-byte characters.
4640 * s.bytesize # => 6 # Six bytes.
4641 * s.byteindex("\uFFFF") # => 0
4642 * s.byteindex("\uFFFF", 1) # Raises IndexError
4643 * s.byteindex("\uFFFF", 2) # Raises IndexError
4644 * s.byteindex("\uFFFF", 3) # => 3
4645 * s.byteindex("\uFFFF", 4) # Raises IndexError
4646 * s.byteindex("\uFFFF", 5) # Raises IndexError
4647 * s.byteindex("\uFFFF", 6) # => nil
4648 *
4649 * Related: see {Querying}[rdoc-ref:String@Querying].
4650 */
4651
4652static VALUE
4653rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4654{
4655 VALUE sub;
4656 VALUE initpos;
4657 long pos;
4658
4659 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4660 long slen = RSTRING_LEN(str);
4661 pos = NUM2LONG(initpos);
4662 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4663 if (RB_TYPE_P(sub, T_REGEXP)) {
4665 }
4666 return Qnil;
4667 }
4668 }
4669 else {
4670 pos = 0;
4671 }
4672
4673 str_ensure_byte_pos(str, pos);
4674
4675 if (RB_TYPE_P(sub, T_REGEXP)) {
4676 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4677 VALUE match = rb_backref_get();
4678 struct re_registers *regs = RMATCH_REGS(match);
4679 pos = BEG(0);
4680 return LONG2NUM(pos);
4681 }
4682 }
4683 else {
4684 StringValue(sub);
4685 pos = rb_str_byteindex(str, sub, pos);
4686 if (pos >= 0) return LONG2NUM(pos);
4687 }
4688 return Qnil;
4689}
4690
4691#ifndef HAVE_MEMRCHR
4692static void*
4693memrchr(const char *search_str, int chr, long search_len)
4694{
4695 const char *ptr = search_str + search_len;
4696 while (ptr > search_str) {
4697 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4698 }
4699
4700 return ((void *)0);
4701}
4702#endif
4703
4704static long
4705str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4706{
4707 char *hit, *adjusted;
4708 int c;
4709 long slen, searchlen;
4710 char *sbeg, *e, *t;
4711
4712 sbeg = RSTRING_PTR(str);
4713 slen = RSTRING_LEN(sub);
4714 if (slen == 0) return s - sbeg;
4715 e = RSTRING_END(str);
4716 t = RSTRING_PTR(sub);
4717 c = *t & 0xff;
4718 searchlen = s - sbeg + 1;
4719
4720 if (memcmp(s, t, slen) == 0) {
4721 return s - sbeg;
4722 }
4723
4724 do {
4725 hit = memrchr(sbeg, c, searchlen);
4726 if (!hit) break;
4727 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4728 if (hit != adjusted) {
4729 searchlen = adjusted - sbeg;
4730 continue;
4731 }
4732 if (memcmp(hit, t, slen) == 0)
4733 return hit - sbeg;
4734 searchlen = adjusted - sbeg;
4735 } while (searchlen > 0);
4736
4737 return -1;
4738}
4739
4740/* found index in byte */
4741static long
4742rb_str_rindex(VALUE str, VALUE sub, long pos)
4743{
4744 long len, slen;
4745 char *sbeg, *s;
4746 rb_encoding *enc;
4747 int singlebyte;
4748
4749 enc = rb_enc_check(str, sub);
4750 if (is_broken_string(sub)) return -1;
4751 singlebyte = single_byte_optimizable(str);
4752 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4753 slen = str_strlen(sub, enc); /* rb_enc_check */
4754
4755 /* substring longer than string */
4756 if (len < slen) return -1;
4757 if (len - pos < slen) pos = len - slen;
4758 if (len == 0) return pos;
4759
4760 sbeg = RSTRING_PTR(str);
4761
4762 if (pos == 0) {
4763 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4764 return 0;
4765 else
4766 return -1;
4767 }
4768
4769 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4770 return str_rindex(str, sub, s, enc);
4771}
4772
4773/*
4774 * call-seq:
4775 * rindex(substring, offset = self.length) -> integer or nil
4776 * rindex(regexp, offset = self.length) -> integer or nil
4777 *
4778 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4779 * or +nil+ if none found:
4780 *
4781 * 'foo'.rindex('f') # => 0
4782 * 'foo'.rindex('o') # => 2
4783 * 'foo'.rindex('oo') # => 1
4784 * 'foo'.rindex('ooo') # => nil
4785 *
4786 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4787 * or +nil+ if none found:
4788 *
4789 * 'foo'.rindex(/f/) # => 0
4790 * 'foo'.rindex(/o/) # => 2
4791 * 'foo'.rindex(/oo/) # => 1
4792 * 'foo'.rindex(/ooo/) # => nil
4793 *
4794 * The _last_ match means starting at the possible last position, not
4795 * the last of longest matches.
4796 *
4797 * 'foo'.rindex(/o+/) # => 2
4798 * $~ #=> #<MatchData "o">
4799 *
4800 * To get the last longest match, needs to combine with negative
4801 * lookbehind.
4802 *
4803 * 'foo'.rindex(/(?<!o)o+/) # => 1
4804 * $~ #=> #<MatchData "oo">
4805 *
4806 * Or String#index with negative lookforward.
4807 *
4808 * 'foo'.index(/o+(?!.*o)/) # => 1
4809 * $~ #=> #<MatchData "oo">
4810 *
4811 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4812 * string to _end_ the search:
4813 *
4814 * 'foo'.rindex('o', 0) # => nil
4815 * 'foo'.rindex('o', 1) # => 1
4816 * 'foo'.rindex('o', 2) # => 2
4817 * 'foo'.rindex('o', 3) # => 2
4818 *
4819 * If +offset+ is a negative Integer, the maximum starting position in the
4820 * string to _end_ the search is the sum of the string's length and +offset+:
4821 *
4822 * 'foo'.rindex('o', -1) # => 2
4823 * 'foo'.rindex('o', -2) # => 1
4824 * 'foo'.rindex('o', -3) # => nil
4825 * 'foo'.rindex('o', -4) # => nil
4826 *
4827 * Related: String#index.
4828 */
4829
4830static VALUE
4831rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4832{
4833 VALUE sub;
4834 VALUE initpos;
4835 rb_encoding *enc = STR_ENC_GET(str);
4836 long pos, len = str_strlen(str, enc); /* str's enc */
4837
4838 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4839 pos = NUM2LONG(initpos);
4840 if (pos < 0 && (pos += len) < 0) {
4841 if (RB_TYPE_P(sub, T_REGEXP)) {
4843 }
4844 return Qnil;
4845 }
4846 if (pos > len) pos = len;
4847 }
4848 else {
4849 pos = len;
4850 }
4851
4852 if (RB_TYPE_P(sub, T_REGEXP)) {
4853 /* enc = rb_enc_check(str, sub); */
4854 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4855 enc, single_byte_optimizable(str));
4856
4857 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4858 VALUE match = rb_backref_get();
4859 struct re_registers *regs = RMATCH_REGS(match);
4860 pos = rb_str_sublen(str, BEG(0));
4861 return LONG2NUM(pos);
4862 }
4863 }
4864 else {
4865 StringValue(sub);
4866 pos = rb_str_rindex(str, sub, pos);
4867 if (pos >= 0) {
4868 pos = rb_str_sublen(str, pos);
4869 return LONG2NUM(pos);
4870 }
4871 }
4872 return Qnil;
4873}
4874
4875static long
4876rb_str_byterindex(VALUE str, VALUE sub, long pos)
4877{
4878 long len, slen;
4879 char *sbeg, *s;
4880 rb_encoding *enc;
4881
4882 enc = rb_enc_check(str, sub);
4883 if (is_broken_string(sub)) return -1;
4884 len = RSTRING_LEN(str);
4885 slen = RSTRING_LEN(sub);
4886
4887 /* substring longer than string */
4888 if (len < slen) return -1;
4889 if (len - pos < slen) pos = len - slen;
4890 if (len == 0) return pos;
4891
4892 sbeg = RSTRING_PTR(str);
4893
4894 if (pos == 0) {
4895 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4896 return 0;
4897 else
4898 return -1;
4899 }
4900
4901 s = sbeg + pos;
4902 return str_rindex(str, sub, s, enc);
4903}
4904
4905/*
4906 * call-seq:
4907 * byterindex(object, offset = self.bytesize) -> integer or nil
4908 *
4909 * Returns the 0-based integer index of a substring of +self+
4910 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4911 * or +nil+ if there is no such substring;
4912 * the returned index is the count of _bytes_ (not characters).
4913 *
4914 * When +object+ is a string,
4915 * returns the index of the _last_ found substring equal to +object+:
4916 *
4917 * s = 'foo' # => "foo"
4918 * s.size # => 3 # Three 1-byte characters.
4919 * s.bytesize # => 3 # Three bytes.
4920 * s.byterindex('f') # => 0
4921 s.byterindex('o') # => 2
4922 s.byterindex('oo') # => 1
4923 s.byterindex('ooo') # => nil
4924 *
4925 * When +object+ is a Regexp,
4926 * returns the index of the last found substring matching +object+;
4927 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4928 *
4929 * s = 'foo'
4930 * s.byterindex(/f/) # => 0
4931 * $~ # => #<MatchData "f">
4932 * s.byterindex(/o/) # => 2
4933 * s.byterindex(/oo/) # => 1
4934 * s.byterindex(/ooo/) # => nil
4935 * $~ # => nil
4936 *
4937 * The last match means starting at the possible last position,
4938 * not the last of the longest matches:
4939 *
4940 * s = 'foo'
4941 * s.byterindex(/o+/) # => 2
4942 * $~ #=> #<MatchData "o">
4943 *
4944 * To get the last longest match, use a negative lookbehind:
4945 *
4946 * s = 'foo'
4947 * s.byterindex(/(?<!o)o+/) # => 1
4948 * $~ # => #<MatchData "oo">
4949 *
4950 * Or use method #byteindex with negative lookahead:
4951 *
4952 * s = 'foo'
4953 * s.byteindex(/o+(?!.*o)/) # => 1
4954 * $~ #=> #<MatchData "oo">
4955 *
4956 * \Integer argument +offset+, if given, specifies the 0-based index
4957 * of the byte where searching is to end.
4958 *
4959 * When +offset+ is non-negative,
4960 * searching ends at byte position +offset+:
4961 *
4962 * s = 'foo'
4963 * s.byterindex('o', 0) # => nil
4964 * s.byterindex('o', 1) # => 1
4965 * s.byterindex('o', 2) # => 2
4966 * s.byterindex('o', 3) # => 2
4967 *
4968 * When +offset+ is negative, counts backward from the end of +self+:
4969 *
4970 * s = 'foo'
4971 * s.byterindex('o', -1) # => 2
4972 * s.byterindex('o', -2) # => 1
4973 * s.byterindex('o', -3) # => nil
4974 *
4975 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4976 *
4977 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4978 * s.size # => 2 # Two 3-byte characters.
4979 * s.bytesize # => 6 # Six bytes.
4980 * s.byterindex("\uFFFF") # => 3
4981 * s.byterindex("\uFFFF", 1) # Raises IndexError
4982 * s.byterindex("\uFFFF", 2) # Raises IndexError
4983 * s.byterindex("\uFFFF", 3) # => 3
4984 * s.byterindex("\uFFFF", 4) # Raises IndexError
4985 * s.byterindex("\uFFFF", 5) # Raises IndexError
4986 * s.byterindex("\uFFFF", 6) # => nil
4987 *
4988 * Related: see {Querying}[rdoc-ref:String@Querying].
4989 */
4990
4991static VALUE
4992rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4993{
4994 VALUE sub;
4995 VALUE initpos;
4996 long pos, len = RSTRING_LEN(str);
4997
4998 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4999 pos = NUM2LONG(initpos);
5000 if (pos < 0 && (pos += len) < 0) {
5001 if (RB_TYPE_P(sub, T_REGEXP)) {
5003 }
5004 return Qnil;
5005 }
5006 if (pos > len) pos = len;
5007 }
5008 else {
5009 pos = len;
5010 }
5011
5012 str_ensure_byte_pos(str, pos);
5013
5014 if (RB_TYPE_P(sub, T_REGEXP)) {
5015 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5016 VALUE match = rb_backref_get();
5017 struct re_registers *regs = RMATCH_REGS(match);
5018 pos = BEG(0);
5019 return LONG2NUM(pos);
5020 }
5021 }
5022 else {
5023 StringValue(sub);
5024 pos = rb_str_byterindex(str, sub, pos);
5025 if (pos >= 0) return LONG2NUM(pos);
5026 }
5027 return Qnil;
5028}
5029
5030/*
5031 * call-seq:
5032 * self =~ object -> integer or nil
5033 *
5034 * When +object+ is a Regexp, returns the index of the first substring in +self+
5035 * matched by +object+,
5036 * or +nil+ if no match is found;
5037 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5038 *
5039 * 'foo' =~ /f/ # => 0
5040 * $~ # => #<MatchData "f">
5041 * 'foo' =~ /o/ # => 1
5042 * $~ # => #<MatchData "o">
5043 * 'foo' =~ /x/ # => nil
5044 * $~ # => nil
5045 *
5046 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5047 * (see Regexp#=~):
5048 *
5049 * number = nil
5050 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5051 * number # => nil # Not assigned.
5052 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5053 * number # => "9" # Assigned.
5054 *
5055 * If +object+ is not a Regexp, returns the value
5056 * returned by <tt>object =~ self</tt>.
5057 *
5058 * Related: see {Querying}[rdoc-ref:String@Querying].
5059 */
5060
5061static VALUE
5062rb_str_match(VALUE x, VALUE y)
5063{
5064 switch (OBJ_BUILTIN_TYPE(y)) {
5065 case T_STRING:
5066 rb_raise(rb_eTypeError, "type mismatch: String given");
5067
5068 case T_REGEXP:
5069 return rb_reg_match(y, x);
5070
5071 default:
5072 return rb_funcall(y, idEqTilde, 1, x);
5073 }
5074}
5075
5076
5077static VALUE get_pat(VALUE);
5078
5079
5080/*
5081 * call-seq:
5082 * match(pattern, offset = 0) -> matchdata or nil
5083 * match(pattern, offset = 0) {|matchdata| ... } -> object
5084 *
5085 * Creates a MatchData object based on +self+ and the given arguments;
5086 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5087 *
5088 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5089 *
5090 * regexp = Regexp.new(pattern)
5091 *
5092 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5093 * (see Regexp#match):
5094 *
5095 * matchdata = regexp.match(self[offset..])
5096 *
5097 * With no block given, returns the computed +matchdata+ or +nil+:
5098 *
5099 * 'foo'.match('f') # => #<MatchData "f">
5100 * 'foo'.match('o') # => #<MatchData "o">
5101 * 'foo'.match('x') # => nil
5102 * 'foo'.match('f', 1) # => nil
5103 * 'foo'.match('o', 1) # => #<MatchData "o">
5104 *
5105 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5106 * returns the block's return value:
5107 *
5108 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5109 *
5110 * With a block given and +nil+ +matchdata+, does not call the block:
5111 *
5112 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5113 *
5114 * Related: see {Querying}[rdoc-ref:String@Querying].
5115 */
5116
5117static VALUE
5118rb_str_match_m(int argc, VALUE *argv, VALUE str)
5119{
5120 VALUE re, result;
5121 if (argc < 1)
5122 rb_check_arity(argc, 1, 2);
5123 re = argv[0];
5124 argv[0] = str;
5125 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5126 if (!NIL_P(result) && rb_block_given_p()) {
5127 return rb_yield(result);
5128 }
5129 return result;
5130}
5131
5132/*
5133 * call-seq:
5134 * match?(pattern, offset = 0) -> true or false
5135 *
5136 * Returns whether a match is found for +self+ and the given arguments;
5137 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5138 *
5139 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5140 *
5141 * regexp = Regexp.new(pattern)
5142 *
5143 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5144 * +false+ otherwise:
5145 *
5146 * 'foo'.match?(/o/) # => true
5147 * 'foo'.match?('o') # => true
5148 * 'foo'.match?(/x/) # => false
5149 * 'foo'.match?('f', 1) # => false
5150 * 'foo'.match?('o', 1) # => true
5151 *
5152 * Related: see {Querying}[rdoc-ref:String@Querying].
5153 */
5154
5155static VALUE
5156rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5157{
5158 VALUE re;
5159 rb_check_arity(argc, 1, 2);
5160 re = get_pat(argv[0]);
5161 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5162}
5163
5164enum neighbor_char {
5165 NEIGHBOR_NOT_CHAR,
5166 NEIGHBOR_FOUND,
5167 NEIGHBOR_WRAPPED
5168};
5169
5170static enum neighbor_char
5171enc_succ_char(char *p, long len, rb_encoding *enc)
5172{
5173 long i;
5174 int l;
5175
5176 if (rb_enc_mbminlen(enc) > 1) {
5177 /* wchar, trivial case */
5178 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5179 if (!MBCLEN_CHARFOUND_P(r)) {
5180 return NEIGHBOR_NOT_CHAR;
5181 }
5182 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5183 l = rb_enc_code_to_mbclen(c, enc);
5184 if (!l) return NEIGHBOR_NOT_CHAR;
5185 if (l != len) return NEIGHBOR_WRAPPED;
5186 rb_enc_mbcput(c, p, enc);
5187 r = rb_enc_precise_mbclen(p, p + len, enc);
5188 if (!MBCLEN_CHARFOUND_P(r)) {
5189 return NEIGHBOR_NOT_CHAR;
5190 }
5191 return NEIGHBOR_FOUND;
5192 }
5193 while (1) {
5194 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5195 p[i] = '\0';
5196 if (i < 0)
5197 return NEIGHBOR_WRAPPED;
5198 ++((unsigned char*)p)[i];
5199 l = rb_enc_precise_mbclen(p, p+len, enc);
5200 if (MBCLEN_CHARFOUND_P(l)) {
5201 l = MBCLEN_CHARFOUND_LEN(l);
5202 if (l == len) {
5203 return NEIGHBOR_FOUND;
5204 }
5205 else {
5206 memset(p+l, 0xff, len-l);
5207 }
5208 }
5209 if (MBCLEN_INVALID_P(l) && i < len-1) {
5210 long len2;
5211 int l2;
5212 for (len2 = len-1; 0 < len2; len2--) {
5213 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5214 if (!MBCLEN_INVALID_P(l2))
5215 break;
5216 }
5217 memset(p+len2+1, 0xff, len-(len2+1));
5218 }
5219 }
5220}
5221
5222static enum neighbor_char
5223enc_pred_char(char *p, long len, rb_encoding *enc)
5224{
5225 long i;
5226 int l;
5227 if (rb_enc_mbminlen(enc) > 1) {
5228 /* wchar, trivial case */
5229 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5230 if (!MBCLEN_CHARFOUND_P(r)) {
5231 return NEIGHBOR_NOT_CHAR;
5232 }
5233 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5234 if (!c) return NEIGHBOR_NOT_CHAR;
5235 --c;
5236 l = rb_enc_code_to_mbclen(c, enc);
5237 if (!l) return NEIGHBOR_NOT_CHAR;
5238 if (l != len) return NEIGHBOR_WRAPPED;
5239 rb_enc_mbcput(c, p, enc);
5240 r = rb_enc_precise_mbclen(p, p + len, enc);
5241 if (!MBCLEN_CHARFOUND_P(r)) {
5242 return NEIGHBOR_NOT_CHAR;
5243 }
5244 return NEIGHBOR_FOUND;
5245 }
5246 while (1) {
5247 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5248 p[i] = '\xff';
5249 if (i < 0)
5250 return NEIGHBOR_WRAPPED;
5251 --((unsigned char*)p)[i];
5252 l = rb_enc_precise_mbclen(p, p+len, enc);
5253 if (MBCLEN_CHARFOUND_P(l)) {
5254 l = MBCLEN_CHARFOUND_LEN(l);
5255 if (l == len) {
5256 return NEIGHBOR_FOUND;
5257 }
5258 else {
5259 memset(p+l, 0, len-l);
5260 }
5261 }
5262 if (MBCLEN_INVALID_P(l) && i < len-1) {
5263 long len2;
5264 int l2;
5265 for (len2 = len-1; 0 < len2; len2--) {
5266 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5267 if (!MBCLEN_INVALID_P(l2))
5268 break;
5269 }
5270 memset(p+len2+1, 0, len-(len2+1));
5271 }
5272 }
5273}
5274
5275/*
5276 overwrite +p+ by succeeding letter in +enc+ and returns
5277 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5278 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5279 assuming each ranges are successive, and mbclen
5280 never change in each ranges.
5281 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5282 character.
5283 */
5284static enum neighbor_char
5285enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5286{
5287 enum neighbor_char ret;
5288 unsigned int c;
5289 int ctype;
5290 int range;
5291 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5292
5293 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5294 int try;
5295 const int max_gaps = 1;
5296
5297 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5298 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5299 ctype = ONIGENC_CTYPE_DIGIT;
5300 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5301 ctype = ONIGENC_CTYPE_ALPHA;
5302 else
5303 return NEIGHBOR_NOT_CHAR;
5304
5305 MEMCPY(save, p, char, len);
5306 for (try = 0; try <= max_gaps; ++try) {
5307 ret = enc_succ_char(p, len, enc);
5308 if (ret == NEIGHBOR_FOUND) {
5309 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5310 if (rb_enc_isctype(c, ctype, enc))
5311 return NEIGHBOR_FOUND;
5312 }
5313 }
5314 MEMCPY(p, save, char, len);
5315 range = 1;
5316 while (1) {
5317 MEMCPY(save, p, char, len);
5318 ret = enc_pred_char(p, len, enc);
5319 if (ret == NEIGHBOR_FOUND) {
5320 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5321 if (!rb_enc_isctype(c, ctype, enc)) {
5322 MEMCPY(p, save, char, len);
5323 break;
5324 }
5325 }
5326 else {
5327 MEMCPY(p, save, char, len);
5328 break;
5329 }
5330 range++;
5331 }
5332 if (range == 1) {
5333 return NEIGHBOR_NOT_CHAR;
5334 }
5335
5336 if (ctype != ONIGENC_CTYPE_DIGIT) {
5337 MEMCPY(carry, p, char, len);
5338 return NEIGHBOR_WRAPPED;
5339 }
5340
5341 MEMCPY(carry, p, char, len);
5342 enc_succ_char(carry, len, enc);
5343 return NEIGHBOR_WRAPPED;
5344}
5345
5346
5347static VALUE str_succ(VALUE str);
5348
5349/*
5350 * call-seq:
5351 * succ -> new_str
5352 *
5353 * Returns the successor to +self+. The successor is calculated by
5354 * incrementing characters.
5355 *
5356 * The first character to be incremented is the rightmost alphanumeric:
5357 * or, if no alphanumerics, the rightmost character:
5358 *
5359 * 'THX1138'.succ # => "THX1139"
5360 * '<<koala>>'.succ # => "<<koalb>>"
5361 * '***'.succ # => '**+'
5362 *
5363 * The successor to a digit is another digit, "carrying" to the next-left
5364 * character for a "rollover" from 9 to 0, and prepending another digit
5365 * if necessary:
5366 *
5367 * '00'.succ # => "01"
5368 * '09'.succ # => "10"
5369 * '99'.succ # => "100"
5370 *
5371 * The successor to a letter is another letter of the same case,
5372 * carrying to the next-left character for a rollover,
5373 * and prepending another same-case letter if necessary:
5374 *
5375 * 'aa'.succ # => "ab"
5376 * 'az'.succ # => "ba"
5377 * 'zz'.succ # => "aaa"
5378 * 'AA'.succ # => "AB"
5379 * 'AZ'.succ # => "BA"
5380 * 'ZZ'.succ # => "AAA"
5381 *
5382 * The successor to a non-alphanumeric character is the next character
5383 * in the underlying character set's collating sequence,
5384 * carrying to the next-left character for a rollover,
5385 * and prepending another character if necessary:
5386 *
5387 * s = 0.chr * 3
5388 * s # => "\x00\x00\x00"
5389 * s.succ # => "\x00\x00\x01"
5390 * s = 255.chr * 3
5391 * s # => "\xFF\xFF\xFF"
5392 * s.succ # => "\x01\x00\x00\x00"
5393 *
5394 * Carrying can occur between and among mixtures of alphanumeric characters:
5395 *
5396 * s = 'zz99zz99'
5397 * s.succ # => "aaa00aa00"
5398 * s = '99zz99zz'
5399 * s.succ # => "100aa00aa"
5400 *
5401 * The successor to an empty +String+ is a new empty +String+:
5402 *
5403 * ''.succ # => ""
5404 *
5405 */
5406
5407VALUE
5409{
5410 VALUE str;
5411 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5412 rb_enc_cr_str_copy_for_substr(str, orig);
5413 return str_succ(str);
5414}
5415
5416static VALUE
5417str_succ(VALUE str)
5418{
5419 rb_encoding *enc;
5420 char *sbeg, *s, *e, *last_alnum = 0;
5421 int found_alnum = 0;
5422 long l, slen;
5423 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5424 long carry_pos = 0, carry_len = 1;
5425 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5426
5427 slen = RSTRING_LEN(str);
5428 if (slen == 0) return str;
5429
5430 enc = STR_ENC_GET(str);
5431 sbeg = RSTRING_PTR(str);
5432 s = e = sbeg + slen;
5433
5434 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5435 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5436 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5437 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5438 break;
5439 }
5440 }
5441 l = rb_enc_precise_mbclen(s, e, enc);
5442 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5443 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5444 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5445 switch (neighbor) {
5446 case NEIGHBOR_NOT_CHAR:
5447 continue;
5448 case NEIGHBOR_FOUND:
5449 return str;
5450 case NEIGHBOR_WRAPPED:
5451 last_alnum = s;
5452 break;
5453 }
5454 found_alnum = 1;
5455 carry_pos = s - sbeg;
5456 carry_len = l;
5457 }
5458 if (!found_alnum) { /* str contains no alnum */
5459 s = e;
5460 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5461 enum neighbor_char neighbor;
5462 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5463 l = rb_enc_precise_mbclen(s, e, enc);
5464 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5465 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5466 MEMCPY(tmp, s, char, l);
5467 neighbor = enc_succ_char(tmp, l, enc);
5468 switch (neighbor) {
5469 case NEIGHBOR_FOUND:
5470 MEMCPY(s, tmp, char, l);
5471 return str;
5472 break;
5473 case NEIGHBOR_WRAPPED:
5474 MEMCPY(s, tmp, char, l);
5475 break;
5476 case NEIGHBOR_NOT_CHAR:
5477 break;
5478 }
5479 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5480 /* wrapped to \0...\0. search next valid char. */
5481 enc_succ_char(s, l, enc);
5482 }
5483 if (!rb_enc_asciicompat(enc)) {
5484 MEMCPY(carry, s, char, l);
5485 carry_len = l;
5486 }
5487 carry_pos = s - sbeg;
5488 }
5490 }
5491 RESIZE_CAPA(str, slen + carry_len);
5492 sbeg = RSTRING_PTR(str);
5493 s = sbeg + carry_pos;
5494 memmove(s + carry_len, s, slen - carry_pos);
5495 memmove(s, carry, carry_len);
5496 slen += carry_len;
5497 STR_SET_LEN(str, slen);
5498 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5500 return str;
5501}
5502
5503
5504/*
5505 * call-seq:
5506 * succ! -> self
5507 *
5508 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5509 */
5510
5511static VALUE
5512rb_str_succ_bang(VALUE str)
5513{
5514 rb_str_modify(str);
5515 str_succ(str);
5516 return str;
5517}
5518
5519static int
5520all_digits_p(const char *s, long len)
5521{
5522 while (len-- > 0) {
5523 if (!ISDIGIT(*s)) return 0;
5524 s++;
5525 }
5526 return 1;
5527}
5528
5529static int
5530str_upto_i(VALUE str, VALUE arg)
5531{
5532 rb_yield(str);
5533 return 0;
5534}
5535
5536/*
5537 * call-seq:
5538 * upto(other_string, exclusive = false) {|string| ... } -> self
5539 * upto(other_string, exclusive = false) -> new_enumerator
5540 *
5541 * With a block given, calls the block with each +String+ value
5542 * returned by successive calls to String#succ;
5543 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5544 * the sequence terminates when value +other_string+ is reached;
5545 * returns +self+:
5546 *
5547 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5548 * Output:
5549 *
5550 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5551 *
5552 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5553 *
5554 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5555 *
5556 * Output:
5557 *
5558 * a8 a9 b0 b1 b2 b3 b4 b5
5559 *
5560 * If +other_string+ would not be reached, does not call the block:
5561 *
5562 * '25'.upto('5') {|s| fail s }
5563 * 'aa'.upto('a') {|s| fail s }
5564 *
5565 * With no block given, returns a new Enumerator:
5566 *
5567 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5568 *
5569 */
5570
5571static VALUE
5572rb_str_upto(int argc, VALUE *argv, VALUE beg)
5573{
5574 VALUE end, exclusive;
5575
5576 rb_scan_args(argc, argv, "11", &end, &exclusive);
5577 RETURN_ENUMERATOR(beg, argc, argv);
5578 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5579}
5580
5581VALUE
5582rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5583{
5584 VALUE current, after_end;
5585 ID succ;
5586 int n, ascii;
5587 rb_encoding *enc;
5588
5589 CONST_ID(succ, "succ");
5590 StringValue(end);
5591 enc = rb_enc_check(beg, end);
5592 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5593 /* single character */
5594 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5595 char c = RSTRING_PTR(beg)[0];
5596 char e = RSTRING_PTR(end)[0];
5597
5598 if (c > e || (excl && c == e)) return beg;
5599 for (;;) {
5600 VALUE str = rb_enc_str_new(&c, 1, enc);
5602 if ((*each)(str, arg)) break;
5603 if (!excl && c == e) break;
5604 c++;
5605 if (excl && c == e) break;
5606 }
5607 return beg;
5608 }
5609 /* both edges are all digits */
5610 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5611 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5612 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5613 VALUE b, e;
5614 int width;
5615
5616 width = RSTRING_LENINT(beg);
5617 b = rb_str_to_inum(beg, 10, FALSE);
5618 e = rb_str_to_inum(end, 10, FALSE);
5619 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5620 long bi = FIX2LONG(b);
5621 long ei = FIX2LONG(e);
5622 rb_encoding *usascii = rb_usascii_encoding();
5623
5624 while (bi <= ei) {
5625 if (excl && bi == ei) break;
5626 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5627 bi++;
5628 }
5629 }
5630 else {
5631 ID op = excl ? '<' : idLE;
5632 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5633
5634 args[0] = INT2FIX(width);
5635 while (rb_funcall(b, op, 1, e)) {
5636 args[1] = b;
5637 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5638 b = rb_funcallv(b, succ, 0, 0);
5639 }
5640 }
5641 return beg;
5642 }
5643 /* normal case */
5644 n = rb_str_cmp(beg, end);
5645 if (n > 0 || (excl && n == 0)) return beg;
5646
5647 after_end = rb_funcallv(end, succ, 0, 0);
5648 current = str_duplicate(rb_cString, beg);
5649 while (!rb_str_equal(current, after_end)) {
5650 VALUE next = Qnil;
5651 if (excl || !rb_str_equal(current, end))
5652 next = rb_funcallv(current, succ, 0, 0);
5653 if ((*each)(current, arg)) break;
5654 if (NIL_P(next)) break;
5655 current = next;
5656 StringValue(current);
5657 if (excl && rb_str_equal(current, end)) break;
5658 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5659 break;
5660 }
5661
5662 return beg;
5663}
5664
5665VALUE
5666rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5667{
5668 VALUE current;
5669 ID succ;
5670
5671 CONST_ID(succ, "succ");
5672 /* both edges are all digits */
5673 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5674 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5675 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5676 int width = RSTRING_LENINT(beg);
5677 b = rb_str_to_inum(beg, 10, FALSE);
5678 if (FIXNUM_P(b)) {
5679 long bi = FIX2LONG(b);
5680 rb_encoding *usascii = rb_usascii_encoding();
5681
5682 while (FIXABLE(bi)) {
5683 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5684 bi++;
5685 }
5686 b = LONG2NUM(bi);
5687 }
5688 args[0] = INT2FIX(width);
5689 while (1) {
5690 args[1] = b;
5691 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5692 b = rb_funcallv(b, succ, 0, 0);
5693 }
5694 }
5695 /* normal case */
5696 current = str_duplicate(rb_cString, beg);
5697 while (1) {
5698 VALUE next = rb_funcallv(current, succ, 0, 0);
5699 if ((*each)(current, arg)) break;
5700 current = next;
5701 StringValue(current);
5702 if (RSTRING_LEN(current) == 0)
5703 break;
5704 }
5705
5706 return beg;
5707}
5708
5709static int
5710include_range_i(VALUE str, VALUE arg)
5711{
5712 VALUE *argp = (VALUE *)arg;
5713 if (!rb_equal(str, *argp)) return 0;
5714 *argp = Qnil;
5715 return 1;
5716}
5717
5718VALUE
5719rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5720{
5721 beg = rb_str_new_frozen(beg);
5722 StringValue(end);
5723 end = rb_str_new_frozen(end);
5724 if (NIL_P(val)) return Qfalse;
5725 val = rb_check_string_type(val);
5726 if (NIL_P(val)) return Qfalse;
5727 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5728 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5729 rb_enc_asciicompat(STR_ENC_GET(val))) {
5730 const char *bp = RSTRING_PTR(beg);
5731 const char *ep = RSTRING_PTR(end);
5732 const char *vp = RSTRING_PTR(val);
5733 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5734 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5735 return Qfalse;
5736 else {
5737 char b = *bp;
5738 char e = *ep;
5739 char v = *vp;
5740
5741 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5742 if (b <= v && v < e) return Qtrue;
5743 return RBOOL(!RTEST(exclusive) && v == e);
5744 }
5745 }
5746 }
5747#if 0
5748 /* both edges are all digits */
5749 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5750 all_digits_p(bp, RSTRING_LEN(beg)) &&
5751 all_digits_p(ep, RSTRING_LEN(end))) {
5752 /* TODO */
5753 }
5754#endif
5755 }
5756 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5757
5758 return RBOOL(NIL_P(val));
5759}
5760
5761static VALUE
5762rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5763{
5764 if (rb_reg_search(re, str, 0, 0) >= 0) {
5765 VALUE match = rb_backref_get();
5766 int nth = rb_reg_backref_number(match, backref);
5767 return rb_reg_nth_match(nth, match);
5768 }
5769 return Qnil;
5770}
5771
5772static VALUE
5773rb_str_aref(VALUE str, VALUE indx)
5774{
5775 long idx;
5776
5777 if (FIXNUM_P(indx)) {
5778 idx = FIX2LONG(indx);
5779 }
5780 else if (RB_TYPE_P(indx, T_REGEXP)) {
5781 return rb_str_subpat(str, indx, INT2FIX(0));
5782 }
5783 else if (RB_TYPE_P(indx, T_STRING)) {
5784 if (rb_str_index(str, indx, 0) != -1)
5785 return str_duplicate(rb_cString, indx);
5786 return Qnil;
5787 }
5788 else {
5789 /* check if indx is Range */
5790 long beg, len = str_strlen(str, NULL);
5791 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5792 case Qfalse:
5793 break;
5794 case Qnil:
5795 return Qnil;
5796 default:
5797 return rb_str_substr(str, beg, len);
5798 }
5799 idx = NUM2LONG(indx);
5800 }
5801
5802 return str_substr(str, idx, 1, FALSE);
5803}
5804
5805
5806/*
5807 * call-seq:
5808 * self[index] -> new_string or nil
5809 * self[start, length] -> new_string or nil
5810 * self[range] -> new_string or nil
5811 * self[regexp, capture = 0] -> new_string or nil
5812 * self[substring] -> new_string or nil
5813 *
5814 * Returns the substring of +self+ specified by the arguments.
5815 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5816 *
5817 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
5818 */
5819
5820static VALUE
5821rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5822{
5823 if (argc == 2) {
5824 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5825 return rb_str_subpat(str, argv[0], argv[1]);
5826 }
5827 else {
5828 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5829 }
5830 }
5831 rb_check_arity(argc, 1, 2);
5832 return rb_str_aref(str, argv[0]);
5833}
5834
5835VALUE
5837{
5838 char *ptr = RSTRING_PTR(str);
5839 long olen = RSTRING_LEN(str), nlen;
5840
5841 str_modifiable(str);
5842 if (len > olen) len = olen;
5843 nlen = olen - len;
5844 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5845 char *oldptr = ptr;
5846 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5847 STR_SET_EMBED(str);
5848 ptr = RSTRING(str)->as.embed.ary;
5849 memmove(ptr, oldptr + len, nlen);
5850 if (fl == STR_NOEMBED) xfree(oldptr);
5851 }
5852 else {
5853 if (!STR_SHARED_P(str)) {
5854 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5855 rb_enc_cr_str_exact_copy(shared, str);
5856 OBJ_FREEZE(shared);
5857 }
5858 ptr = RSTRING(str)->as.heap.ptr += len;
5859 }
5860 STR_SET_LEN(str, nlen);
5861
5862 if (!SHARABLE_MIDDLE_SUBSTRING) {
5863 TERM_FILL(ptr + nlen, TERM_LEN(str));
5864 }
5866 return str;
5867}
5868
5869static void
5870rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5871{
5872 char *sptr;
5873 long slen;
5874 int cr;
5875
5876 if (beg == 0 && vlen == 0) {
5877 rb_str_drop_bytes(str, len);
5878 return;
5879 }
5880
5881 str_modify_keep_cr(str);
5882 RSTRING_GETMEM(str, sptr, slen);
5883 if (len < vlen) {
5884 /* expand string */
5885 RESIZE_CAPA(str, slen + vlen - len);
5886 sptr = RSTRING_PTR(str);
5887 }
5888
5890 cr = rb_enc_str_coderange(val);
5891 else
5893
5894 if (vlen != len) {
5895 memmove(sptr + beg + vlen,
5896 sptr + beg + len,
5897 slen - (beg + len));
5898 }
5899 if (vlen < beg && len < 0) {
5900 MEMZERO(sptr + slen, char, -len);
5901 }
5902 if (vlen > 0) {
5903 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5904 }
5905 slen += vlen - len;
5906 STR_SET_LEN(str, slen);
5907 TERM_FILL(&sptr[slen], TERM_LEN(str));
5908 ENC_CODERANGE_SET(str, cr);
5909}
5910
5911static inline void
5912rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5913{
5914 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5915}
5916
5917void
5918rb_str_update(VALUE str, long beg, long len, VALUE val)
5919{
5920 long slen;
5921 char *p, *e;
5922 rb_encoding *enc;
5923 int singlebyte = single_byte_optimizable(str);
5924 int cr;
5925
5926 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5927
5928 StringValue(val);
5929 enc = rb_enc_check(str, val);
5930 slen = str_strlen(str, enc); /* rb_enc_check */
5931
5932 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5933 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5934 }
5935 if (beg < 0) {
5936 beg += slen;
5937 }
5938 RUBY_ASSERT(beg >= 0);
5939 RUBY_ASSERT(beg <= slen);
5940
5941 if (len > slen - beg) {
5942 len = slen - beg;
5943 }
5944 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5945 if (!p) p = RSTRING_END(str);
5946 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5947 if (!e) e = RSTRING_END(str);
5948 /* error check */
5949 beg = p - RSTRING_PTR(str); /* physical position */
5950 len = e - p; /* physical length */
5951 rb_str_update_0(str, beg, len, val);
5952 rb_enc_associate(str, enc);
5954 if (cr != ENC_CODERANGE_BROKEN)
5955 ENC_CODERANGE_SET(str, cr);
5956}
5957
5958static void
5959rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5960{
5961 int nth;
5962 VALUE match;
5963 long start, end, len;
5964 rb_encoding *enc;
5965 struct re_registers *regs;
5966
5967 if (rb_reg_search(re, str, 0, 0) < 0) {
5968 rb_raise(rb_eIndexError, "regexp not matched");
5969 }
5970 match = rb_backref_get();
5971 nth = rb_reg_backref_number(match, backref);
5972 regs = RMATCH_REGS(match);
5973 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5974 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5975 }
5976 if (nth < 0) {
5977 nth += regs->num_regs;
5978 }
5979
5980 start = BEG(nth);
5981 if (start == -1) {
5982 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5983 }
5984 end = END(nth);
5985 len = end - start;
5986 StringValue(val);
5987 enc = rb_enc_check_str(str, val);
5988 rb_str_update_0(str, start, len, val);
5989 rb_enc_associate(str, enc);
5990}
5991
5992static VALUE
5993rb_str_aset(VALUE str, VALUE indx, VALUE val)
5994{
5995 long idx, beg;
5996
5997 switch (TYPE(indx)) {
5998 case T_REGEXP:
5999 rb_str_subpat_set(str, indx, INT2FIX(0), val);
6000 return val;
6001
6002 case T_STRING:
6003 beg = rb_str_index(str, indx, 0);
6004 if (beg < 0) {
6005 rb_raise(rb_eIndexError, "string not matched");
6006 }
6007 beg = rb_str_sublen(str, beg);
6008 rb_str_update(str, beg, str_strlen(indx, NULL), val);
6009 return val;
6010
6011 default:
6012 /* check if indx is Range */
6013 {
6014 long beg, len;
6015 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6016 rb_str_update(str, beg, len, val);
6017 return val;
6018 }
6019 }
6020 /* FALLTHROUGH */
6021
6022 case T_FIXNUM:
6023 idx = NUM2LONG(indx);
6024 rb_str_update(str, idx, 1, val);
6025 return val;
6026 }
6027}
6028
6029/*
6030 * call-seq:
6031 * self[index] = new_string
6032 * self[start, length] = new_string
6033 * self[range] = new_string
6034 * self[regexp, capture = 0] = new_string
6035 * self[substring] = new_string
6036 *
6037 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6038 * See {String Slices}[rdoc-ref:String@String+Slices].
6039 *
6040 * A few examples:
6041 *
6042 * s = 'foo'
6043 * s[2] = 'rtune' # => "rtune"
6044 * s # => "fortune"
6045 * s[1, 5] = 'init' # => "init"
6046 * s # => "finite"
6047 * s[3..4] = 'al' # => "al"
6048 * s # => "finale"
6049 * s[/e$/] = 'ly' # => "ly"
6050 * s # => "finally"
6051 * s['lly'] = 'ncial' # => "ncial"
6052 * s # => "financial"
6053 *
6054 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6055 */
6056
6057static VALUE
6058rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6059{
6060 if (argc == 3) {
6061 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6062 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6063 }
6064 else {
6065 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6066 }
6067 return argv[2];
6068 }
6069 rb_check_arity(argc, 2, 3);
6070 return rb_str_aset(str, argv[0], argv[1]);
6071}
6072
6073/*
6074 * call-seq:
6075 * insert(offset, other_string) -> self
6076 *
6077 * :include: doc/string/insert.rdoc
6078 *
6079 */
6080
6081static VALUE
6082rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6083{
6084 long pos = NUM2LONG(idx);
6085
6086 if (pos == -1) {
6087 return rb_str_append(str, str2);
6088 }
6089 else if (pos < 0) {
6090 pos++;
6091 }
6092 rb_str_update(str, pos, 0, str2);
6093 return str;
6094}
6095
6096
6097/*
6098 * call-seq:
6099 * slice!(index) -> new_string or nil
6100 * slice!(start, length) -> new_string or nil
6101 * slice!(range) -> new_string or nil
6102 * slice!(regexp, capture = 0) -> new_string or nil
6103 * slice!(substring) -> new_string or nil
6104 *
6105 * Removes and returns the substring of +self+ specified by the arguments.
6106 * See {String Slices}[rdoc-ref:String@String+Slices].
6107 *
6108 * A few examples:
6109 *
6110 * string = "This is a string"
6111 * string.slice!(2) #=> "i"
6112 * string.slice!(3..6) #=> " is "
6113 * string.slice!(/s.*t/) #=> "sa st"
6114 * string.slice!("r") #=> "r"
6115 * string #=> "Thing"
6116 *
6117 */
6118
6119static VALUE
6120rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6121{
6122 VALUE result = Qnil;
6123 VALUE indx;
6124 long beg, len = 1;
6125 char *p;
6126
6127 rb_check_arity(argc, 1, 2);
6128 str_modify_keep_cr(str);
6129 indx = argv[0];
6130 if (RB_TYPE_P(indx, T_REGEXP)) {
6131 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6132 VALUE match = rb_backref_get();
6133 struct re_registers *regs = RMATCH_REGS(match);
6134 int nth = 0;
6135 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6136 if ((nth += regs->num_regs) <= 0) return Qnil;
6137 }
6138 else if (nth >= regs->num_regs) return Qnil;
6139 beg = BEG(nth);
6140 len = END(nth) - beg;
6141 goto subseq;
6142 }
6143 else if (argc == 2) {
6144 beg = NUM2LONG(indx);
6145 len = NUM2LONG(argv[1]);
6146 goto num_index;
6147 }
6148 else if (FIXNUM_P(indx)) {
6149 beg = FIX2LONG(indx);
6150 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6151 if (!len) return Qnil;
6152 beg = p - RSTRING_PTR(str);
6153 goto subseq;
6154 }
6155 else if (RB_TYPE_P(indx, T_STRING)) {
6156 beg = rb_str_index(str, indx, 0);
6157 if (beg == -1) return Qnil;
6158 len = RSTRING_LEN(indx);
6159 result = str_duplicate(rb_cString, indx);
6160 goto squash;
6161 }
6162 else {
6163 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6164 case Qnil:
6165 return Qnil;
6166 case Qfalse:
6167 beg = NUM2LONG(indx);
6168 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6169 if (!len) return Qnil;
6170 beg = p - RSTRING_PTR(str);
6171 goto subseq;
6172 default:
6173 goto num_index;
6174 }
6175 }
6176
6177 num_index:
6178 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6179 beg = p - RSTRING_PTR(str);
6180
6181 subseq:
6182 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6183 rb_enc_cr_str_copy_for_substr(result, str);
6184
6185 squash:
6186 if (len > 0) {
6187 if (beg == 0) {
6188 rb_str_drop_bytes(str, len);
6189 }
6190 else {
6191 char *sptr = RSTRING_PTR(str);
6192 long slen = RSTRING_LEN(str);
6193 if (beg + len > slen) /* pathological check */
6194 len = slen - beg;
6195 memmove(sptr + beg,
6196 sptr + beg + len,
6197 slen - (beg + len));
6198 slen -= len;
6199 STR_SET_LEN(str, slen);
6200 TERM_FILL(&sptr[slen], TERM_LEN(str));
6201 }
6202 }
6203 return result;
6204}
6205
6206static VALUE
6207get_pat(VALUE pat)
6208{
6209 VALUE val;
6210
6211 switch (OBJ_BUILTIN_TYPE(pat)) {
6212 case T_REGEXP:
6213 return pat;
6214
6215 case T_STRING:
6216 break;
6217
6218 default:
6219 val = rb_check_string_type(pat);
6220 if (NIL_P(val)) {
6221 Check_Type(pat, T_REGEXP);
6222 }
6223 pat = val;
6224 }
6225
6226 return rb_reg_regcomp(pat);
6227}
6228
6229static VALUE
6230get_pat_quoted(VALUE pat, int check)
6231{
6232 VALUE val;
6233
6234 switch (OBJ_BUILTIN_TYPE(pat)) {
6235 case T_REGEXP:
6236 return pat;
6237
6238 case T_STRING:
6239 break;
6240
6241 default:
6242 val = rb_check_string_type(pat);
6243 if (NIL_P(val)) {
6244 Check_Type(pat, T_REGEXP);
6245 }
6246 pat = val;
6247 }
6248 if (check && is_broken_string(pat)) {
6249 rb_exc_raise(rb_reg_check_preprocess(pat));
6250 }
6251 return pat;
6252}
6253
6254static long
6255rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6256{
6257 if (BUILTIN_TYPE(pat) == T_STRING) {
6258 pos = rb_str_byteindex(str, pat, pos);
6259 if (set_backref_str) {
6260 if (pos >= 0) {
6261 str = rb_str_new_frozen_String(str);
6262 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6263 if (match) {
6264 *match = match_data;
6265 }
6266 }
6267 else {
6269 }
6270 }
6271 return pos;
6272 }
6273 else {
6274 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6275 }
6276}
6277
6278static long
6279rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6280{
6281 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6282}
6283
6284
6285/*
6286 * call-seq:
6287 * sub!(pattern, replacement) -> self or nil
6288 * sub!(pattern) {|match| ... } -> self or nil
6289 *
6290 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6291 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6292 *
6293 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6294 *
6295 * Related: String#sub, String#gsub, String#gsub!.
6296 *
6297 */
6298
6299static VALUE
6300rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6301{
6302 VALUE pat, repl, hash = Qnil;
6303 int iter = 0;
6304 long plen;
6305 int min_arity = rb_block_given_p() ? 1 : 2;
6306 long beg;
6307
6308 rb_check_arity(argc, min_arity, 2);
6309 if (argc == 1) {
6310 iter = 1;
6311 }
6312 else {
6313 repl = argv[1];
6314 hash = rb_check_hash_type(argv[1]);
6315 if (NIL_P(hash)) {
6316 StringValue(repl);
6317 }
6318 }
6319
6320 pat = get_pat_quoted(argv[0], 1);
6321
6322 str_modifiable(str);
6323 beg = rb_pat_search(pat, str, 0, 1);
6324 if (beg >= 0) {
6325 rb_encoding *enc;
6326 int cr = ENC_CODERANGE(str);
6327 long beg0, end0;
6328 VALUE match, match0 = Qnil;
6329 struct re_registers *regs;
6330 char *p, *rp;
6331 long len, rlen;
6332
6333 match = rb_backref_get();
6334 regs = RMATCH_REGS(match);
6335 if (RB_TYPE_P(pat, T_STRING)) {
6336 beg0 = beg;
6337 end0 = beg0 + RSTRING_LEN(pat);
6338 match0 = pat;
6339 }
6340 else {
6341 beg0 = BEG(0);
6342 end0 = END(0);
6343 if (iter) match0 = rb_reg_nth_match(0, match);
6344 }
6345
6346 if (iter || !NIL_P(hash)) {
6347 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6348
6349 if (iter) {
6350 repl = rb_obj_as_string(rb_yield(match0));
6351 }
6352 else {
6353 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6354 repl = rb_obj_as_string(repl);
6355 }
6356 str_mod_check(str, p, len);
6357 rb_check_frozen(str);
6358 }
6359 else {
6360 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6361 }
6362
6363 enc = rb_enc_compatible(str, repl);
6364 if (!enc) {
6365 rb_encoding *str_enc = STR_ENC_GET(str);
6366 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6367 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6368 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6369 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6370 rb_enc_inspect_name(str_enc),
6371 rb_enc_inspect_name(STR_ENC_GET(repl)));
6372 }
6373 enc = STR_ENC_GET(repl);
6374 }
6375 rb_str_modify(str);
6376 rb_enc_associate(str, enc);
6378 int cr2 = ENC_CODERANGE(repl);
6379 if (cr2 == ENC_CODERANGE_BROKEN ||
6380 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6382 else
6383 cr = cr2;
6384 }
6385 plen = end0 - beg0;
6386 rlen = RSTRING_LEN(repl);
6387 len = RSTRING_LEN(str);
6388 if (rlen > plen) {
6389 RESIZE_CAPA(str, len + rlen - plen);
6390 }
6391 p = RSTRING_PTR(str);
6392 if (rlen != plen) {
6393 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6394 }
6395 rp = RSTRING_PTR(repl);
6396 memmove(p + beg0, rp, rlen);
6397 len += rlen - plen;
6398 STR_SET_LEN(str, len);
6399 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6400 ENC_CODERANGE_SET(str, cr);
6401
6402 RB_GC_GUARD(match);
6403
6404 return str;
6405 }
6406 return Qnil;
6407}
6408
6409
6410/*
6411 * call-seq:
6412 * sub(pattern, replacement) -> new_string
6413 * sub(pattern) {|match| ... } -> new_string
6414 *
6415 * Returns a copy of +self+ with only the first occurrence
6416 * (not all occurrences) of the given +pattern+ replaced.
6417 *
6418 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6419 *
6420 * Related: String#sub!, String#gsub, String#gsub!.
6421 *
6422 */
6423
6424static VALUE
6425rb_str_sub(int argc, VALUE *argv, VALUE str)
6426{
6427 str = str_duplicate(rb_cString, str);
6428 rb_str_sub_bang(argc, argv, str);
6429 return str;
6430}
6431
6432static VALUE
6433str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6434{
6435 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6436 long beg, beg0, end0;
6437 long offset, blen, slen, len, last;
6438 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6439 char *sp, *cp;
6440 int need_backref_str = -1;
6441 rb_encoding *str_enc;
6442
6443 switch (argc) {
6444 case 1:
6445 RETURN_ENUMERATOR(str, argc, argv);
6446 mode = ITER;
6447 break;
6448 case 2:
6449 repl = argv[1];
6450 hash = rb_check_hash_type(argv[1]);
6451 if (NIL_P(hash)) {
6452 StringValue(repl);
6453 }
6454 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6455 mode = FAST_MAP;
6456 }
6457 else {
6458 mode = MAP;
6459 }
6460 break;
6461 default:
6462 rb_error_arity(argc, 1, 2);
6463 }
6464
6465 pat = get_pat_quoted(argv[0], 1);
6466 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6467
6468 if (beg < 0) {
6469 if (bang) return Qnil; /* no match, no substitution */
6470 return str_duplicate(rb_cString, str);
6471 }
6472
6473 offset = 0;
6474 blen = RSTRING_LEN(str) + 30; /* len + margin */
6475 dest = rb_str_buf_new(blen);
6476 sp = RSTRING_PTR(str);
6477 slen = RSTRING_LEN(str);
6478 cp = sp;
6479 str_enc = STR_ENC_GET(str);
6480 rb_enc_associate(dest, str_enc);
6481 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6482
6483 do {
6484 struct re_registers *regs = RMATCH_REGS(match);
6485 if (RB_TYPE_P(pat, T_STRING)) {
6486 beg0 = beg;
6487 end0 = beg0 + RSTRING_LEN(pat);
6488 match0 = pat;
6489 }
6490 else {
6491 beg0 = BEG(0);
6492 end0 = END(0);
6493 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6494 }
6495
6496 if (mode != STR) {
6497 if (mode == ITER) {
6498 val = rb_obj_as_string(rb_yield(match0));
6499 }
6500 else {
6501 struct RString fake_str;
6502 VALUE key;
6503 if (mode == FAST_MAP) {
6504 // It is safe to use a fake_str here because we established that it won't escape,
6505 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6506 // default proc.
6507 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6508 }
6509 else {
6510 key = rb_str_subseq(str, beg0, end0 - beg0);
6511 }
6512 val = rb_hash_aref(hash, key);
6513 val = rb_obj_as_string(val);
6514 }
6515 str_mod_check(str, sp, slen);
6516 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6517 rb_raise(rb_eRuntimeError, "block should not cheat");
6518 }
6519 }
6520 else if (need_backref_str) {
6521 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6522 if (need_backref_str < 0) {
6523 need_backref_str = val != repl;
6524 }
6525 }
6526 else {
6527 val = repl;
6528 }
6529
6530 len = beg0 - offset; /* copy pre-match substr */
6531 if (len) {
6532 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6533 }
6534
6535 rb_str_buf_append(dest, val);
6536
6537 last = offset;
6538 offset = end0;
6539 if (beg0 == end0) {
6540 /*
6541 * Always consume at least one character of the input string
6542 * in order to prevent infinite loops.
6543 */
6544 if (RSTRING_LEN(str) <= end0) break;
6545 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6546 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6547 offset = end0 + len;
6548 }
6549 cp = RSTRING_PTR(str) + offset;
6550 if (offset > RSTRING_LEN(str)) break;
6551
6552 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6553 if (mode != FAST_MAP && mode != STR) {
6554 match = Qnil;
6555 }
6556 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6557
6558 RB_GC_GUARD(match);
6559 } while (beg >= 0);
6560
6561 if (RSTRING_LEN(str) > offset) {
6562 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6563 }
6564 rb_pat_search0(pat, str, last, 1, &match);
6565 if (bang) {
6566 str_shared_replace(str, dest);
6567 }
6568 else {
6569 str = dest;
6570 }
6571
6572 return str;
6573}
6574
6575
6576/*
6577 * call-seq:
6578 * gsub!(pattern, replacement) -> self or nil
6579 * gsub!(pattern) {|match| ... } -> self or nil
6580 * gsub!(pattern) -> an_enumerator
6581 *
6582 * Like String#gsub, except that:
6583 *
6584 * - Performs substitutions in +self+ (not in a copy of +self+).
6585 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6586 *
6587 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6588 */
6589
6590static VALUE
6591rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6592{
6593 str_modify_keep_cr(str);
6594 return str_gsub(argc, argv, str, 1);
6595}
6596
6597
6598/*
6599 * call-seq:
6600 * gsub(pattern, replacement) -> new_string
6601 * gsub(pattern) {|match| ... } -> new_string
6602 * gsub(pattern) -> enumerator
6603 *
6604 * Returns a copy of +self+ with zero or more substrings replaced.
6605 *
6606 * Argument +pattern+ may be a string or a Regexp;
6607 * argument +replacement+ may be a string or a Hash.
6608 * Varying types for the argument values makes this method very versatile.
6609 *
6610 * Below are some simple examples;
6611 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6612 *
6613 * With arguments +pattern+ and string +replacement+ given,
6614 * replaces each matching substring with the given +replacement+ string:
6615 *
6616 * s = 'abracadabra'
6617 * s.gsub('ab', 'AB') # => "ABracadABra"
6618 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6619 *
6620 * With arguments +pattern+ and hash +replacement+ given,
6621 * replaces each matching substring with a value from the given +replacement+ hash,
6622 * or removes it:
6623 *
6624 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6625 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6626 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6627 *
6628 * With argument +pattern+ and a block given,
6629 * calls the block with each matching substring;
6630 * replaces that substring with the block's return value:
6631 *
6632 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6633 * # => "ABrACADABrA"
6634 *
6635 * With argument +pattern+ and no block given,
6636 * returns a new Enumerator.
6637 *
6638 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6639 */
6640
6641static VALUE
6642rb_str_gsub(int argc, VALUE *argv, VALUE str)
6643{
6644 return str_gsub(argc, argv, str, 0);
6645}
6646
6647
6648/*
6649 * call-seq:
6650 * replace(other_string) -> self
6651 *
6652 * Replaces the contents of +self+ with the contents of +other_string+:
6653 *
6654 * s = 'foo' # => "foo"
6655 * s.replace('bar') # => "bar"
6656 *
6657 */
6658
6659VALUE
6661{
6662 str_modifiable(str);
6663 if (str == str2) return str;
6664
6665 StringValue(str2);
6666 str_discard(str);
6667 return str_replace(str, str2);
6668}
6669
6670/*
6671 * call-seq:
6672 * clear -> self
6673 *
6674 * Removes the contents of +self+:
6675 *
6676 * s = 'foo'
6677 * s.clear # => ""
6678 * s # => ""
6679 *
6680 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6681 */
6682
6683static VALUE
6684rb_str_clear(VALUE str)
6685{
6686 str_discard(str);
6687 STR_SET_EMBED(str);
6688 STR_SET_LEN(str, 0);
6689 RSTRING_PTR(str)[0] = 0;
6690 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6692 else
6694 return str;
6695}
6696
6697/*
6698 * call-seq:
6699 * chr -> string
6700 *
6701 * :include: doc/string/chr.rdoc
6702 *
6703 */
6704
6705static VALUE
6706rb_str_chr(VALUE str)
6707{
6708 return rb_str_substr(str, 0, 1);
6709}
6710
6711/*
6712 * call-seq:
6713 * getbyte(index) -> integer or nil
6714 *
6715 * :include: doc/string/getbyte.rdoc
6716 *
6717 */
6718VALUE
6719rb_str_getbyte(VALUE str, VALUE index)
6720{
6721 long pos = NUM2LONG(index);
6722
6723 if (pos < 0)
6724 pos += RSTRING_LEN(str);
6725 if (pos < 0 || RSTRING_LEN(str) <= pos)
6726 return Qnil;
6727
6728 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6729}
6730
6731/*
6732 * call-seq:
6733 * setbyte(index, integer) -> integer
6734 *
6735 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6736 *
6737 * s = 'abcde' # => "abcde"
6738 * s.setbyte(0, 98) # => 98
6739 * s # => "bbcde"
6740 *
6741 * Related: String#getbyte.
6742 */
6743VALUE
6744rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6745{
6746 long pos = NUM2LONG(index);
6747 long len = RSTRING_LEN(str);
6748 char *ptr, *head, *left = 0;
6749 rb_encoding *enc;
6750 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6751
6752 if (pos < -len || len <= pos)
6753 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6754 if (pos < 0)
6755 pos += len;
6756
6757 VALUE v = rb_to_int(value);
6758 VALUE w = rb_int_and(v, INT2FIX(0xff));
6759 char byte = (char)(NUM2INT(w) & 0xFF);
6760
6761 if (!str_independent(str))
6762 str_make_independent(str);
6763 enc = STR_ENC_GET(str);
6764 head = RSTRING_PTR(str);
6765 ptr = &head[pos];
6766 if (!STR_EMBED_P(str)) {
6767 cr = ENC_CODERANGE(str);
6768 switch (cr) {
6769 case ENC_CODERANGE_7BIT:
6770 left = ptr;
6771 *ptr = byte;
6772 if (ISASCII(byte)) goto end;
6773 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6774 if (!MBCLEN_CHARFOUND_P(nlen))
6776 else
6778 goto end;
6780 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6781 width = rb_enc_precise_mbclen(left, head+len, enc);
6782 *ptr = byte;
6783 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6784 if (!MBCLEN_CHARFOUND_P(nlen))
6786 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6788 goto end;
6789 }
6790 }
6792 *ptr = byte;
6793
6794 end:
6795 return value;
6796}
6797
6798static VALUE
6799str_byte_substr(VALUE str, long beg, long len, int empty)
6800{
6801 long n = RSTRING_LEN(str);
6802
6803 if (beg > n || len < 0) return Qnil;
6804 if (beg < 0) {
6805 beg += n;
6806 if (beg < 0) return Qnil;
6807 }
6808 if (len > n - beg)
6809 len = n - beg;
6810 if (len <= 0) {
6811 if (!empty) return Qnil;
6812 len = 0;
6813 }
6814
6815 VALUE str2 = str_subseq(str, beg, len);
6816
6817 str_enc_copy_direct(str2, str);
6818
6819 if (RSTRING_LEN(str2) == 0) {
6820 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6822 else
6824 }
6825 else {
6826 switch (ENC_CODERANGE(str)) {
6827 case ENC_CODERANGE_7BIT:
6829 break;
6830 default:
6832 break;
6833 }
6834 }
6835
6836 return str2;
6837}
6838
6839VALUE
6840rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6841{
6842 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6843}
6844
6845static VALUE
6846str_byte_aref(VALUE str, VALUE indx)
6847{
6848 long idx;
6849 if (FIXNUM_P(indx)) {
6850 idx = FIX2LONG(indx);
6851 }
6852 else {
6853 /* check if indx is Range */
6854 long beg, len = RSTRING_LEN(str);
6855
6856 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6857 case Qfalse:
6858 break;
6859 case Qnil:
6860 return Qnil;
6861 default:
6862 return str_byte_substr(str, beg, len, TRUE);
6863 }
6864
6865 idx = NUM2LONG(indx);
6866 }
6867 return str_byte_substr(str, idx, 1, FALSE);
6868}
6869
6870/*
6871 * call-seq:
6872 * byteslice(offset, length = 1) -> string or nil
6873 * byteslice(range) -> string or nil
6874 *
6875 * :include: doc/string/byteslice.rdoc
6876 */
6877
6878static VALUE
6879rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6880{
6881 if (argc == 2) {
6882 long beg = NUM2LONG(argv[0]);
6883 long len = NUM2LONG(argv[1]);
6884 return str_byte_substr(str, beg, len, TRUE);
6885 }
6886 rb_check_arity(argc, 1, 2);
6887 return str_byte_aref(str, argv[0]);
6888}
6889
6890static void
6891str_check_beg_len(VALUE str, long *beg, long *len)
6892{
6893 long end, slen = RSTRING_LEN(str);
6894
6895 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6896 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6897 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6898 }
6899 if (*beg < 0) {
6900 *beg += slen;
6901 }
6902 RUBY_ASSERT(*beg >= 0);
6903 RUBY_ASSERT(*beg <= slen);
6904
6905 if (*len > slen - *beg) {
6906 *len = slen - *beg;
6907 }
6908 end = *beg + *len;
6909 str_ensure_byte_pos(str, *beg);
6910 str_ensure_byte_pos(str, end);
6911}
6912
6913/*
6914 * call-seq:
6915 * bytesplice(offset, length, str) -> self
6916 * bytesplice(offset, length, str, str_offset, str_length) -> self
6917 * bytesplice(range, str) -> self
6918 * bytesplice(range, str, str_range) -> self
6919 *
6920 * :include: doc/string/bytesplice.rdoc
6921 */
6922
6923static VALUE
6924rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6925{
6926 long beg, len, vbeg, vlen;
6927 VALUE val;
6928 int cr;
6929
6930 rb_check_arity(argc, 2, 5);
6931 if (!(argc == 2 || argc == 3 || argc == 5)) {
6932 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6933 }
6934 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6935 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6936 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6937 rb_builtin_class_name(argv[0]));
6938 }
6939 val = argv[1];
6940 StringValue(val);
6941 if (argc == 2) {
6942 /* bytesplice(range, str) */
6943 vbeg = 0;
6944 vlen = RSTRING_LEN(val);
6945 }
6946 else {
6947 /* bytesplice(range, str, str_range) */
6948 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6949 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6950 rb_builtin_class_name(argv[2]));
6951 }
6952 }
6953 }
6954 else {
6955 beg = NUM2LONG(argv[0]);
6956 len = NUM2LONG(argv[1]);
6957 val = argv[2];
6958 StringValue(val);
6959 if (argc == 3) {
6960 /* bytesplice(index, length, str) */
6961 vbeg = 0;
6962 vlen = RSTRING_LEN(val);
6963 }
6964 else {
6965 /* bytesplice(index, length, str, str_index, str_length) */
6966 vbeg = NUM2LONG(argv[3]);
6967 vlen = NUM2LONG(argv[4]);
6968 }
6969 }
6970 str_check_beg_len(str, &beg, &len);
6971 str_check_beg_len(val, &vbeg, &vlen);
6972 str_modify_keep_cr(str);
6973
6974 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6975 rb_enc_associate(str, rb_enc_check(str, val));
6976 }
6977
6978 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6980 if (cr != ENC_CODERANGE_BROKEN)
6981 ENC_CODERANGE_SET(str, cr);
6982 return str;
6983}
6984
6985/*
6986 * call-seq:
6987 * reverse -> string
6988 *
6989 * Returns a new string with the characters from +self+ in reverse order.
6990 *
6991 * 'stressed'.reverse # => "desserts"
6992 *
6993 */
6994
6995static VALUE
6996rb_str_reverse(VALUE str)
6997{
6998 rb_encoding *enc;
6999 VALUE rev;
7000 char *s, *e, *p;
7001 int cr;
7002
7003 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
7004 enc = STR_ENC_GET(str);
7005 rev = rb_str_new(0, RSTRING_LEN(str));
7006 s = RSTRING_PTR(str); e = RSTRING_END(str);
7007 p = RSTRING_END(rev);
7008 cr = ENC_CODERANGE(str);
7009
7010 if (RSTRING_LEN(str) > 1) {
7011 if (single_byte_optimizable(str)) {
7012 while (s < e) {
7013 *--p = *s++;
7014 }
7015 }
7016 else if (cr == ENC_CODERANGE_VALID) {
7017 while (s < e) {
7018 int clen = rb_enc_fast_mbclen(s, e, enc);
7019
7020 p -= clen;
7021 memcpy(p, s, clen);
7022 s += clen;
7023 }
7024 }
7025 else {
7026 cr = rb_enc_asciicompat(enc) ?
7028 while (s < e) {
7029 int clen = rb_enc_mbclen(s, e, enc);
7030
7031 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7032 p -= clen;
7033 memcpy(p, s, clen);
7034 s += clen;
7035 }
7036 }
7037 }
7038 STR_SET_LEN(rev, RSTRING_LEN(str));
7039 str_enc_copy_direct(rev, str);
7040 ENC_CODERANGE_SET(rev, cr);
7041
7042 return rev;
7043}
7044
7045
7046/*
7047 * call-seq:
7048 * reverse! -> self
7049 *
7050 * Returns +self+ with its characters reversed:
7051 *
7052 * s = 'stressed'
7053 * s.reverse! # => "desserts"
7054 * s # => "desserts"
7055 *
7056 */
7057
7058static VALUE
7059rb_str_reverse_bang(VALUE str)
7060{
7061 if (RSTRING_LEN(str) > 1) {
7062 if (single_byte_optimizable(str)) {
7063 char *s, *e, c;
7064
7065 str_modify_keep_cr(str);
7066 s = RSTRING_PTR(str);
7067 e = RSTRING_END(str) - 1;
7068 while (s < e) {
7069 c = *s;
7070 *s++ = *e;
7071 *e-- = c;
7072 }
7073 }
7074 else {
7075 str_shared_replace(str, rb_str_reverse(str));
7076 }
7077 }
7078 else {
7079 str_modify_keep_cr(str);
7080 }
7081 return str;
7082}
7083
7084
7085/*
7086 * call-seq:
7087 * include?(other_string) -> true or false
7088 *
7089 * Returns whether +self+ contains +other_string+:
7090 *
7091 * s = 'bar'
7092 * s.include?('ba') # => true
7093 * s.include?('ar') # => true
7094 * s.include?('bar') # => true
7095 * s.include?('a') # => true
7096 * s.include?('') # => true
7097 * s.include?('foo') # => false
7098 *
7099 * Related: see {Querying}[rdoc-ref:String@Querying].
7100 */
7101
7102VALUE
7103rb_str_include(VALUE str, VALUE arg)
7104{
7105 long i;
7106
7107 StringValue(arg);
7108 i = rb_str_index(str, arg, 0);
7109
7110 return RBOOL(i != -1);
7111}
7112
7113
7114/*
7115 * call-seq:
7116 * to_i(base = 10) -> integer
7117 *
7118 * Returns the result of interpreting leading characters in +self+
7119 * as an integer in the given +base+ (which must be in (0, 2..36)):
7120 *
7121 * '123456'.to_i # => 123456
7122 * '123def'.to_i(16) # => 1195503
7123 *
7124 * With +base+ zero, string +object+ may contain leading characters
7125 * to specify the actual base:
7126 *
7127 * '123def'.to_i(0) # => 123
7128 * '0123def'.to_i(0) # => 83
7129 * '0b123def'.to_i(0) # => 1
7130 * '0o123def'.to_i(0) # => 83
7131 * '0d123def'.to_i(0) # => 123
7132 * '0x123def'.to_i(0) # => 1195503
7133 *
7134 * Characters past a leading valid number (in the given +base+) are ignored:
7135 *
7136 * '12.345'.to_i # => 12
7137 * '12345'.to_i(2) # => 1
7138 *
7139 * Returns zero if there is no leading valid number:
7140 *
7141 * 'abcdef'.to_i # => 0
7142 * '2'.to_i(2) # => 0
7143 *
7144 */
7145
7146static VALUE
7147rb_str_to_i(int argc, VALUE *argv, VALUE str)
7148{
7149 int base = 10;
7150
7151 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7152 rb_raise(rb_eArgError, "invalid radix %d", base);
7153 }
7154 return rb_str_to_inum(str, base, FALSE);
7155}
7156
7157
7158/*
7159 * call-seq:
7160 * to_f -> float
7161 *
7162 * Returns the result of interpreting leading characters in +self+ as a Float:
7163 *
7164 * '3.14159'.to_f # => 3.14159
7165 * '1.234e-2'.to_f # => 0.01234
7166 *
7167 * Characters past a leading valid number (in the given +base+) are ignored:
7168 *
7169 * '3.14 (pi to two places)'.to_f # => 3.14
7170 *
7171 * Returns zero if there is no leading valid number:
7172 *
7173 * 'abcdef'.to_f # => 0.0
7174 *
7175 */
7176
7177static VALUE
7178rb_str_to_f(VALUE str)
7179{
7180 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7181}
7182
7183
7184/*
7185 * call-seq:
7186 * to_s -> self or string
7187 *
7188 * Returns +self+ if +self+ is a +String+,
7189 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7190 */
7191
7192static VALUE
7193rb_str_to_s(VALUE str)
7194{
7195 if (rb_obj_class(str) != rb_cString) {
7196 return str_duplicate(rb_cString, str);
7197 }
7198 return str;
7199}
7200
7201#if 0
7202static void
7203str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7204{
7205 char s[RUBY_MAX_CHAR_LEN];
7206 int n = rb_enc_codelen(c, enc);
7207
7208 rb_enc_mbcput(c, s, enc);
7209 rb_enc_str_buf_cat(str, s, n, enc);
7210}
7211#endif
7212
7213#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7214
7215int
7216rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7217{
7218 char buf[CHAR_ESC_LEN + 1];
7219 int l;
7220
7221#if SIZEOF_INT > 4
7222 c &= 0xffffffff;
7223#endif
7224 if (unicode_p) {
7225 if (c < 0x7F && ISPRINT(c)) {
7226 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7227 }
7228 else if (c < 0x10000) {
7229 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7230 }
7231 else {
7232 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7233 }
7234 }
7235 else {
7236 if (c < 0x100) {
7237 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7238 }
7239 else {
7240 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7241 }
7242 }
7243 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7244 rb_str_buf_cat(result, buf, l);
7245 return l;
7246}
7247
7248const char *
7249ruby_escaped_char(int c)
7250{
7251 switch (c) {
7252 case '\0': return "\\0";
7253 case '\n': return "\\n";
7254 case '\r': return "\\r";
7255 case '\t': return "\\t";
7256 case '\f': return "\\f";
7257 case '\013': return "\\v";
7258 case '\010': return "\\b";
7259 case '\007': return "\\a";
7260 case '\033': return "\\e";
7261 case '\x7f': return "\\c?";
7262 }
7263 return NULL;
7264}
7265
7266VALUE
7267rb_str_escape(VALUE str)
7268{
7269 int encidx = ENCODING_GET(str);
7270 rb_encoding *enc = rb_enc_from_index(encidx);
7271 const char *p = RSTRING_PTR(str);
7272 const char *pend = RSTRING_END(str);
7273 const char *prev = p;
7274 char buf[CHAR_ESC_LEN + 1];
7275 VALUE result = rb_str_buf_new(0);
7276 int unicode_p = rb_enc_unicode_p(enc);
7277 int asciicompat = rb_enc_asciicompat(enc);
7278
7279 while (p < pend) {
7280 unsigned int c;
7281 const char *cc;
7282 int n = rb_enc_precise_mbclen(p, pend, enc);
7283 if (!MBCLEN_CHARFOUND_P(n)) {
7284 if (p > prev) str_buf_cat(result, prev, p - prev);
7285 n = rb_enc_mbminlen(enc);
7286 if (pend < p + n)
7287 n = (int)(pend - p);
7288 while (n--) {
7289 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7290 str_buf_cat(result, buf, strlen(buf));
7291 prev = ++p;
7292 }
7293 continue;
7294 }
7295 n = MBCLEN_CHARFOUND_LEN(n);
7296 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7297 p += n;
7298 cc = ruby_escaped_char(c);
7299 if (cc) {
7300 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7301 str_buf_cat(result, cc, strlen(cc));
7302 prev = p;
7303 }
7304 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7305 }
7306 else {
7307 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7308 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7309 prev = p;
7310 }
7311 }
7312 if (p > prev) str_buf_cat(result, prev, p - prev);
7313 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7314
7315 return result;
7316}
7317
7318/*
7319 * call-seq:
7320 * inspect -> string
7321 *
7322 * :include: doc/string/inspect.rdoc
7323 *
7324 */
7325
7326VALUE
7328{
7329 int encidx = ENCODING_GET(str);
7330 rb_encoding *enc = rb_enc_from_index(encidx);
7331 const char *p, *pend, *prev;
7332 char buf[CHAR_ESC_LEN + 1];
7333 VALUE result = rb_str_buf_new(0);
7334 rb_encoding *resenc = rb_default_internal_encoding();
7335 int unicode_p = rb_enc_unicode_p(enc);
7336 int asciicompat = rb_enc_asciicompat(enc);
7337
7338 if (resenc == NULL) resenc = rb_default_external_encoding();
7339 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7340 rb_enc_associate(result, resenc);
7341 str_buf_cat2(result, "\"");
7342
7343 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7344 prev = p;
7345 while (p < pend) {
7346 unsigned int c, cc;
7347 int n;
7348
7349 n = rb_enc_precise_mbclen(p, pend, enc);
7350 if (!MBCLEN_CHARFOUND_P(n)) {
7351 if (p > prev) str_buf_cat(result, prev, p - prev);
7352 n = rb_enc_mbminlen(enc);
7353 if (pend < p + n)
7354 n = (int)(pend - p);
7355 while (n--) {
7356 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7357 str_buf_cat(result, buf, strlen(buf));
7358 prev = ++p;
7359 }
7360 continue;
7361 }
7362 n = MBCLEN_CHARFOUND_LEN(n);
7363 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7364 p += n;
7365 if ((asciicompat || unicode_p) &&
7366 (c == '"'|| c == '\\' ||
7367 (c == '#' &&
7368 p < pend &&
7369 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7370 (cc = rb_enc_codepoint(p,pend,enc),
7371 (cc == '$' || cc == '@' || cc == '{'))))) {
7372 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7373 str_buf_cat2(result, "\\");
7374 if (asciicompat || enc == resenc) {
7375 prev = p - n;
7376 continue;
7377 }
7378 }
7379 switch (c) {
7380 case '\n': cc = 'n'; break;
7381 case '\r': cc = 'r'; break;
7382 case '\t': cc = 't'; break;
7383 case '\f': cc = 'f'; break;
7384 case '\013': cc = 'v'; break;
7385 case '\010': cc = 'b'; break;
7386 case '\007': cc = 'a'; break;
7387 case 033: cc = 'e'; break;
7388 default: cc = 0; break;
7389 }
7390 if (cc) {
7391 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7392 buf[0] = '\\';
7393 buf[1] = (char)cc;
7394 str_buf_cat(result, buf, 2);
7395 prev = p;
7396 continue;
7397 }
7398 /* The special casing of 0x85 (NEXT_LINE) here is because
7399 * Oniguruma historically treats it as printable, but it
7400 * doesn't match the print POSIX bracket class or character
7401 * property in regexps.
7402 *
7403 * See Ruby Bug #16842 for details:
7404 * https://bugs.ruby-lang.org/issues/16842
7405 */
7406 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7407 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7408 continue;
7409 }
7410 else {
7411 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7412 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7413 prev = p;
7414 continue;
7415 }
7416 }
7417 if (p > prev) str_buf_cat(result, prev, p - prev);
7418 str_buf_cat2(result, "\"");
7419
7420 return result;
7421}
7422
7423#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7424
7425/*
7426 * call-seq:
7427 * dump -> new_string
7428 *
7429 * :include: doc/string/dump.rdoc
7430 *
7431 */
7432
7433VALUE
7435{
7436 int encidx = rb_enc_get_index(str);
7437 rb_encoding *enc = rb_enc_from_index(encidx);
7438 long len;
7439 const char *p, *pend;
7440 char *q, *qend;
7441 VALUE result;
7442 int u8 = (encidx == rb_utf8_encindex());
7443 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7444
7445 len = 2; /* "" */
7446 if (!rb_enc_asciicompat(enc)) {
7447 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7448 len += strlen(enc->name);
7449 }
7450
7451 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7452 while (p < pend) {
7453 int clen;
7454 unsigned char c = *p++;
7455
7456 switch (c) {
7457 case '"': case '\\':
7458 case '\n': case '\r':
7459 case '\t': case '\f':
7460 case '\013': case '\010': case '\007': case '\033':
7461 clen = 2;
7462 break;
7463
7464 case '#':
7465 clen = IS_EVSTR(p, pend) ? 2 : 1;
7466 break;
7467
7468 default:
7469 if (ISPRINT(c)) {
7470 clen = 1;
7471 }
7472 else {
7473 if (u8 && c > 0x7F) { /* \u notation */
7474 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7475 if (MBCLEN_CHARFOUND_P(n)) {
7476 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7477 if (cc <= 0xFFFF)
7478 clen = 6; /* \uXXXX */
7479 else if (cc <= 0xFFFFF)
7480 clen = 9; /* \u{XXXXX} */
7481 else
7482 clen = 10; /* \u{XXXXXX} */
7483 p += MBCLEN_CHARFOUND_LEN(n)-1;
7484 break;
7485 }
7486 }
7487 clen = 4; /* \xNN */
7488 }
7489 break;
7490 }
7491
7492 if (clen > LONG_MAX - len) {
7493 rb_raise(rb_eRuntimeError, "string size too big");
7494 }
7495 len += clen;
7496 }
7497
7498 result = rb_str_new(0, len);
7499 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7500 q = RSTRING_PTR(result); qend = q + len + 1;
7501
7502 *q++ = '"';
7503 while (p < pend) {
7504 unsigned char c = *p++;
7505
7506 if (c == '"' || c == '\\') {
7507 *q++ = '\\';
7508 *q++ = c;
7509 }
7510 else if (c == '#') {
7511 if (IS_EVSTR(p, pend)) *q++ = '\\';
7512 *q++ = '#';
7513 }
7514 else if (c == '\n') {
7515 *q++ = '\\';
7516 *q++ = 'n';
7517 }
7518 else if (c == '\r') {
7519 *q++ = '\\';
7520 *q++ = 'r';
7521 }
7522 else if (c == '\t') {
7523 *q++ = '\\';
7524 *q++ = 't';
7525 }
7526 else if (c == '\f') {
7527 *q++ = '\\';
7528 *q++ = 'f';
7529 }
7530 else if (c == '\013') {
7531 *q++ = '\\';
7532 *q++ = 'v';
7533 }
7534 else if (c == '\010') {
7535 *q++ = '\\';
7536 *q++ = 'b';
7537 }
7538 else if (c == '\007') {
7539 *q++ = '\\';
7540 *q++ = 'a';
7541 }
7542 else if (c == '\033') {
7543 *q++ = '\\';
7544 *q++ = 'e';
7545 }
7546 else if (ISPRINT(c)) {
7547 *q++ = c;
7548 }
7549 else {
7550 *q++ = '\\';
7551 if (u8) {
7552 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7553 if (MBCLEN_CHARFOUND_P(n)) {
7554 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7555 p += n;
7556 if (cc <= 0xFFFF)
7557 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7558 else
7559 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7560 q += strlen(q);
7561 continue;
7562 }
7563 }
7564 snprintf(q, qend-q, "x%02X", c);
7565 q += 3;
7566 }
7567 }
7568 *q++ = '"';
7569 *q = '\0';
7570 if (!rb_enc_asciicompat(enc)) {
7571 snprintf(q, qend-q, nonascii_suffix, enc->name);
7572 encidx = rb_ascii8bit_encindex();
7573 }
7574 /* result from dump is ASCII */
7575 rb_enc_associate_index(result, encidx);
7577 return result;
7578}
7579
7580static int
7581unescape_ascii(unsigned int c)
7582{
7583 switch (c) {
7584 case 'n':
7585 return '\n';
7586 case 'r':
7587 return '\r';
7588 case 't':
7589 return '\t';
7590 case 'f':
7591 return '\f';
7592 case 'v':
7593 return '\13';
7594 case 'b':
7595 return '\010';
7596 case 'a':
7597 return '\007';
7598 case 'e':
7599 return 033;
7600 }
7602}
7603
7604static void
7605undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7606{
7607 const char *s = *ss;
7608 unsigned int c;
7609 int codelen;
7610 size_t hexlen;
7611 unsigned char buf[6];
7612 static rb_encoding *enc_utf8 = NULL;
7613
7614 switch (*s) {
7615 case '\\':
7616 case '"':
7617 case '#':
7618 rb_str_cat(undumped, s, 1); /* cat itself */
7619 s++;
7620 break;
7621 case 'n':
7622 case 'r':
7623 case 't':
7624 case 'f':
7625 case 'v':
7626 case 'b':
7627 case 'a':
7628 case 'e':
7629 *buf = unescape_ascii(*s);
7630 rb_str_cat(undumped, (char *)buf, 1);
7631 s++;
7632 break;
7633 case 'u':
7634 if (*binary) {
7635 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7636 }
7637 *utf8 = true;
7638 if (++s >= s_end) {
7639 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7640 }
7641 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7642 if (*penc != enc_utf8) {
7643 *penc = enc_utf8;
7644 rb_enc_associate(undumped, enc_utf8);
7645 }
7646 if (*s == '{') { /* handle \u{...} form */
7647 s++;
7648 for (;;) {
7649 if (s >= s_end) {
7650 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7651 }
7652 if (*s == '}') {
7653 s++;
7654 break;
7655 }
7656 if (ISSPACE(*s)) {
7657 s++;
7658 continue;
7659 }
7660 c = scan_hex(s, s_end-s, &hexlen);
7661 if (hexlen == 0 || hexlen > 6) {
7662 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7663 }
7664 if (c > 0x10ffff) {
7665 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7666 }
7667 if (0xd800 <= c && c <= 0xdfff) {
7668 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7669 }
7670 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7671 rb_str_cat(undumped, (char *)buf, codelen);
7672 s += hexlen;
7673 }
7674 }
7675 else { /* handle \uXXXX form */
7676 c = scan_hex(s, 4, &hexlen);
7677 if (hexlen != 4) {
7678 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7679 }
7680 if (0xd800 <= c && c <= 0xdfff) {
7681 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7682 }
7683 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7684 rb_str_cat(undumped, (char *)buf, codelen);
7685 s += hexlen;
7686 }
7687 break;
7688 case 'x':
7689 if (*utf8) {
7690 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7691 }
7692 *binary = true;
7693 if (++s >= s_end) {
7694 rb_raise(rb_eRuntimeError, "invalid hex escape");
7695 }
7696 *buf = scan_hex(s, 2, &hexlen);
7697 if (hexlen != 2) {
7698 rb_raise(rb_eRuntimeError, "invalid hex escape");
7699 }
7700 rb_str_cat(undumped, (char *)buf, 1);
7701 s += hexlen;
7702 break;
7703 default:
7704 rb_str_cat(undumped, s-1, 2);
7705 s++;
7706 }
7707
7708 *ss = s;
7709}
7710
7711static VALUE rb_str_is_ascii_only_p(VALUE str);
7712
7713/*
7714 * call-seq:
7715 * undump -> string
7716 *
7717 * Returns an unescaped version of +self+:
7718 *
7719 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7720 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7721 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7722 * s_undumped == s_orig # => true
7723 *
7724 * Related: String#dump (inverse of String#undump).
7725 *
7726 */
7727
7728static VALUE
7729str_undump(VALUE str)
7730{
7731 const char *s = RSTRING_PTR(str);
7732 const char *s_end = RSTRING_END(str);
7733 rb_encoding *enc = rb_enc_get(str);
7734 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7735 bool utf8 = false;
7736 bool binary = false;
7737 int w;
7738
7740 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7741 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7742 }
7743 if (!str_null_check(str, &w)) {
7744 rb_raise(rb_eRuntimeError, "string contains null byte");
7745 }
7746 if (RSTRING_LEN(str) < 2) goto invalid_format;
7747 if (*s != '"') goto invalid_format;
7748
7749 /* strip '"' at the start */
7750 s++;
7751
7752 for (;;) {
7753 if (s >= s_end) {
7754 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7755 }
7756
7757 if (*s == '"') {
7758 /* epilogue */
7759 s++;
7760 if (s == s_end) {
7761 /* ascii compatible dumped string */
7762 break;
7763 }
7764 else {
7765 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7766 static const char dup_suffix[] = ".dup";
7767 const char *encname;
7768 int encidx;
7769 ptrdiff_t size;
7770
7771 /* check separately for strings dumped by older versions */
7772 size = sizeof(dup_suffix) - 1;
7773 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7774
7775 size = sizeof(force_encoding_suffix) - 1;
7776 if (s_end - s <= size) goto invalid_format;
7777 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7778 s += size;
7779
7780 if (utf8) {
7781 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7782 }
7783
7784 encname = s;
7785 s = memchr(s, '"', s_end-s);
7786 size = s - encname;
7787 if (!s) goto invalid_format;
7788 if (s_end - s != 2) goto invalid_format;
7789 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7790
7791 encidx = rb_enc_find_index2(encname, (long)size);
7792 if (encidx < 0) {
7793 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7794 }
7795 rb_enc_associate_index(undumped, encidx);
7796 }
7797 break;
7798 }
7799
7800 if (*s == '\\') {
7801 s++;
7802 if (s >= s_end) {
7803 rb_raise(rb_eRuntimeError, "invalid escape");
7804 }
7805 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7806 }
7807 else {
7808 rb_str_cat(undumped, s++, 1);
7809 }
7810 }
7811
7812 RB_GC_GUARD(str);
7813
7814 return undumped;
7815invalid_format:
7816 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7817}
7818
7819static void
7820rb_str_check_dummy_enc(rb_encoding *enc)
7821{
7822 if (rb_enc_dummy_p(enc)) {
7823 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7824 rb_enc_name(enc));
7825 }
7826}
7827
7828static rb_encoding *
7829str_true_enc(VALUE str)
7830{
7831 rb_encoding *enc = STR_ENC_GET(str);
7832 rb_str_check_dummy_enc(enc);
7833 return enc;
7834}
7835
7836static OnigCaseFoldType
7837check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7838{
7839 if (argc==0)
7840 return flags;
7841 if (argc>2)
7842 rb_raise(rb_eArgError, "too many options");
7843 if (argv[0]==sym_turkic) {
7844 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7845 if (argc==2) {
7846 if (argv[1]==sym_lithuanian)
7847 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7848 else
7849 rb_raise(rb_eArgError, "invalid second option");
7850 }
7851 }
7852 else if (argv[0]==sym_lithuanian) {
7853 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7854 if (argc==2) {
7855 if (argv[1]==sym_turkic)
7856 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7857 else
7858 rb_raise(rb_eArgError, "invalid second option");
7859 }
7860 }
7861 else if (argc>1)
7862 rb_raise(rb_eArgError, "too many options");
7863 else if (argv[0]==sym_ascii)
7864 flags |= ONIGENC_CASE_ASCII_ONLY;
7865 else if (argv[0]==sym_fold) {
7866 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7867 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7868 else
7869 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7870 }
7871 else
7872 rb_raise(rb_eArgError, "invalid option");
7873 return flags;
7874}
7875
7876static inline bool
7877case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7878{
7879 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7880 return true;
7881 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7882}
7883
7884/* 16 should be long enough to absorb any kind of single character length increase */
7885#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7886#ifndef CASEMAP_DEBUG
7887# define CASEMAP_DEBUG 0
7888#endif
7889
7890struct mapping_buffer;
7891typedef struct mapping_buffer {
7892 size_t capa;
7893 size_t used;
7894 struct mapping_buffer *next;
7895 OnigUChar space[FLEX_ARY_LEN];
7897
7898static void
7899mapping_buffer_free(void *p)
7900{
7901 mapping_buffer *previous_buffer;
7902 mapping_buffer *current_buffer = p;
7903 while (current_buffer) {
7904 previous_buffer = current_buffer;
7905 current_buffer = current_buffer->next;
7906 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7907 }
7908}
7909
7910static const rb_data_type_t mapping_buffer_type = {
7911 "mapping_buffer",
7912 {0, mapping_buffer_free,},
7913 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7914};
7915
7916static VALUE
7917rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7918{
7919 VALUE target;
7920
7921 const OnigUChar *source_current, *source_end;
7922 int target_length = 0;
7923 VALUE buffer_anchor;
7924 mapping_buffer *current_buffer = 0;
7925 mapping_buffer **pre_buffer;
7926 size_t buffer_count = 0;
7927 int buffer_length_or_invalid;
7928
7929 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7930
7931 source_current = (OnigUChar*)RSTRING_PTR(source);
7932 source_end = (OnigUChar*)RSTRING_END(source);
7933
7934 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7935 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7936 while (source_current < source_end) {
7937 /* increase multiplier using buffer count to converge quickly */
7938 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7939 if (CASEMAP_DEBUG) {
7940 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7941 }
7942 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7943 *pre_buffer = current_buffer;
7944 pre_buffer = &current_buffer->next;
7945 current_buffer->next = NULL;
7946 current_buffer->capa = capa;
7947 buffer_length_or_invalid = enc->case_map(flags,
7948 &source_current, source_end,
7949 current_buffer->space,
7950 current_buffer->space+current_buffer->capa,
7951 enc);
7952 if (buffer_length_or_invalid < 0) {
7953 current_buffer = DATA_PTR(buffer_anchor);
7954 DATA_PTR(buffer_anchor) = 0;
7955 mapping_buffer_free(current_buffer);
7956 rb_raise(rb_eArgError, "input string invalid");
7957 }
7958 target_length += current_buffer->used = buffer_length_or_invalid;
7959 }
7960 if (CASEMAP_DEBUG) {
7961 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7962 }
7963
7964 if (buffer_count==1) {
7965 target = rb_str_new((const char*)current_buffer->space, target_length);
7966 }
7967 else {
7968 char *target_current;
7969
7970 target = rb_str_new(0, target_length);
7971 target_current = RSTRING_PTR(target);
7972 current_buffer = DATA_PTR(buffer_anchor);
7973 while (current_buffer) {
7974 memcpy(target_current, current_buffer->space, current_buffer->used);
7975 target_current += current_buffer->used;
7976 current_buffer = current_buffer->next;
7977 }
7978 }
7979 current_buffer = DATA_PTR(buffer_anchor);
7980 DATA_PTR(buffer_anchor) = 0;
7981 mapping_buffer_free(current_buffer);
7982
7983 RB_GC_GUARD(buffer_anchor);
7984
7985 /* TODO: check about string terminator character */
7986 str_enc_copy_direct(target, source);
7987 /*ENC_CODERANGE_SET(mapped, cr);*/
7988
7989 return target;
7990}
7991
7992static VALUE
7993rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7994{
7995 const OnigUChar *source_current, *source_end;
7996 OnigUChar *target_current, *target_end;
7997 long old_length = RSTRING_LEN(source);
7998 int length_or_invalid;
7999
8000 if (old_length == 0) return Qnil;
8001
8002 source_current = (OnigUChar*)RSTRING_PTR(source);
8003 source_end = (OnigUChar*)RSTRING_END(source);
8004 if (source == target) {
8005 target_current = (OnigUChar*)source_current;
8006 target_end = (OnigUChar*)source_end;
8007 }
8008 else {
8009 target_current = (OnigUChar*)RSTRING_PTR(target);
8010 target_end = (OnigUChar*)RSTRING_END(target);
8011 }
8012
8013 length_or_invalid = onigenc_ascii_only_case_map(flags,
8014 &source_current, source_end,
8015 target_current, target_end, enc);
8016 if (length_or_invalid < 0)
8017 rb_raise(rb_eArgError, "input string invalid");
8018 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8019 fprintf(stderr, "problem with rb_str_ascii_casemap"
8020 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8021 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8022 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8023 }
8024
8025 str_enc_copy(target, source);
8026
8027 return target;
8028}
8029
8030static bool
8031upcase_single(VALUE str)
8032{
8033 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8034 bool modified = false;
8035
8036 while (s < send) {
8037 unsigned int c = *(unsigned char*)s;
8038
8039 if ('a' <= c && c <= 'z') {
8040 *s = 'A' + (c - 'a');
8041 modified = true;
8042 }
8043 s++;
8044 }
8045 return modified;
8046}
8047
8048/*
8049 * call-seq:
8050 * upcase!(mapping) -> self or nil
8051 *
8052 * Upcases the characters in +self+;
8053 * returns +self+ if any changes were made, +nil+ otherwise:
8054 *
8055 * s = 'Hello World!' # => "Hello World!"
8056 * s.upcase! # => "HELLO WORLD!"
8057 * s # => "HELLO WORLD!"
8058 * s.upcase! # => nil
8059 *
8060 * The casing may be affected by the given +mapping+;
8061 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8062 *
8063 * Related: String#upcase, String#downcase, String#downcase!.
8064 *
8065 */
8066
8067static VALUE
8068rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8069{
8070 rb_encoding *enc;
8071 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8072
8073 flags = check_case_options(argc, argv, flags);
8074 str_modify_keep_cr(str);
8075 enc = str_true_enc(str);
8076 if (case_option_single_p(flags, enc, str)) {
8077 if (upcase_single(str))
8078 flags |= ONIGENC_CASE_MODIFIED;
8079 }
8080 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8081 rb_str_ascii_casemap(str, str, &flags, enc);
8082 else
8083 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8084
8085 if (ONIGENC_CASE_MODIFIED&flags) return str;
8086 return Qnil;
8087}
8088
8089
8090/*
8091 * call-seq:
8092 * upcase(mapping) -> string
8093 *
8094 * Returns a string containing the upcased characters in +self+:
8095 *
8096 * s = 'Hello World!' # => "Hello World!"
8097 * s.upcase # => "HELLO WORLD!"
8098 *
8099 * The casing may be affected by the given +mapping+;
8100 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8101 *
8102 * Related: String#upcase!, String#downcase, String#downcase!.
8103 *
8104 */
8105
8106static VALUE
8107rb_str_upcase(int argc, VALUE *argv, VALUE str)
8108{
8109 rb_encoding *enc;
8110 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8111 VALUE ret;
8112
8113 flags = check_case_options(argc, argv, flags);
8114 enc = str_true_enc(str);
8115 if (case_option_single_p(flags, enc, str)) {
8116 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8117 str_enc_copy_direct(ret, str);
8118 upcase_single(ret);
8119 }
8120 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8121 ret = rb_str_new(0, RSTRING_LEN(str));
8122 rb_str_ascii_casemap(str, ret, &flags, enc);
8123 }
8124 else {
8125 ret = rb_str_casemap(str, &flags, enc);
8126 }
8127
8128 return ret;
8129}
8130
8131static bool
8132downcase_single(VALUE str)
8133{
8134 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8135 bool modified = false;
8136
8137 while (s < send) {
8138 unsigned int c = *(unsigned char*)s;
8139
8140 if ('A' <= c && c <= 'Z') {
8141 *s = 'a' + (c - 'A');
8142 modified = true;
8143 }
8144 s++;
8145 }
8146
8147 return modified;
8148}
8149
8150/*
8151 * call-seq:
8152 * downcase!(mapping) -> self or nil
8153 *
8154 * Like String#downcase, except that:
8155 *
8156 * - Changes character casings in +self+ (not in a copy of +self+).
8157 * - Returns +self+ if any changes are made, +nil+ otherwise.
8158 *
8159 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8160 */
8161
8162static VALUE
8163rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8164{
8165 rb_encoding *enc;
8166 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8167
8168 flags = check_case_options(argc, argv, flags);
8169 str_modify_keep_cr(str);
8170 enc = str_true_enc(str);
8171 if (case_option_single_p(flags, enc, str)) {
8172 if (downcase_single(str))
8173 flags |= ONIGENC_CASE_MODIFIED;
8174 }
8175 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8176 rb_str_ascii_casemap(str, str, &flags, enc);
8177 else
8178 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8179
8180 if (ONIGENC_CASE_MODIFIED&flags) return str;
8181 return Qnil;
8182}
8183
8184
8185/*
8186 * call-seq:
8187 * downcase(mapping) -> string
8188 *
8189 * :include: doc/string/downcase.rdoc
8190 *
8191 */
8192
8193static VALUE
8194rb_str_downcase(int argc, VALUE *argv, VALUE str)
8195{
8196 rb_encoding *enc;
8197 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8198 VALUE ret;
8199
8200 flags = check_case_options(argc, argv, flags);
8201 enc = str_true_enc(str);
8202 if (case_option_single_p(flags, enc, str)) {
8203 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8204 str_enc_copy_direct(ret, str);
8205 downcase_single(ret);
8206 }
8207 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8208 ret = rb_str_new(0, RSTRING_LEN(str));
8209 rb_str_ascii_casemap(str, ret, &flags, enc);
8210 }
8211 else {
8212 ret = rb_str_casemap(str, &flags, enc);
8213 }
8214
8215 return ret;
8216}
8217
8218
8219/*
8220 * call-seq:
8221 * capitalize!(mapping = :ascii) -> self or nil
8222 *
8223 * Like String#capitalize, except that:
8224 *
8225 * - Changes character casings in +self+ (not in a copy of +self+).
8226 * - Returns +self+ if any changes are made, +nil+ otherwise.
8227 *
8228 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8229 */
8230
8231static VALUE
8232rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8233{
8234 rb_encoding *enc;
8235 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8236
8237 flags = check_case_options(argc, argv, flags);
8238 str_modify_keep_cr(str);
8239 enc = str_true_enc(str);
8240 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8241 if (flags&ONIGENC_CASE_ASCII_ONLY)
8242 rb_str_ascii_casemap(str, str, &flags, enc);
8243 else
8244 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8245
8246 if (ONIGENC_CASE_MODIFIED&flags) return str;
8247 return Qnil;
8248}
8249
8250
8251/*
8252 * call-seq:
8253 * capitalize(mapping = :ascii) -> string
8254 *
8255 * Returns a string containing the characters in +self+,
8256 * each with possibly changed case:
8257 *
8258 * - The first character is upcased.
8259 * - All other characters are downcased.
8260 *
8261 * Examples:
8262 *
8263 * 'hello world'.capitalize # => "Hello world"
8264 * 'HELLO WORLD'.capitalize # => "Hello world"
8265 *
8266 * Some characters do not have upcase and downcase, and so are not changed;
8267 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc]:
8268 *
8269 * '1, 2, 3, ...'.capitalize # => "1, 2, 3, ..."
8270 *
8271 * The casing is affected by the given +mapping+,
8272 * which may be +:ascii+, +:fold+, or +:turkic+;
8273 * see {Case Mappings}[rdoc-ref:case_mapping.rdoc@Case+Mappings].
8274 *
8275 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8276 */
8277
8278static VALUE
8279rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8280{
8281 rb_encoding *enc;
8282 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8283 VALUE ret;
8284
8285 flags = check_case_options(argc, argv, flags);
8286 enc = str_true_enc(str);
8287 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8288 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8289 ret = rb_str_new(0, RSTRING_LEN(str));
8290 rb_str_ascii_casemap(str, ret, &flags, enc);
8291 }
8292 else {
8293 ret = rb_str_casemap(str, &flags, enc);
8294 }
8295 return ret;
8296}
8297
8298
8299/*
8300 * call-seq:
8301 * swapcase!(mapping) -> self or nil
8302 *
8303 * Upcases each lowercase character in +self+;
8304 * downcases uppercase character;
8305 * returns +self+ if any changes were made, +nil+ otherwise:
8306 *
8307 * s = 'Hello World!' # => "Hello World!"
8308 * s.swapcase! # => "hELLO wORLD!"
8309 * s # => "hELLO wORLD!"
8310 * ''.swapcase! # => nil
8311 *
8312 * The casing may be affected by the given +mapping+;
8313 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8314 *
8315 * Related: String#swapcase.
8316 *
8317 */
8318
8319static VALUE
8320rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8321{
8322 rb_encoding *enc;
8323 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8324
8325 flags = check_case_options(argc, argv, flags);
8326 str_modify_keep_cr(str);
8327 enc = str_true_enc(str);
8328 if (flags&ONIGENC_CASE_ASCII_ONLY)
8329 rb_str_ascii_casemap(str, str, &flags, enc);
8330 else
8331 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8332
8333 if (ONIGENC_CASE_MODIFIED&flags) return str;
8334 return Qnil;
8335}
8336
8337
8338/*
8339 * call-seq:
8340 * swapcase(mapping) -> string
8341 *
8342 * Returns a string containing the characters in +self+, with cases reversed;
8343 * each uppercase character is downcased;
8344 * each lowercase character is upcased:
8345 *
8346 * s = 'Hello World!' # => "Hello World!"
8347 * s.swapcase # => "hELLO wORLD!"
8348 *
8349 * The casing may be affected by the given +mapping+;
8350 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8351 *
8352 * Related: String#swapcase!.
8353 *
8354 */
8355
8356static VALUE
8357rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8358{
8359 rb_encoding *enc;
8360 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8361 VALUE ret;
8362
8363 flags = check_case_options(argc, argv, flags);
8364 enc = str_true_enc(str);
8365 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8366 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8367 ret = rb_str_new(0, RSTRING_LEN(str));
8368 rb_str_ascii_casemap(str, ret, &flags, enc);
8369 }
8370 else {
8371 ret = rb_str_casemap(str, &flags, enc);
8372 }
8373 return ret;
8374}
8375
8376typedef unsigned char *USTR;
8377
8378struct tr {
8379 int gen;
8380 unsigned int now, max;
8381 char *p, *pend;
8382};
8383
8384static unsigned int
8385trnext(struct tr *t, rb_encoding *enc)
8386{
8387 int n;
8388
8389 for (;;) {
8390 nextpart:
8391 if (!t->gen) {
8392 if (t->p == t->pend) return -1;
8393 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8394 t->p += n;
8395 }
8396 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8397 t->p += n;
8398 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8399 t->p += n;
8400 if (t->p < t->pend) {
8401 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8402 t->p += n;
8403 if (t->now > c) {
8404 if (t->now < 0x80 && c < 0x80) {
8405 rb_raise(rb_eArgError,
8406 "invalid range \"%c-%c\" in string transliteration",
8407 t->now, c);
8408 }
8409 else {
8410 rb_raise(rb_eArgError, "invalid range in string transliteration");
8411 }
8412 continue; /* not reached */
8413 }
8414 else if (t->now < c) {
8415 t->gen = 1;
8416 t->max = c;
8417 }
8418 }
8419 }
8420 return t->now;
8421 }
8422 else {
8423 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8424 if (t->now == t->max) {
8425 t->gen = 0;
8426 goto nextpart;
8427 }
8428 }
8429 if (t->now < t->max) {
8430 return t->now;
8431 }
8432 else {
8433 t->gen = 0;
8434 return t->max;
8435 }
8436 }
8437 }
8438}
8439
8440static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8441
8442static VALUE
8443tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8444{
8445 const unsigned int errc = -1;
8446 unsigned int trans[256];
8447 rb_encoding *enc, *e1, *e2;
8448 struct tr trsrc, trrepl;
8449 int cflag = 0;
8450 unsigned int c, c0, last = 0;
8451 int modify = 0, i, l;
8452 unsigned char *s, *send;
8453 VALUE hash = 0;
8454 int singlebyte = single_byte_optimizable(str);
8455 int termlen;
8456 int cr;
8457
8458#define CHECK_IF_ASCII(c) \
8459 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8460 (cr = ENC_CODERANGE_VALID) : 0)
8461
8462 StringValue(src);
8463 StringValue(repl);
8464 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8465 if (RSTRING_LEN(repl) == 0) {
8466 return rb_str_delete_bang(1, &src, str);
8467 }
8468
8469 cr = ENC_CODERANGE(str);
8470 e1 = rb_enc_check(str, src);
8471 e2 = rb_enc_check(str, repl);
8472 if (e1 == e2) {
8473 enc = e1;
8474 }
8475 else {
8476 enc = rb_enc_check(src, repl);
8477 }
8478 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8479 if (RSTRING_LEN(src) > 1 &&
8480 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8481 trsrc.p + l < trsrc.pend) {
8482 cflag = 1;
8483 trsrc.p += l;
8484 }
8485 trrepl.p = RSTRING_PTR(repl);
8486 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8487 trsrc.gen = trrepl.gen = 0;
8488 trsrc.now = trrepl.now = 0;
8489 trsrc.max = trrepl.max = 0;
8490
8491 if (cflag) {
8492 for (i=0; i<256; i++) {
8493 trans[i] = 1;
8494 }
8495 while ((c = trnext(&trsrc, enc)) != errc) {
8496 if (c < 256) {
8497 trans[c] = errc;
8498 }
8499 else {
8500 if (!hash) hash = rb_hash_new();
8501 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8502 }
8503 }
8504 while ((c = trnext(&trrepl, enc)) != errc)
8505 /* retrieve last replacer */;
8506 last = trrepl.now;
8507 for (i=0; i<256; i++) {
8508 if (trans[i] != errc) {
8509 trans[i] = last;
8510 }
8511 }
8512 }
8513 else {
8514 unsigned int r;
8515
8516 for (i=0; i<256; i++) {
8517 trans[i] = errc;
8518 }
8519 while ((c = trnext(&trsrc, enc)) != errc) {
8520 r = trnext(&trrepl, enc);
8521 if (r == errc) r = trrepl.now;
8522 if (c < 256) {
8523 trans[c] = r;
8524 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8525 }
8526 else {
8527 if (!hash) hash = rb_hash_new();
8528 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8529 }
8530 }
8531 }
8532
8533 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8534 cr = ENC_CODERANGE_7BIT;
8535 str_modify_keep_cr(str);
8536 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8537 termlen = rb_enc_mbminlen(enc);
8538 if (sflag) {
8539 int clen, tlen;
8540 long offset, max = RSTRING_LEN(str);
8541 unsigned int save = -1;
8542 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8543
8544 while (s < send) {
8545 int may_modify = 0;
8546
8547 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8548 if (!MBCLEN_CHARFOUND_P(r)) {
8549 xfree(buf);
8550 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8551 }
8552 clen = MBCLEN_CHARFOUND_LEN(r);
8553 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8554
8555 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8556
8557 s += clen;
8558 if (c < 256) {
8559 c = trans[c];
8560 }
8561 else if (hash) {
8562 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8563 if (NIL_P(tmp)) {
8564 if (cflag) c = last;
8565 else c = errc;
8566 }
8567 else if (cflag) c = errc;
8568 else c = NUM2INT(tmp);
8569 }
8570 else {
8571 c = errc;
8572 }
8573 if (c != (unsigned int)-1) {
8574 if (save == c) {
8575 CHECK_IF_ASCII(c);
8576 continue;
8577 }
8578 save = c;
8579 tlen = rb_enc_codelen(c, enc);
8580 modify = 1;
8581 }
8582 else {
8583 save = -1;
8584 c = c0;
8585 if (enc != e1) may_modify = 1;
8586 }
8587 if ((offset = t - buf) + tlen > max) {
8588 size_t MAYBE_UNUSED(old) = max + termlen;
8589 max = offset + tlen + (send - s);
8590 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8591 t = buf + offset;
8592 }
8593 rb_enc_mbcput(c, t, enc);
8594 if (may_modify && memcmp(s, t, tlen) != 0) {
8595 modify = 1;
8596 }
8597 CHECK_IF_ASCII(c);
8598 t += tlen;
8599 }
8600 if (!STR_EMBED_P(str)) {
8601 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8602 }
8603 TERM_FILL((char *)t, termlen);
8604 RSTRING(str)->as.heap.ptr = (char *)buf;
8605 STR_SET_LEN(str, t - buf);
8606 STR_SET_NOEMBED(str);
8607 RSTRING(str)->as.heap.aux.capa = max;
8608 }
8609 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8610 while (s < send) {
8611 c = (unsigned char)*s;
8612 if (trans[c] != errc) {
8613 if (!cflag) {
8614 c = trans[c];
8615 *s = c;
8616 modify = 1;
8617 }
8618 else {
8619 *s = last;
8620 modify = 1;
8621 }
8622 }
8623 CHECK_IF_ASCII(c);
8624 s++;
8625 }
8626 }
8627 else {
8628 int clen, tlen;
8629 long offset, max = (long)((send - s) * 1.2);
8630 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8631
8632 while (s < send) {
8633 int may_modify = 0;
8634
8635 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8636 if (!MBCLEN_CHARFOUND_P(r)) {
8637 xfree(buf);
8638 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8639 }
8640 clen = MBCLEN_CHARFOUND_LEN(r);
8641 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8642
8643 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8644
8645 if (c < 256) {
8646 c = trans[c];
8647 }
8648 else if (hash) {
8649 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8650 if (NIL_P(tmp)) {
8651 if (cflag) c = last;
8652 else c = errc;
8653 }
8654 else if (cflag) c = errc;
8655 else c = NUM2INT(tmp);
8656 }
8657 else {
8658 c = cflag ? last : errc;
8659 }
8660 if (c != errc) {
8661 tlen = rb_enc_codelen(c, enc);
8662 modify = 1;
8663 }
8664 else {
8665 c = c0;
8666 if (enc != e1) may_modify = 1;
8667 }
8668 if ((offset = t - buf) + tlen > max) {
8669 size_t MAYBE_UNUSED(old) = max + termlen;
8670 max = offset + tlen + (long)((send - s) * 1.2);
8671 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8672 t = buf + offset;
8673 }
8674 if (s != t) {
8675 rb_enc_mbcput(c, t, enc);
8676 if (may_modify && memcmp(s, t, tlen) != 0) {
8677 modify = 1;
8678 }
8679 }
8680 CHECK_IF_ASCII(c);
8681 s += clen;
8682 t += tlen;
8683 }
8684 if (!STR_EMBED_P(str)) {
8685 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8686 }
8687 TERM_FILL((char *)t, termlen);
8688 RSTRING(str)->as.heap.ptr = (char *)buf;
8689 STR_SET_LEN(str, t - buf);
8690 STR_SET_NOEMBED(str);
8691 RSTRING(str)->as.heap.aux.capa = max;
8692 }
8693
8694 if (modify) {
8695 if (cr != ENC_CODERANGE_BROKEN)
8696 ENC_CODERANGE_SET(str, cr);
8697 rb_enc_associate(str, enc);
8698 return str;
8699 }
8700 return Qnil;
8701}
8702
8703
8704/*
8705 * call-seq:
8706 * tr!(selector, replacements) -> self or nil
8707 *
8708 * Like String#tr, but modifies +self+ in place.
8709 * Returns +self+ if any changes were made, +nil+ otherwise.
8710 *
8711 */
8712
8713static VALUE
8714rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8715{
8716 return tr_trans(str, src, repl, 0);
8717}
8718
8719
8720/*
8721 * call-seq:
8722 * tr(selector, replacements) -> new_string
8723 *
8724 * Returns a copy of +self+ with each character specified by string +selector+
8725 * translated to the corresponding character in string +replacements+.
8726 * The correspondence is _positional_:
8727 *
8728 * - Each occurrence of the first character specified by +selector+
8729 * is translated to the first character in +replacements+.
8730 * - Each occurrence of the second character specified by +selector+
8731 * is translated to the second character in +replacements+.
8732 * - And so on.
8733 *
8734 * Example:
8735 *
8736 * 'hello'.tr('el', 'ip') #=> "hippo"
8737 *
8738 * If +replacements+ is shorter than +selector+,
8739 * it is implicitly padded with its own last character:
8740 *
8741 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8742 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8743 *
8744 * Arguments +selector+ and +replacements+ must be valid character selectors
8745 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8746 * and may use any of its valid forms, including negation, ranges, and escaping:
8747 *
8748 * # Negation.
8749 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8750 * # Ranges.
8751 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8752 * # Escapes.
8753 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8754 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8755 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8756 *
8757 */
8758
8759static VALUE
8760rb_str_tr(VALUE str, VALUE src, VALUE repl)
8761{
8762 str = str_duplicate(rb_cString, str);
8763 tr_trans(str, src, repl, 0);
8764 return str;
8765}
8766
8767#define TR_TABLE_MAX (UCHAR_MAX+1)
8768#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8769static void
8770tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8771 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8772{
8773 const unsigned int errc = -1;
8774 char buf[TR_TABLE_MAX];
8775 struct tr tr;
8776 unsigned int c;
8777 VALUE table = 0, ptable = 0;
8778 int i, l, cflag = 0;
8779
8780 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8781 tr.gen = tr.now = tr.max = 0;
8782
8783 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8784 cflag = 1;
8785 tr.p += l;
8786 }
8787 if (first) {
8788 for (i=0; i<TR_TABLE_MAX; i++) {
8789 stable[i] = 1;
8790 }
8791 stable[TR_TABLE_MAX] = cflag;
8792 }
8793 else if (stable[TR_TABLE_MAX] && !cflag) {
8794 stable[TR_TABLE_MAX] = 0;
8795 }
8796 for (i=0; i<TR_TABLE_MAX; i++) {
8797 buf[i] = cflag;
8798 }
8799
8800 while ((c = trnext(&tr, enc)) != errc) {
8801 if (c < TR_TABLE_MAX) {
8802 buf[(unsigned char)c] = !cflag;
8803 }
8804 else {
8805 VALUE key = UINT2NUM(c);
8806
8807 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8808 if (cflag) {
8809 ptable = *ctablep;
8810 table = ptable ? ptable : rb_hash_new();
8811 *ctablep = table;
8812 }
8813 else {
8814 table = rb_hash_new();
8815 ptable = *tablep;
8816 *tablep = table;
8817 }
8818 }
8819 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8820 rb_hash_aset(table, key, Qtrue);
8821 }
8822 }
8823 }
8824 for (i=0; i<TR_TABLE_MAX; i++) {
8825 stable[i] = stable[i] && buf[i];
8826 }
8827 if (!table && !cflag) {
8828 *tablep = 0;
8829 }
8830}
8831
8832
8833static int
8834tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8835{
8836 if (c < TR_TABLE_MAX) {
8837 return table[c] != 0;
8838 }
8839 else {
8840 VALUE v = UINT2NUM(c);
8841
8842 if (del) {
8843 if (!NIL_P(rb_hash_lookup(del, v)) &&
8844 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8845 return TRUE;
8846 }
8847 }
8848 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8849 return FALSE;
8850 }
8851 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8852 }
8853}
8854
8855/*
8856 * call-seq:
8857 * delete!(*selectors) -> self or nil
8858 *
8859 * Like String#delete, but modifies +self+ in place;
8860 * returns +self+ if any characters were deleted, +nil+ otherwise.
8861 *
8862 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8863 */
8864
8865static VALUE
8866rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8867{
8868 char squeez[TR_TABLE_SIZE];
8869 rb_encoding *enc = 0;
8870 char *s, *send, *t;
8871 VALUE del = 0, nodel = 0;
8872 int modify = 0;
8873 int i, ascompat, cr;
8874
8875 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8877 for (i=0; i<argc; i++) {
8878 VALUE s = argv[i];
8879
8880 StringValue(s);
8881 enc = rb_enc_check(str, s);
8882 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8883 }
8884
8885 str_modify_keep_cr(str);
8886 ascompat = rb_enc_asciicompat(enc);
8887 s = t = RSTRING_PTR(str);
8888 send = RSTRING_END(str);
8889 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8890 while (s < send) {
8891 unsigned int c;
8892 int clen;
8893
8894 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8895 if (squeez[c]) {
8896 modify = 1;
8897 }
8898 else {
8899 if (t != s) *t = c;
8900 t++;
8901 }
8902 s++;
8903 }
8904 else {
8905 c = rb_enc_codepoint_len(s, send, &clen, enc);
8906
8907 if (tr_find(c, squeez, del, nodel)) {
8908 modify = 1;
8909 }
8910 else {
8911 if (t != s) rb_enc_mbcput(c, t, enc);
8912 t += clen;
8914 }
8915 s += clen;
8916 }
8917 }
8918 TERM_FILL(t, TERM_LEN(str));
8919 STR_SET_LEN(str, t - RSTRING_PTR(str));
8920 ENC_CODERANGE_SET(str, cr);
8921
8922 if (modify) return str;
8923 return Qnil;
8924}
8925
8926
8927/*
8928 * call-seq:
8929 * delete(*selectors) -> new_string
8930 *
8931 * :include: doc/string/delete.rdoc
8932 *
8933 */
8934
8935static VALUE
8936rb_str_delete(int argc, VALUE *argv, VALUE str)
8937{
8938 str = str_duplicate(rb_cString, str);
8939 rb_str_delete_bang(argc, argv, str);
8940 return str;
8941}
8942
8943
8944/*
8945 * call-seq:
8946 * squeeze!(*selectors) -> self or nil
8947 *
8948 * Like String#squeeze, but modifies +self+ in place.
8949 * Returns +self+ if any changes were made, +nil+ otherwise.
8950 */
8951
8952static VALUE
8953rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8954{
8955 char squeez[TR_TABLE_SIZE];
8956 rb_encoding *enc = 0;
8957 VALUE del = 0, nodel = 0;
8958 unsigned char *s, *send, *t;
8959 int i, modify = 0;
8960 int ascompat, singlebyte = single_byte_optimizable(str);
8961 unsigned int save;
8962
8963 if (argc == 0) {
8964 enc = STR_ENC_GET(str);
8965 }
8966 else {
8967 for (i=0; i<argc; i++) {
8968 VALUE s = argv[i];
8969
8970 StringValue(s);
8971 enc = rb_enc_check(str, s);
8972 if (singlebyte && !single_byte_optimizable(s))
8973 singlebyte = 0;
8974 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8975 }
8976 }
8977
8978 str_modify_keep_cr(str);
8979 s = t = (unsigned char *)RSTRING_PTR(str);
8980 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8981 send = (unsigned char *)RSTRING_END(str);
8982 save = -1;
8983 ascompat = rb_enc_asciicompat(enc);
8984
8985 if (singlebyte) {
8986 while (s < send) {
8987 unsigned int c = *s++;
8988 if (c != save || (argc > 0 && !squeez[c])) {
8989 *t++ = save = c;
8990 }
8991 }
8992 }
8993 else {
8994 while (s < send) {
8995 unsigned int c;
8996 int clen;
8997
8998 if (ascompat && (c = *s) < 0x80) {
8999 if (c != save || (argc > 0 && !squeez[c])) {
9000 *t++ = save = c;
9001 }
9002 s++;
9003 }
9004 else {
9005 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
9006
9007 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9008 if (t != s) rb_enc_mbcput(c, t, enc);
9009 save = c;
9010 t += clen;
9011 }
9012 s += clen;
9013 }
9014 }
9015 }
9016
9017 TERM_FILL((char *)t, TERM_LEN(str));
9018 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9019 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9020 modify = 1;
9021 }
9022
9023 if (modify) return str;
9024 return Qnil;
9025}
9026
9027
9028/*
9029 * call-seq:
9030 * squeeze(*selectors) -> new_string
9031 *
9032 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9033 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9034 *
9035 * "Squeezed" means that each multiple-character run of a selected character
9036 * is squeezed down to a single character;
9037 * with no arguments given, squeezes all characters:
9038 *
9039 * "yellow moon".squeeze #=> "yelow mon"
9040 * " now is the".squeeze(" ") #=> " now is the"
9041 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9042 *
9043 */
9044
9045static VALUE
9046rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9047{
9048 str = str_duplicate(rb_cString, str);
9049 rb_str_squeeze_bang(argc, argv, str);
9050 return str;
9051}
9052
9053
9054/*
9055 * call-seq:
9056 * tr_s!(selector, replacements) -> self or nil
9057 *
9058 * Like String#tr_s, but modifies +self+ in place.
9059 * Returns +self+ if any changes were made, +nil+ otherwise.
9060 *
9061 * Related: String#squeeze!.
9062 */
9063
9064static VALUE
9065rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9066{
9067 return tr_trans(str, src, repl, 1);
9068}
9069
9070
9071/*
9072 * call-seq:
9073 * tr_s(selector, replacements) -> string
9074 *
9075 * Like String#tr, but also squeezes the modified portions of the translated string;
9076 * returns a new string (translated and squeezed).
9077 *
9078 * 'hello'.tr_s('l', 'r') #=> "hero"
9079 * 'hello'.tr_s('el', '-') #=> "h-o"
9080 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9081 *
9082 * Related: String#squeeze.
9083 *
9084 */
9085
9086static VALUE
9087rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9088{
9089 str = str_duplicate(rb_cString, str);
9090 tr_trans(str, src, repl, 1);
9091 return str;
9092}
9093
9094
9095/*
9096 * call-seq:
9097 * count(*selectors) -> integer
9098 *
9099 * :include: doc/string/count.rdoc
9100 */
9101
9102static VALUE
9103rb_str_count(int argc, VALUE *argv, VALUE str)
9104{
9105 char table[TR_TABLE_SIZE];
9106 rb_encoding *enc = 0;
9107 VALUE del = 0, nodel = 0, tstr;
9108 char *s, *send;
9109 int i;
9110 int ascompat;
9111 size_t n = 0;
9112
9114
9115 tstr = argv[0];
9116 StringValue(tstr);
9117 enc = rb_enc_check(str, tstr);
9118 if (argc == 1) {
9119 const char *ptstr;
9120 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9121 (ptstr = RSTRING_PTR(tstr),
9122 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9123 !is_broken_string(str)) {
9124 int clen;
9125 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9126
9127 s = RSTRING_PTR(str);
9128 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9129 send = RSTRING_END(str);
9130 while (s < send) {
9131 if (*(unsigned char*)s++ == c) n++;
9132 }
9133 return SIZET2NUM(n);
9134 }
9135 }
9136
9137 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9138 for (i=1; i<argc; i++) {
9139 tstr = argv[i];
9140 StringValue(tstr);
9141 enc = rb_enc_check(str, tstr);
9142 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9143 }
9144
9145 s = RSTRING_PTR(str);
9146 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9147 send = RSTRING_END(str);
9148 ascompat = rb_enc_asciicompat(enc);
9149 while (s < send) {
9150 unsigned int c;
9151
9152 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9153 if (table[c]) {
9154 n++;
9155 }
9156 s++;
9157 }
9158 else {
9159 int clen;
9160 c = rb_enc_codepoint_len(s, send, &clen, enc);
9161 if (tr_find(c, table, del, nodel)) {
9162 n++;
9163 }
9164 s += clen;
9165 }
9166 }
9167
9168 return SIZET2NUM(n);
9169}
9170
9171static VALUE
9172rb_fs_check(VALUE val)
9173{
9174 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9175 val = rb_check_string_type(val);
9176 if (NIL_P(val)) return 0;
9177 }
9178 return val;
9179}
9180
9181static const char isspacetable[256] = {
9182 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9184 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9198};
9199
9200#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9201
9202static long
9203split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9204{
9205 if (empty_count >= 0 && len == 0) {
9206 return empty_count + 1;
9207 }
9208 if (empty_count > 0) {
9209 /* make different substrings */
9210 if (result) {
9211 do {
9212 rb_ary_push(result, str_new_empty_String(str));
9213 } while (--empty_count > 0);
9214 }
9215 else {
9216 do {
9217 rb_yield(str_new_empty_String(str));
9218 } while (--empty_count > 0);
9219 }
9220 }
9221 str = rb_str_subseq(str, beg, len);
9222 if (result) {
9223 rb_ary_push(result, str);
9224 }
9225 else {
9226 rb_yield(str);
9227 }
9228 return empty_count;
9229}
9230
9231typedef enum {
9232 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9233} split_type_t;
9234
9235static split_type_t
9236literal_split_pattern(VALUE spat, split_type_t default_type)
9237{
9238 rb_encoding *enc = STR_ENC_GET(spat);
9239 const char *ptr;
9240 long len;
9241 RSTRING_GETMEM(spat, ptr, len);
9242 if (len == 0) {
9243 /* Special case - split into chars */
9244 return SPLIT_TYPE_CHARS;
9245 }
9246 else if (rb_enc_asciicompat(enc)) {
9247 if (len == 1 && ptr[0] == ' ') {
9248 return SPLIT_TYPE_AWK;
9249 }
9250 }
9251 else {
9252 int l;
9253 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9254 return SPLIT_TYPE_AWK;
9255 }
9256 }
9257 return default_type;
9258}
9259
9260/*
9261 * call-seq:
9262 * split(field_sep = $;, limit = 0) -> array
9263 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9264 *
9265 * :include: doc/string/split.rdoc
9266 *
9267 */
9268
9269static VALUE
9270rb_str_split_m(int argc, VALUE *argv, VALUE str)
9271{
9272 rb_encoding *enc;
9273 VALUE spat;
9274 VALUE limit;
9275 split_type_t split_type;
9276 long beg, end, i = 0, empty_count = -1;
9277 int lim = 0;
9278 VALUE result, tmp;
9279
9280 result = rb_block_given_p() ? Qfalse : Qnil;
9281 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9282 lim = NUM2INT(limit);
9283 if (lim <= 0) limit = Qnil;
9284 else if (lim == 1) {
9285 if (RSTRING_LEN(str) == 0)
9286 return result ? rb_ary_new2(0) : str;
9287 tmp = str_duplicate(rb_cString, str);
9288 if (!result) {
9289 rb_yield(tmp);
9290 return str;
9291 }
9292 return rb_ary_new3(1, tmp);
9293 }
9294 i = 1;
9295 }
9296 if (NIL_P(limit) && !lim) empty_count = 0;
9297
9298 enc = STR_ENC_GET(str);
9299 split_type = SPLIT_TYPE_REGEXP;
9300 if (!NIL_P(spat)) {
9301 spat = get_pat_quoted(spat, 0);
9302 }
9303 else if (NIL_P(spat = rb_fs)) {
9304 split_type = SPLIT_TYPE_AWK;
9305 }
9306 else if (!(spat = rb_fs_check(spat))) {
9307 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9308 }
9309 else {
9310 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9311 }
9312 if (split_type != SPLIT_TYPE_AWK) {
9313 switch (BUILTIN_TYPE(spat)) {
9314 case T_REGEXP:
9315 rb_reg_options(spat); /* check if uninitialized */
9316 tmp = RREGEXP_SRC(spat);
9317 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9318 if (split_type == SPLIT_TYPE_AWK) {
9319 spat = tmp;
9320 split_type = SPLIT_TYPE_STRING;
9321 }
9322 break;
9323
9324 case T_STRING:
9325 mustnot_broken(spat);
9326 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9327 break;
9328
9329 default:
9331 }
9332 }
9333
9334#define SPLIT_STR(beg, len) ( \
9335 empty_count = split_string(result, str, beg, len, empty_count), \
9336 str_mod_check(str, str_start, str_len))
9337
9338 beg = 0;
9339 char *ptr = RSTRING_PTR(str);
9340 char *const str_start = ptr;
9341 const long str_len = RSTRING_LEN(str);
9342 char *const eptr = str_start + str_len;
9343 if (split_type == SPLIT_TYPE_AWK) {
9344 char *bptr = ptr;
9345 int skip = 1;
9346 unsigned int c;
9347
9348 if (result) result = rb_ary_new();
9349 end = beg;
9350 if (is_ascii_string(str)) {
9351 while (ptr < eptr) {
9352 c = (unsigned char)*ptr++;
9353 if (skip) {
9354 if (ascii_isspace(c)) {
9355 beg = ptr - bptr;
9356 }
9357 else {
9358 end = ptr - bptr;
9359 skip = 0;
9360 if (!NIL_P(limit) && lim <= i) break;
9361 }
9362 }
9363 else if (ascii_isspace(c)) {
9364 SPLIT_STR(beg, end-beg);
9365 skip = 1;
9366 beg = ptr - bptr;
9367 if (!NIL_P(limit)) ++i;
9368 }
9369 else {
9370 end = ptr - bptr;
9371 }
9372 }
9373 }
9374 else {
9375 while (ptr < eptr) {
9376 int n;
9377
9378 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9379 ptr += n;
9380 if (skip) {
9381 if (rb_isspace(c)) {
9382 beg = ptr - bptr;
9383 }
9384 else {
9385 end = ptr - bptr;
9386 skip = 0;
9387 if (!NIL_P(limit) && lim <= i) break;
9388 }
9389 }
9390 else if (rb_isspace(c)) {
9391 SPLIT_STR(beg, end-beg);
9392 skip = 1;
9393 beg = ptr - bptr;
9394 if (!NIL_P(limit)) ++i;
9395 }
9396 else {
9397 end = ptr - bptr;
9398 }
9399 }
9400 }
9401 }
9402 else if (split_type == SPLIT_TYPE_STRING) {
9403 char *substr_start = ptr;
9404 char *sptr = RSTRING_PTR(spat);
9405 long slen = RSTRING_LEN(spat);
9406
9407 if (result) result = rb_ary_new();
9408 mustnot_broken(str);
9409 enc = rb_enc_check(str, spat);
9410 while (ptr < eptr &&
9411 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9412 /* Check we are at the start of a char */
9413 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9414 if (t != ptr + end) {
9415 ptr = t;
9416 continue;
9417 }
9418 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9419 str_mod_check(spat, sptr, slen);
9420 ptr += end + slen;
9421 substr_start = ptr;
9422 if (!NIL_P(limit) && lim <= ++i) break;
9423 }
9424 beg = ptr - str_start;
9425 }
9426 else if (split_type == SPLIT_TYPE_CHARS) {
9427 int n;
9428
9429 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9430 mustnot_broken(str);
9431 enc = rb_enc_get(str);
9432 while (ptr < eptr &&
9433 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9434 SPLIT_STR(ptr - str_start, n);
9435 ptr += n;
9436 if (!NIL_P(limit) && lim <= ++i) break;
9437 }
9438 beg = ptr - str_start;
9439 }
9440 else {
9441 if (result) result = rb_ary_new();
9442 long len = RSTRING_LEN(str);
9443 long start = beg;
9444 long idx;
9445 int last_null = 0;
9446 struct re_registers *regs;
9447 VALUE match = 0;
9448
9449 for (; rb_reg_search(spat, str, start, 0) >= 0;
9450 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9451 match = rb_backref_get();
9452 if (!result) rb_match_busy(match);
9453 regs = RMATCH_REGS(match);
9454 end = BEG(0);
9455 if (start == end && BEG(0) == END(0)) {
9456 if (!ptr) {
9457 SPLIT_STR(0, 0);
9458 break;
9459 }
9460 else if (last_null == 1) {
9461 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9462 beg = start;
9463 }
9464 else {
9465 if (start == len)
9466 start++;
9467 else
9468 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9469 last_null = 1;
9470 continue;
9471 }
9472 }
9473 else {
9474 SPLIT_STR(beg, end-beg);
9475 beg = start = END(0);
9476 }
9477 last_null = 0;
9478
9479 for (idx=1; idx < regs->num_regs; idx++) {
9480 if (BEG(idx) == -1) continue;
9481 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9482 }
9483 if (!NIL_P(limit) && lim <= ++i) break;
9484 }
9485 if (match) rb_match_unbusy(match);
9486 }
9487 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9488 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9489 }
9490
9491 return result ? result : str;
9492}
9493
9494VALUE
9495rb_str_split(VALUE str, const char *sep0)
9496{
9497 VALUE sep;
9498
9499 StringValue(str);
9500 sep = rb_str_new_cstr(sep0);
9501 return rb_str_split_m(1, &sep, str);
9502}
9503
9504#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9505
9506static inline int
9507enumerator_element(VALUE ary, VALUE e)
9508{
9509 if (ary) {
9510 rb_ary_push(ary, e);
9511 return 0;
9512 }
9513 else {
9514 rb_yield(e);
9515 return 1;
9516 }
9517}
9518
9519#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9520
9521static const char *
9522chomp_newline(const char *p, const char *e, rb_encoding *enc)
9523{
9524 const char *prev = rb_enc_prev_char(p, e, e, enc);
9525 if (rb_enc_is_newline(prev, e, enc)) {
9526 e = prev;
9527 prev = rb_enc_prev_char(p, e, e, enc);
9528 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9529 e = prev;
9530 }
9531 return e;
9532}
9533
9534static VALUE
9535get_rs(void)
9536{
9537 VALUE rs = rb_rs;
9538 if (!NIL_P(rs) &&
9539 (!RB_TYPE_P(rs, T_STRING) ||
9540 RSTRING_LEN(rs) != 1 ||
9541 RSTRING_PTR(rs)[0] != '\n')) {
9542 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9543 }
9544 return rs;
9545}
9546
9547#define rb_rs get_rs()
9548
9549static VALUE
9550rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9551{
9552 rb_encoding *enc;
9553 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9554 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9555 long pos, len, rslen;
9556 int rsnewline = 0;
9557
9558 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9559 rs = rb_rs;
9560 if (!NIL_P(opts)) {
9561 static ID keywords[1];
9562 if (!keywords[0]) {
9563 keywords[0] = rb_intern_const("chomp");
9564 }
9565 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9566 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9567 }
9568
9569 if (NIL_P(rs)) {
9570 if (!ENUM_ELEM(ary, str)) {
9571 return ary;
9572 }
9573 else {
9574 return orig;
9575 }
9576 }
9577
9578 if (!RSTRING_LEN(str)) goto end;
9579 str = rb_str_new_frozen(str);
9580 ptr = subptr = RSTRING_PTR(str);
9581 pend = RSTRING_END(str);
9582 len = RSTRING_LEN(str);
9583 StringValue(rs);
9584 rslen = RSTRING_LEN(rs);
9585
9586 if (rs == rb_default_rs)
9587 enc = rb_enc_get(str);
9588 else
9589 enc = rb_enc_check(str, rs);
9590
9591 if (rslen == 0) {
9592 /* paragraph mode */
9593 int n;
9594 const char *eol = NULL;
9595 subend = subptr;
9596 while (subend < pend) {
9597 long chomp_rslen = 0;
9598 do {
9599 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9600 n = 0;
9601 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9602 if (rb_enc_is_newline(subend + n, pend, enc)) {
9603 if (eol == subend) break;
9604 subend += rslen;
9605 if (subptr) {
9606 eol = subend;
9607 chomp_rslen = -rslen;
9608 }
9609 }
9610 else {
9611 if (!subptr) subptr = subend;
9612 subend += rslen;
9613 }
9614 rslen = 0;
9615 } while (subend < pend);
9616 if (!subptr) break;
9617 if (rslen == 0) chomp_rslen = 0;
9618 line = rb_str_subseq(str, subptr - ptr,
9619 subend - subptr + (chomp ? chomp_rslen : rslen));
9620 if (ENUM_ELEM(ary, line)) {
9621 str_mod_check(str, ptr, len);
9622 }
9623 subptr = eol = NULL;
9624 }
9625 goto end;
9626 }
9627 else {
9628 rsptr = RSTRING_PTR(rs);
9629 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9630 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9631 rsnewline = 1;
9632 }
9633 }
9634
9635 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9636 rs = rb_str_new(rsptr, rslen);
9637 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9638 rsptr = RSTRING_PTR(rs);
9639 rslen = RSTRING_LEN(rs);
9640 }
9641
9642 while (subptr < pend) {
9643 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9644 if (pos < 0) break;
9645 hit = subptr + pos;
9646 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9647 if (hit != adjusted) {
9648 subptr = adjusted;
9649 continue;
9650 }
9651 subend = hit += rslen;
9652 if (chomp) {
9653 if (rsnewline) {
9654 subend = chomp_newline(subptr, subend, enc);
9655 }
9656 else {
9657 subend -= rslen;
9658 }
9659 }
9660 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9661 if (ENUM_ELEM(ary, line)) {
9662 str_mod_check(str, ptr, len);
9663 }
9664 subptr = hit;
9665 }
9666
9667 if (subptr != pend) {
9668 if (chomp) {
9669 if (rsnewline) {
9670 pend = chomp_newline(subptr, pend, enc);
9671 }
9672 else if (pend - subptr >= rslen &&
9673 memcmp(pend - rslen, rsptr, rslen) == 0) {
9674 pend -= rslen;
9675 }
9676 }
9677 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9678 ENUM_ELEM(ary, line);
9679 RB_GC_GUARD(str);
9680 }
9681
9682 end:
9683 if (ary)
9684 return ary;
9685 else
9686 return orig;
9687}
9688
9689/*
9690 * call-seq:
9691 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9692 * each_line(record_separator = $/, chomp: false) -> enumerator
9693 *
9694 * :include: doc/string/each_line.rdoc
9695 *
9696 */
9697
9698static VALUE
9699rb_str_each_line(int argc, VALUE *argv, VALUE str)
9700{
9701 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9702 return rb_str_enumerate_lines(argc, argv, str, 0);
9703}
9704
9705/*
9706 * call-seq:
9707 * lines(record_separator = $/, chomp: false) -> array_of_strings
9708 *
9709 * Returns substrings ("lines") of +self+
9710 * according to the given arguments:
9711 *
9712 * s = <<~EOT
9713 * This is the first line.
9714 * This is line two.
9715 *
9716 * This is line four.
9717 * This is line five.
9718 * EOT
9719 *
9720 * With the default argument values:
9721 *
9722 * $/ # => "\n"
9723 * s.lines
9724 * # =>
9725 * ["This is the first line.\n",
9726 * "This is line two.\n",
9727 * "\n",
9728 * "This is line four.\n",
9729 * "This is line five.\n"]
9730 *
9731 * With a different +record_separator+:
9732 *
9733 * record_separator = ' is '
9734 * s.lines(record_separator)
9735 * # =>
9736 * ["This is ",
9737 * "the first line.\nThis is ",
9738 * "line two.\n\nThis is ",
9739 * "line four.\nThis is ",
9740 * "line five.\n"]
9741 *
9742 * With keyword argument +chomp+ as +true+,
9743 * removes the trailing newline from each line:
9744 *
9745 * s.lines(chomp: true)
9746 * # =>
9747 * ["This is the first line.",
9748 * "This is line two.",
9749 * "",
9750 * "This is line four.",
9751 * "This is line five."]
9752 *
9753 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9754 */
9755
9756static VALUE
9757rb_str_lines(int argc, VALUE *argv, VALUE str)
9758{
9759 VALUE ary = WANTARRAY("lines", 0);
9760 return rb_str_enumerate_lines(argc, argv, str, ary);
9761}
9762
9763static VALUE
9764rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9765{
9766 return LONG2FIX(RSTRING_LEN(str));
9767}
9768
9769static VALUE
9770rb_str_enumerate_bytes(VALUE str, VALUE ary)
9771{
9772 long i;
9773
9774 for (i=0; i<RSTRING_LEN(str); i++) {
9775 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9776 }
9777 if (ary)
9778 return ary;
9779 else
9780 return str;
9781}
9782
9783/*
9784 * call-seq:
9785 * each_byte {|byte| ... } -> self
9786 * each_byte -> enumerator
9787 *
9788 * :include: doc/string/each_byte.rdoc
9789 *
9790 */
9791
9792static VALUE
9793rb_str_each_byte(VALUE str)
9794{
9795 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9796 return rb_str_enumerate_bytes(str, 0);
9797}
9798
9799/*
9800 * call-seq:
9801 * bytes -> array_of_bytes
9802 *
9803 * :include: doc/string/bytes.rdoc
9804 *
9805 */
9806
9807static VALUE
9808rb_str_bytes(VALUE str)
9809{
9810 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9811 return rb_str_enumerate_bytes(str, ary);
9812}
9813
9814static VALUE
9815rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9816{
9817 return rb_str_length(str);
9818}
9819
9820static VALUE
9821rb_str_enumerate_chars(VALUE str, VALUE ary)
9822{
9823 VALUE orig = str;
9824 long i, len, n;
9825 const char *ptr;
9826 rb_encoding *enc;
9827
9828 str = rb_str_new_frozen(str);
9829 ptr = RSTRING_PTR(str);
9830 len = RSTRING_LEN(str);
9831 enc = rb_enc_get(str);
9832
9834 for (i = 0; i < len; i += n) {
9835 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9836 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9837 }
9838 }
9839 else {
9840 for (i = 0; i < len; i += n) {
9841 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9842 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9843 }
9844 }
9845 RB_GC_GUARD(str);
9846 if (ary)
9847 return ary;
9848 else
9849 return orig;
9850}
9851
9852/*
9853 * call-seq:
9854 * each_char {|char| ... } -> self
9855 * each_char -> enumerator
9856 *
9857 * :include: doc/string/each_char.rdoc
9858 *
9859 */
9860
9861static VALUE
9862rb_str_each_char(VALUE str)
9863{
9864 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9865 return rb_str_enumerate_chars(str, 0);
9866}
9867
9868/*
9869 * call-seq:
9870 * chars -> array_of_characters
9871 *
9872 * :include: doc/string/chars.rdoc
9873 *
9874 */
9875
9876static VALUE
9877rb_str_chars(VALUE str)
9878{
9879 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9880 return rb_str_enumerate_chars(str, ary);
9881}
9882
9883static VALUE
9884rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9885{
9886 VALUE orig = str;
9887 int n;
9888 unsigned int c;
9889 const char *ptr, *end;
9890 rb_encoding *enc;
9891
9892 if (single_byte_optimizable(str))
9893 return rb_str_enumerate_bytes(str, ary);
9894
9895 str = rb_str_new_frozen(str);
9896 ptr = RSTRING_PTR(str);
9897 end = RSTRING_END(str);
9898 enc = STR_ENC_GET(str);
9899
9900 while (ptr < end) {
9901 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9902 ENUM_ELEM(ary, UINT2NUM(c));
9903 ptr += n;
9904 }
9905 RB_GC_GUARD(str);
9906 if (ary)
9907 return ary;
9908 else
9909 return orig;
9910}
9911
9912/*
9913 * call-seq:
9914 * each_codepoint {|codepoint| ... } -> self
9915 * each_codepoint -> enumerator
9916 *
9917 * :include: doc/string/each_codepoint.rdoc
9918 *
9919 */
9920
9921static VALUE
9922rb_str_each_codepoint(VALUE str)
9923{
9924 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9925 return rb_str_enumerate_codepoints(str, 0);
9926}
9927
9928/*
9929 * call-seq:
9930 * codepoints -> array_of_integers
9931 *
9932 * :include: doc/string/codepoints.rdoc
9933 *
9934 */
9935
9936static VALUE
9937rb_str_codepoints(VALUE str)
9938{
9939 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9940 return rb_str_enumerate_codepoints(str, ary);
9941}
9942
9943static regex_t *
9944get_reg_grapheme_cluster(rb_encoding *enc)
9945{
9946 int encidx = rb_enc_to_index(enc);
9947
9948 const OnigUChar source_ascii[] = "\\X";
9949 const OnigUChar *source = source_ascii;
9950 size_t source_len = sizeof(source_ascii) - 1;
9951
9952 switch (encidx) {
9953#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9954#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9955#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9956#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9957#define CASE_UTF(e) \
9958 case ENCINDEX_UTF_##e: { \
9959 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9960 source = source_UTF_##e; \
9961 source_len = sizeof(source_UTF_##e); \
9962 break; \
9963 }
9964 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9965#undef CASE_UTF
9966#undef CHARS_16BE
9967#undef CHARS_16LE
9968#undef CHARS_32BE
9969#undef CHARS_32LE
9970 }
9971
9972 regex_t *reg_grapheme_cluster;
9973 OnigErrorInfo einfo;
9974 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9975 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9976 if (r) {
9977 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9978 onig_error_code_to_str(message, r, &einfo);
9979 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9980 }
9981
9982 return reg_grapheme_cluster;
9983}
9984
9985static regex_t *
9986get_cached_reg_grapheme_cluster(rb_encoding *enc)
9987{
9988 int encidx = rb_enc_to_index(enc);
9989 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9990
9991 if (encidx == rb_utf8_encindex()) {
9992 if (!reg_grapheme_cluster_utf8) {
9993 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9994 }
9995
9996 return reg_grapheme_cluster_utf8;
9997 }
9998
9999 return NULL;
10000}
10001
10002static VALUE
10003rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
10004{
10005 size_t grapheme_cluster_count = 0;
10006 rb_encoding *enc = get_encoding(str);
10007 const char *ptr, *end;
10008
10009 if (!rb_enc_unicode_p(enc)) {
10010 return rb_str_length(str);
10011 }
10012
10013 bool cached_reg_grapheme_cluster = true;
10014 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10015 if (!reg_grapheme_cluster) {
10016 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10017 cached_reg_grapheme_cluster = false;
10018 }
10019
10020 ptr = RSTRING_PTR(str);
10021 end = RSTRING_END(str);
10022
10023 while (ptr < end) {
10024 OnigPosition len = onig_match(reg_grapheme_cluster,
10025 (const OnigUChar *)ptr, (const OnigUChar *)end,
10026 (const OnigUChar *)ptr, NULL, 0);
10027 if (len <= 0) break;
10028 grapheme_cluster_count++;
10029 ptr += len;
10030 }
10031
10032 if (!cached_reg_grapheme_cluster) {
10033 onig_free(reg_grapheme_cluster);
10034 }
10035
10036 return SIZET2NUM(grapheme_cluster_count);
10037}
10038
10039static VALUE
10040rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
10041{
10042 VALUE orig = str;
10043 rb_encoding *enc = get_encoding(str);
10044 const char *ptr0, *ptr, *end;
10045
10046 if (!rb_enc_unicode_p(enc)) {
10047 return rb_str_enumerate_chars(str, ary);
10048 }
10049
10050 if (!ary) str = rb_str_new_frozen(str);
10051
10052 bool cached_reg_grapheme_cluster = true;
10053 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10054 if (!reg_grapheme_cluster) {
10055 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10056 cached_reg_grapheme_cluster = false;
10057 }
10058
10059 ptr0 = ptr = RSTRING_PTR(str);
10060 end = RSTRING_END(str);
10061
10062 while (ptr < end) {
10063 OnigPosition len = onig_match(reg_grapheme_cluster,
10064 (const OnigUChar *)ptr, (const OnigUChar *)end,
10065 (const OnigUChar *)ptr, NULL, 0);
10066 if (len <= 0) break;
10067 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10068 ptr += len;
10069 }
10070
10071 if (!cached_reg_grapheme_cluster) {
10072 onig_free(reg_grapheme_cluster);
10073 }
10074
10075 RB_GC_GUARD(str);
10076 if (ary)
10077 return ary;
10078 else
10079 return orig;
10080}
10081
10082/*
10083 * call-seq:
10084 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
10085 * each_grapheme_cluster -> enumerator
10086 *
10087 * :include: doc/string/each_grapheme_cluster.rdoc
10088 *
10089 */
10090
10091static VALUE
10092rb_str_each_grapheme_cluster(VALUE str)
10093{
10094 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10095 return rb_str_enumerate_grapheme_clusters(str, 0);
10096}
10097
10098/*
10099 * call-seq:
10100 * grapheme_clusters -> array_of_grapheme_clusters
10101 *
10102 * :include: doc/string/grapheme_clusters.rdoc
10103 *
10104 */
10105
10106static VALUE
10107rb_str_grapheme_clusters(VALUE str)
10108{
10109 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10110 return rb_str_enumerate_grapheme_clusters(str, ary);
10111}
10112
10113static long
10114chopped_length(VALUE str)
10115{
10116 rb_encoding *enc = STR_ENC_GET(str);
10117 const char *p, *p2, *beg, *end;
10118
10119 beg = RSTRING_PTR(str);
10120 end = beg + RSTRING_LEN(str);
10121 if (beg >= end) return 0;
10122 p = rb_enc_prev_char(beg, end, end, enc);
10123 if (!p) return 0;
10124 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10125 p2 = rb_enc_prev_char(beg, p, end, enc);
10126 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10127 }
10128 return p - beg;
10129}
10130
10131/*
10132 * call-seq:
10133 * chop! -> self or nil
10134 *
10135 * Like String#chop, except that:
10136 *
10137 * - Removes trailing characters from +self+ (not from a copy of +self+).
10138 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10139 *
10140 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10141 */
10142
10143static VALUE
10144rb_str_chop_bang(VALUE str)
10145{
10146 str_modify_keep_cr(str);
10147 if (RSTRING_LEN(str) > 0) {
10148 long len;
10149 len = chopped_length(str);
10150 STR_SET_LEN(str, len);
10151 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10152 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10154 }
10155 return str;
10156 }
10157 return Qnil;
10158}
10159
10160
10161/*
10162 * call-seq:
10163 * chop -> new_string
10164 *
10165 * :include: doc/string/chop.rdoc
10166 *
10167 */
10168
10169static VALUE
10170rb_str_chop(VALUE str)
10171{
10172 return rb_str_subseq(str, 0, chopped_length(str));
10173}
10174
10175static long
10176smart_chomp(VALUE str, const char *e, const char *p)
10177{
10178 rb_encoding *enc = rb_enc_get(str);
10179 if (rb_enc_mbminlen(enc) > 1) {
10180 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10181 if (rb_enc_is_newline(pp, e, enc)) {
10182 e = pp;
10183 }
10184 pp = e - rb_enc_mbminlen(enc);
10185 if (pp >= p) {
10186 pp = rb_enc_left_char_head(p, pp, e, enc);
10187 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10188 e = pp;
10189 }
10190 }
10191 }
10192 else {
10193 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10194 case '\n':
10195 if (--e > p && *(e-1) == '\r') {
10196 --e;
10197 }
10198 break;
10199 case '\r':
10200 --e;
10201 break;
10202 }
10203 }
10204 return e - p;
10205}
10206
10207static long
10208chompped_length(VALUE str, VALUE rs)
10209{
10210 rb_encoding *enc;
10211 int newline;
10212 char *pp, *e, *rsptr;
10213 long rslen;
10214 char *const p = RSTRING_PTR(str);
10215 long len = RSTRING_LEN(str);
10216
10217 if (len == 0) return 0;
10218 e = p + len;
10219 if (rs == rb_default_rs) {
10220 return smart_chomp(str, e, p);
10221 }
10222
10223 enc = rb_enc_get(str);
10224 RSTRING_GETMEM(rs, rsptr, rslen);
10225 if (rslen == 0) {
10226 if (rb_enc_mbminlen(enc) > 1) {
10227 while (e > p) {
10228 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10229 if (!rb_enc_is_newline(pp, e, enc)) break;
10230 e = pp;
10231 pp -= rb_enc_mbminlen(enc);
10232 if (pp >= p) {
10233 pp = rb_enc_left_char_head(p, pp, e, enc);
10234 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10235 e = pp;
10236 }
10237 }
10238 }
10239 }
10240 else {
10241 while (e > p && *(e-1) == '\n') {
10242 --e;
10243 if (e > p && *(e-1) == '\r')
10244 --e;
10245 }
10246 }
10247 return e - p;
10248 }
10249 if (rslen > len) return len;
10250
10251 enc = rb_enc_get(rs);
10252 newline = rsptr[rslen-1];
10253 if (rslen == rb_enc_mbminlen(enc)) {
10254 if (rslen == 1) {
10255 if (newline == '\n')
10256 return smart_chomp(str, e, p);
10257 }
10258 else {
10259 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10260 return smart_chomp(str, e, p);
10261 }
10262 }
10263
10264 enc = rb_enc_check(str, rs);
10265 if (is_broken_string(rs)) {
10266 return len;
10267 }
10268 pp = e - rslen;
10269 if (p[len-1] == newline &&
10270 (rslen <= 1 ||
10271 memcmp(rsptr, pp, rslen) == 0)) {
10272 if (at_char_boundary(p, pp, e, enc))
10273 return len - rslen;
10274 RB_GC_GUARD(rs);
10275 }
10276 return len;
10277}
10278
10284static VALUE
10285chomp_rs(int argc, const VALUE *argv)
10286{
10287 rb_check_arity(argc, 0, 1);
10288 if (argc > 0) {
10289 VALUE rs = argv[0];
10290 if (!NIL_P(rs)) StringValue(rs);
10291 return rs;
10292 }
10293 else {
10294 return rb_rs;
10295 }
10296}
10297
10298VALUE
10299rb_str_chomp_string(VALUE str, VALUE rs)
10300{
10301 long olen = RSTRING_LEN(str);
10302 long len = chompped_length(str, rs);
10303 if (len >= olen) return Qnil;
10304 str_modify_keep_cr(str);
10305 STR_SET_LEN(str, len);
10306 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10307 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10309 }
10310 return str;
10311}
10312
10313/*
10314 * call-seq:
10315 * chomp!(line_sep = $/) -> self or nil
10316 *
10317 * Like String#chomp, except that:
10318 *
10319 * - Removes trailing characters from +self+ (not from a copy of +self+).
10320 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10321 *
10322 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10323 */
10324
10325static VALUE
10326rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10327{
10328 VALUE rs;
10329 str_modifiable(str);
10330 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10331 rs = chomp_rs(argc, argv);
10332 if (NIL_P(rs)) return Qnil;
10333 return rb_str_chomp_string(str, rs);
10334}
10335
10336
10337/*
10338 * call-seq:
10339 * chomp(line_sep = $/) -> new_string
10340 *
10341 * :include: doc/string/chomp.rdoc
10342 *
10343 */
10344
10345static VALUE
10346rb_str_chomp(int argc, VALUE *argv, VALUE str)
10347{
10348 VALUE rs = chomp_rs(argc, argv);
10349 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10350 return rb_str_subseq(str, 0, chompped_length(str, rs));
10351}
10352
10353static long
10354lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10355{
10356 const char *const start = s;
10357
10358 if (!s || s >= e) return 0;
10359
10360 /* remove spaces at head */
10361 if (single_byte_optimizable(str)) {
10362 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10363 }
10364 else {
10365 while (s < e) {
10366 int n;
10367 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10368
10369 if (cc && !rb_isspace(cc)) break;
10370 s += n;
10371 }
10372 }
10373 return s - start;
10374}
10375
10376/*
10377 * call-seq:
10378 * lstrip! -> self or nil
10379 *
10380 * Like String#lstrip, except that:
10381 *
10382 * - Performs stripping in +self+ (not in a copy of +self+).
10383 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10384 *
10385 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10386 */
10387
10388static VALUE
10389rb_str_lstrip_bang(VALUE str)
10390{
10391 rb_encoding *enc;
10392 char *start, *s;
10393 long olen, loffset;
10394
10395 str_modify_keep_cr(str);
10396 enc = STR_ENC_GET(str);
10397 RSTRING_GETMEM(str, start, olen);
10398 loffset = lstrip_offset(str, start, start+olen, enc);
10399 if (loffset > 0) {
10400 long len = olen-loffset;
10401 s = start + loffset;
10402 memmove(start, s, len);
10403 STR_SET_LEN(str, len);
10404 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10405 return str;
10406 }
10407 return Qnil;
10408}
10409
10410
10411/*
10412 * call-seq:
10413 * lstrip -> new_string
10414 *
10415 * Returns a copy of +self+ with leading whitespace removed;
10416 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10417 *
10418 * whitespace = "\x00\t\n\v\f\r "
10419 * s = whitespace + 'abc' + whitespace
10420 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10421 * s.lstrip
10422 * # => "abc\u0000\t\n\v\f\r "
10423 *
10424 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10425 */
10426
10427static VALUE
10428rb_str_lstrip(VALUE str)
10429{
10430 char *start;
10431 long len, loffset;
10432 RSTRING_GETMEM(str, start, len);
10433 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10434 if (loffset <= 0) return str_duplicate(rb_cString, str);
10435 return rb_str_subseq(str, loffset, len - loffset);
10436}
10437
10438static long
10439rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10440{
10441 const char *t;
10442
10443 rb_str_check_dummy_enc(enc);
10445 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10446 }
10447 if (!s || s >= e) return 0;
10448 t = e;
10449
10450 /* remove trailing spaces or '\0's */
10451 if (single_byte_optimizable(str)) {
10452 unsigned char c;
10453 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10454 }
10455 else {
10456 char *tp;
10457
10458 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10459 unsigned int c = rb_enc_codepoint(tp, e, enc);
10460 if (c && !rb_isspace(c)) break;
10461 t = tp;
10462 }
10463 }
10464 return e - t;
10465}
10466
10467/*
10468 * call-seq:
10469 * rstrip! -> self or nil
10470 *
10471 * Like String#rstrip, except that any modifications are made in +self+;
10472 * returns +self+ if any modification are made, +nil+ otherwise.
10473 *
10474 * Related: String#lstrip!, String#strip!.
10475 */
10476
10477static VALUE
10478rb_str_rstrip_bang(VALUE str)
10479{
10480 rb_encoding *enc;
10481 char *start;
10482 long olen, roffset;
10483
10484 str_modify_keep_cr(str);
10485 enc = STR_ENC_GET(str);
10486 RSTRING_GETMEM(str, start, olen);
10487 roffset = rstrip_offset(str, start, start+olen, enc);
10488 if (roffset > 0) {
10489 long len = olen - roffset;
10490
10491 STR_SET_LEN(str, len);
10492 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10493 return str;
10494 }
10495 return Qnil;
10496}
10497
10498
10499/*
10500 * call-seq:
10501 * rstrip -> new_string
10502 *
10503 * Returns a copy of the receiver with trailing whitespace removed;
10504 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10505 *
10506 * whitespace = "\x00\t\n\v\f\r "
10507 * s = whitespace + 'abc' + whitespace
10508 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10509 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10510 *
10511 * Related: String#lstrip, String#strip.
10512 */
10513
10514static VALUE
10515rb_str_rstrip(VALUE str)
10516{
10517 rb_encoding *enc;
10518 char *start;
10519 long olen, roffset;
10520
10521 enc = STR_ENC_GET(str);
10522 RSTRING_GETMEM(str, start, olen);
10523 roffset = rstrip_offset(str, start, start+olen, enc);
10524
10525 if (roffset <= 0) return str_duplicate(rb_cString, str);
10526 return rb_str_subseq(str, 0, olen-roffset);
10527}
10528
10529
10530/*
10531 * call-seq:
10532 * strip! -> self or nil
10533 *
10534 * Like String#strip, except that any modifications are made in +self+;
10535 * returns +self+ if any modification are made, +nil+ otherwise.
10536 *
10537 * Related: String#lstrip!, String#strip!.
10538 */
10539
10540static VALUE
10541rb_str_strip_bang(VALUE str)
10542{
10543 char *start;
10544 long olen, loffset, roffset;
10545 rb_encoding *enc;
10546
10547 str_modify_keep_cr(str);
10548 enc = STR_ENC_GET(str);
10549 RSTRING_GETMEM(str, start, olen);
10550 loffset = lstrip_offset(str, start, start+olen, enc);
10551 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10552
10553 if (loffset > 0 || roffset > 0) {
10554 long len = olen-roffset;
10555 if (loffset > 0) {
10556 len -= loffset;
10557 memmove(start, start + loffset, len);
10558 }
10559 STR_SET_LEN(str, len);
10560 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10561 return str;
10562 }
10563 return Qnil;
10564}
10565
10566
10567/*
10568 * call-seq:
10569 * strip -> new_string
10570 *
10571 * Returns a copy of the receiver with leading and trailing whitespace removed;
10572 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10573 *
10574 * whitespace = "\x00\t\n\v\f\r "
10575 * s = whitespace + 'abc' + whitespace
10576 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10577 * s.strip # => "abc"
10578 *
10579 * Related: String#lstrip, String#rstrip.
10580 */
10581
10582static VALUE
10583rb_str_strip(VALUE str)
10584{
10585 char *start;
10586 long olen, loffset, roffset;
10587 rb_encoding *enc = STR_ENC_GET(str);
10588
10589 RSTRING_GETMEM(str, start, olen);
10590 loffset = lstrip_offset(str, start, start+olen, enc);
10591 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10592
10593 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10594 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10595}
10596
10597static VALUE
10598scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10599{
10600 VALUE result = Qnil;
10601 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10602 if (pos >= 0) {
10603 VALUE match;
10604 struct re_registers *regs;
10605 if (BUILTIN_TYPE(pat) == T_STRING) {
10606 regs = NULL;
10607 end = pos + RSTRING_LEN(pat);
10608 }
10609 else {
10610 match = rb_backref_get();
10611 regs = RMATCH_REGS(match);
10612 pos = BEG(0);
10613 end = END(0);
10614 }
10615
10616 if (pos == end) {
10617 rb_encoding *enc = STR_ENC_GET(str);
10618 /*
10619 * Always consume at least one character of the input string
10620 */
10621 if (RSTRING_LEN(str) > end)
10622 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10623 RSTRING_END(str), enc);
10624 else
10625 *start = end + 1;
10626 }
10627 else {
10628 *start = end;
10629 }
10630
10631 if (!regs || regs->num_regs == 1) {
10632 result = rb_str_subseq(str, pos, end - pos);
10633 return result;
10634 }
10635 else {
10636 result = rb_ary_new2(regs->num_regs);
10637 for (int i = 1; i < regs->num_regs; i++) {
10638 VALUE s = Qnil;
10639 if (BEG(i) >= 0) {
10640 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10641 }
10642
10643 rb_ary_push(result, s);
10644 }
10645 }
10646
10647 RB_GC_GUARD(match);
10648 }
10649
10650 return result;
10651}
10652
10653
10654/*
10655 * call-seq:
10656 * scan(string_or_regexp) -> array
10657 * scan(string_or_regexp) {|matches| ... } -> self
10658 *
10659 * Matches a pattern against +self+; the pattern is:
10660 *
10661 * - +string_or_regexp+ itself, if it is a Regexp.
10662 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10663 *
10664 * Iterates through +self+, generating a collection of matching results:
10665 *
10666 * - If the pattern contains no groups, each result is the
10667 * matched string, <code>$&</code>.
10668 * - If the pattern contains groups, each result is an array
10669 * containing one entry per group.
10670 *
10671 * With no block given, returns an array of the results:
10672 *
10673 * s = 'cruel world'
10674 * s.scan(/\w+/) # => ["cruel", "world"]
10675 * s.scan(/.../) # => ["cru", "el ", "wor"]
10676 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10677 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10678 *
10679 * With a block given, calls the block with each result; returns +self+:
10680 *
10681 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10682 * print "\n"
10683 * s.scan(/(.)(.)/) {|x,y| print y, x }
10684 * print "\n"
10685 *
10686 * Output:
10687 *
10688 * <<cruel>> <<world>>
10689 * rceu lowlr
10690 *
10691 */
10692
10693static VALUE
10694rb_str_scan(VALUE str, VALUE pat)
10695{
10696 VALUE result;
10697 long start = 0;
10698 long last = -1, prev = 0;
10699 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10700
10701 pat = get_pat_quoted(pat, 1);
10702 mustnot_broken(str);
10703 if (!rb_block_given_p()) {
10704 VALUE ary = rb_ary_new();
10705
10706 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10707 last = prev;
10708 prev = start;
10709 rb_ary_push(ary, result);
10710 }
10711 if (last >= 0) rb_pat_search(pat, str, last, 1);
10712 else rb_backref_set(Qnil);
10713 return ary;
10714 }
10715
10716 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10717 last = prev;
10718 prev = start;
10719 rb_yield(result);
10720 str_mod_check(str, p, len);
10721 }
10722 if (last >= 0) rb_pat_search(pat, str, last, 1);
10723 return str;
10724}
10725
10726
10727/*
10728 * call-seq:
10729 * hex -> integer
10730 *
10731 * Interprets the leading substring of +self+ as hexadecimal;
10732 * returns its integer value:
10733 *
10734 * '0xFFFF'.hex # => 65535
10735 * 'FFzzzFF'.hex # => 255 # Hex ends at first non-hex character, 'z'.
10736 * 'ffzzzFF'.hex # => 255 # Case does not matter.
10737 * '-FFzzzFF'.hex # => -255 # May have leading '-'.
10738 * '0xFFzzzFF'.hex # => 255 # May have leading '0x'.
10739 * '-0xFFzzzFF'.hex # => -255 # May have leading '-0x'.
10740 *
10741 * Returns zero if there is no such leading substring:
10742 *
10743 * 'zzz'.hex # => 0
10744 *
10745 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10746 */
10747
10748static VALUE
10749rb_str_hex(VALUE str)
10750{
10751 return rb_str_to_inum(str, 16, FALSE);
10752}
10753
10754
10755/*
10756 * call-seq:
10757 * oct -> integer
10758 *
10759 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10760 * returns their value as an integer.
10761 *
10762 * In brief:
10763 *
10764 * # Interpreted as octal.
10765 * '777'.oct # => 511
10766 * '777x'.oct # => 511
10767 * '0777'.oct # => 511
10768 * '0o777'.oct # => 511
10769 * '-777'.oct # => -511
10770 * # Not interpreted as octal.
10771 * '0b111'.oct # => 7 # Interpreted as binary.
10772 * '0d999'.oct # => 999 # Interpreted as decimal.
10773 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10774 *
10775 * The leading substring is interpreted as octal when it begins with:
10776 *
10777 * - One or more character representing octal digits
10778 * (each in the range <tt>'0'..'7'</tt>);
10779 * the string to be interpreted ends at the first character that does not represent an octal digit:
10780 *
10781 * '7'.oct @ => 7
10782 * '11'.oct # => 9
10783 * '777'.oct # => 511
10784 * '0777'.oct # => 511
10785 * '7778'.oct # => 511
10786 * '777x'.oct # => 511
10787 *
10788 * - <tt>'0o'</tt>, followed by one or more octal digits:
10789 *
10790 * '0o777'.oct # => 511
10791 * '0o7778'.oct # => 511
10792 *
10793 * The leading substring is _not_ interpreted as octal when it begins with:
10794 *
10795 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10796 * (each in the range <tt>'0'..'1'</tt>);
10797 * the string to be interpreted ends at the first character that does not represent a binary digit.
10798 * the string is interpreted as binary digits (base 2):
10799 *
10800 * '0b111'.oct # => 7
10801 * '0b1112'.oct # => 7
10802 *
10803 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10804 * (each in the range <tt>'0'..'9'</tt>);
10805 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10806 * the string is interpreted as decimal digits (base 10):
10807 *
10808 * '0d999'.oct # => 999
10809 * '0d999x'.oct # => 999
10810 *
10811 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10812 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10813 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10814 * the string is interpreted as hexadecimal digits (base 16):
10815 *
10816 * '0xfff'.oct # => 4095
10817 * '0xfffg'.oct # => 4095
10818 *
10819 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10820 *
10821 * '-777'.oct # => -511
10822 * '-0777'.oct # => -511
10823 * '-0b111'.oct # => -7
10824 * '-0xfff'.oct # => -4095
10825 *
10826 * For any substring not described above, returns zero:
10827 *
10828 * 'foo'.oct # => 0
10829 * ''.oct # => 0
10830 *
10831 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10832 */
10833
10834static VALUE
10835rb_str_oct(VALUE str)
10836{
10837 return rb_str_to_inum(str, -8, FALSE);
10838}
10839
10840#ifndef HAVE_CRYPT_R
10841# include "ruby/thread_native.h"
10842# include "ruby/atomic.h"
10843
10844static struct {
10845 rb_nativethread_lock_t lock;
10846} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10847#endif
10848
10849/*
10850 * call-seq:
10851 * crypt(salt_str) -> new_string
10852 *
10853 * Returns the string generated by calling <code>crypt(3)</code>
10854 * standard library function with <code>str</code> and
10855 * <code>salt_str</code>, in this order, as its arguments. Please do
10856 * not use this method any longer. It is legacy; provided only for
10857 * backward compatibility with ruby scripts in earlier days. It is
10858 * bad to use in contemporary programs for several reasons:
10859 *
10860 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10861 * run. The generated string lacks data portability.
10862 *
10863 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10864 * (i.e. silently ends up in unexpected results).
10865 *
10866 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10867 * thread safe.
10868 *
10869 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10870 * very very weak. According to its manpage, Linux's traditional
10871 * <code>crypt(3)</code> output has only 2**56 variations; too
10872 * easy to brute force today. And this is the default behaviour.
10873 *
10874 * * In order to make things robust some OSes implement so-called
10875 * "modular" usage. To go through, you have to do a complex
10876 * build-up of the <code>salt_str</code> parameter, by hand.
10877 * Failure in generation of a proper salt string tends not to
10878 * yield any errors; typos in parameters are normally not
10879 * detectable.
10880 *
10881 * * For instance, in the following example, the second invocation
10882 * of String#crypt is wrong; it has a typo in "round=" (lacks
10883 * "s"). However the call does not fail and something unexpected
10884 * is generated.
10885 *
10886 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10887 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10888 *
10889 * * Even in the "modular" mode, some hash functions are considered
10890 * archaic and no longer recommended at all; for instance module
10891 * <code>$1$</code> is officially abandoned by its author: see
10892 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10893 * instance module <code>$3$</code> is considered completely
10894 * broken: see the manpage of FreeBSD.
10895 *
10896 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10897 * written above, <code>crypt(3)</code> on Mac OS never fails.
10898 * This means even if you build up a proper salt string it
10899 * generates a traditional DES hash anyways, and there is no way
10900 * for you to be aware of.
10901 *
10902 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10903 *
10904 * If for some reason you cannot migrate to other secure contemporary
10905 * password hashing algorithms, install the string-crypt gem and
10906 * <code>require 'string/crypt'</code> to continue using it.
10907 */
10908
10909static VALUE
10910rb_str_crypt(VALUE str, VALUE salt)
10911{
10912#ifdef HAVE_CRYPT_R
10913 VALUE databuf;
10914 struct crypt_data *data;
10915# define CRYPT_END() ALLOCV_END(databuf)
10916#else
10917 char *tmp_buf;
10918 extern char *crypt(const char *, const char *);
10919# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10920#endif
10921 VALUE result;
10922 const char *s, *saltp;
10923 char *res;
10924#ifdef BROKEN_CRYPT
10925 char salt_8bit_clean[3];
10926#endif
10927
10928 StringValue(salt);
10929 mustnot_wchar(str);
10930 mustnot_wchar(salt);
10931 s = StringValueCStr(str);
10932 saltp = RSTRING_PTR(salt);
10933 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10934 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10935 }
10936
10937#ifdef BROKEN_CRYPT
10938 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10939 salt_8bit_clean[0] = saltp[0] & 0x7f;
10940 salt_8bit_clean[1] = saltp[1] & 0x7f;
10941 salt_8bit_clean[2] = '\0';
10942 saltp = salt_8bit_clean;
10943 }
10944#endif
10945#ifdef HAVE_CRYPT_R
10946 data = ALLOCV(databuf, sizeof(struct crypt_data));
10947# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10948 data->initialized = 0;
10949# endif
10950 res = crypt_r(s, saltp, data);
10951#else
10952 rb_nativethread_lock_lock(&crypt_mutex.lock);
10953 res = crypt(s, saltp);
10954#endif
10955 if (!res) {
10956 int err = errno;
10957 CRYPT_END();
10958 rb_syserr_fail(err, "crypt");
10959 }
10960#ifdef HAVE_CRYPT_R
10961 result = rb_str_new_cstr(res);
10962 CRYPT_END();
10963#else
10964 // We need to copy this buffer because it's static and we need to unlock the mutex
10965 // before allocating a new object (the string to be returned). If we allocate while
10966 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10967 // if other ractors are waiting on this lock.
10968 size_t res_size = strlen(res)+1;
10969 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10970 memcpy(tmp_buf, res, res_size);
10971 res = tmp_buf;
10972 CRYPT_END();
10973 result = rb_str_new_cstr(res);
10974#endif
10975 return result;
10976}
10977
10978
10979/*
10980 * call-seq:
10981 * ord -> integer
10982 *
10983 * :include: doc/string/ord.rdoc
10984 *
10985 */
10986
10987static VALUE
10988rb_str_ord(VALUE s)
10989{
10990 unsigned int c;
10991
10992 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10993 return UINT2NUM(c);
10994}
10995/*
10996 * call-seq:
10997 * sum(n = 16) -> integer
10998 *
10999 * :include: doc/string/sum.rdoc
11000 *
11001 */
11002
11003static VALUE
11004rb_str_sum(int argc, VALUE *argv, VALUE str)
11005{
11006 int bits = 16;
11007 char *ptr, *p, *pend;
11008 long len;
11009 VALUE sum = INT2FIX(0);
11010 unsigned long sum0 = 0;
11011
11012 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11013 bits = 0;
11014 }
11015 ptr = p = RSTRING_PTR(str);
11016 len = RSTRING_LEN(str);
11017 pend = p + len;
11018
11019 while (p < pend) {
11020 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11021 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11022 str_mod_check(str, ptr, len);
11023 sum0 = 0;
11024 }
11025 sum0 += (unsigned char)*p;
11026 p++;
11027 }
11028
11029 if (bits == 0) {
11030 if (sum0) {
11031 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11032 }
11033 }
11034 else {
11035 if (sum == INT2FIX(0)) {
11036 if (bits < (int)sizeof(long)*CHAR_BIT) {
11037 sum0 &= (((unsigned long)1)<<bits)-1;
11038 }
11039 sum = LONG2FIX(sum0);
11040 }
11041 else {
11042 VALUE mod;
11043
11044 if (sum0) {
11045 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11046 }
11047
11048 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11049 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11050 sum = rb_funcall(sum, '&', 1, mod);
11051 }
11052 }
11053 return sum;
11054}
11055
11056static VALUE
11057rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11058{
11059 rb_encoding *enc;
11060 VALUE w;
11061 long width, len, flen = 1, fclen = 1;
11062 VALUE res;
11063 char *p;
11064 const char *f = " ";
11065 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11066 VALUE pad;
11067 int singlebyte = 1, cr;
11068 int termlen;
11069
11070 rb_scan_args(argc, argv, "11", &w, &pad);
11071 enc = STR_ENC_GET(str);
11072 termlen = rb_enc_mbminlen(enc);
11073 width = NUM2LONG(w);
11074 if (argc == 2) {
11075 StringValue(pad);
11076 enc = rb_enc_check(str, pad);
11077 f = RSTRING_PTR(pad);
11078 flen = RSTRING_LEN(pad);
11079 fclen = str_strlen(pad, enc); /* rb_enc_check */
11080 singlebyte = single_byte_optimizable(pad);
11081 if (flen == 0 || fclen == 0) {
11082 rb_raise(rb_eArgError, "zero width padding");
11083 }
11084 }
11085 len = str_strlen(str, enc); /* rb_enc_check */
11086 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11087 n = width - len;
11088 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11089 rlen = n - llen;
11090 cr = ENC_CODERANGE(str);
11091 if (flen > 1) {
11092 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11093 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11094 }
11095 size = RSTRING_LEN(str);
11096 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11097 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11098 (len += llen2 + rlen2) >= LONG_MAX - size) {
11099 rb_raise(rb_eArgError, "argument too big");
11100 }
11101 len += size;
11102 res = str_enc_new(rb_cString, 0, len, enc);
11103 p = RSTRING_PTR(res);
11104 if (flen <= 1) {
11105 memset(p, *f, llen);
11106 p += llen;
11107 }
11108 else {
11109 while (llen >= fclen) {
11110 memcpy(p,f,flen);
11111 p += flen;
11112 llen -= fclen;
11113 }
11114 if (llen > 0) {
11115 memcpy(p, f, llen2);
11116 p += llen2;
11117 }
11118 }
11119 memcpy(p, RSTRING_PTR(str), size);
11120 p += size;
11121 if (flen <= 1) {
11122 memset(p, *f, rlen);
11123 p += rlen;
11124 }
11125 else {
11126 while (rlen >= fclen) {
11127 memcpy(p,f,flen);
11128 p += flen;
11129 rlen -= fclen;
11130 }
11131 if (rlen > 0) {
11132 memcpy(p, f, rlen2);
11133 p += rlen2;
11134 }
11135 }
11136 TERM_FILL(p, termlen);
11137 STR_SET_LEN(res, p-RSTRING_PTR(res));
11138
11139 if (argc == 2)
11140 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11141 if (cr != ENC_CODERANGE_BROKEN)
11142 ENC_CODERANGE_SET(res, cr);
11143
11144 RB_GC_GUARD(pad);
11145 return res;
11146}
11147
11148
11149/*
11150 * call-seq:
11151 * ljust(width, pad_string = ' ') -> new_string
11152 *
11153 * :include: doc/string/ljust.rdoc
11154 *
11155 */
11156
11157static VALUE
11158rb_str_ljust(int argc, VALUE *argv, VALUE str)
11159{
11160 return rb_str_justify(argc, argv, str, 'l');
11161}
11162
11163/*
11164 * call-seq:
11165 * rjust(size, pad_string = ' ') -> new_string
11166 *
11167 * :include: doc/string/rjust.rdoc
11168 *
11169 * Related: String#ljust, String#center.
11170 *
11171 */
11172
11173static VALUE
11174rb_str_rjust(int argc, VALUE *argv, VALUE str)
11175{
11176 return rb_str_justify(argc, argv, str, 'r');
11177}
11178
11179
11180/*
11181 * call-seq:
11182 * center(size, pad_string = ' ') -> new_string
11183 *
11184 * :include: doc/string/center.rdoc
11185 *
11186 */
11187
11188static VALUE
11189rb_str_center(int argc, VALUE *argv, VALUE str)
11190{
11191 return rb_str_justify(argc, argv, str, 'c');
11192}
11193
11194/*
11195 * call-seq:
11196 * partition(string_or_regexp) -> [head, match, tail]
11197 *
11198 * :include: doc/string/partition.rdoc
11199 *
11200 */
11201
11202static VALUE
11203rb_str_partition(VALUE str, VALUE sep)
11204{
11205 long pos;
11206
11207 sep = get_pat_quoted(sep, 0);
11208 if (RB_TYPE_P(sep, T_REGEXP)) {
11209 if (rb_reg_search(sep, str, 0, 0) < 0) {
11210 goto failed;
11211 }
11212 VALUE match = rb_backref_get();
11213 struct re_registers *regs = RMATCH_REGS(match);
11214
11215 pos = BEG(0);
11216 sep = rb_str_subseq(str, pos, END(0) - pos);
11217 }
11218 else {
11219 pos = rb_str_index(str, sep, 0);
11220 if (pos < 0) goto failed;
11221 }
11222 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11223 sep,
11224 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11225 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11226
11227 failed:
11228 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11229}
11230
11231/*
11232 * call-seq:
11233 * rpartition(sep) -> [head, match, tail]
11234 *
11235 * :include: doc/string/rpartition.rdoc
11236 *
11237 */
11238
11239static VALUE
11240rb_str_rpartition(VALUE str, VALUE sep)
11241{
11242 long pos = RSTRING_LEN(str);
11243
11244 sep = get_pat_quoted(sep, 0);
11245 if (RB_TYPE_P(sep, T_REGEXP)) {
11246 if (rb_reg_search(sep, str, pos, 1) < 0) {
11247 goto failed;
11248 }
11249 VALUE match = rb_backref_get();
11250 struct re_registers *regs = RMATCH_REGS(match);
11251
11252 pos = BEG(0);
11253 sep = rb_str_subseq(str, pos, END(0) - pos);
11254 }
11255 else {
11256 pos = rb_str_sublen(str, pos);
11257 pos = rb_str_rindex(str, sep, pos);
11258 if (pos < 0) {
11259 goto failed;
11260 }
11261 }
11262
11263 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11264 sep,
11265 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11266 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11267 failed:
11268 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11269}
11270
11271/*
11272 * call-seq:
11273 * start_with?(*string_or_regexp) -> true or false
11274 *
11275 * :include: doc/string/start_with_p.rdoc
11276 *
11277 */
11278
11279static VALUE
11280rb_str_start_with(int argc, VALUE *argv, VALUE str)
11281{
11282 int i;
11283
11284 for (i=0; i<argc; i++) {
11285 VALUE tmp = argv[i];
11286 if (RB_TYPE_P(tmp, T_REGEXP)) {
11287 if (rb_reg_start_with_p(tmp, str))
11288 return Qtrue;
11289 }
11290 else {
11291 const char *p, *s, *e;
11292 long slen, tlen;
11293 rb_encoding *enc;
11294
11295 StringValue(tmp);
11296 enc = rb_enc_check(str, tmp);
11297 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11298 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11299 p = RSTRING_PTR(str);
11300 e = p + slen;
11301 s = p + tlen;
11302 if (!at_char_right_boundary(p, s, e, enc))
11303 continue;
11304 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11305 return Qtrue;
11306 }
11307 }
11308 return Qfalse;
11309}
11310
11311/*
11312 * call-seq:
11313 * end_with?(*strings) -> true or false
11314 *
11315 * :include: doc/string/end_with_p.rdoc
11316 *
11317 */
11318
11319static VALUE
11320rb_str_end_with(int argc, VALUE *argv, VALUE str)
11321{
11322 int i;
11323
11324 for (i=0; i<argc; i++) {
11325 VALUE tmp = argv[i];
11326 const char *p, *s, *e;
11327 long slen, tlen;
11328 rb_encoding *enc;
11329
11330 StringValue(tmp);
11331 enc = rb_enc_check(str, tmp);
11332 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11333 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11334 p = RSTRING_PTR(str);
11335 e = p + slen;
11336 s = e - tlen;
11337 if (!at_char_boundary(p, s, e, enc))
11338 continue;
11339 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11340 return Qtrue;
11341 }
11342 return Qfalse;
11343}
11344
11354static long
11355deleted_prefix_length(VALUE str, VALUE prefix)
11356{
11357 const char *strptr, *prefixptr;
11358 long olen, prefixlen;
11359 rb_encoding *enc = rb_enc_get(str);
11360
11361 StringValue(prefix);
11362
11363 if (!is_broken_string(prefix) ||
11364 !rb_enc_asciicompat(enc) ||
11365 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11366 enc = rb_enc_check(str, prefix);
11367 }
11368
11369 /* return 0 if not start with prefix */
11370 prefixlen = RSTRING_LEN(prefix);
11371 if (prefixlen <= 0) return 0;
11372 olen = RSTRING_LEN(str);
11373 if (olen < prefixlen) return 0;
11374 strptr = RSTRING_PTR(str);
11375 prefixptr = RSTRING_PTR(prefix);
11376 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11377 if (is_broken_string(prefix)) {
11378 if (!is_broken_string(str)) {
11379 /* prefix in a valid string cannot be broken */
11380 return 0;
11381 }
11382 const char *strend = strptr + olen;
11383 const char *after_prefix = strptr + prefixlen;
11384 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11385 /* prefix does not end at char-boundary */
11386 return 0;
11387 }
11388 }
11389 /* prefix part in `str` also should be valid. */
11390
11391 return prefixlen;
11392}
11393
11394/*
11395 * call-seq:
11396 * delete_prefix!(prefix) -> self or nil
11397 *
11398 * Like String#delete_prefix, except that +self+ is modified in place;
11399 * returns +self+ if the prefix is removed, +nil+ otherwise.
11400 *
11401 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11402 */
11403
11404static VALUE
11405rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11406{
11407 long prefixlen;
11408 str_modify_keep_cr(str);
11409
11410 prefixlen = deleted_prefix_length(str, prefix);
11411 if (prefixlen <= 0) return Qnil;
11412
11413 return rb_str_drop_bytes(str, prefixlen);
11414}
11415
11416/*
11417 * call-seq:
11418 * delete_prefix(prefix) -> new_string
11419 *
11420 * :include: doc/string/delete_prefix.rdoc
11421 *
11422 */
11423
11424static VALUE
11425rb_str_delete_prefix(VALUE str, VALUE prefix)
11426{
11427 long prefixlen;
11428
11429 prefixlen = deleted_prefix_length(str, prefix);
11430 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11431
11432 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11433}
11434
11444static long
11445deleted_suffix_length(VALUE str, VALUE suffix)
11446{
11447 const char *strptr, *suffixptr;
11448 long olen, suffixlen;
11449 rb_encoding *enc;
11450
11451 StringValue(suffix);
11452 if (is_broken_string(suffix)) return 0;
11453 enc = rb_enc_check(str, suffix);
11454
11455 /* return 0 if not start with suffix */
11456 suffixlen = RSTRING_LEN(suffix);
11457 if (suffixlen <= 0) return 0;
11458 olen = RSTRING_LEN(str);
11459 if (olen < suffixlen) return 0;
11460 strptr = RSTRING_PTR(str);
11461 suffixptr = RSTRING_PTR(suffix);
11462 const char *strend = strptr + olen;
11463 const char *before_suffix = strend - suffixlen;
11464 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11465 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11466
11467 return suffixlen;
11468}
11469
11470/*
11471 * call-seq:
11472 * delete_suffix!(suffix) -> self or nil
11473 *
11474 * Like String#delete_suffix, except that +self+ is modified in place;
11475 * returns +self+ if the suffix is removed, +nil+ otherwise.
11476 *
11477 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11478 */
11479
11480static VALUE
11481rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11482{
11483 long olen, suffixlen, len;
11484 str_modifiable(str);
11485
11486 suffixlen = deleted_suffix_length(str, suffix);
11487 if (suffixlen <= 0) return Qnil;
11488
11489 olen = RSTRING_LEN(str);
11490 str_modify_keep_cr(str);
11491 len = olen - suffixlen;
11492 STR_SET_LEN(str, len);
11493 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11494 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11496 }
11497 return str;
11498}
11499
11500/*
11501 * call-seq:
11502 * delete_suffix(suffix) -> new_string
11503 *
11504 * :include: doc/string/delete_suffix.rdoc
11505 *
11506 */
11507
11508static VALUE
11509rb_str_delete_suffix(VALUE str, VALUE suffix)
11510{
11511 long suffixlen;
11512
11513 suffixlen = deleted_suffix_length(str, suffix);
11514 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11515
11516 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11517}
11518
11519void
11520rb_str_setter(VALUE val, ID id, VALUE *var)
11521{
11522 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11523 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11524 }
11525 *var = val;
11526}
11527
11528static void
11529rb_fs_setter(VALUE val, ID id, VALUE *var)
11530{
11531 val = rb_fs_check(val);
11532 if (!val) {
11533 rb_raise(rb_eTypeError,
11534 "value of %"PRIsVALUE" must be String or Regexp",
11535 rb_id2str(id));
11536 }
11537 if (!NIL_P(val)) {
11538 rb_warn_deprecated("'$;'", NULL);
11539 }
11540 *var = val;
11541}
11542
11543
11544/*
11545 * call-seq:
11546 * force_encoding(encoding) -> self
11547 *
11548 * :include: doc/string/force_encoding.rdoc
11549 *
11550 */
11551
11552static VALUE
11553rb_str_force_encoding(VALUE str, VALUE enc)
11554{
11555 str_modifiable(str);
11556
11557 rb_encoding *encoding = rb_to_encoding(enc);
11558 int idx = rb_enc_to_index(encoding);
11559
11560 // If the encoding is unchanged, we do nothing.
11561 if (ENCODING_GET(str) == idx) {
11562 return str;
11563 }
11564
11565 rb_enc_associate_index(str, idx);
11566
11567 // If the coderange was 7bit and the new encoding is ASCII-compatible
11568 // we can keep the coderange.
11569 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11570 return str;
11571 }
11572
11574 return str;
11575}
11576
11577/*
11578 * call-seq:
11579 * b -> new_string
11580 *
11581 * :include: doc/string/b.rdoc
11582 *
11583 */
11584
11585static VALUE
11586rb_str_b(VALUE str)
11587{
11588 VALUE str2;
11589 if (STR_EMBED_P(str)) {
11590 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11591 }
11592 else {
11593 str2 = str_alloc_heap(rb_cString);
11594 }
11595 str_replace_shared_without_enc(str2, str);
11596
11597 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11598 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11599 // If we know the receiver's code range then we know the result's code range.
11600 int cr = ENC_CODERANGE(str);
11601 switch (cr) {
11602 case ENC_CODERANGE_7BIT:
11604 break;
11608 break;
11609 default:
11610 ENC_CODERANGE_CLEAR(str2);
11611 break;
11612 }
11613 }
11614
11615 return str2;
11616}
11617
11618/*
11619 * call-seq:
11620 * valid_encoding? -> true or false
11621 *
11622 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11623 *
11624 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11625 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11626 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11627 */
11628
11629static VALUE
11630rb_str_valid_encoding_p(VALUE str)
11631{
11632 int cr = rb_enc_str_coderange(str);
11633
11634 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11635}
11636
11637/*
11638 * call-seq:
11639 * ascii_only? -> true or false
11640 *
11641 * Returns whether +self+ contains only ASCII characters:
11642 *
11643 * 'abc'.ascii_only? # => true
11644 * "abc\u{6666}".ascii_only? # => false
11645 *
11646 * Related: see {Querying}[rdoc-ref:String@Querying].
11647 */
11648
11649static VALUE
11650rb_str_is_ascii_only_p(VALUE str)
11651{
11652 int cr = rb_enc_str_coderange(str);
11653
11654 return RBOOL(cr == ENC_CODERANGE_7BIT);
11655}
11656
11657VALUE
11659{
11660 static const char ellipsis[] = "...";
11661 const long ellipsislen = sizeof(ellipsis) - 1;
11662 rb_encoding *const enc = rb_enc_get(str);
11663 const long blen = RSTRING_LEN(str);
11664 const char *const p = RSTRING_PTR(str), *e = p + blen;
11665 VALUE estr, ret = 0;
11666
11667 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11668 if (len * rb_enc_mbminlen(enc) >= blen ||
11669 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11670 ret = str;
11671 }
11672 else if (len <= ellipsislen ||
11673 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11674 if (rb_enc_asciicompat(enc)) {
11675 ret = rb_str_new(ellipsis, len);
11676 rb_enc_associate(ret, enc);
11677 }
11678 else {
11679 estr = rb_usascii_str_new(ellipsis, len);
11680 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11681 }
11682 }
11683 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11684 rb_str_cat(ret, ellipsis, ellipsislen);
11685 }
11686 else {
11687 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11688 rb_enc_from_encoding(enc), 0, Qnil);
11689 rb_str_append(ret, estr);
11690 }
11691 return ret;
11692}
11693
11694static VALUE
11695str_compat_and_valid(VALUE str, rb_encoding *enc)
11696{
11697 int cr;
11698 str = StringValue(str);
11699 cr = rb_enc_str_coderange(str);
11700 if (cr == ENC_CODERANGE_BROKEN) {
11701 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11702 }
11703 else {
11704 rb_encoding *e = STR_ENC_GET(str);
11705 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11706 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11707 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11708 }
11709 }
11710 return str;
11711}
11712
11713static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11714
11715VALUE
11717{
11718 rb_encoding *enc = STR_ENC_GET(str);
11719 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11720}
11721
11722VALUE
11723rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11724{
11725 int cr = ENC_CODERANGE_UNKNOWN;
11726 if (enc == STR_ENC_GET(str)) {
11727 /* cached coderange makes sense only when enc equals the
11728 * actual encoding of str */
11729 cr = ENC_CODERANGE(str);
11730 }
11731 return enc_str_scrub(enc, str, repl, cr);
11732}
11733
11734static VALUE
11735enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11736{
11737 int encidx;
11738 VALUE buf = Qnil;
11739 const char *rep, *p, *e, *p1, *sp;
11740 long replen = -1;
11741 long slen;
11742
11743 if (rb_block_given_p()) {
11744 if (!NIL_P(repl))
11745 rb_raise(rb_eArgError, "both of block and replacement given");
11746 replen = 0;
11747 }
11748
11749 if (ENC_CODERANGE_CLEAN_P(cr))
11750 return Qnil;
11751
11752 if (!NIL_P(repl)) {
11753 repl = str_compat_and_valid(repl, enc);
11754 }
11755
11756 if (rb_enc_dummy_p(enc)) {
11757 return Qnil;
11758 }
11759 encidx = rb_enc_to_index(enc);
11760
11761#define DEFAULT_REPLACE_CHAR(str) do { \
11762 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11763 rep = replace; replen = (int)sizeof(replace); \
11764 } while (0)
11765
11766 slen = RSTRING_LEN(str);
11767 p = RSTRING_PTR(str);
11768 e = RSTRING_END(str);
11769 p1 = p;
11770 sp = p;
11771
11772 if (rb_enc_asciicompat(enc)) {
11773 int rep7bit_p;
11774 if (!replen) {
11775 rep = NULL;
11776 rep7bit_p = FALSE;
11777 }
11778 else if (!NIL_P(repl)) {
11779 rep = RSTRING_PTR(repl);
11780 replen = RSTRING_LEN(repl);
11781 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11782 }
11783 else if (encidx == rb_utf8_encindex()) {
11784 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11785 rep7bit_p = FALSE;
11786 }
11787 else {
11788 DEFAULT_REPLACE_CHAR("?");
11789 rep7bit_p = TRUE;
11790 }
11791 cr = ENC_CODERANGE_7BIT;
11792
11793 p = search_nonascii(p, e);
11794 if (!p) {
11795 p = e;
11796 }
11797 while (p < e) {
11798 int ret = rb_enc_precise_mbclen(p, e, enc);
11799 if (MBCLEN_NEEDMORE_P(ret)) {
11800 break;
11801 }
11802 else if (MBCLEN_CHARFOUND_P(ret)) {
11804 p += MBCLEN_CHARFOUND_LEN(ret);
11805 }
11806 else if (MBCLEN_INVALID_P(ret)) {
11807 /*
11808 * p1~p: valid ascii/multibyte chars
11809 * p ~e: invalid bytes + unknown bytes
11810 */
11811 long clen = rb_enc_mbmaxlen(enc);
11812 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11813 if (p > p1) {
11814 rb_str_buf_cat(buf, p1, p - p1);
11815 }
11816
11817 if (e - p < clen) clen = e - p;
11818 if (clen <= 2) {
11819 clen = 1;
11820 }
11821 else {
11822 const char *q = p;
11823 clen--;
11824 for (; clen > 1; clen--) {
11825 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11826 if (MBCLEN_NEEDMORE_P(ret)) break;
11827 if (MBCLEN_INVALID_P(ret)) continue;
11829 }
11830 }
11831 if (rep) {
11832 rb_str_buf_cat(buf, rep, replen);
11833 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11834 }
11835 else {
11836 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11837 str_mod_check(str, sp, slen);
11838 repl = str_compat_and_valid(repl, enc);
11839 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11842 }
11843 p += clen;
11844 p1 = p;
11845 p = search_nonascii(p, e);
11846 if (!p) {
11847 p = e;
11848 break;
11849 }
11850 }
11851 else {
11853 }
11854 }
11855 if (NIL_P(buf)) {
11856 if (p == e) {
11857 ENC_CODERANGE_SET(str, cr);
11858 return Qnil;
11859 }
11860 buf = rb_str_buf_new(RSTRING_LEN(str));
11861 }
11862 if (p1 < p) {
11863 rb_str_buf_cat(buf, p1, p - p1);
11864 }
11865 if (p < e) {
11866 if (rep) {
11867 rb_str_buf_cat(buf, rep, replen);
11868 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11869 }
11870 else {
11871 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11872 str_mod_check(str, sp, slen);
11873 repl = str_compat_and_valid(repl, enc);
11874 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11877 }
11878 }
11879 }
11880 else {
11881 /* ASCII incompatible */
11882 long mbminlen = rb_enc_mbminlen(enc);
11883 if (!replen) {
11884 rep = NULL;
11885 }
11886 else if (!NIL_P(repl)) {
11887 rep = RSTRING_PTR(repl);
11888 replen = RSTRING_LEN(repl);
11889 }
11890 else if (encidx == ENCINDEX_UTF_16BE) {
11891 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11892 }
11893 else if (encidx == ENCINDEX_UTF_16LE) {
11894 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11895 }
11896 else if (encidx == ENCINDEX_UTF_32BE) {
11897 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11898 }
11899 else if (encidx == ENCINDEX_UTF_32LE) {
11900 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11901 }
11902 else {
11903 DEFAULT_REPLACE_CHAR("?");
11904 }
11905
11906 while (p < e) {
11907 int ret = rb_enc_precise_mbclen(p, e, enc);
11908 if (MBCLEN_NEEDMORE_P(ret)) {
11909 break;
11910 }
11911 else if (MBCLEN_CHARFOUND_P(ret)) {
11912 p += MBCLEN_CHARFOUND_LEN(ret);
11913 }
11914 else if (MBCLEN_INVALID_P(ret)) {
11915 const char *q = p;
11916 long clen = rb_enc_mbmaxlen(enc);
11917 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11918 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11919
11920 if (e - p < clen) clen = e - p;
11921 if (clen <= mbminlen * 2) {
11922 clen = mbminlen;
11923 }
11924 else {
11925 clen -= mbminlen;
11926 for (; clen > mbminlen; clen-=mbminlen) {
11927 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11928 if (MBCLEN_NEEDMORE_P(ret)) break;
11929 if (MBCLEN_INVALID_P(ret)) continue;
11931 }
11932 }
11933 if (rep) {
11934 rb_str_buf_cat(buf, rep, replen);
11935 }
11936 else {
11937 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11938 str_mod_check(str, sp, slen);
11939 repl = str_compat_and_valid(repl, enc);
11940 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11941 }
11942 p += clen;
11943 p1 = p;
11944 }
11945 else {
11947 }
11948 }
11949 if (NIL_P(buf)) {
11950 if (p == e) {
11952 return Qnil;
11953 }
11954 buf = rb_str_buf_new(RSTRING_LEN(str));
11955 }
11956 if (p1 < p) {
11957 rb_str_buf_cat(buf, p1, p - p1);
11958 }
11959 if (p < e) {
11960 if (rep) {
11961 rb_str_buf_cat(buf, rep, replen);
11962 }
11963 else {
11964 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11965 str_mod_check(str, sp, slen);
11966 repl = str_compat_and_valid(repl, enc);
11967 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11968 }
11969 }
11971 }
11972 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11973 return buf;
11974}
11975
11976/*
11977 * call-seq:
11978 * scrub(replacement_string = default_replacement) -> new_string
11979 * scrub{|bytes| ... } -> new_string
11980 *
11981 * :include: doc/string/scrub.rdoc
11982 *
11983 */
11984static VALUE
11985str_scrub(int argc, VALUE *argv, VALUE str)
11986{
11987 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11988 VALUE new = rb_str_scrub(str, repl);
11989 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11990}
11991
11992/*
11993 * call-seq:
11994 * scrub! -> self
11995 * scrub!(replacement_string = default_replacement) -> self
11996 * scrub!{|bytes| ... } -> self
11997 *
11998 * Like String#scrub, except that any replacements are made in +self+.
11999 *
12000 */
12001static VALUE
12002str_scrub_bang(int argc, VALUE *argv, VALUE str)
12003{
12004 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12005 VALUE new = rb_str_scrub(str, repl);
12006 if (!NIL_P(new)) rb_str_replace(str, new);
12007 return str;
12008}
12009
12010static ID id_normalize;
12011static ID id_normalized_p;
12012static VALUE mUnicodeNormalize;
12013
12014static VALUE
12015unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12016{
12017 static int UnicodeNormalizeRequired = 0;
12018 VALUE argv2[2];
12019
12020 if (!UnicodeNormalizeRequired) {
12021 rb_require("unicode_normalize/normalize.rb");
12022 UnicodeNormalizeRequired = 1;
12023 }
12024 argv2[0] = str;
12025 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12026 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12027}
12028
12029/*
12030 * call-seq:
12031 * unicode_normalize(form = :nfc) -> string
12032 *
12033 * Returns a copy of +self+ with
12034 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
12035 *
12036 * Argument +form+ must be one of the following symbols
12037 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
12038 *
12039 * - +:nfc+: Canonical decomposition, followed by canonical composition.
12040 * - +:nfd+: Canonical decomposition.
12041 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
12042 * - +:nfkd+: Compatibility decomposition.
12043 *
12044 * The encoding of +self+ must be one of:
12045 *
12046 * - Encoding::UTF_8
12047 * - Encoding::UTF_16BE
12048 * - Encoding::UTF_16LE
12049 * - Encoding::UTF_32BE
12050 * - Encoding::UTF_32LE
12051 * - Encoding::GB18030
12052 * - Encoding::UCS_2BE
12053 * - Encoding::UCS_4BE
12054 *
12055 * Examples:
12056 *
12057 * "a\u0300".unicode_normalize # => "a"
12058 * "\u00E0".unicode_normalize(:nfd) # => "a "
12059 *
12060 * Related: String#unicode_normalize!, String#unicode_normalized?.
12061 */
12062static VALUE
12063rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12064{
12065 return unicode_normalize_common(argc, argv, str, id_normalize);
12066}
12067
12068/*
12069 * call-seq:
12070 * unicode_normalize!(form = :nfc) -> self
12071 *
12072 * Like String#unicode_normalize, except that the normalization
12073 * is performed on +self+.
12074 *
12075 * Related String#unicode_normalized?.
12076 *
12077 */
12078static VALUE
12079rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12080{
12081 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12082}
12083
12084/* call-seq:
12085 * unicode_normalized?(form = :nfc) -> true or false
12086 *
12087 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
12088 * +false+ otherwise.
12089 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12090 *
12091 * Examples:
12092 *
12093 * "a\u0300".unicode_normalized? # => false
12094 * "a\u0300".unicode_normalized?(:nfd) # => true
12095 * "\u00E0".unicode_normalized? # => true
12096 * "\u00E0".unicode_normalized?(:nfd) # => false
12097 *
12098 *
12099 * Raises an exception if +self+ is not in a Unicode encoding:
12100 *
12101 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12102 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
12103 *
12104 * Related: String#unicode_normalize, String#unicode_normalize!.
12105 *
12106 */
12107static VALUE
12108rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12109{
12110 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12111}
12112
12113/**********************************************************************
12114 * Document-class: Symbol
12115 *
12116 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12117 *
12118 * You can create a +Symbol+ object explicitly with:
12119 *
12120 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12121 *
12122 * The same +Symbol+ object will be
12123 * created for a given name or string for the duration of a program's
12124 * execution, regardless of the context or meaning of that name. Thus
12125 * if <code>Fred</code> is a constant in one context, a method in
12126 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12127 * will be the same object in all three contexts.
12128 *
12129 * module One
12130 * class Fred
12131 * end
12132 * $f1 = :Fred
12133 * end
12134 * module Two
12135 * Fred = 1
12136 * $f2 = :Fred
12137 * end
12138 * def Fred()
12139 * end
12140 * $f3 = :Fred
12141 * $f1.object_id #=> 2514190
12142 * $f2.object_id #=> 2514190
12143 * $f3.object_id #=> 2514190
12144 *
12145 * Constant, method, and variable names are returned as symbols:
12146 *
12147 * module One
12148 * Two = 2
12149 * def three; 3 end
12150 * @four = 4
12151 * @@five = 5
12152 * $six = 6
12153 * end
12154 * seven = 7
12155 *
12156 * One.constants
12157 * # => [:Two]
12158 * One.instance_methods(true)
12159 * # => [:three]
12160 * One.instance_variables
12161 * # => [:@four]
12162 * One.class_variables
12163 * # => [:@@five]
12164 * global_variables.grep(/six/)
12165 * # => [:$six]
12166 * local_variables
12167 * # => [:seven]
12168 *
12169 * A +Symbol+ object differs from a String object in that
12170 * a +Symbol+ object represents an identifier, while a String object
12171 * represents text or data.
12172 *
12173 * == What's Here
12174 *
12175 * First, what's elsewhere. Class +Symbol+:
12176 *
12177 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12178 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12179 *
12180 * Here, class +Symbol+ provides methods that are useful for:
12181 *
12182 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12183 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12184 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12185 *
12186 * === Methods for Querying
12187 *
12188 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12189 * - #=~: Returns the index of the first substring in symbol that matches a
12190 * given Regexp or other object; returns +nil+ if no match is found.
12191 * - #[], #slice : Returns a substring of symbol
12192 * determined by a given index, start/length, or range, or string.
12193 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12194 * - #encoding: Returns the Encoding object that represents the encoding
12195 * of symbol.
12196 * - #end_with?: Returns +true+ if symbol ends with
12197 * any of the given strings.
12198 * - #match: Returns a MatchData object if symbol
12199 * matches a given Regexp; +nil+ otherwise.
12200 * - #match?: Returns +true+ if symbol
12201 * matches a given Regexp; +false+ otherwise.
12202 * - #length, #size: Returns the number of characters in symbol.
12203 * - #start_with?: Returns +true+ if symbol starts with
12204 * any of the given strings.
12205 *
12206 * === Methods for Comparing
12207 *
12208 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12209 * or larger than symbol.
12210 * - #==, #===: Returns +true+ if a given symbol has the same content and
12211 * encoding.
12212 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12213 * symbol is smaller than, equal to, or larger than symbol.
12214 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12215 * after Unicode case folding; +false+ otherwise.
12216 *
12217 * === Methods for Converting
12218 *
12219 * - #capitalize: Returns symbol with the first character upcased
12220 * and all other characters downcased.
12221 * - #downcase: Returns symbol with all characters downcased.
12222 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12223 * - #name: Returns the frozen string corresponding to symbol.
12224 * - #succ, #next: Returns the symbol that is the successor to symbol.
12225 * - #swapcase: Returns symbol with all upcase characters downcased
12226 * and all downcase characters upcased.
12227 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12228 * - #to_s, #id2name: Returns the string corresponding to +self+.
12229 * - #to_sym, #intern: Returns +self+.
12230 * - #upcase: Returns symbol with all characters upcased.
12231 *
12232 */
12233
12234
12235/*
12236 * call-seq:
12237 * symbol == object -> true or false
12238 *
12239 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12240 */
12241
12242#define sym_equal rb_obj_equal
12243
12244static int
12245sym_printable(const char *s, const char *send, rb_encoding *enc)
12246{
12247 while (s < send) {
12248 int n;
12249 int c = rb_enc_precise_mbclen(s, send, enc);
12250
12251 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12252 n = MBCLEN_CHARFOUND_LEN(c);
12253 c = rb_enc_mbc_to_codepoint(s, send, enc);
12254 if (!rb_enc_isprint(c, enc)) return FALSE;
12255 s += n;
12256 }
12257 return TRUE;
12258}
12259
12260int
12261rb_str_symname_p(VALUE sym)
12262{
12263 rb_encoding *enc;
12264 const char *ptr;
12265 long len;
12266 rb_encoding *resenc = rb_default_internal_encoding();
12267
12268 if (resenc == NULL) resenc = rb_default_external_encoding();
12269 enc = STR_ENC_GET(sym);
12270 ptr = RSTRING_PTR(sym);
12271 len = RSTRING_LEN(sym);
12272 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12273 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12274 return FALSE;
12275 }
12276 return TRUE;
12277}
12278
12279VALUE
12280rb_str_quote_unprintable(VALUE str)
12281{
12282 rb_encoding *enc;
12283 const char *ptr;
12284 long len;
12285 rb_encoding *resenc;
12286
12287 Check_Type(str, T_STRING);
12288 resenc = rb_default_internal_encoding();
12289 if (resenc == NULL) resenc = rb_default_external_encoding();
12290 enc = STR_ENC_GET(str);
12291 ptr = RSTRING_PTR(str);
12292 len = RSTRING_LEN(str);
12293 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12294 !sym_printable(ptr, ptr + len, enc)) {
12295 return rb_str_escape(str);
12296 }
12297 return str;
12298}
12299
12300VALUE
12301rb_id_quote_unprintable(ID id)
12302{
12303 VALUE str = rb_id2str(id);
12304 if (!rb_str_symname_p(str)) {
12305 return rb_str_escape(str);
12306 }
12307 return str;
12308}
12309
12310/*
12311 * call-seq:
12312 * inspect -> string
12313 *
12314 * Returns a string representation of +self+ (including the leading colon):
12315 *
12316 * :foo.inspect # => ":foo"
12317 *
12318 * Related: Symbol#to_s, Symbol#name.
12319 *
12320 */
12321
12322static VALUE
12323sym_inspect(VALUE sym)
12324{
12325 VALUE str = rb_sym2str(sym);
12326 const char *ptr;
12327 long len;
12328 char *dest;
12329
12330 if (!rb_str_symname_p(str)) {
12331 str = rb_str_inspect(str);
12332 len = RSTRING_LEN(str);
12333 rb_str_resize(str, len + 1);
12334 dest = RSTRING_PTR(str);
12335 memmove(dest + 1, dest, len);
12336 }
12337 else {
12338 rb_encoding *enc = STR_ENC_GET(str);
12339 VALUE orig_str = str;
12340
12341 len = RSTRING_LEN(orig_str);
12342 str = rb_enc_str_new(0, len + 1, enc);
12343
12344 // Get data pointer after allocation
12345 ptr = RSTRING_PTR(orig_str);
12346 dest = RSTRING_PTR(str);
12347 memcpy(dest + 1, ptr, len);
12348
12349 RB_GC_GUARD(orig_str);
12350 }
12351 dest[0] = ':';
12352
12354
12355 return str;
12356}
12357
12358VALUE
12360{
12361 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12362 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12363 return str;
12364}
12365
12366VALUE
12367rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12368{
12369 VALUE obj;
12370
12371 if (argc < 1) {
12372 rb_raise(rb_eArgError, "no receiver given");
12373 }
12374 obj = argv[0];
12375 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12376}
12377
12378/*
12379 * call-seq:
12380 * succ
12381 *
12382 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12383 *
12384 * :foo.succ # => :fop
12385 *
12386 * Related: String#succ.
12387 */
12388
12389static VALUE
12390sym_succ(VALUE sym)
12391{
12392 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12393}
12394
12395/*
12396 * call-seq:
12397 * symbol <=> object -> -1, 0, +1, or nil
12398 *
12399 * If +object+ is a symbol,
12400 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12401 *
12402 * :bar <=> :foo # => -1
12403 * :foo <=> :foo # => 0
12404 * :foo <=> :bar # => 1
12405 *
12406 * Otherwise, returns +nil+:
12407 *
12408 * :foo <=> 'bar' # => nil
12409 *
12410 * Related: String#<=>.
12411 */
12412
12413static VALUE
12414sym_cmp(VALUE sym, VALUE other)
12415{
12416 if (!SYMBOL_P(other)) {
12417 return Qnil;
12418 }
12419 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12420}
12421
12422/*
12423 * call-seq:
12424 * casecmp(object) -> -1, 0, 1, or nil
12425 *
12426 * :include: doc/symbol/casecmp.rdoc
12427 *
12428 */
12429
12430static VALUE
12431sym_casecmp(VALUE sym, VALUE other)
12432{
12433 if (!SYMBOL_P(other)) {
12434 return Qnil;
12435 }
12436 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12437}
12438
12439/*
12440 * call-seq:
12441 * casecmp?(object) -> true, false, or nil
12442 *
12443 * :include: doc/symbol/casecmp_p.rdoc
12444 *
12445 */
12446
12447static VALUE
12448sym_casecmp_p(VALUE sym, VALUE other)
12449{
12450 if (!SYMBOL_P(other)) {
12451 return Qnil;
12452 }
12453 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12454}
12455
12456/*
12457 * call-seq:
12458 * symbol =~ object -> integer or nil
12459 *
12460 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12461 * including possible updates to global variables;
12462 * see String#=~.
12463 *
12464 */
12465
12466static VALUE
12467sym_match(VALUE sym, VALUE other)
12468{
12469 return rb_str_match(rb_sym2str(sym), other);
12470}
12471
12472/*
12473 * call-seq:
12474 * match(pattern, offset = 0) -> matchdata or nil
12475 * match(pattern, offset = 0) {|matchdata| } -> object
12476 *
12477 * Equivalent to <tt>self.to_s.match</tt>,
12478 * including possible updates to global variables;
12479 * see String#match.
12480 *
12481 */
12482
12483static VALUE
12484sym_match_m(int argc, VALUE *argv, VALUE sym)
12485{
12486 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12487}
12488
12489/*
12490 * call-seq:
12491 * match?(pattern, offset) -> true or false
12492 *
12493 * Equivalent to <tt>sym.to_s.match?</tt>;
12494 * see String#match.
12495 *
12496 */
12497
12498static VALUE
12499sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12500{
12501 return rb_str_match_m_p(argc, argv, sym);
12502}
12503
12504/*
12505 * call-seq:
12506 * symbol[index] -> string or nil
12507 * symbol[start, length] -> string or nil
12508 * symbol[range] -> string or nil
12509 * symbol[regexp, capture = 0] -> string or nil
12510 * symbol[substring] -> string or nil
12511 *
12512 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12513 *
12514 */
12515
12516static VALUE
12517sym_aref(int argc, VALUE *argv, VALUE sym)
12518{
12519 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12520}
12521
12522/*
12523 * call-seq:
12524 * length -> integer
12525 *
12526 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12527 */
12528
12529static VALUE
12530sym_length(VALUE sym)
12531{
12532 return rb_str_length(rb_sym2str(sym));
12533}
12534
12535/*
12536 * call-seq:
12537 * empty? -> true or false
12538 *
12539 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12540 *
12541 */
12542
12543static VALUE
12544sym_empty(VALUE sym)
12545{
12546 return rb_str_empty(rb_sym2str(sym));
12547}
12548
12549/*
12550 * call-seq:
12551 * upcase(mapping) -> symbol
12552 *
12553 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12554 *
12555 * See String#upcase.
12556 *
12557 */
12558
12559static VALUE
12560sym_upcase(int argc, VALUE *argv, VALUE sym)
12561{
12562 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12563}
12564
12565/*
12566 * call-seq:
12567 * downcase(mapping) -> symbol
12568 *
12569 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12570 *
12571 * See String#downcase.
12572 *
12573 * Related: Symbol#upcase.
12574 *
12575 */
12576
12577static VALUE
12578sym_downcase(int argc, VALUE *argv, VALUE sym)
12579{
12580 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12581}
12582
12583/*
12584 * call-seq:
12585 * capitalize(mapping) -> symbol
12586 *
12587 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12588 *
12589 * See String#capitalize.
12590 *
12591 */
12592
12593static VALUE
12594sym_capitalize(int argc, VALUE *argv, VALUE sym)
12595{
12596 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12597}
12598
12599/*
12600 * call-seq:
12601 * swapcase(mapping) -> symbol
12602 *
12603 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12604 *
12605 * See String#swapcase.
12606 *
12607 */
12608
12609static VALUE
12610sym_swapcase(int argc, VALUE *argv, VALUE sym)
12611{
12612 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12613}
12614
12615/*
12616 * call-seq:
12617 * start_with?(*string_or_regexp) -> true or false
12618 *
12619 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12620 *
12621 */
12622
12623static VALUE
12624sym_start_with(int argc, VALUE *argv, VALUE sym)
12625{
12626 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12627}
12628
12629/*
12630 * call-seq:
12631 * end_with?(*strings) -> true or false
12632 *
12633 *
12634 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12635 *
12636 */
12637
12638static VALUE
12639sym_end_with(int argc, VALUE *argv, VALUE sym)
12640{
12641 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12642}
12643
12644/*
12645 * call-seq:
12646 * encoding -> encoding
12647 *
12648 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12649 *
12650 */
12651
12652static VALUE
12653sym_encoding(VALUE sym)
12654{
12655 return rb_obj_encoding(rb_sym2str(sym));
12656}
12657
12658static VALUE
12659string_for_symbol(VALUE name)
12660{
12661 if (!RB_TYPE_P(name, T_STRING)) {
12662 VALUE tmp = rb_check_string_type(name);
12663 if (NIL_P(tmp)) {
12664 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12665 name);
12666 }
12667 name = tmp;
12668 }
12669 return name;
12670}
12671
12672ID
12674{
12675 if (SYMBOL_P(name)) {
12676 return SYM2ID(name);
12677 }
12678 name = string_for_symbol(name);
12679 return rb_intern_str(name);
12680}
12681
12682VALUE
12684{
12685 if (SYMBOL_P(name)) {
12686 return name;
12687 }
12688 name = string_for_symbol(name);
12689 return rb_str_intern(name);
12690}
12691
12692/*
12693 * call-seq:
12694 * Symbol.all_symbols -> array_of_symbols
12695 *
12696 * Returns an array of all symbols currently in Ruby's symbol table:
12697 *
12698 * Symbol.all_symbols.size # => 9334
12699 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12700 *
12701 */
12702
12703static VALUE
12704sym_all_symbols(VALUE _)
12705{
12706 return rb_sym_all_symbols();
12707}
12708
12709VALUE
12710rb_str_to_interned_str(VALUE str)
12711{
12712 return rb_fstring(str);
12713}
12714
12715VALUE
12716rb_interned_str(const char *ptr, long len)
12717{
12718 struct RString fake_str;
12719 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12720}
12721
12722VALUE
12724{
12725 return rb_interned_str(ptr, strlen(ptr));
12726}
12727
12728VALUE
12729rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12730{
12731 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12732 rb_enc_autoload(enc);
12733 }
12734
12735 struct RString fake_str;
12736 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12737}
12738
12739VALUE
12740rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12741{
12742 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12743 rb_enc_autoload(enc);
12744 }
12745
12746 struct RString fake_str;
12747 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12748}
12749
12750VALUE
12752{
12753 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12754}
12755
12756#if USE_YJIT
12757void
12758rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12759{
12760 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12761 ssize_t code = RB_NUM2SSIZE(codepoint);
12762
12763 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12764 rb_str_buf_cat_byte(str, (char) code);
12765 return;
12766 }
12767 }
12768
12769 rb_str_concat(str, codepoint);
12770}
12771#endif
12772
12773static int
12774fstring_set_class_i(VALUE *str, void *data)
12775{
12776 RBASIC_SET_CLASS(*str, rb_cString);
12777
12778 return ST_CONTINUE;
12779}
12780
12781void
12782Init_String(void)
12783{
12784 rb_cString = rb_define_class("String", rb_cObject);
12785
12786 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12787
12789 rb_define_alloc_func(rb_cString, empty_str_alloc);
12790 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12791 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12792 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12793 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12794 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12797 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12798 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12799 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12800 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12803 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12804 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12805 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12806 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12809 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12810 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12811 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12812 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12813 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12815 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12817 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12818 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12819 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12820 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12821 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12822 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12824 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12825 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12826 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12827 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12828 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12829 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12830 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12831 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12833 rb_define_method(rb_cString, "+@", str_uplus, 0);
12834 rb_define_method(rb_cString, "-@", str_uminus, 0);
12835 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12836 rb_define_alias(rb_cString, "dedup", "-@");
12837
12838 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12839 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12840 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12841 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12844 rb_define_method(rb_cString, "undump", str_undump, 0);
12845
12846 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12847 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12848 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12849 sym_fold = ID2SYM(rb_intern_const("fold"));
12850
12851 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12852 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12853 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12854 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12855
12856 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12857 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12858 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12859 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12860
12861 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12862 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12863 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12864 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12865 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12866 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12867 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12868 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12869 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12870 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12871 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12872 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12874 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12875 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12876 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12877 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12878 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12879
12880 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12881 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12882 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12883
12884 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12885
12886 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12887 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12888 rb_define_method(rb_cString, "center", rb_str_center, -1);
12889
12890 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12891 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12892 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12893 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12894 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12895 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12896 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12897 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12898 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12899
12900 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12901 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12902 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12903 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12904 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12905 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12906 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12907 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12908 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12909
12910 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12911 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12912 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12913 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12914 rb_define_method(rb_cString, "count", rb_str_count, -1);
12915
12916 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12917 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12918 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12919 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12920
12921 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12922 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12923 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12924 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12925 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12926
12927 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12928
12929 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12930 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12931
12932 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12933 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12934
12935 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12936 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12937 rb_define_method(rb_cString, "b", rb_str_b, 0);
12938 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12939 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12940
12941 /* define UnicodeNormalize module here so that we don't have to look it up */
12942 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12943 id_normalize = rb_intern_const("normalize");
12944 id_normalized_p = rb_intern_const("normalized?");
12945
12946 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12947 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12948 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12949
12950 rb_fs = Qnil;
12951 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12952 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12953 rb_gc_register_address(&rb_fs);
12954
12955 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12959 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12960
12961 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12962 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12963 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12964 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12965 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12966 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12967
12968 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12969 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12970 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12971 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12972
12973 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12974 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12975 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12976 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12977 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12978 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12979 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12980
12981 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12982 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12983 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12984 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12985
12986 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12987 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12988
12989 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12990}
12991
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1695
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1478
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1596
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2847
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2667
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3137
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1036
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2926
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1681
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:206
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:682
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3908
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:676
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2164
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2182
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1341
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3578
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:265
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:583
VALUE rb_cSymbol
Symbol class.
Definition string.c:84
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:177
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1329
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:83
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3262
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1327
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:932
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1192
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3003
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1211
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12729
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2309
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3707
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1140
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1432
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1333
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:951
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12751
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:816
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2677
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2940
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:701
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1879
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1885
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1927
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4220
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3717
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1727
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1497
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2462
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1582
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:944
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:938
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3772
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1408
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12359
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2535
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1384
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1721
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3031
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5408
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4136
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3128
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11658
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1763
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1680
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1174
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1531
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:986
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1503
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1971
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4122
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3540
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2398
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:1989
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1638
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1566
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6660
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3136
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1145
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12723
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1414
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1603
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3738
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3078
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4243
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3362
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7327
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2765
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12716
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4190
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4009
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4165
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1691
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3714
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3253
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5918
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11716
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1624
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1677
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:630
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2925
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3225
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1655
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3344
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1186
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1548
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2719
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7434
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1396
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1693
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2412
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5836
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9495
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1180
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1825
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1984
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2063
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3340
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1603
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12683
ID rb_to_id(VALUE str)
Definition string.c:12673
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3496
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4464
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:163
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1426
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2902
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2784
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1420
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2797
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1754
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:456
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1596
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:202
Definition string.c:8378
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113